import scipy.stats as stats figsize( 12.5, 9 ) norm_pdf = stats.norm.pdf subplot(311) x = np.linspace(0, 60000, 200) sp1 = plt.fill_between(x , 0, norm_pdf( x, 35000, 7500), color = "#348ABD", lw = 3, alpha = 0.6, label = "historical total prices") p1 = Rectangle((0, 0), 1, 1, fc=sp1.get_facecolor()[0]) legend([p1], [sp1.get_label()]) subplot(312) x = np.linspace(0, 10000, 200) sp2 = plt.fill_between(x , 0, norm_pdf( x, 3000, 500), color = "#A60628", lw = 3, alpha = 0.6, label="snowblower price guess") p2 = Rectangle((0, 0), 1, 1, fc=sp2.get_facecolor()[0]) legend([p2], [sp2.get_label()]) subplot(313) x = np.linspace(0, 25000, 200) sp3 = plt.fill_between(x , 0, norm_pdf( x, 12000, 3000), color = "#7A68A6", lw = 3, alpha = 0.6, label = "Trip price guess") plt.autoscale( tight=True) p3 = Rectangle((0, 0), 1, 1, fc=sp3.get_facecolor()[0]) legend([p3], [sp3.get_label()]) %pylab inline import pymc as mc data_mu = [ 3e3, 12e3] data_std = [ 5e2, 3e3 ] mu_prior = 35e3 std_prior = 75e2 true_price = mc.Normal( "true_price", mu_prior, 1.0/std_prior**2 ) prize_1 = mc.Normal( "first_prize", data_mu[0], 1.0/data_std[0]**2 ) prize_2 = mc.Normal( "second_prize", data_mu[1], 1.0/data_std[1]**2 ) price_estimate = prize_1 + prize_2 @mc.potential def error( true_price = true_price, price_estimate = price_estimate ): return mc.normal_like( true_price, price_estimate, 1/(3e3)**2) model = mc.Model([true_price, prize_1, prize_2, price_estimate, error ]) mcmc = mc.MCMC(model) mcmc.sample( 50000, 10000) price_trace = mcmc.trace( "true_price" )[:] figsize( 12.5, 4) import scipy.stats as stats x = np.linspace( 5000, 40000 ) plt.plot( x, stats.norm.pdf(x, 35000, 7500), c = "k", lw = 2, label = "prior dist. of suite price") _hist = plt.hist( price_trace, bins = 35, normed= True, histtype= "stepfilled") plt.title( "Posterior of the true price estimate" ) plt.vlines( mu_prior, 0, 1.1*np.max(_hist[0] ), label = "prior's mean", linestyles="--" ) plt.vlines( price_trace.mean(), 0, 1.1*np.max(_hist[0] ), \ label = "posterior's mean", linestyles="-.") plt.legend(loc = "upper left") figsize( 12.5, 7 ) #numpy friendly showdown_loss def showdown_loss( guess, true_price, risk = 80000): loss = np.zeros_like( true_price ) ix = true_price < guess loss[~ix] = np.abs( guess - true_price[~ix] ) close_mask = [ abs( true_price - guess) <= 250 ] loss[close_mask] = -2*true_price[close_mask] loss[ix] = risk return loss guesses = np.linspace( 5000, 50000, 70) risks = np.linspace( 30000, 150000, 6 ) expected_loss = lambda guess, risk: \ showdown_loss( guess, price_trace, risk ).mean() for _p in risks: results = [ expected_loss( _g, _p ) for _g in guesses ] plt.plot( guesses, results, label = "%d"%_p ) plt.title("Expected loss of different guesses, \nvarious risk-levels of \ overestimating") plt.legend( loc="upper left", title="Risk parameter") plt.xlabel("price bid") plt.ylabel("expected loss") plt.xlim( 5000, 30000 ); import scipy.optimize as sop ax = subplot(111) for _p in risks: _color = ax._get_lines.color_cycle.next() _min_results = sop.fmin( expected_loss, 15000, args=(_p,),disp = False) _results = [ expected_loss( _g, _p ) for _g in guesses ] plt.plot( guesses, _results , color = _color ) plt.scatter( _min_results, 0, s = 60, \ color= _color, label = "%d"%_p) plt.vlines( _min_results, 0, 120000, color = _color, linestyles="--" ) print "minimum at risk %d: %.2f"%( _p, _min_results ) plt.title("Expected loss & Bayes actions of different guesses, \n \ various risk-levels of overestimating") plt.legend( loc="upper left", scatterpoints = 1, title = "Bayes action at risk:") plt.xlabel("price guess") plt.ylabel("expected loss") plt.xlim( 7000, 30000 ) plt.ylim( -1000, 80000) figsize(12.5, 4) def stock_loss( true_return, yhat, alpha = 100. ): if true_return*yhat < 0: #opposite signs, not good return alpha*yhat**2 - sign( true_return )*yhat \ + abs( true_return ) else: return abs( true_return - yhat ) true_value = .05 pred = np.linspace( -.04, .12, 75 ) plt.plot( pred, [stock_loss( true_value, _p) for _p in pred], \ label = "Loss associated with\n prediction if true value = 0.05", lw =3 ) plt.vlines( 0, 0, .25, linestyles="--" ) plt.xlabel( "prediction") plt.ylabel( "loss" ) plt.xlim( -0.04, .12 ) plt.ylim( 0, 0.25) true_value = -.02 plt.plot( pred, [stock_loss( true_value, _p) for _p in pred], alpha = 0.6, \ label = "Loss associated with\n prediction if true value = -0.02", lw =3) plt.legend() plt.title("Stock returns loss if true value = 0.05, -0.02" ) ## Code to create artificial data N = 100 X = 0.025*np.random.randn(N ) Y = 0.5*X + 0.01*np.random.randn( N) ls_coef_ = np.cov( X, Y )[0,1]/np.var(X) ls_intercept = Y.mean() - ls_coef_*X.mean() plt.scatter( X, Y, c="k") plt.xlabel("trading signal") plt.ylabel("returns") plt.title( "Empirical returns vs trading signal" ) plt.plot( X, ls_coef_*X + ls_intercept, label = "Least-squares line") plt.xlim( X.min(), X.max()) plt.ylim( Y.min(), Y.max() ) plt.legend( loc="upper left" ) import pymc as mc from pymc.Matplot import plot as mcplot std = mc.Uniform( "std", 0, 100, trace = False ) #this needs to be explained. @mc.deterministic def prec( U = std ): return 1.0/( U )**2 beta = mc.Normal( "beta", 0, 0.0001 ) alpha = mc.Normal( "alpha", 0, 0.0001 ) @mc.deterministic def mean( X = X, alpha = alpha, beta = beta ): return alpha + beta*X obs = mc.Normal( "obs", mean, prec, value = Y, observed = True) model = mc.Model( {"obs":obs, "beta_0":beta, "alpha_0":alpha, "prec":prec} ) mcmc = mc.MCMC( model ) mcmc.sample( 100000, 80000) mcplot( mcmc ) figsize( 12.5, 6 ) from scipy.optimize import fmin def stock_loss( price, pred, coef = 500): """vectorized for numpy""" sol = np.zeros_like(price) ix = price*pred < 0 sol[ix] = coef*pred**2 - sign(price[ix])*pred + abs(price[ix]) sol[ ~ix ] = abs( price[~ix] - pred ) return sol tau_samples = mcmc.trace( "prec" )[:] alpha_samples = mcmc.trace( "alpha" )[:] beta_samples = mcmc.trace( "beta" )[:] N = tau_samples.shape[0] noise = 1./np.sqrt(tau_samples)*np.random.randn(N) possible_outcomes = lambda signal: alpha_samples + beta_samples*signal \ + noise opt_predictions = np.zeros(50) trading_signals = np.linspace( X.min(), X.max(), 50 ) for i, _signal in enumerate( trading_signals ): _possible_outcomes = possible_outcomes( _signal ) tomin = lambda pred: stock_loss( _possible_outcomes, pred).mean() opt_predictions[i] = fmin( tomin, 0, disp = False ) plt.xlabel("trading signal") plt.ylabel("prediction") plt.title( "Least-squares prediction vs. Bayes action prediction" ) plt.plot( X, ls_coef_*X + ls_intercept, label ="Least-squares prediction") plt.xlim( X.min(), X.max()) plot( trading_signals, opt_predictions, label ="Bayes action prediction") plt.legend( loc="upper left" ) from draw_sky2 import draw_sky n_sky = 3 #choose a file/sky to examine. data = np.genfromtxt( "data/Train_Skies/Train_Skies/\ Training_Sky%d.csv"%(n_sky), dtype = None, skip_header = 1, delimiter = ",", usecols = [1,2,3,4]) print "Data on galaxies in sky %d."%n_sky print "position_x, position_y, e_1, e_2 " print data[:3] fig = draw_sky( data ) plt.title("Galaxy positions and ellipcities of sky %d."%n_sky) plt.xlabel( "x-position") plt.ylabel( "y-position" ); def euclidean_distance( x, y): return np.sqrt( ( ( x - y )**2).sum(axis=1) ) def f_distance( gxy_pos, halo_pos, c): # foo_position should be a 2-d numpy array return np.maximum(euclidean_distance( gxy_pos, halo_pos), c)[:,None] def tangential_distance(glxy_position, halo_position): # foo_position should be a 2-d numpy array delta = glxy_position - halo_position t = (2*np.arctan( delta[:,1]/delta[:,0] ))[:,None] return np.concatenate( [-np.cos( t), -np.sin(t) ], axis=1) import pymc as mc #set the size of the halo's mass mass_large = mc.Uniform( "mass_large", 40, 180, trace = False) #set the initial prior position of the halos, it's a 2-d Uniform dist. halo_position = mc.Uniform( "halo_position", 0, 4200, size = (1,2) ) @mc.deterministic def mean(mass=mass_large, h_pos = halo_position, glx_pos=data[:,:2]): return mass/f_distance( glx_pos, h_pos, 240)*\ tangential_distance( glx_pos, h_pos ) ellpty = mc.Normal( "ellipcity", mean, 1./0.05, observed = True, value = data[:,2:] ) model = mc.Model( [ellpty, mean, halo_position, mass_large] ) mcmc = mc.MCMC( model ) map_ = mc.MAP( model ) map_.fit() mcmc.sample(200000, 140000, 3) t = mcmc.trace("halo_position")[:].reshape( 20000,2) fig = draw_sky( data ) plt.title("Galaxy positions and ellipcities of sky %d."%n_sky) plt.xlabel( "x-position") plt.ylabel( "y-position" ) scatter( t[:,0], t[:,1], alpha = 0.015, c = "r") plt.xlim( 0, 4200 ) plt.ylim(0, 4200 ); halo_data = np.genfromtxt( "data/Training_halos.csv", delimiter = ",", usecols = [1, 2,3, 4,5,6,7,8,9], skip_header = 1) print halo_data[ n_sky] fig = draw_sky( data ) plt.title("Galaxy positions and ellipcities of sky %d."%n_sky) plt.xlabel( "x-position") plt.ylabel( "y-position" ) plt.scatter( t[:,0], t[:,1], alpha = 0.015, c = "r") plt.scatter( halo_data[n_sky-1][3], halo_data[n_sky-1][4], label = "True halo position", c = "k", s = 70) plt.legend(scatterpoints = 1, loc = "lower left") plt.xlim( 0, 4200 ) plt.ylim(0, 4200 ); print "True halo location:", halo_data[n_sky][3], halo_data[n_sky][4] mean_posterior = t.mean(axis=0).reshape(1,2) print mean_posterior from DarkWorldsMetric import main_score _halo_data = halo_data[n_sky-1] nhalo_all = _halo_data[0].reshape(1,1) x_true_all = _halo_data[3].reshape(1,1) y_true_all = _halo_data[4].reshape(1,1) x_ref_all = _halo_data[1].reshape(1,1) y_ref_all = _halo_data[2].reshape(1,1) sky_prediction = mean_posterior print "Using the mean:" main_score( nhalo_all, x_true_all, y_true_all, \ x_ref_all, y_ref_all, sky_prediction) #what's a bad score? print random_guess = np.random.randint( 0, 4200, size=(1,2) ) print "Using a random location:", random_guess main_score( nhalo_all, x_true_all, y_true_all, \ x_ref_all, y_ref_all, random_guess ) print from pymc.Matplot import plot as mcplot def halo_posteriors( n_halos_in_sky, galaxy_data, samples = 5e5, burn_in = 34e4, thin = 4): #set the size of the halo's mass """ exp_mass_large = mc.Uniform( "exp_mass_large", 40, 180) @mc.deterministic def mass_large( exp_mass_large = exp_mass_large ): return np.log( exp_mass_large ) """ mass_large = mc.Uniform( "mass_large", 40, 180 ) mass_small_1 = 20 mass_small_2 = 20 masses = np.array( [ mass_large,mass_small_1, mass_small_2], dtype=object) #set the initial prior positions of the halos, it's a 2-d Uniform dist. halo_positions = mc.Uniform( "halo_positions", 0, 4200, size = (n_halos_in_sky,2)) #notice this size fdist_constants = np.array( [240, 70, 70] ) @mc.deterministic def mean(mass=masses, h_pos = halo_positions, glx_pos=data[:,:2], n_halos_in_sky = n_halos_in_sky): _sum = 0 for i in range( n_halos_in_sky ): _sum += mass[i]/f_distance( glx_pos,h_pos[i, :], fdist_constants[i])*\ tangential_distance( glx_pos, h_pos[i, :] ) return _sum ellpty = mc.Normal( "ellipcity", mean, 1./0.05, observed = True, value = data[:,2:] ) model = mc.Model( [ellpty, mean, halo_positions, mass_large] ) map_ = mc.MAP( model) map_.fit(method="fmin_powell) mcmc = mc.MCMC( model) mcmc.sample(samples, burn_in, thin) return mcmc.trace( "halo_positions" )[:] n_sky =215 data = np.genfromtxt( "data/Train_Skies/Train_Skies/\ Training_Sky%d.csv"%(n_sky), dtype = None, skip_header = 1, delimiter = ",", usecols = [1,2,3,4]) #there are 3 halos in this file. samples = 10.5e5 traces = halo_posteriors( 3, data, samples = samples, burn_in = 9.5e5, thin= 10 ) fig = draw_sky( data ) plt.title("Galaxy positions and ellipcities of sky %d."%n_sky) plt.xlabel( "x-position") plt.ylabel( "y-position" ) colors = ["#467821", "#A60628", "#7A68A6"] for i in range( traces.shape[1] ): plt.scatter( traces[:, i, 0], traces[:, i, 1], c = colors[i], alpha = 0.02 ) for i in range( traces.shape[1] ): plt.scatter( halo_data[n_sky-1][3 + 2*i], halo_data[n_sky-1][4 + 2*i], label = "True halo position", c = "k", s = 90) #plt.legend(scatterpoints = 1) plt.xlim( 0, 4200 ) plt.ylim( 0, 4200 ); _halo_data = halo_data[n_sky-1] print traces.shape mean_posterior = traces.mean(axis=0).reshape( 1,4 ) print mean_posterior nhalo_all = _halo_data[0].reshape(1,1) x_true_all = _halo_data[3].reshape(1,1) y_true_all = _halo_data[4].reshape(1,1) x_ref_all = _halo_data[1].reshape(1,1) y_ref_all = _halo_data[2].reshape(1,1) sky_prediction = mean_posterior print "Using the mean:" main_score( [1], x_true_all, y_true_all, \ x_ref_all, y_ref_all, sky_prediction) #what's a bad score? print random_guess = np.random.randint( 0, 4200, size=(1,2) ) print "Using a random location:", random_guess main_score( [1], x_true_all, y_true_all, \ x_ref_all, y_ref_all, random_guess ) print from IPython.core.display import HTML def css_styling(): styles = open("../styles/custom.css", "r").read() return HTML(styles) css_styling()