From Kaggle
In order to achieve this we have created a simulated data set with 200 variables and 20,000 cases. An ‘equation’ based on this data was created in order to generate a Target to be predicted. Given the all 20,000 cases, the problem is very easy to solve – but you only get given the Target value of 250 cases – the task is to build a model that gives the best predictions on the remaining 19,750 cases.
import gzip
import requests
import zipfile
url = "https://dl.dropbox.com/s/lnly9gw8pb1xhir/overfitting.zip"
results = requests.get(url)
import StringIO
z = zipfile.ZipFile(StringIO.StringIO(results.content))
#z.extractall()
z.extractall()
z.namelist()
['overfitting.csv']
d = z.open('overfitting.csv')
d.readline()
'case_id,train,Target_Practice,Target_Leaderboard,Target_Evaluate,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200\r\n'
import numpy as np
M = np.fromstring(d.read(), sep="," )
len(d.read())
23919756
np.fromstring?
data = np.loadtxt("overfitting.csv", delimiter=",", skiprows=1)
print """
There are also 5 other fields,
case_id - 1 to 20,000, a unique identifier for each row
train - 1/0, this is a flag for the first 250 rows which are the training dataset
Target_Practice - we have provided all 20,000 Targets for this model, so you can develop your method completely off line.
Target_Leaderboard - only 250 Targets are provided. You submit your predictions for the remaining 19,750 to the Kaggle leaderboard.
Target_Evaluate - again only 250 Targets are provided. Those competitors who beat the 'benchmark' on the Leaderboard will be asked to make one further submission for the Evaluation model.
"""
data.shape
There are also 5 other fields, case_id - 1 to 20,000, a unique identifier for each row train - 1/0, this is a flag for the first 250 rows which are the training dataset Target_Practice - we have provided all 20,000 Targets for this model, so you can develop your method completely off line. Target_Leaderboard - only 250 Targets are provided. You submit your predictions for the remaining 19,750 to the Kaggle leaderboard. Target_Evaluate - again only 250 Targets are provided. Those competitors who beat the 'benchmark' on the Leaderboard will be asked to make one further submission for the Evaluation model.
(20000L, 205L)
ix_training = data[:,1] == 1
ix_testing = data[:,1] == 0
training_data = data[ ix_training, 5: ]
testing_data = data[ ix_testing, 5: ]
training_labels = data[ ix_training, 2]
testing_labels = data[ ix_testing, 2]
print "training:", training_data.shape, training_labels.shape
print "testing: ", testing_data.shape, testing_labels.shape
training: (250L, 200L) (250L,) testing: (19750L, 200L) (19750L,)
He mentions that the X variables are from a Unifrom distribution. Let's investigate this:
figsize( 12, 4 )
hist( training_data.flatten() )
print training_data.shape[0]*training_data.shape[1]
50000
looks pretty right
import pymc as mc
to_include = mc.Bernoulli( "to_include", 0.5, size= 200 )
coef = mc.Uniform( "coefs", 0, 1, size = 200 )
@mc.deterministic
def Z( coef = coef, to_include = to_include, data = training_data ):
ym = np.dot( to_include*training_data, coef )
return ym - ym.mean()
@mc.deterministic
def T( z = Z ):
return 0.45*(np.sign(z) + 1.1)
obs = mc.Bernoulli( "obs", T, value = training_labels, observed = True)
model = mc.Model( [to_include, coef, Z, T, obs] )
map_ = mc.MAP( model )
map_.fit()
Warning: Stochastic to_include's value is neither numerical nor array with floating-point dtype. Recommend fitting method fmin (default).
mcmc = mc.MCMC( model )
mcmc.sample(100000, 90000,1)
[****************100%******************] 100000 of 100000 complete
(np.round(T.value) == training_labels ).mean()
0.73999999999999999
t_trace = mcmc.trace("T")[:]
(np.round( t_trace[-500:-400,:]).mean(axis=0) == training_labels ).mean()
0.72399999999999998
t_mean = np.round( t_trace).mean(axis=1)
imshow(t_trace[-10000:,:], aspect="auto")
colorbar()
<matplotlib.colorbar.Colorbar instance at 0x0000000013270208>
figsize( 23, 8)
coef_trace = mcmc.trace("coefs")[:]
imshow(coef_trace[-10000:,:], aspect="auto", cmap=pyplot.cm.RdBu, interpolation="none")
<matplotlib.image.AxesImage at 0x19ce2780>
include_trace = mcmc.trace("to_include")[:]
figsize( 23, 8)
imshow(include_trace[-10000:,:], aspect="auto", interpolation="none")
<matplotlib.image.AxesImage at 0x18d8ef60>