import pandas as pd import numpy as np import matplotlib.pyplot as plt # Get the data: Algebra 2005-2006 train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt' test_filepath = 'data/algebra0506/algebra_2005_2006_test.txt' traindata = pd.read_table(train_filepath) # Inspect some of the training data traindata.head() # Take the column of anonimized student IDs and count the number of unique entries print 'Number of students: ', len(np.unique(traindata['Anon Student Id'])) csd = traindata['Correct Step Duration (sec)'] csd.describe() %matplotlib inline hist = plt.hist(np.array(csd.dropna()),bins=100,normed=True,log=False,range=(0,100)) plt.xlabel('Time to correct answer (sec)') plt.ylabel('Fraction') plt.show() counts, bins = hist[0], hist[1] cdf = np.cumsum(counts) plt.plot(bins[1::], cdf) plt.xlabel('Time to correct answer (sec)') plt.ylabel('Cumulative fraction') plt.axis((0,100,0,1.0)) plt.show() # The unique identifier for each problem is the 'Problem Name' problems = traindata['Problem Name'] # Get just the uniques problems = np.unique(problems) print 'Number of unique problems: ', len(problems) pmedian_times = {} for p in problems: pmedian_times[p] = traindata[traindata['Problem Name'] == p]['Correct Step Duration (sec)'].median() import operator sorted_times = sorted(pmedian_times.iteritems(), key=operator.itemgetter(1), reverse=True) traindata.columns traindata['Step Name'] traindata.columns import sklearn sklearn.__version__