import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Get the data: Algebra 2005-2006
train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt'
test_filepath  = 'data/algebra0506/algebra_2005_2006_test.txt'
traindata = pd.read_table(train_filepath)

# Inspect some of the training data
traindata.head()

# Take the column of anonimized student IDs and count the number of unique entries
print 'Number of students: ', len(np.unique(traindata['Anon Student Id']))

csd = traindata['Correct Step Duration (sec)']
csd.describe()

%matplotlib inline
hist = plt.hist(np.array(csd.dropna()),bins=100,normed=True,log=False,range=(0,100))
plt.xlabel('Time to correct answer (sec)')
plt.ylabel('Fraction')
plt.show()

counts, bins = hist[0], hist[1]
cdf = np.cumsum(counts)
plt.plot(bins[1::], cdf)
plt.xlabel('Time to correct answer (sec)')
plt.ylabel('Cumulative fraction')
plt.axis((0,100,0,1.0))
plt.show()

# The unique identifier for each problem is the 'Problem Name'
problems = traindata['Problem Name']

# Get just the uniques
problems = np.unique(problems)
print 'Number of unique problems: ', len(problems)

pmedian_times = {}
for p in problems:
    pmedian_times[p] = traindata[traindata['Problem Name'] == p]['Correct Step Duration (sec)'].median()

import operator

sorted_times = sorted(pmedian_times.iteritems(), key=operator.itemgetter(1), reverse=True)

traindata.columns

traindata['Step Name']

traindata.columns

import sklearn

sklearn.__version__