We need an in-house testing system to validate our machine learning algorithm. We need this in order to iterate towards better solutions. I am basing this in-house testing system on the Yu et al. JMLR Workshop and Conference Proceedings paper that the winning team submitted. The leaderboard contains the full list of submissions and links to papers.
In the Yu et al. paper, the main reason why they built their own testing system instead of just submitting to their answers and having the KDD Cup server score it was to avoid overfitting the solution.
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.cross_validation import cross_val_score
# Get the data: Algebra 2005-2006 (A56) and/or Algebra 2008-2009 (A89)
a56_train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt'
#a89_train_filepath = 'data/algebra0809/algebra_2008_2009_train.txt'
a56data = pd.read_table(a56_train_filepath)
#a89data = pd.read_table(a89_train_filepath)
hierarchy = a56data['Problem Hierarchy']
units, sections = [], []
for i in range(len(hierarchy)):
units.append(hierarchy[i].split(',')[0].strip())
sections.append(hierarchy[i].split(',')[1].strip())
# Now add 'Units' and 'Sections' as columns within the dataframe
a56data['Problem Unit'] = pd.Series(units, index=a56data.index)
a56data['Problem Section'] = pd.Series(sections, index=a56data.index)
# Rearrange order of columns
cols = a56data.columns.tolist()
cols = cols[0:3]+cols[-2::]+cols[3:-2]
a56data = a56data[cols]
df = a56data
cats = ['Anon Student Id', 'Problem Hierarchy', 'Problem Unit', 'Problem Section', 'Problem Name']
sids = list(set(df['Anon Student Id']))
sid_dict = {}
for idx,sid in enumerate(sids):
sid_dict[idx] = sid
df.loc[df['Anon Student Id'] == sid,'Anon Student Id'] = idx
cat = 'Problem Hierarchy'
prhs = list(set(df[cat]))
prh_dict = {}
for idx,prh in enumerate(prhs):
prh_dict[idx] = prh
df.loc[df[cat] == prh,cat] = idx
cat = 'Problem Unit'
prus = list(set(df[cat]))
pru_dict = {}
for idx,pru in enumerate(prus):
pru_dict[idx] = pru
df.loc[df[cat] == pru,cat] = idx
cat = 'Problem Section'
prss = list(set(df[cat]))
prs_dict = {}
for idx,prs in enumerate(prss):
prs_dict[idx] = prs
df.loc[df[cat] == prs,cat] = idx
cat = 'Problem Name'
prns = list(set(df[cat]))
prn_dict = {}
for idx,prn in enumerate(prns):
prn_dict[idx] = prn
df.loc[df[cat] == prn,cat] = idx
cat = 'Step Name'
stns = list(set(df[cat]))
stn_dict = {}
for idx,stn in enumerate(stns):
stn_dict[idx] = stn
df.loc[df[cat] == stn,cat] = idx
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-160-531a7478134b> in <module>() 4 for idx,stn in enumerate(stns): 5 stn_dict[idx] = stn ----> 6 df.loc[df[cat] == stn,cat] = idx /Users/mikhail/anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in __setitem__(self, key, value) 96 indexer = self._convert_to_indexer(key, is_setter=True) 97 ---> 98 self._setitem_with_indexer(indexer, value) 99 100 def _has_valid_type(self, k, axis): /Users/mikhail/anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _setitem_with_indexer(self, indexer, value) 404 # scalar 405 for item in labels: --> 406 setter(item, value) 407 408 else: KeyboardInterrupt:
df
Row | Anon Student Id | Problem Hierarchy | Problem Unit | Problem Section | Problem Name | Problem View | Step Name | Step Start Time | First Transaction Time | Correct Transaction Time | Step End Time | Step Duration (sec) | Correct Step Duration (sec) | Error Step Duration (sec) | Correct First Attempt | Incorrects | Hints | Corrects | KC(Default) | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 104 | 87 | 3 | 17 | 1046 | 1 | 3(x+2) = 15 | 2005-09-09 12:24:35.0 | 2005-09-09 12:24:49.0 | 2005-09-09 12:25:15.0 | 2005-09-09 12:25:15.0 | 40 | NaN | 40 | 0 | 2 | 3 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
1 | 2 | 104 | 87 | 3 | 17 | 1046 | 1 | x+2 = 5 | 2005-09-09 12:25:15.0 | 2005-09-09 12:25:31.0 | 2005-09-09 12:25:31.0 | 2005-09-09 12:25:31.0 | 16 | 16 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
2 | 3 | 104 | 87 | 3 | 17 | 346 | 1 | 2-8y = -4 | 2005-09-09 12:25:36.0 | 2005-09-09 12:25:43.0 | 2005-09-09 12:26:12.0 | 2005-09-09 12:26:12.0 | 36 | NaN | 36 | 0 | 2 | 3 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
3 | 4 | 104 | 87 | 3 | 17 | 346 | 1 | -8y = -6 | 2005-09-09 12:26:12.0 | 2005-09-09 12:26:34.0 | 2005-09-09 12:26:34.0 | 2005-09-09 12:26:34.0 | 22 | 22 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove coefficient; {ax+b=c, divid... | ... |
4 | 5 | 104 | 87 | 3 | 17 | 346 | 2 | -7y-5 = -4 | 2005-09-09 12:26:38.0 | 2005-09-09 12:28:36.0 | 2005-09-09 12:28:36.0 | 2005-09-09 12:28:36.0 | 118 | 118 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
5 | 6 | 104 | 87 | 3 | 17 | 346 | 2 | -7y = 1 | 2005-09-09 12:28:36.0 | 2005-09-09 12:28:43.0 | 2005-09-09 12:28:51.0 | 2005-09-09 12:28:51.0 | 15 | NaN | 15 | 0 | 1 | 0 | 1 | [SkillRule: Remove coefficient; {ax+b=c, divid... | ... |
6 | 7 | 104 | 87 | 3 | 17 | 346 | 3 | 7y+4 = 7 | 2005-09-09 12:28:57.0 | 2005-09-09 12:29:09.0 | 2005-09-09 12:29:09.0 | 2005-09-09 12:29:09.0 | 12 | 12 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
7 | 8 | 104 | 87 | 3 | 17 | 346 | 3 | 7y = 3 | 2005-09-09 12:29:09.0 | 2005-09-09 12:29:14.0 | 2005-09-09 12:29:14.0 | 2005-09-09 12:29:14.0 | 5 | 5 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove positive coefficient; {ax/b... | ... |
8 | 9 | 104 | 87 | 3 | 17 | 346 | 4 | -5+9y = -6 | 2005-09-09 12:29:19.0 | 2005-09-09 12:29:31.0 | 2005-09-09 12:29:31.0 | 2005-09-09 12:29:31.0 | 12 | 12 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
9 | 10 | 104 | 87 | 3 | 17 | 346 | 4 | 9y = -1 | 2005-09-09 12:29:31.0 | 2005-09-09 12:29:36.0 | 2005-09-09 12:29:36.0 | 2005-09-09 12:29:36.0 | 5 | 5 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove positive coefficient; {ax/b... | ... |
10 | 11 | 104 | 90 | 3 | 15 | 346 | 1 | -7-3x = -2 | 2005-09-09 12:29:41.0 | 2005-09-09 12:30:27.0 | 2005-09-09 12:30:27.0 | 2005-09-09 12:30:27.0 | 46 | 46 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
11 | 12 | 104 | 90 | 3 | 15 | 346 | 1 | -7-3x+7 = -2+7 | 2005-09-09 12:30:27.0 | 2005-09-09 12:30:34.0 | 2005-09-09 12:30:45.0 | 2005-09-09 12:30:49.0 | 22 | NaN | 22 | 0 | 1 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
12 | 13 | 104 | 90 | 3 | 15 | 346 | 1 | -3x = 5 | 2005-09-09 12:30:49.0 | 2005-09-09 12:31:04.0 | 2005-09-09 12:31:04.0 | 2005-09-09 12:31:04.0 | 15 | 15 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove coefficient; {ax+b=c, divid... | ... |
13 | 14 | 104 | 90 | 3 | 15 | 346 | 1 | -3x/-3 = 5/-3 | 2005-09-09 12:31:04.0 | 2005-09-09 12:31:07.0 | 2005-09-09 12:31:07.0 | 2005-09-09 12:31:12.0 | 8 | 8 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Multiply/Divide; [Typein Skill: {R... | ... |
14 | 15 | 104 | 90 | 3 | 15 | 346 | 2 | -9 = 8y+9 | 2005-09-09 12:31:16.0 | 2005-09-09 12:31:29.0 | 2005-09-09 12:31:29.0 | 2005-09-09 12:31:29.0 | 13 | 13 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
15 | 16 | 104 | 90 | 3 | 15 | 346 | 2 | -9-9 = 8y+9-9 | 2005-09-09 12:31:29.0 | 2005-09-09 12:31:32.0 | 2005-09-09 12:31:32.0 | 2005-09-09 12:31:39.0 | 10 | 10 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
16 | 17 | 104 | 90 | 3 | 15 | 346 | 2 | -18 = 8y | 2005-09-09 12:31:39.0 | 2005-09-09 12:31:44.0 | 2005-09-09 12:31:44.0 | 2005-09-09 12:31:44.0 | 5 | 5 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove positive coefficient; {ax/b... | ... |
17 | 18 | 104 | 90 | 3 | 15 | 346 | 2 | -18/8 = 8y/8 | 2005-09-09 12:31:44.0 | 2005-09-09 12:31:46.0 | 2005-09-09 12:31:46.0 | 2005-09-09 12:32:56.0 | 72 | 72 | NaN | 1 | 2 | 0 | 2 | [SkillRule: Multiply/Divide; [Typein Skill: {R... | ... |
18 | 19 | 104 | 90 | 3 | 15 | 346 | 3 | -2-2x = 9 | 2005-09-09 12:33:01.0 | 2005-09-09 12:33:22.0 | 2005-09-09 12:33:32.0 | 2005-09-09 12:33:32.0 | 31 | NaN | 31 | 0 | 1 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
19 | 20 | 104 | 90 | 3 | 15 | 346 | 3 | -2-2x+2 = 9+2 | 2005-09-09 12:33:32.0 | 2005-09-09 12:33:37.0 | 2005-09-09 12:33:37.0 | 2005-09-09 12:33:40.0 | 8 | 8 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
20 | 21 | 104 | 90 | 3 | 15 | 346 | 3 | -2x = 11 | 2005-09-09 12:33:40.0 | 2005-09-09 12:33:46.0 | 2005-09-09 12:33:46.0 | 2005-09-09 12:33:46.0 | 6 | 6 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove coefficient; {ax+b=c, divid... | ... |
21 | 22 | 104 | 90 | 3 | 15 | 346 | 3 | -2x/-2 = 11/-2 | 2005-09-09 12:33:46.0 | 2005-09-09 12:33:51.0 | 2005-09-09 12:33:51.0 | 2005-09-09 12:33:55.0 | 9 | 9 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Multiply/Divide; [Typein Skill: {R... | ... |
22 | 23 | 104 | 90 | 3 | 15 | 346 | 4 | 4+4y = -6 | 2005-09-09 12:33:59.0 | 2005-09-09 12:34:06.0 | 2005-09-09 12:34:06.0 | 2005-09-09 12:34:06.0 | 7 | 7 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
23 | 24 | 104 | 90 | 3 | 15 | 346 | 4 | 4+4y-4 = -6-4 | 2005-09-09 12:34:06.0 | 2005-09-09 12:34:09.0 | 2005-09-09 12:34:09.0 | 2005-09-09 12:34:17.0 | 11 | 11 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
24 | 25 | 104 | 90 | 3 | 15 | 346 | 4 | 4y = -6-4 | 2005-09-09 12:34:17.0 | 2005-09-09 12:34:25.0 | 2005-09-09 12:34:25.0 | 2005-09-09 12:34:25.0 | 8 | 8 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Consolidate vars with coeff; CLT] | ... |
25 | 26 | 104 | 90 | 3 | 15 | 346 | 4 | FinalAnswer | 2005-09-09 12:34:25.0 | 2005-09-09 12:34:29.0 | 2005-09-09 12:34:33.0 | 2005-09-09 12:34:33.0 | 8 | NaN | 8 | 0 | 1 | 0 | 1 | combine-like-terms-sp | ... |
26 | 27 | 104 | 90 | 3 | 15 | 346 | 4 | 4y = -10 | 2005-09-09 12:34:33.0 | 2005-09-09 12:34:42.0 | 2005-09-09 12:34:42.0 | 2005-09-09 12:34:42.0 | 9 | 9 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove positive coefficient; {ax/b... | ... |
27 | 28 | 104 | 90 | 3 | 15 | 346 | 4 | 4y/4 = -10/4 | 2005-09-09 12:34:42.0 | 2005-09-09 12:34:46.0 | 2005-09-09 12:34:46.0 | 2005-09-09 12:35:02.0 | 20 | 20 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Multiply/Divide; [Typein Skill: {R... | ... |
28 | 29 | 104 | 89 | 3 | 16 | 668 | 1 | -7 = -5(y+7) | 2005-09-09 12:35:08.0 | 2005-09-09 12:36:27.0 | 2005-09-09 12:37:01.0 | 2005-09-09 12:37:01.0 | 113 | NaN | 113 | 0 | 2 | 3 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
29 | 30 | 104 | 89 | 3 | 16 | 668 | 1 | 7/5 = y+7 | 2005-09-09 12:37:01.0 | 2005-09-09 12:37:09.0 | 2005-09-09 12:37:09.0 | 2005-09-09 12:37:09.0 | 8 | 8 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
30 | 31 | 104 | 89 | 3 | 16 | 668 | 2 | -7(x+9) = -5 | 2005-09-09 12:37:17.0 | 2005-09-09 12:38:15.0 | 2005-09-09 12:38:15.0 | 2005-09-09 12:38:15.0 | 58 | 58 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
31 | 32 | 104 | 89 | 3 | 16 | 668 | 2 | x+9 = 5/7 | 2005-09-09 12:38:15.0 | 2005-09-09 12:38:21.0 | 2005-09-09 12:38:21.0 | 2005-09-09 12:38:21.0 | 6 | 6 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
32 | 33 | 104 | 89 | 3 | 16 | 668 | 3 | 5 = 8(y+1) | 2005-09-09 12:38:25.0 | 2005-09-09 12:38:33.0 | 2005-09-09 12:38:33.0 | 2005-09-09 12:38:33.0 | 8 | 8 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
33 | 34 | 104 | 89 | 3 | 16 | 668 | 3 | 5/8 = y+1 | 2005-09-09 12:38:33.0 | 2005-09-09 12:38:40.0 | 2005-09-09 12:38:40.0 | 2005-09-09 12:38:40.0 | 7 | 7 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
34 | 35 | 104 | 89 | 3 | 16 | 668 | 4 | 0 = -3(x-5) | 2005-09-09 12:38:44.0 | 2005-09-09 12:38:49.0 | 2005-09-09 12:38:49.0 | 2005-09-09 12:38:49.0 | 5 | 5 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
35 | 36 | 104 | 89 | 3 | 16 | 668 | 4 | 0 = x-5 | 2005-09-09 12:38:49.0 | 2005-09-09 12:38:56.0 | 2005-09-09 12:38:56.0 | 2005-09-09 12:38:56.0 | 7 | 7 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
36 | 37 | 104 | 84 | 3 | 13 | 668 | 1 | -5(y-10) = 3 | 2005-09-09 12:39:01.0 | 2005-09-09 12:39:07.0 | 2005-09-09 12:39:07.0 | 2005-09-09 12:39:07.0 | 6 | 6 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
37 | 38 | 104 | 84 | 3 | 13 | 668 | 1 | -5(y-10)/-5 = 3/-5 | 2005-09-09 12:39:07.0 | 2005-09-09 12:39:13.0 | 2005-09-09 12:39:13.0 | 2005-09-09 12:39:18.0 | 11 | 11 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Calculate Eliminate Parens; [Typei... | ... |
38 | 39 | 104 | 84 | 3 | 13 | 668 | 1 | y-10 = 3/-5 | 2005-09-09 12:39:18.0 | 2005-09-09 12:39:26.0 | 2005-09-09 12:39:26.0 | 2005-09-09 12:39:26.0 | 8 | 8 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
39 | 40 | 104 | 84 | 3 | 13 | 668 | 1 | y-10+10 = 3/-5+10 | 2005-09-09 12:39:26.0 | 2005-09-09 12:39:29.0 | 2005-09-09 12:39:29.0 | 2005-09-09 12:39:36.0 | 10 | 10 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
40 | 41 | 104 | 84 | 3 | 13 | 668 | 1 | y = 3/-5+10 | 2005-09-09 12:39:36.0 | 2005-09-09 12:39:44.0 | 2005-09-09 12:40:14.0 | 2005-09-09 12:40:14.0 | 38 | NaN | 38 | 0 | 1 | 3 | 1 | NaN | ... |
41 | 42 | 104 | 84 | 3 | 13 | 668 | 1 | FinalAnswer | 2005-09-09 12:40:14.0 | 2005-09-09 12:40:36.0 | 2005-09-09 12:40:36.0 | 2005-09-09 12:41:14.0 | 39 | 39 | NaN | 1 | 0 | 0 | 2 | simplify-fractions-sp~~combine-like-terms-sp | ... |
42 | 43 | 104 | 84 | 3 | 13 | 668 | 1 | y = -3/5+10 | 2005-09-09 12:40:36.0 | 2005-09-09 12:40:57.0 | 2005-09-09 12:40:57.0 | 2005-09-09 12:40:57.0 | 21 | 21 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Consolidate vars, no coeff; CLT] | ... |
43 | 44 | 104 | 84 | 3 | 13 | 668 | 2 | 4(x-4) = -8 | 2005-09-09 12:41:48.0 | 2005-09-09 12:42:13.0 | 2005-09-09 12:42:13.0 | 2005-09-09 12:42:13.0 | 25 | 25 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
44 | 45 | 104 | 84 | 3 | 13 | 668 | 2 | 4(x-4)/4 = -8/4 | 2005-09-09 12:42:13.0 | 2005-09-09 12:42:18.0 | 2005-09-09 12:42:18.0 | 2005-09-09 12:42:23.0 | 10 | 10 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Calculate Eliminate Parens; [Typei... | ... |
45 | 46 | 104 | 84 | 3 | 13 | 668 | 2 | x-4 = -8/4 | 2005-09-09 12:42:23.0 | 2005-09-09 12:42:32.0 | 2005-09-09 12:42:32.0 | 2005-09-09 12:42:32.0 | 9 | 9 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
46 | 47 | 104 | 84 | 3 | 13 | 668 | 2 | x-4+4 = -8/4+4 | 2005-09-09 12:42:32.0 | 2005-09-09 12:42:35.0 | 2005-09-09 12:42:35.0 | 2005-09-09 12:42:55.0 | 23 | 23 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
47 | 48 | 104 | 84 | 3 | 13 | 668 | 2 | x = -8/4+4 | 2005-09-09 12:42:55.0 | 2005-09-09 12:43:02.0 | 2005-09-09 12:43:02.0 | 2005-09-09 12:43:02.0 | 7 | 7 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Consolidate vars, no coeff; CLT] | ... |
48 | 49 | 104 | 84 | 3 | 13 | 668 | 2 | FinalAnswer | 2005-09-09 12:43:02.0 | 2005-09-09 12:43:25.0 | 2005-09-09 12:43:25.0 | 2005-09-09 12:43:25.0 | 23 | 23 | NaN | 1 | 0 | 0 | 1 | simplify-fractions-sp~~combine-like-terms-sp | ... |
49 | 50 | 104 | 84 | 3 | 13 | 668 | 3 | 5 = 4(x-3) | 2005-09-09 12:43:34.0 | 2005-09-09 12:43:41.0 | 2005-09-09 12:43:41.0 | 2005-09-09 12:43:41.0 | 7 | 7 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
50 | 51 | 104 | 84 | 3 | 13 | 668 | 3 | 5/4 = 4(x-3)/4 | 2005-09-09 12:43:41.0 | 2005-09-09 12:43:45.0 | 2005-09-09 12:43:45.0 | 2005-09-09 12:43:49.0 | 8 | 8 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Calculate Eliminate Parens; [Typei... | ... |
51 | 52 | 104 | 84 | 3 | 13 | 668 | 3 | 5/4 = x-3 | 2005-09-09 12:43:49.0 | 2005-09-09 12:44:09.0 | 2005-09-09 12:44:09.0 | 2005-09-09 12:44:09.0 | 20 | 20 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
52 | 53 | 104 | 84 | 3 | 13 | 668 | 3 | 5/4+3 = x-3+3 | 2005-09-09 12:44:09.0 | 2005-09-09 12:44:12.0 | 2005-09-09 12:44:12.0 | 2005-09-09 12:44:19.0 | 10 | 10 | NaN | 1 | 0 | 0 | 2 | [SkillRule: Add/Subtract; [Typein Skill: {Isol... | ... |
53 | 54 | 104 | 84 | 3 | 13 | 668 | 3 | 5/4+3 = x | 2005-09-09 12:44:19.0 | 2005-09-09 12:44:35.0 | 2005-09-09 12:44:35.0 | 2005-09-09 12:44:35.0 | 16 | 16 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Consolidate vars, no coeff; CLT] | ... |
54 | 55 | 104 | 84 | 3 | 13 | 668 | 3 | FinalAnswer | 2005-09-09 12:44:35.0 | 2005-09-09 12:44:48.0 | 2005-09-09 12:44:48.0 | 2005-09-09 12:44:48.0 | 13 | 13 | NaN | 1 | 0 | 0 | 1 | simplify-fractions-sp~~combine-like-terms-sp | ... |
55 | 56 | 104 | 83 | 3 | 14 | 936 | 1 | 0.1 = -42.3(y-83.7) | 2005-09-09 12:44:58.0 | 2005-09-09 12:45:42.0 | 2005-09-09 12:45:42.0 | 2005-09-09 12:45:42.0 | 44 | 44 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
56 | 57 | 104 | 83 | 3 | 14 | 936 | 1 | -0.00236407 = y-83.7 | 2005-09-09 12:45:42.0 | 2005-09-09 12:45:56.0 | 2005-09-09 12:45:56.0 | 2005-09-09 12:45:56.0 | 14 | 14 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
57 | 58 | 104 | 83 | 3 | 14 | 667 | 1 | 1.8(y-9.8) = -2.4 | 2005-09-09 12:46:01.0 | 2005-09-09 12:46:48.0 | 2005-09-09 12:46:57.0 | 2005-09-09 12:46:57.0 | 56 | NaN | 56 | 0 | 1 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
58 | 59 | 104 | 83 | 3 | 14 | 667 | 1 | y-9.8 = -1.33333333 | 2005-09-09 12:46:57.0 | 2005-09-09 12:47:49.0 | 2005-09-09 12:48:21.0 | 2005-09-09 12:48:21.0 | 84 | NaN | 84 | 0 | 1 | 0 | 1 | [SkillRule: Remove constant; {ax+b=c, positive... | ... |
59 | 60 | 104 | 83 | 3 | 14 | 667 | 2 | -5.5(y-1.9) = -9.6 | 2005-09-09 12:48:26.0 | 2005-09-09 12:49:24.0 | 2005-09-09 12:49:24.0 | 2005-09-09 12:49:24.0 | 58 | 58 | NaN | 1 | 0 | 0 | 1 | [SkillRule: Eliminate Parens; {CLT nested; CLT... | ... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
809694 rows × 21 columns
# Create an empty testing dataframe
testdf = pd.DataFrame(columns=df.columns)
# Create the testing set
unique_units = list(set(df['Problem Unit']))
for i in range(len(unique_units)):
# Get the last problem of the current problem unit
lastProb = list(df[df['Problem Unit'] == unique_units[i]]['Problem Name'])[-1]
# Get all the rows corresponding to the last problem for the given problem unit
lastProbRows = a56data[(df['Problem Unit'] == unique_units[i]) & (df['Problem Name']==lastProb)]
# Concatenate test dataframe with the rows just found
testdf = pd.concat([testdf,lastProbRows])
# Create a training dataframe that is equal to original dataframe with all the test cases removed
trainIndex = df.index - testdf.index
traindf = df.loc[trainIndex]
# Get the target feature within the test set: the Correct First Attmpt
CFAs = np.array(testdf['Correct First Attempt'])
# Define a helper function for calculating the root-mean-square error
def RMSE(p,y):
''' The Root-Mean-Square Error takes the predicted values p for the target
variable y and takes the square root of the mean of the square of their
differences. '''
return np.sqrt(np.sum(np.square(p-y))/len(y))
# Test the RMSE for an array of all zeros
p = np.zeros(len(CFAs))
print 'An array of all zeros gives an RMSE of:',RMSE(p,CFAs)
# Test the RMSE for an array of all ones
p = np.ones(len(CFAs))
print 'An array of all ones gives an RMSE of:',RMSE(p,CFAs)
# Test the RMSE for an array of random 0s and 1s
p = np.random.randint(0,2,len(CFAs)).astype(float)
print 'An array of random ones and zeros gives an RMSE of:',RMSE(p,CFAs)
An array of all zeros gives an RMSE of: 0.863841709437 An array of all ones gives an RMSE of: 0.503763338322 An array of random ones and zeros gives an RMSE of: 0.70685723912
def error_metrics(p,yy):
'''Calculates the error metrics, i.e. the precision and recall.
Precision = True positives / Predicted positives
Recall = True positives / Actual positives'''
predicted_positives = len(p[p==1])
actual_positives = len(yy[yy==1])
# The predicted values for when actual values are 1
pp = p[yy==1]
# True positives are when these predicted values are also 1
true_positives = len(pp[pp==1])
false_positives = len(yy) - true_positives
precision = float(true_positives) / float(predicted_positives)
recall = float(true_positives) / float(actual_positives)
F_1score = 2.0 * precision * recall / (precision + recall)
print 'Root-mean-square error: ', RMSE(p,yy)
print '\nPrecision: Of all predicted CFAs, what fraction actually succeeded?'
print precision
print '\nRecall: Of all actual CFAs, what fraction did we predict correctly?'
print recall
print '\nF_1 Score: ', F_1score
traindf.columns
Index([u'Row', u'Anon Student Id', u'Problem Hierarchy', u'Problem Unit', u'Problem Section', u'Problem Name', u'Problem View', u'Step Name', u'Step Start Time', u'First Transaction Time', u'Correct Transaction Time', u'Step End Time', u'Step Duration (sec)', u'Correct Step Duration (sec)', u'Error Step Duration (sec)', u'Correct First Attempt', u'Incorrects', u'Hints', u'Corrects', u'KC(Default)', u'Opportunity(Default)'], dtype='object')
# Define a helper function to normalize the feature matrix X
import numba
def autonorm(X):
''' Calculates the mean and range of values of each column
in the matrix (features) subtracts the mean from each value
and divides by the range, thereby normalizing all values to
fall between -1 and 1.'''
x_means = np.mean(X,axis=0)
x_means = np.ones(np.shape(X))*x_means
x_maxs = np.max(X,axis=0)
x_mins = np.min(X,axis=0)
x_range = x_maxs - x_mins
X_normd = (X - x_means) / x_range
return X_normd
autonorm_jit = numba.jit(autonorm)
features_to_norm = ['Step Duration (sec)','Hints','Problem View']
category_features = ['Anon Student Id', 'Problem Hierarchy', 'Problem Unit', 'Problem Section', 'Problem Name']
target_feature = ['Correct First Attempt']
features = features_to_norm + category_features
all_features = features_to_norm + category_features + target_feature
X = traindf[all_features].dropna()
y = np.array(X[target_feature]).astype(int).ravel()
X_to_norm = np.array(X[features_to_norm])
X_nonnorm = np.array(X[category_features])
X_to_norm = autonorm(X_to_norm)
X = np.concatenate((X_to_norm,X_nonnorm), axis=1)
XX = testdf[all_features].dropna()
yy = np.array(XX[target_feature]).astype(int).ravel()
XX_to_norm = np.array(XX[features_to_norm])
XX_nonnorm = np.array(XX[category_features])
XX_to_norm = autonorm(XX_to_norm)
XX = np.concatenate((XX_to_norm,XX_nonnorm), axis=1)
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(X,y)
p = model.predict(XX).astype(float)
error_metrics(p,yy)
Root-mean-square error: 0.59582966275 Precision: Of all predicted CFAs, what fraction actually succeeded? 0.823697236354 Recall: Of all actual CFAs, what fraction did we predict correctly? 0.667139953785 F_1 Score: 0.737198320284
scores = cross_val_score(model, X, y)
print 'Accuracy: {0:5.2f} (+/-{1:5.2f})'.format(scores.mean(), scores.std()*2)
Accuracy: 0.78 (+/- 0.01)
from sklearn.ensemble import RandomForestClassifier
n_ests = 70
model = RandomForestClassifier(n_estimators=n_ests, criterion="entropy", max_features=None)
model = model.fit(X,y)
p = model.predict(XX).astype(float)
error_metrics(p,yy)
importances = model.feature_importances_
n_feats = len(features)
feat_std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("\nFeature ranking:")
for f in range(n_feats):
print '{0:2} - {1:20}: {2:5.4f} (std: {3:5.4f})'.format(f+1,features[indices[f]],importances[indices[f]],feat_std[indices[f]])
Root-mean-square error: 0.500098691559 Precision: Of all predicted CFAs, what fraction actually succeeded? 0.834598896018 Recall: Of all actual CFAs, what fraction did we predict correctly? 0.829254711991 F_1 Score: 0.831918221428 Feature ranking: 1 - Anon Student Id : 0.2705 (std: 0.0026) 2 - Hints : 0.2593 (std: 0.0011) 3 - Step Duration (sec) : 0.2261 (std: 0.0032) 4 - Problem Name : 0.1125 (std: 0.0022) 5 - Problem View : 0.0444 (std: 0.0010) 6 - Problem Section : 0.0358 (std: 0.0014) 7 - Problem Hierarchy : 0.0306 (std: 0.0011) 8 - Problem Unit : 0.0208 (std: 0.0011)
scores = cross_val_score(model, X, y)
print 'Accuracy: {0:5.2f} (+/-{1:5.2f})'.format(scores.mean(), scores.std()*2)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-162-8de7c3d2ecea> in <module>() ----> 1 scores = cross_val_score(model, X, y) 2 print 'Accuracy: {0:5.2f} (+/-{1:5.2f})'.format(scores.mean(), scores.std()*2) /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, score_func, pre_dispatch) 1149 train, test, verbose, None, 1150 fit_params) -> 1151 for train, test in cv) 1152 return np.array(scores)[:, 0] 1153 /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 651 self._iterating = True 652 for function, args, kwargs in iterable: --> 653 self.dispatch(function, args, kwargs) 654 655 if pre_dispatch == "all" or n_jobs == 1: /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs) 398 """ 399 if self._pool is None: --> 400 job = ImmediateApply(func, args, kwargs) 401 index = len(self._jobs) 402 if not _verbosity_filter(index, self.verbose): /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs) 136 # Don't delay the application, to avoid keeping the input 137 # arguments in memory --> 138 self.results = func(*args, **kwargs) 139 140 def get(self): /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters) 1237 estimator.fit(X_train, **fit_params) 1238 else: -> 1239 estimator.fit(X_train, y_train, **fit_params) 1240 test_score = _score(estimator, X_test, y_test, scorer) 1241 if return_train_score: /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight) 277 sample_weight, 278 verbose=self.verbose) --> 279 for i in range(n_jobs)) 280 281 # Reduce /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 651 self._iterating = True 652 for function, args, kwargs in iterable: --> 653 self.dispatch(function, args, kwargs) 654 655 if pre_dispatch == "all" or n_jobs == 1: /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs) 398 """ 399 if self._pool is None: --> 400 job = ImmediateApply(func, args, kwargs) 401 index = len(self._jobs) 402 if not _verbosity_filter(index, self.verbose): /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs) 136 # Don't delay the application, to avoid keeping the input 137 # arguments in memory --> 138 self.results = func(*args, **kwargs) 139 140 def get(self): /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _parallel_build_trees(trees, forest, X, y, sample_weight, verbose) 87 tree.fit(X, y, 88 sample_weight=curr_sample_weight, ---> 89 check_input=False) 90 91 tree.indices_ = sample_counts > 0. /Users/mikhail/anaconda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_mask, X_argsorted, check_input, sample_weight) 265 max_leaf_nodes) 266 --> 267 builder.build(self.tree_, X, y, sample_weight) 268 269 if self.n_outputs_ == 1: KeyboardInterrupt: