import pandas as pd import numpy as np import matplotlib.pyplot as plt # Get the data: Algebra 2005-2006 train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt' test_filepath = 'data/algebra0506/algebra_2005_2006_test.txt' traindata = pd.read_table(train_filepath) # What does the training data look like? traindata.head() # Let's look at the columns traindata.columns # Create empty list KCs = [] # Grab the column of Knowledge Components, dropping all NaNs KCcol = traindata['KC(Default)'] KCcol = list(KCcol.dropna()) # Loop over every database entry, read the skills, split on '~~' separator, and append to list for i in range(len(KCcol)): skills = KCcol[i].split('~~') for skill in skills: KCs.append(skill) # Convert to set, which keeps only unique entries, then convert back to list KCs = list(set(KCs)) # Print length print 'The total number of unique skills is: ',len(KCs)