import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Get the data: Algebra 2005-2006
train_filepath = 'data/algebra0506/algebra_2005_2006_train.txt'
test_filepath  = 'data/algebra0506/algebra_2005_2006_test.txt'
traindata = pd.read_table(train_filepath)

# What does the training data look like?
traindata.head()

# Let's look at the columns
traindata.columns

# Create empty list
KCs = []

# Grab the column of Knowledge Components, dropping all NaNs
KCcol = traindata['KC(Default)']
KCcol = list(KCcol.dropna())

# Loop over every database entry, read the skills, split on '~~' separator, and append to list
for i in range(len(KCcol)):
    skills = KCcol[i].split('~~')
    for skill in skills:
        KCs.append(skill)
        
# Convert to set, which keeps only unique entries, then convert back to list
KCs = list(set(KCs))

# Print length
print 'The total number of unique skills is: ',len(KCs)