#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Add the facets overview python code to the python path
import sys
sys.path.append('./python')
# In[2]:
# Load UCI census train and test data into dataframes.
import pandas as pd
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
"Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
"Hours per week", "Country", "Target"]
train_data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
names=features,
sep=r'\s*,\s*',
engine='python',
na_values="?")
test_data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
names=features,
sep=r'\s*,\s*',
skiprows=[0],
engine='python',
na_values="?")
# In[3]:
# Calculate the feature statistics proto from the datasets and stringify it for use in facets overview
from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
import base64
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},
{'name': 'test', 'table': test_data}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")
# In[4]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML
HTML_TEMPLATE = """
"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))