ML Pipeline with Sklearn¶

In [1]:

# load sample dataset
import pandas as pd
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from yellowbrick.regressor import PredictionError

df = pd.read_csv('./diabetes.csv')
print(df.shape)
df

(768, 9)

Out[1]:

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)	Class variable
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

768 rows × 9 columns

In [2]:

# Renaming the column Class variable

df = df.rename(columns={'Class variable': 'Class_variable'})
df

Out[2]:

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)	Class_variable
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

768 rows × 9 columns

In [3]:

# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]

Out[3]:

Series([], dtype: int64)

In [9]:

# eda (automated)
# profile.to_widgets() -- research to fix...

profile = ProfileReport(df)
profile.to_notebook_iframe()
profile.to_file('./reg_diabetes.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [ ]:

# take care of any missing values 
# n/a in this case

In [10]:

# set aside and save unseen data set
data_unseen = df.sample(n=100, random_state=42)
data        = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./diabetes_unseen.csv', index=False)

Data for model: (668, 9),
Data for unseen predictions: (100, 9)

In [11]:

# data.columns!='Class_variable'
X = data.loc[: , data.columns!='Class_variable']
y = data.loc[: , data.columns=='Class_variable']

In [12]:

Out[12]:

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)
0	6	148	72	35	0	33.6	0.627	50
1	1	85	66	29	0	26.6	0.351	31
2	8	183	64	0	0	23.3	0.672	32
3	1	89	66	23	94	28.1	0.167	21
4	0	137	40	35	168	43.1	2.288	33
...	...	...	...	...	...	...	...	...
762	9	89	62	0	0	22.5	0.142	33
763	10	101	76	48	180	32.9	0.171	63
765	5	121	72	23	112	26.2	0.245	30
766	1	126	60	0	0	30.1	0.349	47
767	1	93	70	31	0	30.4	0.315	23

668 rows × 8 columns

In [15]:

# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:

# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)'] 
 []

In [17]:

# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe

Out[17]:

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [18]:

# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

Out[18]:

Pipeline(steps=[('simpleimputer',
                 SimpleImputer(fill_value='N/A', strategy='constant')),
                ('onehotencoder',
                 OneHotEncoder(handle_unknown='ignore', sparse=False))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [19]:

# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

Out[19]:

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Number of times pregnant',
                                  'Plasma glucose concentration a 2 hours in '
                                  'an oral glucose tolerance test',
                                  'Diastolic blood pressure (mm Hg)',
                                  'Triceps skin fold thickness (mm)',
                                  '2-Hour serum insulin (mu U/ml)',
                                  'Body mass index (weight in kg/(height in '
                                  'm)^2)',
                                  'Diabetes pedigree function',
                                  'Age (years)']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='N/A',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 [])])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

ColumnTransformer

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Number of times pregnant',
                                  'Plasma glucose concentration a 2 hours in '
                                  'an oral glucose tolerance test',
                                  'Diastolic blood pressure (mm Hg)',
                                  'Triceps skin fold thickness (mm)',
                                  '2-Hour serum insulin (mu U/ml)',
                                  'Body mass index (weight in kg/(height in '
                                  'm)^2)',
                                  'Diabetes pedigree function',
                                  'Age (years)']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='N/A',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 [])])

num

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']

SimpleImputer

SimpleImputer(strategy='median')

StandardScaler

StandardScaler()

cat

[]

SimpleImputer

SimpleImputer(fill_value='N/A', strategy='constant')

OneHotEncoder

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [20]:

# build the model
gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_diabetes

Out[20]:

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                                   'Triceps skin fold '
                                                   'thickness (mm)',
                                                   '2-Hour s...nsulin (mu '
                                                   'U/ml)',
                                                   'Body mass index (weight in '
                                                   'kg/(height in m)^2)',
                                                   'Diabetes pedigree function',
                                                   'Age (years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=42))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                                   'Triceps skin fold '
                                                   'thickness (mm)',
                                                   '2-Hour s...nsulin (mu '
                                                   'U/ml)',
                                                   'Body mass index (weight in '
                                                   'kg/(height in m)^2)',
                                                   'Diabetes pedigree function',
                                                   'Age (years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=42))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Number of times pregnant',
                                  'Plasma glucose concentration a 2 hours in '
                                  'an oral glucose tolerance test',
                                  'Diastolic blood pressure (mm Hg)',
                                  'Triceps skin fold thickness (mm)',
                                  '2-Hour serum insulin (mu U/ml)',
                                  'Body mass index (weight in kg/(height in '
                                  'm)^2)',
                                  'Diabetes pedigree function',
                                  'Age (years)']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='N/A',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 [])])

num

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']

SimpleImputer

SimpleImputer(strategy='median')

StandardScaler

StandardScaler()

cat

[]

SimpleImputer

SimpleImputer(fill_value='N/A', strategy='constant')

OneHotEncoder

OneHotEncoder(handle_unknown='ignore', sparse=False)

GradientBoostingRegressor

GradientBoostingRegressor(random_state=42)

In [22]:

# train the model
gbr_diabetes.fit(X_train, y_train)

C:\Users\owner\miniconda3\envs\pc3\lib\site-packages\sklearn\ensemble\_gb.py:437: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

Out[22]:

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                                   'Triceps skin fold '
                                                   'thickness (mm)',
                                                   '2-Hour s...nsulin (mu '
                                                   'U/ml)',
                                                   'Body mass index (weight in '
                                                   'kg/(height in m)^2)',
                                                   'Diabetes pedigree function',
                                                   'Age (years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=42))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                                   'Triceps skin fold '
                                                   'thickness (mm)',
                                                   '2-Hour s...nsulin (mu '
                                                   'U/ml)',
                                                   'Body mass index (weight in '
                                                   'kg/(height in m)^2)',
                                                   'Diabetes pedigree function',
                                                   'Age (years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=42))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Number of times pregnant',
                                  'Plasma glucose concentration a 2 hours in '
                                  'an oral glucose tolerance test',
                                  'Diastolic blood pressure (mm Hg)',
                                  'Triceps skin fold thickness (mm)',
                                  '2-Hour serum insulin (mu U/ml)',
                                  'Body mass index (weight in kg/(height in '
                                  'm)^2)',
                                  'Diabetes pedigree function',
                                  'Age (years)']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='N/A',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 [])])

num

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']

SimpleImputer

SimpleImputer(strategy='median')

StandardScaler

StandardScaler()

cat

[]

SimpleImputer

SimpleImputer(fill_value='N/A', strategy='constant')

OneHotEncoder

OneHotEncoder(handle_unknown='ignore', sparse=False)

GradientBoostingRegressor

GradientBoostingRegressor(random_state=42)

In [24]:

# make predictions on the test set
y_pred = gbr_diabetes.predict(X_test)

In [25]:

# measure accuracy
print('R2:', r2_score(y_test, y_pred))

R2: 0.22415907844020777

In [26]:

# done manually to break out the example above
y_test['y_pred'] = y_pred
test_scores = y_test.copy()
test_scores

Out[26]:

	Class_variable	y_pred
418	0	0.068579
180	0	-0.010718
556	0	0.107478
601	0	-0.013441
317	1	0.827088
...	...	...
622	0	0.469301
608	0	0.374756
638	1	0.308118
247	0	0.356902
19	1	0.490692

134 rows × 2 columns

In [27]:

r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred'])
mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred'])
mean_act = test_scores['Class_variable'].mean()
mean_pred = test_scores['y_pred'].mean()
mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred'])
print(f'R2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

R2: 0.22415907844020777
mae: 0.33678577565359047
act_mean: 0.39552238805970147
pred_mean: 0.3679732831419842
mape: 691517409333630.6

In [28]:

import joblib
joblib.dump(gbr_diabetes, './diabetes.pkl')
print(gbr_diabetes)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Number of times pregnant',
                                                   'Plasma glucose '
                                                   'concentration a 2 hours in '
                                                   'an oral glucose tolerance '
                                                   'test',
                                                   'Diastolic blood pressure '
                                                   '(mm Hg)',
                                                   'Triceps skin fold '
                                                   'thickness (mm)',
                                                   '2-Hour s...nsulin (mu '
                                                   'U/ml)',
                                                   'Body mass index (weight in '
                                                   'kg/(height in m)^2)',
                                                   'Diabetes pedigree function',
                                                   'Age (years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  [])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=42))])

In [ ]: