# load sample dataset
import pandas as pd
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from yellowbrick.regressor import PredictionError
df = pd.read_csv('./diabetes.csv')
print(df.shape)
df
(768, 9)
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
# Renaming the column Class variable
df = df.rename(columns={'Class variable': 'Class_variable'})
df
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class_variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]
Series([], dtype: int64)
# eda (automated)
# profile.to_widgets() -- research to fix...
profile = ProfileReport(df)
profile.to_notebook_iframe()
profile.to_file('./reg_diabetes.html')
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
# take care of any missing values
# n/a in this case
# set aside and save unseen data set
data_unseen = df.sample(n=100, random_state=42)
data = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./diabetes_unseen.csv', index=False)
Data for model: (668, 9), Data for unseen predictions: (100, 9)
# data.columns!='Class_variable'
X = data.loc[: , data.columns!='Class_variable']
y = data.loc[: , data.columns=='Class_variable']
X
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
762 | 9 | 89 | 62 | 0 | 0 | 22.5 | 0.142 | 33 |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 |
668 rows × 8 columns
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# encoding
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)'] []
# pipeline for numerical columns
num_pipe = make_pipeline(
SimpleImputer(strategy='median'),
StandardScaler()
)
num_pipe
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())])
SimpleImputer(strategy='median')
StandardScaler()
# pipeline for categorical columns
cat_pipe = make_pipeline(
SimpleImputer(strategy='constant', fill_value='N/A'),
OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe
Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
# combine both the pipelines
full_pipe = ColumnTransformer([
('num', num_pipe, num_cols),
('cat', cat_pipe, cat_cols)
])
full_pipe
ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose concentration a 2 hours in ' 'an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in ' 'm)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose concentration a 2 hours in ' 'an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in ' 'm)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
# build the model
gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_diabetes
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose ' 'concentration a 2 hours in ' 'an oral glucose tolerance ' 'test', 'Diastolic blood pressure ' '(mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2-Hour s...nsulin (mu ' 'U/ml)', 'Body mass index (weight in ' 'kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])), ('gradientboostingregressor', GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose ' 'concentration a 2 hours in ' 'an oral glucose tolerance ' 'test', 'Diastolic blood pressure ' '(mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2-Hour s...nsulin (mu ' 'U/ml)', 'Body mass index (weight in ' 'kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])), ('gradientboostingregressor', GradientBoostingRegressor(random_state=42))])
ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose concentration a 2 hours in ' 'an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in ' 'm)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
# train the model
gbr_diabetes.fit(X_train, y_train)
C:\Users\owner\miniconda3\envs\pc3\lib\site-packages\sklearn\ensemble\_gb.py:437: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose ' 'concentration a 2 hours in ' 'an oral glucose tolerance ' 'test', 'Diastolic blood pressure ' '(mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2-Hour s...nsulin (mu ' 'U/ml)', 'Body mass index (weight in ' 'kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])), ('gradientboostingregressor', GradientBoostingRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose ' 'concentration a 2 hours in ' 'an oral glucose tolerance ' 'test', 'Diastolic blood pressure ' '(mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2-Hour s...nsulin (mu ' 'U/ml)', 'Body mass index (weight in ' 'kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])), ('gradientboostingregressor', GradientBoostingRegressor(random_state=42))])
ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose concentration a 2 hours in ' 'an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in ' 'm)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])
['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
SimpleImputer(strategy='median')
StandardScaler()
[]
SimpleImputer(fill_value='N/A', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
GradientBoostingRegressor(random_state=42)
# make predictions on the test set
y_pred = gbr_diabetes.predict(X_test)
# measure accuracy
print('R2:', r2_score(y_test, y_pred))
R2: 0.22415907844020777
# done manually to break out the example above
y_test['y_pred'] = y_pred
test_scores = y_test.copy()
test_scores
Class_variable | y_pred | |
---|---|---|
418 | 0 | 0.068579 |
180 | 0 | -0.010718 |
556 | 0 | 0.107478 |
601 | 0 | -0.013441 |
317 | 1 | 0.827088 |
... | ... | ... |
622 | 0 | 0.469301 |
608 | 0 | 0.374756 |
638 | 1 | 0.308118 |
247 | 0 | 0.356902 |
19 | 1 | 0.490692 |
134 rows × 2 columns
r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred'])
mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred'])
mean_act = test_scores['Class_variable'].mean()
mean_pred = test_scores['y_pred'].mean()
mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred'])
print(f'R2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')
R2: 0.22415907844020777 mae: 0.33678577565359047 act_mean: 0.39552238805970147 pred_mean: 0.3679732831419842 mape: 691517409333630.6
import joblib
joblib.dump(gbr_diabetes, './diabetes.pkl')
print(gbr_diabetes)
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['Number of times pregnant', 'Plasma glucose ' 'concentration a 2 hours in ' 'an oral glucose tolerance ' 'test', 'Diastolic blood pressure ' '(mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2-Hour s...nsulin (mu ' 'U/ml)', 'Body mass index (weight in ' 'kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']), ('cat', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='N/A', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]), [])])), ('gradientboostingregressor', GradientBoostingRegressor(random_state=42))])