#!/usr/bin/env python # coding: utf-8 # ## ML Pipeline with Sklearn # In[1]: # load sample dataset import pandas as pd import seaborn as sns from ydata_profiling import ProfileReport from sklearn.model_selection import train_test_split from sklearn.datasets import make_regression from sklearn.ensemble import GradientBoostingRegressor from sklearn.pipeline import make_pipeline, Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_absolute_percentage_error from yellowbrick.regressor import PredictionError df = pd.read_csv('./diabetes.csv') print(df.shape) df # In[2]: # Renaming the column Class variable df = df.rename(columns={'Class variable': 'Class_variable'}) df # In[3]: # simple check for nulls df.isna().sum()[df.isna().sum() > 0] # In[9]: # eda (automated) # profile.to_widgets() -- research to fix... profile = ProfileReport(df) profile.to_notebook_iframe() profile.to_file('./reg_diabetes.html') # In[ ]: # take care of any missing values # n/a in this case # In[10]: # set aside and save unseen data set data_unseen = df.sample(n=100, random_state=42) data = df.drop(data_unseen.index) print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}') data_unseen.to_csv('./diabetes_unseen.csv', index=False) # In[11]: # data.columns!='Class_variable' X = data.loc[: , data.columns!='Class_variable'] y = data.loc[: , data.columns=='Class_variable'] # In[12]: X # In[15]: # split the data into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # In[16]: # encoding # get the categorical and numeric column names num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist() cat_cols = X_train.select_dtypes(include=['object']).columns.tolist() print(num_cols, '\n', cat_cols) # In[17]: # pipeline for numerical columns num_pipe = make_pipeline( SimpleImputer(strategy='median'), StandardScaler() ) num_pipe # In[18]: # pipeline for categorical columns cat_pipe = make_pipeline( SimpleImputer(strategy='constant', fill_value='N/A'), OneHotEncoder(handle_unknown='ignore', sparse=False) ) cat_pipe # In[19]: # combine both the pipelines full_pipe = ColumnTransformer([ ('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols) ]) full_pipe # In[20]: # build the model gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42)) gbr_diabetes # In[22]: # train the model gbr_diabetes.fit(X_train, y_train) # In[24]: # make predictions on the test set y_pred = gbr_diabetes.predict(X_test) # In[25]: # measure accuracy print('R2:', r2_score(y_test, y_pred)) # In[26]: # done manually to break out the example above y_test['y_pred'] = y_pred test_scores = y_test.copy() test_scores # In[27]: r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred']) mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred']) mean_act = test_scores['Class_variable'].mean() mean_pred = test_scores['y_pred'].mean() mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred']) print(f'R2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}') # In[28]: import joblib joblib.dump(gbr_diabetes, './diabetes.pkl') print(gbr_diabetes) # In[ ]: