#!/usr/bin/env python
# coding: utf-8

# ## ML Pipeline with Sklearn

# In[1]:


# load sample dataset
import pandas as pd
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from yellowbrick.regressor import PredictionError

df = pd.read_csv('./diabetes.csv')
print(df.shape)
df


# In[2]:


# Renaming the column Class variable

df = df.rename(columns={'Class variable': 'Class_variable'})
df


# In[3]:


# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]


# In[9]:


# eda (automated)
# profile.to_widgets() -- research to fix...

profile = ProfileReport(df)
profile.to_notebook_iframe()
profile.to_file('./reg_diabetes.html')


# In[ ]:


# take care of any missing values 
# n/a in this case


# In[10]:


# set aside and save unseen data set
data_unseen = df.sample(n=100, random_state=42)
data        = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./diabetes_unseen.csv', index=False)


# In[11]:


# data.columns!='Class_variable'
X = data.loc[: , data.columns!='Class_variable']
y = data.loc[: , data.columns=='Class_variable']


# In[12]:


X


# In[15]:


# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# In[16]:


# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)


# In[17]:


# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe


# In[18]:


# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe


# In[19]:


# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe


# In[20]:


# build the model
gbr_diabetes = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_diabetes


# In[22]:


# train the model
gbr_diabetes.fit(X_train, y_train)


# In[24]:


# make predictions on the test set
y_pred = gbr_diabetes.predict(X_test)


# In[25]:


# measure accuracy
print('R2:', r2_score(y_test, y_pred))


# In[26]:


# done manually to break out the example above
y_test['y_pred'] = y_pred
test_scores = y_test.copy()
test_scores


# In[27]:


r2 = r2_score(test_scores['Class_variable'], test_scores['y_pred'])
mae = mean_absolute_error(test_scores['Class_variable'], test_scores['y_pred'])
mean_act = test_scores['Class_variable'].mean()
mean_pred = test_scores['y_pred'].mean()
mape = mean_absolute_percentage_error(test_scores['Class_variable'], test_scores['y_pred'])
print(f'R2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')


# In[28]:


import joblib
joblib.dump(gbr_diabetes, './diabetes.pkl')
print(gbr_diabetes)


# In[ ]: