#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from xgboost import XGBRegressor from lightgbm import LGBMRegressor from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error as mse from scipy.stats import entropy import warnings from causalml.inference.meta import LRSRegressor from causalml.inference.meta import XGBTRegressor, MLPTRegressor from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor from causalml.inference.nn import DragonNet from causalml.match import NearestNeighborMatch, MatchOptimizer, create_table_one from causalml.propensity import ElasticNetPropensityModel from causalml.dataset.regression import * from causalml.metrics import * import os, sys get_ipython().run_line_magic('matplotlib', 'inline') warnings.filterwarnings('ignore') plt.style.use('fivethirtyeight') sns.set_palette('Paired') plt.rcParams['figure.figsize'] = (12,8) # # IHDP semi-synthetic dataset # # Hill introduced a semi-synthetic dataset constructed from the Infant Health # and Development Program (IHDP). This dataset is based on a randomized experiment # investigating the effect of home visits by specialists on future cognitive scores. The data has 747 observations (rows). The IHDP simulation is considered the de-facto standard benchmark for neural network treatment effect # estimation methods. # # The original [paper](https://arxiv.org/pdf/1906.02120.pdf) uses 1000 realizations from the NCPI package, but for illustration purposes, we use 1 dataset (realization) as an example below. # In[3]: df = pd.read_csv(f'data/ihdp_npci_3.csv', header=None) cols = ["treatment", "y_factual", "y_cfactual", "mu0", "mu1"] + [f'x{i}' for i in range(1,26)] df.columns = cols # In[4]: df.shape # In[5]: df.head() # In[6]: pd.Series(df['treatment']).value_counts(normalize=True) # In[7]: X = df.loc[:,'x1':] treatment = df['treatment'] y = df['y_factual'] tau = df.apply(lambda d: d['y_factual'] - d['y_cfactual'] if d['treatment']==1 else d['y_cfactual'] - d['y_factual'], axis=1) # In[8]: # p_model = LogisticRegressionCV(penalty='elasticnet', solver='saga', l1_ratios=np.linspace(0,1,5), # cv=StratifiedKFold(n_splits=4, shuffle=True)) # p_model.fit(X, treatment) # p = p_model.predict_proba(X)[:, 1] # In[9]: p_model = ElasticNetPropensityModel() p = p_model.fit_predict(X, treatment) # In[10]: s_learner = BaseSRegressor(LGBMRegressor()) s_ate = s_learner.estimate_ate(X, treatment, y)[0] s_ite = s_learner.fit_predict(X, treatment, y) t_learner = BaseTRegressor(LGBMRegressor()) t_ate = t_learner.estimate_ate(X, treatment, y)[0][0] t_ite = t_learner.fit_predict(X, treatment, y) x_learner = BaseXRegressor(LGBMRegressor()) x_ate = x_learner.estimate_ate(X, treatment, y, p)[0][0] x_ite = x_learner.fit_predict(X, treatment, y, p) r_learner = BaseRRegressor(LGBMRegressor()) r_ate = r_learner.estimate_ate(X, treatment, y, p)[0][0] r_ite = r_learner.fit_predict(X, treatment, y, p) # In[11]: dragon = DragonNet(neurons_per_layer=200, targeted_reg=True) dragon_ite = dragon.fit_predict(X, treatment, y, return_components=False) dragon_ate = dragon_ite.mean() # In[12]: df_preds = pd.DataFrame([s_ite.ravel(), t_ite.ravel(), x_ite.ravel(), r_ite.ravel(), dragon_ite.ravel(), tau.ravel(), treatment.ravel(), y.ravel()], index=['S','T','X','R','dragonnet','tau','w','y']).T df_cumgain = get_cumgain(df_preds) # In[13]: df_result = pd.DataFrame([s_ate, t_ate, x_ate, r_ate, dragon_ate, tau.mean()], index=['S','T','X','R','dragonnet','actual'], columns=['ATE']) df_result['MAE'] = [mean_absolute_error(t,p) for t,p in zip([s_ite, t_ite, x_ite, r_ite, dragon_ite], [tau.values.reshape(-1,1)]*5 ) ] + [None] df_result['AUUC'] = auuc_score(df_preds) # In[14]: df_result # In[15]: plot_gain(df_preds) # # `causalml` Synthetic Data Generation Method # In[16]: y, X, w, tau, b, e = simulate_nuisance_and_easy_treatment(n=1000) X_train, X_val, y_train, y_val, w_train, w_val, tau_train, tau_val, b_train, b_val, e_train, e_val = \ train_test_split(X, y, w, tau, b, e, test_size=0.2, random_state=123, shuffle=True) preds_dict_train = {} preds_dict_valid = {} preds_dict_train['Actuals'] = tau_train preds_dict_valid['Actuals'] = tau_val preds_dict_train['generated_data'] = { 'y': y_train, 'X': X_train, 'w': w_train, 'tau': tau_train, 'b': b_train, 'e': e_train} preds_dict_valid['generated_data'] = { 'y': y_val, 'X': X_val, 'w': w_val, 'tau': tau_val, 'b': b_val, 'e': e_val} # Predict p_hat because e would not be directly observed in real-life p_model = ElasticNetPropensityModel() p_hat_train = p_model.fit_predict(X_train, w_train) p_hat_val = p_model.fit_predict(X_val, w_val) for base_learner, label_l in zip([BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor], ['S', 'T', 'X', 'R']): for model, label_m in zip([LinearRegression, XGBRegressor], ['LR', 'XGB']): # RLearner will need to fit on the p_hat if label_l != 'R': learner = base_learner(model()) # fit the model on training data only learner.fit(X=X_train, treatment=w_train, y=y_train) try: preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train, p=p_hat_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val, p=p_hat_val).flatten() except TypeError: preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train, treatment=w_train, y=y_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val, treatment=w_val, y=y_val).flatten() else: learner = base_learner(model()) learner.fit(X=X_train, p=p_hat_train, treatment=w_train, y=y_train) preds_dict_train['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_train).flatten() preds_dict_valid['{} Learner ({})'.format( label_l, label_m)] = learner.predict(X=X_val).flatten() learner = DragonNet(verbose=False) learner.fit(X_train, treatment=w_train, y=y_train) preds_dict_train['DragonNet'] = learner.predict_tau(X=X_train).flatten() preds_dict_valid['DragonNet'] = learner.predict_tau(X=X_val).flatten() # In[17]: actuals_train = preds_dict_train['Actuals'] actuals_validation = preds_dict_valid['Actuals'] synthetic_summary_train = pd.DataFrame({label: [preds.mean(), mse(preds, actuals_train)] for label, preds in preds_dict_train.items() if 'generated' not in label.lower()}, index=['ATE', 'MSE']).T synthetic_summary_train['Abs % Error of ATE'] = np.abs( (synthetic_summary_train['ATE']/synthetic_summary_train.loc['Actuals', 'ATE']) - 1) synthetic_summary_validation = pd.DataFrame({label: [preds.mean(), mse(preds, actuals_validation)] for label, preds in preds_dict_valid.items() if 'generated' not in label.lower()}, index=['ATE', 'MSE']).T synthetic_summary_validation['Abs % Error of ATE'] = np.abs( (synthetic_summary_validation['ATE']/synthetic_summary_validation.loc['Actuals', 'ATE']) - 1) # calculate kl divergence for training for label in synthetic_summary_train.index: stacked_values = np.hstack((preds_dict_train[label], actuals_train)) stacked_low = np.percentile(stacked_values, 0.1) stacked_high = np.percentile(stacked_values, 99.9) bins = np.linspace(stacked_low, stacked_high, 100) distr = np.histogram(preds_dict_train[label], bins=bins)[0] distr = np.clip(distr/distr.sum(), 0.001, 0.999) true_distr = np.histogram(actuals_train, bins=bins)[0] true_distr = np.clip(true_distr/true_distr.sum(), 0.001, 0.999) kl = entropy(distr, true_distr) synthetic_summary_train.loc[label, 'KL Divergence'] = kl # calculate kl divergence for validation for label in synthetic_summary_validation.index: stacked_values = np.hstack((preds_dict_valid[label], actuals_validation)) stacked_low = np.percentile(stacked_values, 0.1) stacked_high = np.percentile(stacked_values, 99.9) bins = np.linspace(stacked_low, stacked_high, 100) distr = np.histogram(preds_dict_valid[label], bins=bins)[0] distr = np.clip(distr/distr.sum(), 0.001, 0.999) true_distr = np.histogram(actuals_validation, bins=bins)[0] true_distr = np.clip(true_distr/true_distr.sum(), 0.001, 0.999) kl = entropy(distr, true_distr) synthetic_summary_validation.loc[label, 'KL Divergence'] = kl # In[18]: df_preds_train = pd.DataFrame([preds_dict_train['S Learner (LR)'].ravel(), preds_dict_train['S Learner (XGB)'].ravel(), preds_dict_train['T Learner (LR)'].ravel(), preds_dict_train['T Learner (XGB)'].ravel(), preds_dict_train['X Learner (LR)'].ravel(), preds_dict_train['X Learner (XGB)'].ravel(), preds_dict_train['R Learner (LR)'].ravel(), preds_dict_train['R Learner (XGB)'].ravel(), preds_dict_train['DragonNet'].ravel(), preds_dict_train['generated_data']['tau'].ravel(), preds_dict_train['generated_data']['w'].ravel(), preds_dict_train['generated_data']['y'].ravel()], index=['S Learner (LR)','S Learner (XGB)', 'T Learner (LR)','T Learner (XGB)', 'X Learner (LR)','X Learner (XGB)', 'R Learner (LR)','R Learner (XGB)', 'DragonNet','tau','w','y']).T synthetic_summary_train['AUUC'] = auuc_score(df_preds_train).iloc[:-1] # In[19]: df_preds_validation = pd.DataFrame([preds_dict_valid['S Learner (LR)'].ravel(), preds_dict_valid['S Learner (XGB)'].ravel(), preds_dict_valid['T Learner (LR)'].ravel(), preds_dict_valid['T Learner (XGB)'].ravel(), preds_dict_valid['X Learner (LR)'].ravel(), preds_dict_valid['X Learner (XGB)'].ravel(), preds_dict_valid['R Learner (LR)'].ravel(), preds_dict_valid['R Learner (XGB)'].ravel(), preds_dict_valid['DragonNet'].ravel(), preds_dict_valid['generated_data']['tau'].ravel(), preds_dict_valid['generated_data']['w'].ravel(), preds_dict_valid['generated_data']['y'].ravel()], index=['S Learner (LR)','S Learner (XGB)', 'T Learner (LR)','T Learner (XGB)', 'X Learner (LR)','X Learner (XGB)', 'R Learner (LR)','R Learner (XGB)', 'DragonNet','tau','w','y']).T synthetic_summary_validation['AUUC'] = auuc_score(df_preds_validation).iloc[:-1] # In[20]: synthetic_summary_train # In[21]: synthetic_summary_validation # In[22]: plot_gain(df_preds_train) # In[23]: plot_gain(df_preds_validation) # In[ ]: