#!/usr/bin/env python # coding: utf-8 # # TMLE Example Notebook # This notebook demonstrates the issue of using uplift curves without knowing true treatment effect and how to solve it by using TMLE as a proxy of the true treatment effect. # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: import os base_path = os.path.abspath("../") os.chdir(base_path) # In[3]: import logging from matplotlib import pyplot as plt import numpy as np import pandas as pd from sklearn.model_selection import train_test_split, KFold import sys import warnings warnings.simplefilter("ignore", UserWarning) from lightgbm import LGBMRegressor # In[7]: import causalml from causalml.dataset import synthetic_data from causalml.inference.meta import BaseXRegressor, TMLELearner from causalml.metrics.visualize import * from causalml.propensity import calibrate print(causalml.__version__) # In[8]: logger = logging.getLogger('causalml') logger.setLevel(logging.DEBUG) plt.style.use('fivethirtyeight') # ## Generating Synthetic Data # In[9]: # Generate synthetic data using mode 1 y, X, treatment, tau, b, e = synthetic_data(mode=1, n=1000000, p=10, sigma=5.) # In[10]: X_train, X_test, y_train, y_test, e_train, e_test, treatment_train, treatment_test, tau_train, tau_test, b_train, b_test = train_test_split(X, y, e, treatment, tau, b, test_size=0.5, random_state=42) # ## Calculating Individual Treatment Effect (ITE/CATE) # In[12]: # X Learner learner_x = BaseXRegressor(learner=LGBMRegressor()) learner_x.fit(X=X_train, treatment=treatment_train, y=y_train) cate_x_test = learner_x.predict(X=X_test, p=e_test, treatment=treatment_test).flatten() # In[13]: alpha=0.2 bins=30 plt.figure(figsize=(12,8)) plt.hist(cate_x_test, alpha=alpha, bins=bins, label='X Learner') plt.hist(tau_test, alpha=alpha, bins=bins, label='Actual') plt.title('Distribution of CATE Predictions by X-Learner and Actual') plt.xlabel('Individual Treatment Effect (ITE/CATE)') plt.ylabel('# of Samples') _=plt.legend() # ## Validating CATE without TMLE # In[14]: df = pd.DataFrame({'y': y_test, 'w': treatment_test, 'tau': tau_test, 'X-Learner': cate_x_test, 'Actual': tau_test}) # ### Uplift Curve With Ground Truth # If true treatment effect is known as in simulations, the uplift curve of a model uses the cumulative sum of the treatment effect sorted by model's CATE estimate. # # In the figure below, the uplift curve of X-learner shows positive lift close to the optimal lift by the ground truth. # In[15]: plot(df, outcome_col='y', treatment_col='w', treatment_effect_col='tau') # ### Uplift Curve Without Ground Truth # If true treatment effect is unknown as in practice, the uplift curve of a model uses the cumulative mean difference of outcome in the treatment and control group sorted by model's CATE estimate. # # In the figure below, the uplift curves of X-learner as well as the ground truth show no lift incorrectly. # In[16]: plot(df.drop('tau', axis=1), outcome_col='y', treatment_col='w') # ## TMLE # ### Uplift Curve with TMLE as Ground Truth By using TMLE as a proxy of the ground truth, the uplift curves of X-learner and the ground truth become close to the original using the ground truth. # In[17]: n_fold = 5 kf = KFold(n_splits=n_fold) # In[18]: df = pd.DataFrame({'y': y_test, 'w': treatment_test, 'p': e_test, 'X-Learner': cate_x_test, 'Actual': tau_test}) # In[19]: inference_cols = [] for i in range(X_test.shape[1]): col = 'col_' + str(i) df[col] = X_test[:,i] inference_cols.append(col) # In[20]: df.head() # In[21]: tmle_df = get_tmlegain(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # In[22]: tmle_df # #### Uplift Curve wihtout CI Here we can directly use plot_tmle() function to generate the results and plot uplift curve # In[23]: plot_tmlegain(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) We also provide the api call directly with plot() by input the kind as 'tmle' # In[24]: plot(df, kind='gain', tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # #### AUUC Score # In[25]: auuc_score(df, tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # #### Uplift Curve with CI # In[25]: tmle_df = get_tmlegain(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True) # In[26]: tmle_df # In[27]: plot_tmlegain(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True) # In[29]: plot(df, kind='gain', tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True) # ### Qini Curve with TMLE as Ground Truth # #### Qini Curve without CI # In[30]: qini = get_tmleqini(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # In[31]: qini # In[32]: plot_tmleqini(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) We also provide the api call directly with plot() by input the kind as 'tmleqini' # In[34]: plot(df, kind='qini', tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # #### Qini Score # In[26]: qini_score(df, tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=False) # #### Qini Curve with CI # In[36]: qini = get_tmleqini(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True) # In[37]: qini # In[38]: plot_tmleqini(df, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True) # In[39]: plot(df, kind='qini', tmle=True, inference_col=inference_cols, outcome_col='y', treatment_col='w', p_col='p', n_segment=5, cv=kf, calibrate_propensity=True, ci=True)