This example notebooks illustrates how to visualize uplift trees for interpretation and diagnosis.
These visualization functions work only for tree-based algorithms:
Currently, they are NOT supporting Meta-learner algorithms
This notebook will show how to use visualization for:
Uplift Tree and Uplift Random Forest
Training and Validation Data
One Treatment Group and Multiple Treatment Groups
from causalml.dataset import make_uplift_classification
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot
/Users/jeong/.conda/envs/py36/lib/python3.6/site-packages/lightgbm/__init__.py:48: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler. This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore. Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler. You can install the OpenMP library by the following command: ``brew install libomp``. "You can install the OpenMP library by the following command: ``brew install libomp``.", UserWarning) Using TensorFlow backend.
import numpy as np
import pandas as pd
from IPython.display import Image
from sklearn.model_selection import train_test_split
# Data generation
df, x_names = make_uplift_classification()
df.head()
df = df[df['treatment_group_key'].isin(['control','treatment1'])]
# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
index='treatment_group_key',
aggfunc=[np.mean, np.size],
margins=True)
mean | size | |
---|---|---|
conversion | conversion | |
treatment_group_key | ||
control | 0.5110 | 1000 |
treatment1 | 0.5140 | 1000 |
All | 0.5125 | 2000 |
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftTreeClassifier(max_depth = 5, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
<causalml.inference.tree.models.UpliftTreeClassifier at 0x7f9820592128>
# Print uplift tree as a string
result = uplift_tree_string(uplift_model.fitted_uplift_tree, x_names)
x18_uplift_increase >= -1.3785915096595742? yes -> x1_informative >= -0.6364361308885705? yes -> x7_irrelevant >= -0.7090765407021462? yes -> x19_increase_mix >= -0.8244948824290843? yes -> {'treatment1': 0.542857, 'control': 0.531469} no -> {'treatment1': 0.62963, 'control': 0.436782} no -> {'treatment1': 0.449612, 'control': 0.495798} no -> {'treatment1': 0.46988, 'control': 0.52381} no -> {'treatment1': 0.388489, 'control': 0.559055}
# Plot uplift tree
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
Note the validation uplift score will update.
### Fill the trained tree with testing data set
# The uplift score based on testing dataset is shown as validation uplift score in the tree nodes
uplift_model.fill(X=df_test[x_names].values, treatment=df_test['treatment_group_key'].values, y=df_test['conversion'].values)
# Plot uplift tree
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftRandomForestClassifier(n_estimators=5, max_depth = 5, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
# Specify a tree in the random forest (the index can be any integer from 0 to n_estimators-1)
uplift_tree = uplift_model.uplift_forest[0]
# Print uplift tree as a string
result = uplift_tree_string(uplift_tree.fitted_uplift_tree, x_names)
x18_uplift_increase >= -1.4802946520331732? yes -> x16_increase_mix >= 1.028652295155747? yes -> x5_informative >= 1.1517351173273966? yes -> {'treatment1': 0.646018, 'control': 0.25} no -> {'treatment1': 0.525547, 'control': 0.411765} no -> x17_uplift_increase >= -0.9531241143484912? yes -> {'control': 0.397959, 'treatment1': 0.513661} no -> x15_uplift_increase >= -0.2021677782274923? yes -> {'control': 0.611511, 'treatment1': 0.417323} no -> {'treatment1': 0.546154, 'control': 0.575342} no -> {'treatment1': 0.407767, 'control': 0.529412}
# Plot uplift tree
graph = uplift_tree_plot(uplift_tree.fitted_uplift_tree,x_names)
Image(graph.create_png())
### Fill the trained tree with testing data set
# The uplift score based on testing dataset is shown as validation uplift score in the tree nodes
uplift_tree.fill(X=df_test[x_names].values, treatment=df_test['treatment_group_key'].values, y=df_test['conversion'].values)
# Plot uplift tree
graph = uplift_tree_plot(uplift_tree.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Data generation
df, x_names = make_uplift_classification()
# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
index='treatment_group_key',
aggfunc=[np.mean, np.size],
margins=True)
mean | size | |
---|---|---|
conversion | conversion | |
treatment_group_key | ||
control | 0.511 | 1000 |
treatment1 | 0.514 | 1000 |
treatment2 | 0.559 | 1000 |
treatment3 | 0.600 | 1000 |
All | 0.546 | 4000 |
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftTreeClassifier(max_depth = 3, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
<causalml.inference.tree.models.UpliftTreeClassifier at 0x7f97d18c6e10>
# Plot uplift tree
# The uplift score represents the best uplift score among all treatment effects
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Save the graph as pdf
graph.write_pdf("tbc.pdf")
# Save the graph as png
graph.write_png("tbc.png")
True