This example notebooks illustrates how to visualize uplift trees for interpretation and diagnosis.
These visualization functions work only for tree-based algorithms:
Currently, they are NOT supporting Meta-learner algorithms
This notebook will show how to use visualization for:
Uplift Tree and Uplift Random Forest
Training and Validation Data
One Treatment Group and Multiple Treatment Groups
from causalml.dataset import make_uplift_classification
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot
The sklearn.utils.testing module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
import numpy as np
import pandas as pd
from IPython.display import Image
from sklearn.model_selection import train_test_split
# Data generation
df, x_names = make_uplift_classification()
# Rename features for easy interpretation of visualization
x_names_new = ['feature_%s'%(i) for i in range(len(x_names))]
rename_dict = {x_names[i]:x_names_new[i] for i in range(len(x_names))}
df = df.rename(columns=rename_dict)
x_names = x_names_new
df.head()
df = df[df['treatment_group_key'].isin(['control','treatment1'])]
# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
index='treatment_group_key',
aggfunc=[np.mean, np.size],
margins=True)
mean | size | |
---|---|---|
conversion | conversion | |
treatment_group_key | ||
control | 0.5110 | 1000 |
treatment1 | 0.5140 | 1000 |
All | 0.5125 | 2000 |
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftTreeClassifier(max_depth = 4, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
# Print uplift tree as a string
result = uplift_tree_string(uplift_model.fitted_uplift_tree, x_names)
feature_17 >= -0.44234212654232735? yes -> feature_10 >= 1.020659213325515? yes -> {'treatment1': 0.606557, 'control': 0.381356} no -> {'treatment1': 0.526786, 'control': 0.507812} no -> feature_9 >= 0.8142773340486678? yes -> {'treatment1': 0.61, 'control': 0.459677} no -> feature_4 >= 0.280545459525536? yes -> {'treatment1': 0.41433, 'control': 0.552288} no -> {'treatment1': 0.574803, 'control': 0.507042}
# Plot uplift tree
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
Note the validation uplift score will update.
### Fill the trained tree with testing data set
# The uplift score based on testing dataset is shown as validation uplift score in the tree nodes
uplift_model.fill(X=df_test[x_names].values, treatment=df_test['treatment_group_key'].values, y=df_test['conversion'].values)
# Plot uplift tree
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftRandomForestClassifier(n_estimators=5, max_depth = 5, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
# Specify a tree in the random forest (the index can be any integer from 0 to n_estimators-1)
uplift_tree = uplift_model.uplift_forest[0]
# Print uplift tree as a string
result = uplift_tree_string(uplift_tree.fitted_uplift_tree, x_names)
feature_18 >= -1.286536457386194? yes -> feature_9 >= 0.8272685423844407? yes -> {'control': 0.518519, 'treatment1': 0.661017} no -> feature_4 >= 0.22640723668841609? yes -> feature_6 >= -0.06339335394661161? yes -> {'control': 0.540373, 'treatment1': 0.539474} no -> {'control': 0.601227, 'treatment1': 0.393333} no -> {'control': 0.539394, 'treatment1': 0.557692} no -> feature_13 >= 0.8875002743438594? yes -> {'control': 0.414414, 'treatment1': 0.671756} no -> {'control': 0.544643, 'treatment1': 0.512}
# Plot uplift tree
graph = uplift_tree_plot(uplift_tree.fitted_uplift_tree,x_names)
Image(graph.create_png())
### Fill the trained tree with testing data set
# The uplift score based on testing dataset is shown as validation uplift score in the tree nodes
uplift_tree.fill(X=df_test[x_names].values, treatment=df_test['treatment_group_key'].values, y=df_test['conversion'].values)
# Plot uplift tree
graph = uplift_tree_plot(uplift_tree.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Data generation
df, x_names = make_uplift_classification()
# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
index='treatment_group_key',
aggfunc=[np.mean, np.size],
margins=True)
mean | size | |
---|---|---|
conversion | conversion | |
treatment_group_key | ||
control | 0.511 | 1000 |
treatment1 | 0.514 | 1000 |
treatment2 | 0.559 | 1000 |
treatment3 | 0.600 | 1000 |
All | 0.546 | 4000 |
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)
# Train uplift tree
uplift_model = UpliftTreeClassifier(max_depth = 3, min_samples_leaf = 200, min_samples_treatment = 50, n_reg = 100, evaluationFunction='KL', control_name='control')
uplift_model.fit(df_train[x_names].values,
treatment=df_train['treatment_group_key'].values,
y=df_train['conversion'].values)
# Plot uplift tree
# The uplift score represents the best uplift score among all treatment effects
graph = uplift_tree_plot(uplift_model.fitted_uplift_tree,x_names)
Image(graph.create_png())
# Save the graph as pdf
graph.write_pdf("tbc.pdf")
# Save the graph as png
graph.write_png("tbc.png")
True