We demonstrate the use of 2SLS from the package to estimate the average treatment effect by semi-synthetic data and full synthetic data.
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
base_path = os.path.abspath("../")
os.chdir(base_path)
import logging
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sys
from scipy import stats
import causalml
from causalml.inference.iv import IVRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
The data generation mechanism is described in Syrgkanis et al "Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments" (2019).
df = pd.read_csv("examples/data/card.csv")
df.head()
id | nearc2 | nearc4 | educ | age | fatheduc | motheduc | weight | momdad14 | sinmom14 | ... | smsa66 | wage | enroll | kww | iq | married | libcrd14 | exper | lwage | expersq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 0 | 7 | 29 | NaN | NaN | 158413 | 1 | 0 | ... | 1 | 548 | 0 | 15.0 | NaN | 1.0 | 0.0 | 16 | 6.306275 | 256 |
1 | 3 | 0 | 0 | 12 | 27 | 8.0 | 8.0 | 380166 | 1 | 0 | ... | 1 | 481 | 0 | 35.0 | 93.0 | 1.0 | 1.0 | 9 | 6.175867 | 81 |
2 | 4 | 0 | 0 | 12 | 34 | 14.0 | 12.0 | 367470 | 1 | 0 | ... | 1 | 721 | 0 | 42.0 | 103.0 | 1.0 | 1.0 | 16 | 6.580639 | 256 |
3 | 5 | 1 | 1 | 11 | 27 | 11.0 | 12.0 | 380166 | 1 | 0 | ... | 1 | 250 | 0 | 25.0 | 88.0 | 1.0 | 1.0 | 10 | 5.521461 | 100 |
4 | 6 | 1 | 1 | 12 | 34 | 8.0 | 7.0 | 367470 | 1 | 0 | ... | 1 | 729 | 0 | 34.0 | 108.0 | 1.0 | 0.0 | 16 | 6.591674 | 256 |
5 rows × 34 columns
df.columns.values
array(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc', 'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662', 'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668', 'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66', 'wage', 'enroll', 'kww', 'iq', 'married', 'libcrd14', 'exper', 'lwage', 'expersq'], dtype=object)
data_filter = df['educ'] >= 6
# outcome variable
y=df[data_filter]['lwage'].values
# treatment variable
treatment=df[data_filter]['educ'].values
# instrumental variable
w=df[data_filter]['nearc4'].values
Xdf=df[data_filter][['fatheduc', 'motheduc', 'momdad14', 'sinmom14', 'reg661', 'reg662',
'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66',
'exper', 'expersq']]
Xdf['fatheduc']=Xdf['fatheduc'].fillna(value=Xdf['fatheduc'].mean())
Xdf['motheduc']=Xdf['motheduc'].fillna(value=Xdf['motheduc'].mean())
Xscale=Xdf.copy()
Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']]=StandardScaler().fit_transform(Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']])
X=Xscale.values
Xscale.describe()
fatheduc | motheduc | momdad14 | sinmom14 | reg661 | reg662 | reg663 | reg664 | reg665 | reg666 | reg667 | reg668 | reg669 | south66 | black | smsa | south | smsa66 | exper | expersq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.991000e+03 | 2.991000e+03 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2.991000e+03 | 2.991000e+03 |
mean | -3.529069e-16 | -1.704346e-15 | 0.790371 | 0.100301 | 0.046807 | 0.161484 | 0.196924 | 0.064527 | 0.205951 | 0.094952 | 0.109997 | 0.028419 | 0.090939 | 0.410899 | 0.231361 | 0.715145 | 0.400201 | 0.651622 | 4.285921e-16 | 3.040029e-17 |
std | 1.000167e+00 | 1.000167e+00 | 0.407112 | 0.300451 | 0.211261 | 0.368039 | 0.397741 | 0.245730 | 0.404463 | 0.293197 | 0.312938 | 0.166193 | 0.287571 | 0.492079 | 0.421773 | 0.451421 | 0.490021 | 0.476536 | 1.000167e+00 | 1.000167e+00 |
min | -3.101056e+00 | -3.502453e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -2.159127e+00 | -1.147691e+00 |
25% | -6.303764e-01 | -4.656485e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -6.858865e-01 | -7.077287e-01 |
50% | 0.000000e+00 | 2.091970e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | -1.948066e-01 | -3.655360e-01 |
75% | 6.049634e-01 | 5.466197e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 5.418134e-01 | 3.310707e-01 |
max | 2.457973e+00 | 2.571156e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.242753e+00 | 4.767355e+00 |
def semi_synth_nlsym(X, w, random_seed=None):
np.random.seed(random_seed)
nobs = X.shape[0]
nv = np.random.uniform(0, 1, size=nobs)
c0 = np.random.uniform(0.2, 0.3)
C = c0 * X[:,1]
# Treatment compliance depends on mother education
treatment = C * w + X[:,1] + nv
# Treatment effect depends no mother education and single-mom family at age 14
theta = 0.1 + 0.05 * X[:,1] - 0.1*X[:,3]
# Additional effect on the outcome from mother education
f = 0.05 * X[:,1]
y = theta * (treatment + nv) + f + np.random.normal(0, 0.1, size=nobs)
return y, treatment, theta
y_sim, treatment_sim, theta = semi_synth_nlsym(Xdf.values, w)
# True value
theta.mean()
0.6089706667314586
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(X, treatment_sim, y_sim, w)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
(0.6611532131769402, 0.013922622951893662)
# OLS estimate
ols_fit=sm.OLS(y_sim, sm.add_constant(np.c_[treatment_sim, X], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
(0.7501211540497275, 0.012800163754977008)
The data generation mechanism is described in Hong et al "Semiparametric Efficiency in Nonlinear LATE Models" (2010).
def synthetic_data(n=10000, random_seed=None):
np.random.seed(random_seed)
gamma0 = -0.5
gamma1 = 1.0
delta = 1.0
x = np.random.uniform(size=n)
v = np.random.normal(size=n)
d1 = (gamma0 + x*gamma1 + delta + v>=0).astype(float)
d0 = (gamma0 + x*gamma1 + v>=0).astype(float)
alpha = 1.0
beta = 0.5
lambda11 = 2.0
lambda00 = 1.0
xi1 = np.random.poisson(np.exp(alpha+x*beta))
xi2 = np.random.poisson(np.exp(x*beta))
xi3 = np.random.poisson(np.exp(lambda11), size=n)
xi4 = np.random.poisson(np.exp(lambda00), size=n)
y1 = xi1 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
y0 = xi2 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
z = np.random.binomial(1, stats.norm.cdf(x))
d = d1*z + d0*(1-z)
y = y1*d + y0*(1-d)
return y, x, d, z, y1[(d1>d0)].mean()-y0[(d1>d0)].mean()
y, x, d, z, late = synthetic_data()
# True value
late
2.1789099526066353
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(x, d, y, z)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
(2.1900472390231775, 0.2623695460540134)
# OLS estimate
ols_fit=sm.OLS(y, sm.add_constant(np.c_[d, x], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
(5.3482879532439975, 0.09201397327077365)