In [1]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt

import pandas as pd

In [2]:

churn_data = pd.read_csv('https://raw.githubusercontent.com/'
                         'treselle-systems/customer_churn_analysis/'
                         'master/WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn_data.head()

Out[2]:

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

5 rows × 21 columns

In [3]:

churn_data = churn_data.set_index('customerID')
churn_data = churn_data.drop(['TotalCharges'], axis=1)

# The dataset is naturally hierarchical: some columns only apply to some users. Ex, if you don't have internet 
# then the column OnlineBackup isn't applicable, as it's value is "No internet service". We 
# are going to map this back to No. We will treat the hierachical nature by stratifying on the 
# different services a user may have. 
churn_data = churn_data.applymap(lambda x: "No" if str(x).startswith("No ") else x)
churn_data['Churn'] = (churn_data['Churn'] == "Yes")
strata_cols = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService']
print(churn_data.columns)

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'Churn'],
      dtype='object')

In [4]:

from lifelines import CoxPHFitter

cph = CoxPHFitter().fit(churn_data, 'tenure', 'Churn', 
                        formula="gender + SeniorCitizen + Partner + Dependents  + MultipleLines + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges",
                        strata=strata_cols)

In [5]:

cph

Out[5]:

<lifelines.CoxPHFitter: fitted with 7043 total observations, 5174 right-censored observations>

In [6]:

cph.print_summary()

model	lifelines.CoxPHFitter
duration col	'tenure'
event col	'Churn'
strata	[InternetService, StreamingMovies, StreamingTV...
baseline estimation	breslow
number of observations	7043
number of events observed	1869
partial log-likelihood	-10106.05
time fit was run	2020-08-21 13:31:13 UTC

	coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
gender[T.Male]	-0.09	0.91	0.05	-0.18	-0.00	0.83	1.00	-1.98	0.05	4.40
Partner[T.Yes]	-0.53	0.59	0.06	-0.64	-0.42	0.53	0.66	-9.57	<0.005	69.69
Dependents[T.Yes]	-0.05	0.95	0.07	-0.18	0.09	0.83	1.09	-0.67	0.50	0.99
MultipleLines[T.Yes]	-0.44	0.64	0.13	-0.69	-0.19	0.50	0.83	-3.47	<0.005	10.89
OnlineSecurity[T.Yes]	-0.62	0.54	0.13	-0.88	-0.36	0.41	0.70	-4.65	<0.005	18.20
OnlineBackup[T.Yes]	-0.65	0.52	0.13	-0.90	-0.40	0.41	0.67	-5.09	<0.005	21.39
DeviceProtection[T.Yes]	-0.32	0.73	0.13	-0.56	-0.07	0.57	0.93	-2.51	0.01	6.39
TechSupport[T.Yes]	-0.40	0.67	0.13	-0.66	-0.14	0.52	0.87	-2.99	<0.005	8.49
Contract[T.One year]	-1.59	0.20	0.09	-1.77	-1.42	0.17	0.24	-17.82	<0.005	233.65
Contract[T.Two year]	-3.11	0.04	0.17	-3.45	-2.77	0.03	0.06	-18.00	<0.005	238.16
PaperlessBilling[T.Yes]	0.19	1.21	0.06	0.08	0.30	1.08	1.35	3.29	<0.005	9.97
PaymentMethod[T.Credit card (automatic)]	-0.11	0.90	0.09	-0.29	0.07	0.75	1.07	-1.18	0.24	2.06
PaymentMethod[T.Electronic check]	0.56	1.76	0.07	0.42	0.70	1.53	2.02	7.88	<0.005	48.06
PaymentMethod[T.Mailed check]	0.51	1.66	0.09	0.33	0.68	1.39	1.98	5.65	<0.005	25.85
SeniorCitizen	-0.06	0.94	0.06	-0.17	0.05	0.84	1.06	-1.00	0.32	1.67
MonthlyCharges	-0.01	0.99	0.02	-0.05	0.04	0.95	1.04	-0.24	0.81	0.31

Concordance	0.83
Partial AIC	20244.10
log-likelihood ratio test	2614.83 on 16 df
-log2(p) of ll-ratio test	inf

In [7]:

ax = plt.subplots(figsize=(8, 6))
cph.plot(ax=ax[1])

Out[7]:

<AxesSubplot:xlabel='log(HR) (95% CI)'>

In [8]:

cph.plot_partial_effects_on_outcome('Contract', values=["Month-to-month", "One year", "Two year"], plot_baseline=False);

In [ ]: