%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
import pandas as pd
churn_data = pd.read_csv('https://raw.githubusercontent.com/'
'treselle-systems/customer_churn_analysis/'
'master/WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn_data.head()
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
churn_data = churn_data.set_index('customerID')
churn_data = churn_data.drop(['TotalCharges'], axis=1)
# The dataset is naturally heirarchical: some columns only apply to some users. Ex, if you don't have internet
# then the column OnlineBackup isn't applicable, as it's value is "No internet service". We
# are going to map this back to No. We will treat the hierachical nature by stratifying on the
# different services a user may have.
churn_data = churn_data.applymap(lambda x: "No" if str(x).startswith("No ") else x)
strata_cols = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService']
df = pd.get_dummies(churn_data,
columns=churn_data.columns.difference(strata_cols + ['tenure', 'MonthlyCharges']),
drop_first=True)
from lifelines import CoxPHFitter
cph = CoxPHFitter().fit(df, 'tenure', 'Churn_Yes', strata=strata_cols)
cph
<lifelines.CoxPHFitter: fitted with 7043 observations, 5174 censored>
cph.print_summary()
<lifelines.CoxPHFitter: fitted with 7043 observations, 5174 censored> duration col = 'tenure' event col = 'Churn_Yes' strata = ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService'] number of subjects = 7043 number of events = 1869 log-likelihood = -10106.05 time fit was run = 2019-05-01 18:48:41 UTC --- coef exp(coef) se(coef) z p -log2(p) lower 0.95 upper 0.95 3 MonthlyCharges -0.01 0.99 0.02 -0.24 0.81 0.31 -0.05 0.04 Contract_One year -1.59 0.20 0.09 -17.82 <0.005 233.65 -1.77 -1.42 Contract_Two year -3.11 0.04 0.17 -18.00 <0.005 238.16 -3.45 -2.77 Dependents_Yes -0.05 0.95 0.07 -0.67 0.50 0.99 -0.18 0.09 DeviceProtection_Yes -0.32 0.73 0.13 -2.51 0.01 6.39 -0.56 -0.07 MultipleLines_Yes -0.44 0.64 0.13 -3.47 <0.005 10.89 -0.69 -0.19 OnlineBackup_Yes -0.65 0.52 0.13 -5.09 <0.005 21.39 -0.90 -0.40 OnlineSecurity_Yes -0.62 0.54 0.13 -4.65 <0.005 18.20 -0.88 -0.36 PaperlessBilling_Yes 0.19 1.21 0.06 3.29 <0.005 9.97 0.08 0.30 Partner_Yes -0.53 0.59 0.06 -9.57 <0.005 69.69 -0.64 -0.42 PaymentMethod_Credit card (automatic) -0.11 0.90 0.09 -1.18 0.24 2.06 -0.29 0.07 PaymentMethod_Electronic check 0.56 1.76 0.07 7.88 <0.005 48.06 0.42 0.70 PaymentMethod_Mailed check 0.51 1.66 0.09 5.65 <0.005 25.85 0.33 0.68 SeniorCitizen_1 -0.06 0.94 0.06 -1.00 0.32 1.67 -0.17 0.05 TechSupport_Yes -0.40 0.67 0.13 -2.99 <0.005 8.49 -0.66 -0.14 gender_Male -0.09 0.91 0.05 -1.98 0.05 4.40 -0.18 -0.00 --- Concordance = 0.83 Log-likelihood ratio test = 2614.83 on 16 df, -log2(p)=inf
ax = plt.subplots(figsize=(8, 6))
cph.plot(ax=ax[1])
<matplotlib.axes._subplots.AxesSubplot at 0x11ce2dac8>
cph.plot_covariate_groups('Contract_Two year', values=[0, 1]);