%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
from datetime import date, datetime
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
plt.style.use('seaborn-deep')
Quitting is death, all else is censoring. This is different than the original article's author's rules, who stated that switching roles within a cabinent is an "event".
raw_df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/cabinet-turnover/cabinet-turnover.csv",
na_values=['Still in office', '#VALUE!']
)
TODAY = datetime.today().date()
INAUG_DATES = {
'Trump': date(2017, 1, 20),
'Obama': date(2009, 1, 20),
'Bush 43': date(2001, 1, 20),
'Clinton': date(1993, 1, 20),
'Bush 41': date(1989, 1, 20),
'Reagan': date(1981, 1, 20),
'Carter': date(1977, 1, 20)
}
presidential_terms = pd.DataFrame(list(INAUG_DATES.items()))
presidential_terms.columns = ['president', 'president_start_date']
presidential_terms['president_end_date'] = presidential_terms['president_start_date'].shift(1).fillna(TODAY)
presidential_terms
president | president_start_date | president_end_date | |
---|---|---|---|
0 | Trump | 2017-01-20 | 2020-07-26 |
1 | Obama | 2009-01-20 | 2017-01-20 |
2 | Bush 43 | 2001-01-20 | 2009-01-20 |
3 | Clinton | 1993-01-20 | 2001-01-20 |
4 | Bush 41 | 1989-01-20 | 1993-01-20 |
5 | Reagan | 1981-01-20 | 1989-01-20 |
6 | Carter | 1977-01-20 | 1981-01-20 |
def fill_end(series):
end, president = series
if pd.notnull(end) and end.endswith('admin'):
next_pres ,_ = end.split(' ')
if next_pres == 'Bush':
next_pres = next_pres + ' 43' if president == 'Clinton' else next_pres + ' 41'
return INAUG_DATES[next_pres].strftime('%m/%d/%y')
else:
return end
def fill_start(series):
end, president = series
if pd.notnull(end) and end.endswith('admin'):
prev_pres ,_ = end.split(' ')
if prev_pres == 'Bush':
prev_pres = prev_pres + ' 43' if president == 'Obama' else prev_pres + ' 41'
return INAUG_DATES[president].strftime('%m/%d/%y')
else:
return end
raw_df['end'] = raw_df[['end', 'president']].apply(fill_end, axis=1)
raw_df['start'] = raw_df[['start', 'president']].apply(fill_start, axis=1)
raw_df['end'] = pd.to_datetime(raw_df['end']).dt.date
raw_df['end'] = raw_df['end'].fillna(TODAY)
raw_df['start'] = pd.to_datetime(raw_df['start']).dt.date
raw_df = raw_df.merge(presidential_terms, left_on='president', right_on='president')
raw_df['event'] = (raw_df['end'] < raw_df['president_end_date']) & pd.notnull(raw_df['end'])
# we need to "collapse" individuals into rows, because they may change positions, but that's not quitting...
def collapse(df):
return df.groupby('appointee', as_index=False).aggregate({
'start': 'min', 'end': 'max', 'event': 'all', 'president': 'last', 'president_end_date': 'last'
})
raw_df = raw_df.groupby('president', as_index=False).apply(collapse).reset_index(drop=True)
raw_df['T'] = (raw_df['end'] - raw_df['start']).dt.days
raw_df.tail(20)
appointee | start | end | event | president | president_end_date | T | |
---|---|---|---|---|---|---|---|
267 | Jeff Sessions | 2017-02-09 | 2018-11-07 | True | Trump | 2020-07-26 | 636 |
268 | Jim Mattis | 2017-01-20 | 2018-12-31 | True | Trump | 2020-07-26 | 710 |
269 | John Kelly | 2017-01-20 | 2018-12-31 | True | Trump | 2020-07-26 | 710 |
270 | Kirstjen Nielsen | 2017-12-06 | 2020-07-26 | False | Trump | 2020-07-26 | 963 |
271 | Linda McMahon | 2017-02-14 | 2020-07-26 | False | Trump | 2020-07-26 | 1258 |
272 | Mick Mulvaney | 2017-02-16 | 2020-07-26 | False | Trump | 2020-07-26 | 1256 |
273 | Mike Pence | 2017-01-20 | 2020-07-26 | False | Trump | 2020-07-26 | 1283 |
274 | Mike Pompeo | 2017-01-23 | 2020-07-26 | False | Trump | 2020-07-26 | 1280 |
275 | Nikki Haley | 2017-01-27 | 2018-12-31 | True | Trump | 2020-07-26 | 703 |
276 | Reince Priebus | 2017-01-20 | 2017-07-28 | True | Trump | 2020-07-26 | 189 |
277 | Rex Tillerson | 2017-02-01 | 2018-03-31 | True | Trump | 2020-07-26 | 423 |
278 | Rick Perry | 2017-03-02 | 2020-07-26 | False | Trump | 2020-07-26 | 1242 |
279 | Robert Lighthizer | 2017-05-15 | 2020-07-26 | False | Trump | 2020-07-26 | 1168 |
280 | Robert Wilkie | 2018-07-30 | 2020-07-26 | False | Trump | 2020-07-26 | 727 |
281 | Ryan Zinke | 2017-03-01 | 2019-01-02 | True | Trump | 2020-07-26 | 672 |
282 | Scott Pruitt | 2017-02-17 | 2018-07-06 | True | Trump | 2020-07-26 | 504 |
283 | Sonny Perdue | 2017-04-25 | 2020-07-26 | False | Trump | 2020-07-26 | 1188 |
284 | Steve Mnuchin | 2017-02-13 | 2020-07-26 | False | Trump | 2020-07-26 | 1259 |
285 | Tom Price | 2017-02-10 | 2017-09-29 | True | Trump | 2020-07-26 | 231 |
286 | Wilbur Ross | 2017-02-28 | 2020-07-26 | False | Trump | 2020-07-26 | 1244 |
naf = NelsonAalenFitter()
ax = naf.fit(raw_df['T'],raw_df['event']).plot()
from lifelines import PiecewiseExponentialFitter
pf = PiecewiseExponentialFitter(breakpoints=[1440, 1500])
pf.fit(raw_df['T'], raw_df['event'])
pf.plot(ax=ax)
pf.print_summary(4)
model | lifelines.PiecewiseExponentialFitter |
---|---|
number of observations | 287 |
number of events observed | 158 |
log-likelihood | -1313.4569 |
hypothesis | lambda_0_ != 1, lambda_1_ != 1, lambda_2_ != 1 |
coef | se(coef) | coef lower 95% | coef upper 95% | z | p | -log2(p) | |
---|---|---|---|---|---|---|---|
lambda_0_ | 3067.6037 | 310.1869 | 2459.6485 | 3675.5588 | 9.8863 | <5e-05 | 74.1495 |
lambda_1_ | 153.3781 | 29.9346 | 94.7075 | 212.0488 | 5.0904 | <5e-05 | 21.4161 |
lambda_2_ | 1038.9781 | 168.4720 | 708.7791 | 1369.1771 | 6.1611 | <5e-05 | 30.3667 |
AIC | 2632.9137 |
---|
kmf = KaplanMeierFitter()
ax = plt.subplot()
for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
kmf.fit(df_['T'], df_['event'], label=name)
ax = kmf.plot(ax=ax, ci_show=False)
ax = plt.subplot()
for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
kmf.fit(df_['T'], df_['event'], label=name)
if name == 'Trump':
ax = kmf.plot(ax=ax, color='r')
else:
ax = kmf.plot(ax=ax, color='grey', alpha=0.5, ci_show=False)
raw_df[['president','event', 'T']]
naf = NelsonAalenFitter()
ax = plt.subplot()
for name, df_ in raw_df[['president','event', 'T']].groupby('president'):
if name in ['Trump', 'Carter']:
naf.fit(df_['T'], df_['event'], label=name)
ax = naf.plot(ax=ax)
raw_df['year'] = raw_df['start'].apply(lambda d: int(d.year))
regression_df = raw_df[['president', 'T', 'event', 'year']]
cph = CoxPHFitter()
cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)")
cph.print_summary(3)
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) ~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps) 988 try: --> 989 inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False) 990 except (ValueError, LinAlgError) as e: ~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in solve(a, b, sym_pos, lower, overwrite_a, overwrite_b, debug, check_finite, assume_a, transposed) 247 overwrite_b=overwrite_b) --> 248 _solve_check(n, info) 249 rcond, info = pocon(lu, anorm) ~/venvs/data/lib/python3.7/site-packages/scipy/linalg/basic.py in _solve_check(n, info, lamch, rcond) 28 elif 0 < info: ---> 29 raise LinAlgError('Matrix is singular.') 30 LinAlgError: Matrix is singular. During handling of the above exception, another exception occurred: ConvergenceError Traceback (most recent call last) <ipython-input-15-f5c8742b8449> in <module> 1 cph = CoxPHFitter() ----> 2 cph.fit(regression_df, 'T', 'event', formula="president + bs(year, df=3)") 3 cph.print_summary(3) ~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs) 52 def f(model, *args, **kwargs): 53 cls.set_censoring_type(model, cls.RIGHT) ---> 54 return function(model, *args, **kwargs) 55 56 return f ~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col) 284 timeline=timeline, 285 formula=formula, --> 286 entry_col=entry_col, 287 ) 288 return self ~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, *args, **kwargs) 303 def _fit_model(self, *args, **kwargs): 304 if self.baseline_estimation_method == "breslow": --> 305 return self._fit_model_breslow(*args, **kwargs) 306 elif self.baseline_estimation_method == "spline": 307 return self._fit_model_spline(*args, **kwargs) ~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model_breslow(self, *args, **kwargs) 313 penalizer=self.penalizer, l1_ratio=self.l1_ratio, strata=self.strata, alpha=self.alpha, label=self._label 314 ) --> 315 model.fit(*args, **kwargs) 316 return model 317 ~/code/lifelines/lifelines/utils/__init__.py in f(model, *args, **kwargs) 52 def f(model, *args, **kwargs): 53 cls.set_censoring_type(model, cls.RIGHT) ---> 54 return function(model, *args, **kwargs) 55 56 return f ~/code/lifelines/lifelines/fitters/coxph_fitter.py in fit(self, df, duration_col, event_col, show_progress, initial_point, strata, step_size, weights_col, cluster_col, robust, batch_mode, timeline, formula, entry_col) 726 initial_point=initial_point, 727 show_progress=show_progress, --> 728 step_size=step_size, 729 ) 730 ~/code/lifelines/lifelines/fitters/coxph_fitter.py in _fit_model(self, X, T, E, weights, entries, initial_point, step_size, show_progress) 845 ): 846 beta_, ll_, hessian_ = self._newton_rhapson_for_efron_model( --> 847 X, T, E, weights, entries, initial_point=initial_point, step_size=step_size, show_progress=show_progress 848 ) 849 ~/code/lifelines/lifelines/fitters/coxph_fitter.py in _newton_rhapson_for_efron_model(self, X, T, E, weights, entries, initial_point, step_size, precision, show_progress, max_steps) 1000 CONVERGENCE_DOCS 1001 ), -> 1002 e, 1003 ) 1004 else: ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.
cph.check_assumptions(regression_df)
cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-13-9b2470913cd3> in <module> ----> 1 cph.plot_covariate_groups("year", values=np.linspace(1977, 2016, 5), y="cumulative_hazard") ~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr) 292 raise AttributeError("Must call `fit` first.") 293 --> 294 if hasattr(self._model, attr): 295 return getattr(self._model, attr) 296 ~/code/lifelines/lifelines/fitters/coxph_fitter.py in __getattr__(self, attr) 290 def __getattr__(self, attr): 291 if attr == "_model": --> 292 raise AttributeError("Must call `fit` first.") 293 294 if hasattr(self._model, attr): AttributeError: Must call `fit` first.
from lifelines import *
wf = WeibullAFTFitter(penalizer=0.0)
wf.fit(regression_df, 'T', 'event')
wf.print_summary(3)
model | lifelines.WeibullAFTFitter |
---|---|
duration col | 'T' |
event col | 'event' |
number of observations | 287 |
number of events observed | 158 |
log-likelihood | -1331.256 |
time fit was run | 2020-07-12 23:59:57 UTC |
coef | exp(coef) | se(coef) | coef lower 95% | coef upper 95% | exp(coef) lower 95% | exp(coef) upper 95% | z | p | -log2(p) | ||
---|---|---|---|---|---|---|---|---|---|---|---|
param | covariate | ||||||||||
lambda_ | Intercept | 46.049 | 9.973e+19 | 54.083 | -59.951 | 152.049 | 0.000 | 1.081e+66 | 0.851 | 0.395 | 1.342 |
president[T.Bush 43] | 0.369 | 1.446 | 0.399 | -0.413 | 1.151 | 0.662 | 3.161 | 0.925 | 0.355 | 1.494 | |
president[T.Carter] | -0.316 | 0.729 | 0.375 | -1.051 | 0.419 | 0.350 | 1.521 | -0.842 | 0.400 | 1.322 | |
president[T.Clinton] | 0.326 | 1.385 | 0.213 | -0.092 | 0.744 | 0.912 | 2.104 | 1.527 | 0.127 | 2.980 | |
president[T.Obama] | 0.685 | 1.985 | 0.599 | -0.488 | 1.859 | 0.614 | 6.418 | 1.145 | 0.252 | 1.986 | |
president[T.Reagan] | 0.077 | 1.080 | 0.251 | -0.415 | 0.569 | 0.661 | 1.766 | 0.307 | 0.759 | 0.398 | |
president[T.Trump] | 0.605 | 1.831 | 0.784 | -0.931 | 2.141 | 0.394 | 8.505 | 0.772 | 0.440 | 1.184 | |
year | -0.019 | 0.981 | 0.027 | -0.073 | 0.034 | 0.930 | 1.034 | -0.715 | 0.474 | 1.076 | |
rho_ | Intercept | 0.669 | 1.952 | 0.068 | 0.535 | 0.803 | 1.707 | 2.232 | 9.782 | <0.0005 | 72.648 |
Concordance | 0.570 |
---|---|
AIC | 2680.513 |
log-likelihood ratio test | 6.980 on 7 df |
-log2(p) of ll-ratio test | 1.214 |
lnf = LogNormalAFTFitter(penalizer=0.0000)
lnf.fit(regression_df, 'T', 'event')
lnf.print_summary(3)
model | lifelines.LogNormalAFTFitter |
---|---|
duration col | 'T' |
event col | 'event' |
number of observations | 287 |
number of events observed | 158 |
log-likelihood | -1338.519 |
time fit was run | 2020-07-12 23:59:58 UTC |
coef | exp(coef) | se(coef) | coef lower 95% | coef upper 95% | exp(coef) lower 95% | exp(coef) upper 95% | z | p | -log2(p) | ||
---|---|---|---|---|---|---|---|---|---|---|---|
param | covariate | ||||||||||
mu_ | Intercept | 57.104 | 6.308e+24 | 56.572 | -53.774 | 167.982 | 0.000 | 8.988e+72 | 1.009 | 0.313 | 1.677 |
president[T.Bush 43] | 0.503 | 1.654 | 0.432 | -0.343 | 1.350 | 0.709 | 3.856 | 1.165 | 0.244 | 2.035 | |
president[T.Carter] | -0.408 | 0.665 | 0.398 | -1.189 | 0.372 | 0.305 | 1.451 | -1.025 | 0.305 | 1.711 | |
president[T.Clinton] | 0.280 | 1.323 | 0.246 | -0.202 | 0.762 | 0.817 | 2.144 | 1.139 | 0.255 | 1.973 | |
president[T.Obama] | 0.811 | 2.250 | 0.629 | -0.423 | 2.044 | 0.655 | 7.725 | 1.288 | 0.198 | 2.339 | |
president[T.Reagan] | 0.017 | 1.017 | 0.278 | -0.529 | 0.562 | 0.589 | 1.754 | 0.060 | 0.952 | 0.071 | |
president[T.Trump] | 0.684 | 1.982 | 0.815 | -0.913 | 2.282 | 0.401 | 9.798 | 0.839 | 0.401 | 1.318 | |
year | -0.025 | 0.975 | 0.028 | -0.081 | 0.031 | 0.922 | 1.031 | -0.883 | 0.377 | 1.407 | |
sigma_ | Intercept | -0.293 | 0.746 | 0.060 | -0.411 | -0.175 | 0.663 | 0.839 | -4.869 | <0.0005 | 19.768 |
Concordance | 0.587 |
---|---|
AIC | 2695.039 |
log-likelihood ratio test | 6.228 on 7 df |
-log2(p) of ll-ratio test | 0.962 |
llf = LogLogisticAFTFitter(penalizer=0.000)
llf.fit(regression_df, 'T', 'event')
llf.print_summary(3)
model | lifelines.LogLogisticAFTFitter |
---|---|
duration col | 'T' |
event col | 'event' |
number of observations | 287 |
number of events observed | 158 |
log-likelihood | -1333.497 |
time fit was run | 2020-07-12 23:59:58 UTC |
coef | exp(coef) | se(coef) | coef lower 95% | coef upper 95% | exp(coef) lower 95% | exp(coef) upper 95% | z | p | -log2(p) | ||
---|---|---|---|---|---|---|---|---|---|---|---|
param | covariate | ||||||||||
alpha_ | Intercept | 7.282 | 1453.611 | 56.981 | -104.399 | 118.963 | 0.000 | 4.622e+51 | 0.128 | 0.898 | 0.155 |
president[T.Bush 43] | 0.103 | 1.108 | 0.425 | -0.731 | 0.937 | 0.481 | 2.552 | 0.242 | 0.809 | 0.306 | |
president[T.Carter] | -0.166 | 0.847 | 0.395 | -0.940 | 0.608 | 0.390 | 1.837 | -0.421 | 0.674 | 0.569 | |
president[T.Clinton] | 0.110 | 1.117 | 0.238 | -0.356 | 0.577 | 0.700 | 1.781 | 0.464 | 0.643 | 0.638 | |
president[T.Obama] | 0.250 | 1.285 | 0.630 | -0.985 | 1.486 | 0.373 | 4.420 | 0.397 | 0.691 | 0.533 | |
president[T.Reagan] | 0.149 | 1.161 | 0.265 | -0.371 | 0.669 | 0.690 | 1.953 | 0.562 | 0.574 | 0.801 | |
president[T.Trump] | -0.034 | 0.966 | 0.826 | -1.653 | 1.585 | 0.191 | 4.878 | -0.041 | 0.967 | 0.048 | |
year | -0.000 | 1.000 | 0.029 | -0.056 | 0.056 | 0.945 | 1.058 | -0.002 | 0.999 | 0.002 | |
beta_ | Intercept | 0.892 | 2.440 | 0.069 | 0.757 | 1.027 | 2.131 | 2.794 | 12.909 | <0.0005 | 124.239 |
Concordance | 0.581 |
---|---|
AIC | 2684.993 |
log-likelihood ratio test | 6.110 on 7 df |
-log2(p) of ll-ratio test | 0.924 |