#!/usr/bin/env python # coding: utf-8 # In[1]: from IPython.display import Image Image('../../Python_probability_statistics_machine_learning_2E.png',width=200) # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd from matplotlib.pylab import subplots from matplotlib.patches import Circle, Rectangle # ### Survival curves # # The problem is to estimate the length of time units (e.g., # subjects, # individuals, components) exist in a cohort over time. For example, # consider the # following data. The rows are the days in a 30-day period and the # columns are the # individual units. For example, these could be five patients who # all receive a # particular treatment on day 0 and then *survive* (indicated by # `1`) the next 30 # days on not (indicated by `0`) # In[3]: d = pd.DataFrame(index=range(1,8), columns=['A','B','C','D','E' ], data=1) d.loc[3:,'A']=0 d.loc[6:,'B']=0 d.loc[5:,'C']=0 d.loc[4:,'D']=0 d.index.name='day' d # Importantly, survival is a one-way street --- once a subject is *dead*, then # that subject cannot return to the experiment. This is important because survival # analysis is also applied to component-failure or other topics where this fact is # not so obvious. The following chart shows the survival status of each of the # subjects for all seven days. The blue circles indicate that the subject is alive # and the red squares indicate death of the subject. # In[4]: fig,ax =subplots() _=ax.axis((-0.5,6.5,-1,5)) for ii,jj in zip(*np.where(d.values)): _=ax.add_patch(Circle((ii,jj),0.25)) for ii,jj in zip(*np.where(d.values==0)): _=ax.add_patch(Rectangle((ii-0.25,jj-0.25),0.5,0.5,color='red',alpha=.8)) _=ax.set_yticklabels(['','A','B','C','D','E','',''],fontsize=12); _=ax.invert_yaxis() _=ax.set_xlabel('Days',fontsize=16) _=ax.set_ylabel('Subject',fontsize=16) _=ax.set_title('Survival Chart',fontsize=16) _=ax.spines['right'].set_visible(0) _=ax.spines['top'].set_visible(0) ax.set_aspect(1) fig.savefig('fig-statistics/survival_analysis_001a.png') # # # # #
# #

The red squares # indicate a dead subject, and the blue a living subject.

# # # # In[5]: fig,axs=subplots(5,1,sharex=True,sharey=True) fig.set_size_inches((10,6)) for i,j in enumerate(d.columns): _=d[j].plot(ax=axs[i],marker='o') _=axs[i].axis(ymax=1.1,ymin=-.1) _=axs[i].set_ylabel(j) # In[6]: fig,ax=subplots() _=(d.sum(axis=1)/d.shape[1]).plot(ax=ax,marker='o') _=ax.set_ylabel('Survival probability',fontsize=18) _=ax.axis(ymin=0,ymax=1.1) fig.savefig('fig-statistics/survival_analysis_001.png') # # #
# #

The survival probability decreases by # day.

# # # # # # There is another important recursive perspective on this # calculation. Imagine # there is a life raft containing $[A,B,C,D,E]$. Everyone # survives until day two # when \emph{A} dies. This leaves four in the life raft # $[B,C,D,E]$. Thus, from the # perspective of day one, the survival probability is # the probability of surviving # just up until day two and then surviving day two, # $\mathbb{P}_S(t\ge 2) # =\mathbb{P}(t\notin [0,2)\vert # t<2)\mathbb{P}_S(t=2)=(1)(4/5)=4/5$. In words, # this means that surviving past # the second day is the product of surviving the # second day itself and not having # a death up to that point (i.e., surviving up to # that point). Using this # recursive approach, the survival probability for the # third day is # $\mathbb{P}_S(t\ge 3) =\mathbb{P}_S(t> 3)\mathbb{P}_S(t=3)=(4/5)(3/4)=3/5$. # Recall that just before the third day, the # life raft contains $[B,C,D,E]$ and # on the third day we have $[B,C,E]$. Thus, # from the perspective of just before # the third day there are four survivors in # the raft and on the third day there # are three $3/4$. Using this recursive # argument generates the same plot and comes # in handy with censoring. # # ### Censoring and truncation # # Censoring occurs when a # subject leaves (right censoring) or enters (left # censoring) the study. There are # two general types of right censoring. # The so-called Type I right censoring is # when a subject randomly drops # out of the study. # This random drop-out is another # statistical effect that has to be accounted for # in estimating survival. Type II # right censoring occurs when the study is # terminated when enough specific random # events occur. # # Likewise, left censoring occurs when a subject enters the study # prior to a certain # date, but exactly when this happened is unknown. This happens # in study designs # involving two separate studies stages. For example, a subject # might enroll in the # first selection process but be ineligible for the second # process. Specifically, # suppose a study concerns drug use and certain subjects # have used the drug before # the study but are unable to report exactly when. These # subjects are left # censored. Left truncation (a.k.a. staggered entry, delayed # entry) is similar # except the date of entry is known. For example, a subject that # starts taking a drug # after being initially left out of the study. # # Right # censoring is the most common so let's consider an example. Let's estimate # the # survival function given the following survival times in days: # # $$ # \{ 1, 2,3^+,4,5,6^+,7,8 \} # $$ # # where the censored survival times are indicated by the # plus symbol. As before, # the survival time at the $0^{th}$ day is $8/8=1$, the # first day is $7/8$, the # second day = $(7/8)(6/7)$. Now, we come to the first # right censored entry. The # survival time for the third day is $(7/8)(6/7)(5/5) = # (7/8)(6/7)$. Thus, the # subject who dropped out is not considered *dead* and # cannot be counted as such # but is considered just *absent* as far as the # functional estimation of the # probabilities goes. Continuing for the fourth day, # we have # $(7/8)(6/7)(5/5)(4/5)$, the fifth day, $(7/8)(6/7)(5/5)(4/5)(3/4)$, the # sixth # (right censored) day $(7/8)(6/7)(5/5)(4/5)(3/4)(2/2)$, and so on. We can # summarize this in the following table: # # ### Hazard functions and their # properties # # Generally, the *survival function* is a continuous function of time # $S(t) = # \mathbb{P}(T>t)$ where $T$ is the event time (e.g., time of death). Note # that # the cumulative density function, $F(t)=\mathbb{P}(T\le t)=1-S(t)$ and # $f(t)=\frac{dF(t)}{dt}$ is the usual probability density function. The so-called # *hazard function* is the instantaneous rate of failure at time $t$, # # $$ # h(t) = \frac{f(t)}{S(t)} = \lim_{\Delta t \rightarrow 0}\frac{\mathbb{P}(T # \in (t,t+\Delta t]|T\ge t)}{\Delta t} # $$ # # Note that is a continuous-limit version of # the calculation we performed above. # In words, it says given the event time $T\ge # t$ (subject has survived up to # $t$), what is the probability of the event # occurring in the differential # interval $\Delta t$ for a vanishingly small # $\Delta t$. Note that this is not # the usual derivative-slope from calculus # because there is no difference term in # the numerator. The hazard function is # also called the *force of mortality*, # *intensity rate*, or the # *instantaneous risk*. Informally, you can think of the # hazard function as # encapsulating the two issues we are most concerned about: # deaths and the # population at risk for those deaths. Loosely speaking, the # probability density # function in the numerator represents the probability of a # death occurring in a # small differential interval. However, we are not # particularly interested in # unqualified deaths, but only deaths that can happen # to a specific at-risk # population. Returning to our lifeboat analogy, suppose # there are 1000 people in # the lifeboat and the probability of anybody falling off # the lifeboat is 1/1000. # Two things are happening here: (1) the probability of # the bad event is small and # (2) there are a lot of subjects over which to spread # the probability of that bad # event. This means that the hazard rate for any # particular individual is small. # On the other hand, if there are only two # subjects in the life raft and the # probability of falling off is 3/4, then the # hazard rate is high because not only # is the unfortunate event probable, the risk # of that unfortunate event is shared # by only two subjects. # # It is a mathematical # fact that, # # $$ # h(t) = \frac{-d \log S(t)}{dt} # $$ # # This leads to the following interpretation # # $$ # S(t) = \exp\left( -\int_0^t h(u)du\right) := \exp(-H(t)) # $$ # # where $H(t)$ is the *cumulative hazard function*. Note that $H(t)=-\log S(t)$. # Consider a subject whose survival time is 5 years. For this subject to have died # at # the fifth year, it had to be alive during the fourth year. Thus, the *hazard* # at # 5 years is the failure rate per-year, conditioned on the fact that the # subject # survived until the fourth year. Note that this is *not* the same as the # unconditional failure rate per year at the fifth year, because the unconditional # rate applies to all units at time zero and does not use information about # survival up to that point gleaned from the other units. Thus, the *hazard # function* can be thought of as the point-wise unconditional probability of # experiencing the event, scaled by the fraction of survivors up to that point. # ## Example # # To get a sense of this, let's consider the example where the # probability # density function is exponential with parameter $\lambda$, # $f(t)=\lambda # \exp(-t\lambda),\; \forall t>0$. This makes $S(t) = 1- F(t) = # \exp(-t\lambda)$ # and then the hazard function becomes $h(t)=\lambda$, namely a # constant. To see # this, recall that the exponential distribution is the only # continuous # distribution that has no memory: # # $$ # \mathbb{P}(X\le u+t \vert X>u) = 1-\exp(-\lambda t) = \mathbb{P}(X\le t) # $$ # # This means no matter how long we have # been waiting for a death to occur, the # probability of a death from that point # onward is the same - thus the hazard # function is a constant. # # # ### Expectations # # Given all these definitions, it is # an exercise in integration by parts to show # that the expected life remaining is # the following: # # $$ # \mathbb{E}(T) = \int_0^\infty S(u) du # $$ # # This is equivalent to the following: # # $$ # \mathbb{E}(T \big\vert t=0) = \int_0^\infty S(u) du # $$ # # and we can likewise express the expected remaining life at $t$ as the # following, # # $$ # \mathbb{E}(T \big\vert T\ge t) = \frac{\int_t^\infty S(u) du}{S(t)} # $$ # # ### Parametric regression models # # Because we are interested in how study # parameters affect survival, we need a # model that can accommodate regression in # exogenous (independent) variables ($\mathbf{x}$). # # $$ # h(t\vert \mathbf{x})= h_o(t)\exp(\mathbf{x}^T \mathbf{\boldsymbol{\beta}}) # $$ # # where $\boldsymbol{\beta}$ are the regression coefficients and $h_o(t)$ is the # baseline instantaneous hazard function. Because the hazard function is always # nonnegative, the the effects of the covariates enter through the exponential # function. These kinds of models are called *proportional hazard rate models*. # If # the baseline function is a constant ($\lambda$), then this reduces to the # *exponential regression model* given by the following: # # $$ # h(t\vert \mathbf{x}) = \lambda \exp(\mathbf{x}^T # \mathbf{\boldsymbol{\beta}}) # $$ # # ### Cox proportional hazards model # # The tricky part about the above proportional # hazard rate model is the # specification of the baseline instantaneous hazard # function. In many cases, we # are not so interested in the absolute hazard # function (or its correctness), but # rather a comparison of such hazard functions # between two study populations. The # Cox model emphasizes this comparison by using # a maximum likelihood algorithm for # a partial likelihood function. There is a lot # to keep track of in this model, so # let's try the mechanics first to get a feel # for what is going on. # # Let $j$ denote the $j^{th}$ failure time, assuming that # failure times are sorted # in increasing order. The hazard function for subject # $i$ at failure time $j$ is # $h_i(t_j)$. Using the general proportional hazards # model, we have # # $$ # h_i(t_j) = h_0(t_j)\exp(z_i \beta) := h_0(t_j)\psi_i # $$ # # To keep it simple, we have $z_i \in \{0,1\}$ that indicates membership in the # experimental group ($z_{i}=1$) or the control group ($z_i=0$). # Consider the # first failure time, $t_1$ for subject $i$ failing is the hazard # function # $h_i(t_1)=h_0(t_1)\psi_{i}$. From the definitions, the probability that # subject # $i$ is the one who fails is the following: # # $$ # p_1 = \frac{h_{i}(t_1)}{\sum h_k(t_1)}= \frac{h_{0}(t_1)\psi_i}{\sum h_0(t_1) # \psi_k} # $$ # # where the summation is over all surviving units up to that point. Note that the # baseline hazard cancels out and gives the following, # # $$ # p_1=\frac{\psi_{i}}{\sum_k \psi_k} # $$ # # We can keep computing this for the other failure times to obtain # $\{p_1,p_2,\ldots\,p_D\}$. The product of all of these is the *partial # likelihood*, $L(\psi) = p_1\cdot p_2\cdots p_D$. The next step is to maximize # this partial likelihood (usually logarithm of the partial likelihood) over # $\beta$. There are a lot of numerical issues to keep track of here. Fortunately, # the Python `lifelines` # module can keep this all straight for us. # # Let's see how # this works using the `Rossi` dataset that is available in # lifelines. # In[7]: from lifelines.datasets import load_rossi from lifelines import CoxPHFitter, KaplanMeierFitter rossi_dataset = load_rossi() # The Rossi dataset dataset concerns prison recidivism. The `fin` variable # indicates whether or not the subjects received financial assistance upon # discharge from prison. # # * `week`: week of first arrest after release, or # censoring time. # # * `arrest`: the event indicator, equal to 1 for those arrested # during the period of the study and 0 for those who were not arrested. # # * `fin`: # a factor, with levels yes if the individual received financial aid after release # from prison, and no if he did not; financial aid was a randomly assigned factor # manipulated by the researchers. # # * `age`: in years at the time of release. # # * # `race`: a factor with levels black and other. # # * `wexp`: a factor with levels # yes if the individual had full-time work experience prior to incarceration and # no if he did not. # # * `mar`: a factor with levels married if the individual was # married at the time of release and not married if he was not. # # * `paro`: a # factor coded yes if the individual was released on parole and no if he was not. # * `prio`: number of prior convictions. # # * `educ`: education, a categorical # variable coded numerically, with codes 2 (grade 6 or less), 3 (grades 6 through # 9), 4 (grades 10 and 11), 5 (grade 12), or 6 (some post-secondary). # # * `emp1` - # `emp52`: factors coded yes if the individual was employed in the corresponding # week of the study and no otherwise. # In[8]: rossi_dataset.head() # Now, we just have to set up the calculation in `lifelines`, using the # `scikit- # learn` style. The `lifelines` module handles the censoring # issues. # In[9]: cph = CoxPHFitter() cph.fit(rossi_dataset, duration_col='week', event_col='arrest') cph.print_summary() # access the results using cph.summary # The values in the summary are plotted in # [Figure](#fig:survival_analysis_002). # # #
# #

This shows the fitted coefficients # from the summary table for each covariate.

# # # # In[10]: ax=cph.plot(); ax.figure.savefig('fig-statistics/survival_analysis_002.png') # The Cox proportional hazards model object from `lifelines` allows us to predict # the survival function for an individual with given covariates, assuming that # the # individual just entered the study. For example, for the first individual # (i.e., # row) in the `rossi_dataset`, we can use the model to predict the # survival # function for that individual. # In[11]: cph.predict_survival_function(rossi_dataset.iloc[0,:]).head() # This result is plotted in [Figure](#fig:survival_analysis_003). # In[12]: fig,ax =subplots() _=cph.predict_survival_function(rossi_dataset.iloc[0,:]).plot(ax=ax) _=ax.set_ylabel('Survival probability',fontsize=16) _=ax.set_title('Survival probability for $0^{th}$ subject',fontsize=18) fig.savefig('fig-statistics/survival_analysis_003.png') # # # # # # # # # # # # # # # # # #
# #

The Cox # proportional hazards model can predict the survival probability for an # individual based on their covariates.

# # # # In[ ]: