#!/usr/bin/env python # coding: utf-8 # In[6]: from IPython.display import Image Image('../../../python_for_probability_statistics_and_machine_learning.jpg') # [Python for Probability, Statistics, and Machine Learning](https://www.springer.com/fr/book/9783319307152) # In[7]: from __future__ import division get_ipython().run_line_magic('pylab', 'inline') # In a previous coin-flipping discussion, we discussed estimation of the # underlying probability of getting a heads. There, we derived the # estimator as # $$ # \hat{p}_n = \frac{1}{n}\sum_{i=1}^n X_i # $$ # where $X_i\in \lbrace 0,1 \rbrace$. Confidence intervals allow us to # estimate how close we can get to the true value that we are estimating. # Logically, that seems strange, doesn't it? We really don't know the exact value # of what we are estimating (otherwise, why estimate it?), and yet, somehow we # know how close we can get to something we admit we don't know? Ultimately, we # want to make statements like the *probability of the value in a certain # interval is 90\%*. Unfortunately, that is something we will not be able to say # using our methods. Note that Bayesian estimation gets closer to this statement # by using *credible intervals*, but that is a story for another day. In our # situation, the best we can do is say roughly the following: *if we ran the # experiment multiple times, then the confidence interval would trap the true # parameter 90\% of the time*. # # Let's return to our coin-flipping example and see this in action. One # way to get at a confidence interval is to use Hoeffding's inequality # from the section ref{ch:prob:sec:ineq} # specialized to our Bernoulli variables as # $$ # \mathbb{P}(\mid \hat{p}_n-p\mid >\epsilon) \leq 2 \exp(-2n \epsilon^2) # $$ # Now, we can form the interval # $\mathbb{I}=[\hat{p}_n-\epsilon_n,\hat{p}_n+\epsilon_n]$, where $\epsilon_n$ is # carefully constructed as # $$ # \epsilon_n = \sqrt{ \frac{1}{2 n}\log\frac{2}{\alpha}} # $$ # which makes the right-side of the Hoeffding inequality equal to # $\alpha$. Thus, we finally have # $$ # \mathbb{P}(p \notin \mathbb{I}) = \mathbb{P}\left(\mid \hat{p}_n-p\mid >\epsilon_n\right) \leq \alpha # $$ # Thus, $ \mathbb{P}(p \in \mathbb{I}) # \geq 1-\alpha$. As a numerical example, let's take $n=100$, $\alpha=0.05$, then plugging into everything we have gives $\epsilon_n=0.136$. # So, the 95\% confidence interval here is therefore # $$ # \mathbb{I}=[\hat{p}_n-\epsilon_n,\hat{p}_n+\epsilon_n] = [\hat{p}_n-0.136,\hat{p}_n+0.136] # $$ # The following code sample is a simulation to see if we can really trap # the underlying parameter in our confidence interval. # In[8]: from scipy import stats import numpy as np b= stats.bernoulli(.5) # fair coin distribution nsamples = 100 # flip it nsamples times for 200 estimates xs = b.rvs(nsamples*200).reshape(nsamples,-1) phat = np.mean(xs,axis=0) # estimated p # edge of 95% confidence interval epsilon_n=np.sqrt(np.log(2/0.05)/2/nsamples) pct=np.logical_and(phat-epsilon_n<=0.5, 0.5 <= (epsilon_n +phat) ).mean()*100 print 'Interval trapped correct value ', pct,'% of the time' # # # The result shows that the estimator and the corresponding # interval was able to trap the true value at least 95\% of the time. This is how # to interpret the action of confidence intervals. # # However, the usual practice is to not use Hoeffding's inequality and # instead use arguments around asymptotic normality. # The definition of the standard error is the following: # $$ # \texttt{se} = \sqrt{\mathbb{V}(\hat{\theta}_n)} # $$ # where $\hat{\theta}_n$ is the point-estimator for the parameter # $\theta$, given $n$ samples of data $X_n$, and $\mathbb{V}(\hat{\theta}_n)$ is # the variance of $\hat{\theta}_n$. Likewise, the estimated standard error is # $\widehat{\texttt{se}}$. For example, in our coin-flipping example, the estimator # was $\hat{p}=\sum X_i/n$ with corresponding variance # $\mathbb{V}(\hat{p}_n)=p(1-p)/n$. Plugging in the point estimate gives us the # estimated standard error: $\widehat{\texttt{se}}=\sqrt{\hat{p}(1-\hat{p})/n}$. # Because maximum likelihood estimators are asymptotically normal [^mle], we know # that $\hat{p}_n \sim \mathcal{N}(p,\widehat{\texttt{se}}^2)$. Thus, if we want # a $1-\alpha$ confidence interval, we can compute # # [^mle]: Certain technical regularity conditions must hold for this property # of maximum likelihood estimator to work. See # [[wasserman2004all]](#wasserman2004all) for more details. # $$ # \mathbb{P}(\mid \hat{p}_n-p\mid < \xi)> 1-\alpha # $$ # but since we know that $ (\hat{p}_n-p)$ is asymptotically normal, # $\mathcal{N}(0,\widehat{\texttt{se}}^2)$, we can instead compute # $$ # \int_{-\xi}^{\xi} \mathcal{N}(0,\widehat{\texttt{se}}^2) dx > 1-\alpha # $$ # This looks ugly to compute because we need to find $\xi$, but # Scipy has everything we need for this. # In[9]: # compute estimated se for all trials se=np.sqrt(phat*(1-phat)/xs.shape[0]) # generate random variable for trial 0 rv=stats.norm(0, se[0]) # compute 95% confidence interval for that trial 0 np.array(rv.interval(0.95))+phat[0] def compute_CI(i): return stats.norm.interval(0.95,loc=i, scale=np.sqrt(i*(1-i)/xs.shape[0])) lower,upper = compute_CI(phat) # # # # #
# #The gray circles are the point estimates that are bounded above and below by both asymptotic confidence intervals and Hoeffding intervals. The asymptotic intervals are tighter because the underpinning asymptotic assumptions are valid for these estimates.
#