#!/usr/bin/env python # coding: utf-8 # In[27]: from IPython.display import Image Image('../../Python_probability_statistics_machine_learning_2E.png',width=200) # # Useful Distributions # # ## Normal Distribution # # # # Without a doubt, the normal (Gaussian) # distribution is the most # important and foundational probability distribution. # The one-dimensional form # is the following: # # $$ # f(x) =\frac{e^{-\frac{(x-\mu)^2}{2\sigma^2}}}{\sqrt{2\pi\sigma^2 } } # $$ # # where $\mathbb{E}(x)=\mu$ and $\mathbb{V}(x)=\sigma^2$. The # multidimensional # version for $\mathbf{x}\in \mathbb{R}^n$ is the following, # # $$ # f(\mathbf{x}) = \frac{1}{\det(2\pi \mathbf{R})^{\frac{1}{2}}} # e^{-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^T # \mathbf{R}^{-1}(\mathbf{x}-\boldsymbol{\mu})} # $$ # # where $\mathbf{R}$ is the covariance matrix with entries # # $$ # R_{i,j} = \mathbb{E}\left[ (x_i-\bar{x_i})(x_j-\bar{x_j}) \right] # $$ # # A key property of the normal distribution is that it is completely # specified by # its first two moments. Another key property is that # the normal distribution is # preserved under linear tranformations. # For example, # # $$ # \mathbf{y} = \mathbf{A x} # $$ # # means $\mathbf{y}\sim \mathcal{N}(\mathbf{A x},\mathbf{A R_x} # \mathbf{A}^T)$. # This means that it is easy to do linear algebra and matrix # operations with # normal distributed random variables. There are many intuitive # geometric # relationships that are preserved with normal distributed random # variables, as # discussed in the Gauss-Markov chapter. # # ## Multinomial Distribution # # # # # # The Multinomial distribution generalized the Binomial # distribution. # Recall that the Binomial distribution characterizes the number of # heads obtained # in $n$ trials. # Consider the problem of $n$ balls to be divided # among $r$ available bins # where each bin may accommodate more than one ball. For # example, suppose # $n=10$ and and $r=3$, then one possible valid configuration is # $\mathbf{N}_{10}=[3,3,4]$. The probability that a ball lands in the # $i^{th}$ bin # is $p_i$, where $\sum p_i=1$. The Multinomial distribution # characterizes the # probability distribution of $\mathbf{N}_n$. The Binomial # distribution is a # special case of the Multinomial distribution with $n=2$. The # Multinomial # distribution is implmented in the `scipy.stats` module as shown # below, # In[28]: from scipy.stats import multinomial rv = multinomial(10,[1/3]*3) rv.rvs(4) # Note that the sum across the columns is always $n$ # In[29]: rv.rvs(10).sum(axis=1) # To derive the probability mass function, we define the *occupancy vector*, # $\mathbf{e}_i\in \mathbb{R}^r$ which is a binary vector with exactly one # non- # zero component (i.e., a unit vector). Then, the $\mathbf{N}_n$ vector can # be # written as the sum of $n$ vectors $\mathbf{X}$, each drawn from the set # $\lbrace # \mathbf{e}_j \rbrace_{j=1}^r$, # # $$ # \mathbf{N}_n = \sum_{i=1}^n \mathbf{X}_i # $$ # # where the probability $\mathbb{P}(\mathbf{X}=\mathbf{e}_j)=p_j$. Thus, # $\mathbf{N}_n$ has a discrete distribution over the set of vectors with # non- # negative components that sum to $n$. Because the $\mathbf{X}$ # vectors are # independent and identically distributed, the # probability of any particular # $\mathbf{N}_n=[ x_1,x_2,\cdots,x_r ]^\top=\mathbf{x}$ is # # $$ # \mathbb{P}(\mathbf{N}_n=x) = C_n p_1^{x_1} p_2^{x_2}\cdots p_r^{x_r} # $$ # # where $C_n$ is a combinatorial factor that accounts for all the ways # a # component can sum to $x_j$. Consider that there are $\binom{n}{x_1}$ ways # that # the first component can be chosen. This leaves $n-x_1$ balls left for the # rest # of the vector components. Thus, the second component has # $\binom{n-x_1}{x_2}$ # ways to pick a ball. Following the same pattern, the # third component has # $\binom{n-x_1-x_2}{x_3}$ ways and so forth, # # $$ # C_n = \binom{n}{x_1}\binom{n-x_1}{x_2} \binom{n-x_1-x_2}{x_3} \cdots # \binom{n-x_1-x_2-\cdots-x_{r-1}}{x_r} # $$ # # simplifies to the following, # # $$ # C_n = \frac{n!}{x_1! \cdots x_r!} # $$ # # Thus, the probability mass function for the Multinomial distribution is the # following, # # $$ # \mathbb{P}(\mathbf{N}_n=x) = \frac{n!}{x_1! \cdots x_r!} p_1^{x_1} # p_2^{x_2}\cdots p_r^{x_r} # $$ # # The expectation of this distribution is the following, # # $$ # \mathbb{E}(\mathbf{N}_n) = \sum_{i=1}^n \mathbb{E}(X_i) # $$ # # by the linearity of the expectation. Then, # # $$ # \mathbb{E}(X_i) = \sum_{j=1}^r p_j \mathbf{e}_j = # \mathbf{I}\mathbf{p}=\mathbf{p} # $$ # # where $p_j$ are the components of the vector $\mathbf{p}$ and # $\mathbf{I}$ is # the identity matrix. Then, because this is the same for any $X_i$, # we have # # $$ # \mathbb{E}(\mathbf{N}_n) = n \mathbf{p} # $$ # # For the covariance of $\mathbf{N}_n$, we need to compute the following, # # $$ # \textrm{Cov}(\mathbf{N}_n) = \mathbb{E}\left(\mathbf{N}_n # \mathbf{N}_n^\top\right) - \mathbb{E}(\mathbf{N}_n) # \mathbb{E}(\mathbf{N}_n)^\top # $$ # # For the first term on the right, we have # # $$ # \mathbb{E}\left(\mathbf{N}_n # \mathbf{N}_n^\top\right)=\mathbb{E}\left((\sum_{i=1}^n X_i)(\sum_{j=1}^n # X_j^\top) \right) # $$ # # and for $i=j$, we have # # $$ # \mathbb{E}(X_i X_i^\top) = \textrm{diag}(\mathbf{p}) # $$ # # and for $i\neq j$, we have # # $$ # \mathbb{E}(X_i X_j^\top) = \mathbf{p}\mathbf{p}^\top # $$ # # Note that this term has elements on the diagonal. Then, combining the # above two # equations gives the following, # # $$ # \mathbb{E}(\mathbf{N}_n\mathbf{N}_n^\top) = n \textrm{diag}(\mathbf{p}) + # (n^2-n) \mathbf{p}\mathbf{p}^\top # $$ # # Now, we can assemble the covariance matrix, # # $$ # \textrm{Cov}(\mathbf{N}_n) = n \textrm{diag}(\mathbf{p}) + (n^2-n) # \mathbf{p}\mathbf{p}^\top - n^2 \mathbf{p} \mathbf{p}^\top = n # \textrm{diag}(\mathbf{p})-n \mathbf{p} \mathbf{p}^\top # $$ # # Specifically, the off-diagonal terms are $n p_i p_j$ and the diagonal terms are # $n p_i (1-p_i)$. # # ## Chi-Square Distribution # # # # The $\chi^2$ distribution # appears in many different contexts so it's worth # understanding. Suppose we have # $n$ independent random variables # $X_i$ such that $X_i\sim \mathcal{N}(0,1)$. We # are interested in the following # random variable $R = \sqrt{\sum_i X_i^2}$. The # joint probability density of # $X_i$ is the following, # # $$ # f_{\mathbf{X}}(X) = \frac{e^{-\frac{1}{2}\sum_i X_i^2}}{(2\pi)^{\frac{n}{2}}} # $$ # # where the $\mathbf{X}$ represents a vector of $X_i$ random variables. You # can # think of $R$ as the radius of an $n$-dimensional sphere. The volume of # this # sphere is given by the the following formula, # # $$ # V_n(R) = \frac{\pi^{\frac{n}{2}}}{\Gamma(\frac{n}{2}+1)} R^n # $$ # # To reduce the amount of notation we define, # # $$ # A := \frac{\pi^{\frac{n}{2}}}{\Gamma(\frac{n}{2}+1)} # $$ # # The differential of this volume is the following, # # $$ # dV_n(R)= n A R^{n-1} dR # $$ # # In term of the $X_i$ coordinates, the probability (as always) # integrates out # to one. # # $$ # \int f_{\mathbf{X}}(\mathbf{X}) dV_n(\mathbf{X}) = 1 # $$ # # In terms of $R$, the change of variable provides, # # $$ # \int f_{\mathbf{X}}(R) n A R^{n-1} dR # $$ # # Thus, # # $$ # f_R(R):=f_{\mathbf{X}}(R) = n A # R^{n-1}\frac{e^{-\frac{1}{2}R^2}}{(2\pi)^{\frac{n}{2}}} # $$ # # But we are interested in the distribution $Y=R^2$. Using the same # technique # again, # # $$ # \int f_{R}(R) dR =\int f_{R}(\sqrt{Y}) \frac{dY}{2\sqrt{Y}} # $$ # # Finally, # # $$ # f_Y(Y) := n A Y^\frac{n-1}{2}\frac{e^{-\frac{1}{2}Y}}{(2\pi)^{\frac{n}{2}}} # \frac{1}{2\sqrt{Y}} # $$ # # Then, finally substituting back in $A$ gives the $\chi^2$ distribution with $n$ # degrees of freedom, # # $$ # f_Y(Y) = n \frac{\pi^{\frac{n}{2}}}{\Gamma(\frac{n}{2}+1)} # Y^{n/2-1}\frac{e^{-\frac{1}{2}Y}}{(2\pi)^{\frac{n}{2}}} \frac{1}{2}= # \frac{2^{-\frac{n}{2}-1} n }{\Gamma \left(\frac{n}{2}+1\right)}e^{-Y/2} # Y^{\frac{n}{2}-1} # $$ # # **Example:** Hypothesis testing is a common application of the $\chi^2$ # distribution. Consider Table [1](#tab:diagnosisTable) which tabulates the # infection status of a certain population. The hypothesis is that these data # are # distributed according to the multinomial distribution with the following # rates # for each group, $p_1=1/4$ (mild infection), $p_2=1/4$ (strong infection), # and # $p_3=1/2$ (no infection). Suppose $n_i$ is the count of persons in the # $i^{th}$ # column and $\sum_{i} n_i=n=684$. Let $k$ denote the number of columns. # Then, in # order to apply the Central Limit Theorem, we want to sum the $n_i$ # random # variables, but these all sum to $n$, a constant, which prohibits using # the # theorem. Instead, suppose we sum the $n_i$ variables up to $k-1$ terms. Then, # # $$ # z = \sum_{i=1}^{k-1} n_i # $$ # # is asymptotically normally distributed by the theorem with mean # $\mathbb{E}(z) # = \sum_{i=1}^{k-1} n p_i$. Using our previous results and notation # for # multinomial random variables, we can write this as # # $$ # z = [\mathbf{1}_{k-1}^\top,0]\mathbf{N}_n # $$ # # where $\mathbf{1}_{k-1}$ is a vector of all ones of length $k-1$ and # $\mathbf{N}_n\in \mathbb{R}^{k}$. With this notation, we have # # $$ # \mathbb{E}(z )= n [\mathbf{1}_{k-1}^\top,0] \mathbf{p} = \sum_{i=1}^{k-1} n # p_i = n(1-p_k) # $$ # # We can get the variance of $z$ using the same method, # # $$ # \mathbb{V}(z) # =[\mathbf{1}_{k-1}^\top,0]\textrm{Cov}(\mathbf{N}_n)[\mathbf{1}_{k-1}^\top,0]^\top # $$ # # which gives, # # $$ # \mathbb{V}(z) # =[\mathbf{1}_{k-1}^\top,0](n\textrm{diag}(\mathbf{p})-n\mathbf{p}\mathbf{p}^\top # )[\mathbf{1}_{k-1}^\top,0]^\top # $$ # # The variance is then, # # $$ # \mathbb{V}(z) = n (1-p_k)p_k # $$ # # With the mean and variance established we can subtract the # hypothesize mean for # each column under the hypothesis and create the # transformed variable, # # $$ # z^\prime = \sum_{i=1}^{k-1} \frac{n_i-n p_i}{\sqrt{n (1-p_k)p_k}} \sim # \mathcal{N}(0,1) # $$ # # by the Central Limit Theorem. Likewise, # # $$ # \sum_{i=1}^{k-1} \frac{(n_i-n p_i)^2}{n (1-p_k)p_k} \sim \chi_{k-1}^2 # $$ # # #
# $$ # \begin{table}[] # \centering # \caption{Diagnosis Table} # \label{tab:diagnosisTable} \tag{1} # \begin{tabular}{lllll} # \cline{1-4} # \multicolumn{1}{|l|}{Mild Infection} & \multicolumn{1}{l|}{Strong Infection} & # \multicolumn{1}{l|}{No infection} & \multicolumn{1}{l|}{Total} & \\ \cline{1-4} # \multicolumn{1}{|c|}{128} & \multicolumn{1}{c|}{136} & # \multicolumn{1}{c|}{420} & \multicolumn{1}{c|}{684} & \\ \cline{1-4} # \end{tabular} # \end{table} # $$ # # With all that established, we can test the hypothesis that the data in the table # follow the hypothesized multinomial distribution. # In[30]: from scipy import stats n = 684 p1 = p2 = 1/4 p3 = 1/2 v = n*p3*(1-p3) z = (128-n*p1)**2/v + (136-n*p2)**2/v 1-stats.chi2(2).cdf(z) # This value is very low and suggests that the hypothesized multinomial # distribution is not a good one for this data. Note that this approximation only # works when `n` is large in comparison to the number of columns in the table. # # ## # Poisson and Exponential Distributions # # # # # The Poisson distribution for a random # variable $X$ represents a number of # outcomes occurring in a given time interval # ($t$). # # $$ # p(x;\lambda t) = \frac{e^{-\lambda t}(\lambda t)^x}{x!} # $$ # # The Poisson distribution is closely related to the binomial # distribution, # $b(k;n,p)$ where $p$ is small and $n$ is large. That is, when # there is a low- # probability event but many trials, $n$. Recall that the binomial # distribution is # the following, # # $$ # b(k;n,p) =\binom{n}{k} p^k (1-p)^{n-k} # $$ # # for $k=0$ and taking the logarithm of both sides, we obtain # # $$ # \log b(0;n,p) = (1-p)^n = \left( 1-\frac{\lambda}{n} \right)^n # $$ # # Then, the Taylor expansion of this gives the following, # # $$ # \log b(0;n,p) \approx -\lambda - \frac{\lambda^2}{2 n} - \cdots # $$ # # For large $n$, this results in, # # $$ # b(0;n,p) \approx e^{-\lambda} # $$ # # A similar argument for $k$ leads to the Poisson distribution. # Conveniently, we # have $\mathbb{E}(X) = \mathbb{V}(X)= \lambda$. For example, # suppose that the # average number of vehicles passing under a toll-gate per hour # is 3. Then, the # probability that 6 vehicles pass under the gate in a given hour # is # $p(x=6;\lambda t= 3) = \frac{81}{30 e^3}\approx 0.05$. # # The Poisson distribution # is available from the `scipy.stats` module. # The following code computes the last # result, # In[31]: from scipy.stats import poisson x = poisson(3) print(x.pmf(6)) # The Poisson distribution is important for applications involving reliability # and # queueing. The Poisson distribution is used to compute the probability of # specific numbers of events during a particular time period. In many cases the # time period ($X$) itself is the random variable. For example, we might be # interested in understanding the time $X$ between arrivals of vehicles at a # checkpoint. With the Poisson distribution, the probability of *no* events # occurring in the span of time up to time $t$ is given by the following, # # $$ # p(0;\lambda t) = e^{-\lambda t} # $$ # # Now, suppose $X$ is the time to the first event. The # probability that the # length of time until the first event will exceed $x$ is # given by the following, # # $$ # \mathbb{P}(X>x) = e^{-\lambda x} # $$ # # Then, the cumulative distribution function is given by # the following, # # $$ # \mathbb{P}(0\le X\le x) = F_X (x) = 1-e^{- \lambda x} # $$ # # Taking the derivative gives the *exponential* distribution, # # $$ # f_X(x) = \lambda e^{-\lambda x} # $$ # # where $\mathbb{E}(X) = 1/\lambda $ and $\mathbb{V}(X)=\frac{1}{\lambda^2}$. # For # example, suppose we want to know the probability of a certain # component lasting # beyond $T=10$ years where $T$ is modeled as a # an exponential random variable # with $1/\lambda=5$ years. Then, we have # $1-F_X(10) = e^{-2} \approx 0.135 $. # The exponential distribution is available in the `scipy.stats` module. The # following code computes the result of the example above. Note that the # parameters are described in slightly different terms as above, as described in # the corresponding documentation for `expon`. # In[32]: from scipy.stats import expon x = expon(0,5) # create random variable object print(1 - x.cdf(10)) # ## Gamma Distribution # # # # We have previously discussed how the exponential # distribution can be created # from the Poisson events. The exponential # distribution has the *memoryless* # property, namely, # # $$ # \mathbb{P}(T>t_0+t\vert T>t_0) = \mathbb{P}(T>t) # $$ # # For example, given $T$ as the random variable representing the time # until # failure, this means that a component that has survived up through $t_0$ # has the # same failure probability of lasting $t$ units beyond that point. To # derive this # result, it is easier to compute the complementary event, # # $$ # \mathbb{P}(t_0One thousand samples from a Dirichlet # distribution with $\boldsymbol{\alpha} = [1,1,1]$
# # # # # # Notice that the # generated samples lie on the triangular simplex shown. The # corners of the # triangle correspond to each of the components in the # $\boldsymbol{\mu}$. Using, # a non-uniform $\boldsymbol{\alpha}=[2,3,4]$ vector, # we can visualize the # probability density function using the `pdf` method on the # `dirichlet` object as # shown in [Figure](#fig:Dirichlet_002). By choosing the # $\boldsymbol{\alpha}\in # \mathbb{R}^3$, the peak of the density function can be # moved within the # corresponding triangular simplex. # In[37]: import numpy as np from matplotlib.pylab import cm X,Y = np.meshgrid(np.linspace(.01,1,50),np.linspace(.01,1,50)) d = dirichlet([2,3,4]) idx=(X+Y<1) f=d.pdf(np.vstack([X[idx],Y[idx],1-X[idx]-Y[idx]])) Z = idx*0+ np.nan Z[idx] = f fig,ax=subplots() _=ax.contourf(X,Y,Z,cmap=cm.viridis) ax.set_aspect(1) _=ax.set_xlabel('x',fontsize=18) _=ax.set_ylabel('y',fontsize=18) fig.savefig('fig-probability/Dirichlet_002.png') # # # # #Probability density function # for the Dirichlet distribution with $\boldsymbol{\alpha}=[2,3,4]$
# # # # # # We # have seen that the Beta distribution generalizes the uniform distribution # over # the unit interval. Likewise, the Dirichlet distribution generalizes the # Beta # distribution over a vector with components in the unit interval. Recall # that # binomial distribution and the Beta distribution form a conjugate pair # for # Bayesian inference because with $p\sim \texttt{Beta} $, # # $$ # X\vert p \sim \texttt{Binomial}(n,p) # $$ # # That is, the data conditioned on $p$, is binomial distributed. # Analogously, the # multinomial distribution and the Dirichlet distribution also # form such a # conjugate pair with multinomial parameter $p\sim # \texttt{Dirichlet} $, # # $$ # X\vert p \sim \texttt{multinomial}(n,p) # $$ # # For this reason, the Dirichlet-multinomial distribution is popular in # machine # learning text processing because non-zero probabilities can be assigned # to words # not specifically contained in specific documents, which helps # generalization # performance. # # # ## Negative Binomial Distribution # # # The negative binomial # distribution is used to characterize the number # of trials until a specified # number of failures ($r$) occurs. For # example, suppose `1` indicates failure # and `0` indicates success. Then # the negative binomial distribution characterizes # the probability of a # `k=6` long sequence that has two (`r=2`) failures, with the # sequence # terminating in a failure (e.g., `001001`) with # $\mathbb{P}(1)=1/3$. # The length of the sequence is `6`, so for the # negative binomial distribution, # $\mathbb{P}(6-2)=\frac{80}{729}$. # # The probability mass function is the # following: # # $$ # \mathbb{P}(k) = \binom{n+k-1}{n-1} p^n (1-p)^k # $$ # # where $p$ is the probability of failure. The mean and # variance of this # distribution is the following: # # $$ # \mathbb{E}(k) =\frac{n (1-p)}{p} # $$ # # $$ # \mathbb{V}(k) = \frac{n (1-p)}{p^2} # $$ # # The following simulation shows an example # sequence generated for the negative # binomial distribution. # In[38]: import random n=2 # num of failures p=1/3 # prob of failure nc = 0 # counter seq= [] while nc< n: v,=random.choices([0,1],[1-p,p]) seq.append(v) nc += (v == 1) seq,len(seq) # Keep in mind that the negative binomial distribution characterizes # the family # of such sequences with the specified number of failures. # # ## Negative # Multinomial Distribution # # # The discrete negative multinomial distribution is an # extension of the negative # binomial distribution to account for more than two # possible outcomes. That is, # there are other alteratives whose respective # probabilities sum to one less the # failure probability, $p_{f} = 1-\sum_{k=1}^n # p_i$. For example, a random # sample from this distribution with parameters $n=2$ # (number of observed # failures) and with $p_a= \frac{1}{3}, p_b=\frac{1}{2}$ means # that the failure # probability, $p_f=\frac{1}{6}$. Thus, a sample from this # distribution like # $[ 2,9]$ means that `2` of the $a$ objects were observed in # the # sequence, `9` of the $b$ objects were observed, and there were two failure # symbols (say, `F`) with one of them at the end of the sequence. # # The probability # mass function is the following: # # $$ # \mathbb{P}(\mathbf{k})= (n)_{\sum_{i=0}^m k_i} p_f^{n} \prod_{i=1}^m # \frac{p_i^{k_i}}{k_i!} # $$ # # where $p_f$ is the probability of failure and the other $p_i$ terms # are the # probabilities of the other alternatives in the sequence. The # $(a)_n$ notation # is the rising factorial function (e.g., $a_3 = a (a+1)(a+2)$). # The mean and # variance of this distribution is the # following: # # $$ # \mathbb{E}(\mathbf{k}) =\frac{n}{p_f} \mathbf{p} # $$ # # $$ # \mathbb{V}(k) = \frac{n}{p_f^2} \mathbf{p} \mathbf{p}^T + # \frac{n}{p_f}\diag(\mathbf{p}) # $$ # # The following simulation shows the sequences generated for the # negative # multinomial distribution. # In[39]: import random from collections import Counter n=2 # num of failure items p=[1/3,1/2] # prob of other non-failure items items = ['a','b','F'] # F marks failure item nc = 0 # counter seq= [] while nc< n: v,=random.choices(items,p+[1-sum(p)]) seq.append(v) nc += (v == 'F') c=Counter(seq) print(c) # The values of the `Counter` dictionary above are the $\mathbf{k}$ # vectors in # the probability mass function for the negative multinomial distribution. # Importantly, these are not the probabilities of a particular sequence, but # of a # family of sequences with the same corresponding `Counter` values. # The # probability mass function implemented in Python is the following, # In[40]: from scipy.special import factorial import numpy as np def negative_multinom_pdf(p,n): assert len(n) == len(p) term = [i**j for i,j in zip(p,n)] num=np.prod(term)*(1-sum(p))*factorial(sum(n)) den = np.prod([factorial(i) for i in n]) return num/den # Evaluating this with the prior `Counter` result, # In[41]: negative_multinom_pdf([1/3,1/2],[c['a'],c['b']])