#!/usr/bin/env python # coding: utf-8 # # # # # In[3]: from IPython.display import Image Image('../../Python_probability_statistics_machine_learning_2E.png',width=200) # In[4]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib.pylab import subplots import numpy as np from sklearn.datasets import make_blobs # Clustering is the simplest member of a family of machine learning methods that # do not require supervision to learn from data. Unsupervised methods # have training sets that do not have a target variable. These unsupervised learning # methods rely upon a meaningful metric to group data into # clusters. This makes it an excellent exploratory data analysis # method because there are very few assumptions built into the method itself. # In this section, we focus on the popular K-means clustering method that is # available in Scikit-learn. # # Let's manufacture some data to get going with `make_blobs` from Scikit-learn. # [Figure](#fig:clustering_001) shows some example clusters in two dimensions. # Clustering methods work by minimizing the following objective function, # In[5]: from sklearn.datasets import make_blobs fig,ax=subplots() X, y = make_blobs(n_samples=300, centers=5, random_state=0, cluster_std=0.5) _=ax.scatter(X[:,0],X[:,1],c=y,s=50,cmap='cool',alpha=.8); _=ax.set_xlabel('x-coordinate',fontsize=16) _=ax.set_ylabel('y-coordinate',fontsize=16) _=ax.axis(xmax=4,ymin=-2) ax.tick_params(labelsize='x-large') ax.set_aspect(1/1.6) fig.savefig('fig-machine_learning/clustering_001.png') # # #

# #

The four clusters are pretty easy to see in this example and we want clustering methods to determine the extent and number of such clusters automatically.

# # # In[ ]: # $$ # J = \sum_k \sum_i \Vert \mathbf{x}_i-\mathbf{\mu}_k \Vert^2 # $$ # The *distortion* for the $k^{th}$ cluster is the summand, # $$ # \sum_i \Vert \mathbf{x}_i - \mathbf{ \mu }_k \Vert^2 # $$ # Thus, clustering algorithms work to minimize this by adjusting the # centers of the individual clusters, $\mu_k$. Intuitively, each $\mu_k$ is the # *center of mass* of the points in the cloud. The Euclidean distance is # the typical metric used for this, # $$ # \Vert \mathbf{ x } \Vert^2 = \sum x_i^2 # $$ # There are many clever algorithms that can solve this problem for # the best $\mu_k$ cluster-centers. The K-means algorithm starts with a # user-specified number of $K$ clusters to optimize over. This is implemented in # Scikit-learn with the `KMeans` object that follows the usual fitting # conventions in Scikit-learn, # In[6]: from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4) kmeans.fit(X) # where we have chosen $K=4$. How do we choose the value of # $K$? This is the eternal question of generalization versus # approximation --- too many clusters provide great approximation but # bad generalization. One way to approach this problem is to compute the # mean distortion for increasingly larger values of $K$ until it no # longer makes sense. To do this, we want to take every data point and # compare it to the centers of all the clusters. Then, take the # smallest value of this across all clusters and average those. This # gives us an idea of the overall mean performance for the $K$ clusters. # The following code computes this explicitly. # # **Programming Tip.** # # The `cdist` function from Scipy computes all the pairwise # differences between the two input collections according to the # specified metric. # In[7]: from scipy.spatial.distance import cdist m_distortions=[] for k in range(1,7): kmeans = KMeans(n_clusters=k) _=kmeans.fit(X) tmp=cdist(X,kmeans.cluster_centers_,'euclidean') m_distortions.append(sum(np.min(tmp,axis=1))/X.shape[0]) # In[8]: fig,ax=subplots() fig.set_size_inches((8,5)) _=ax.plot(m_distortions,'-o',ms=10,color='gray') _=ax.set_xlabel('K',fontsize=16) _=ax.set_ylabel('Mean Distortion',fontsize=16) ax.tick_params(labelsize='x-large') # ax.set_aspect(1/1.6) fig.savefig('fig-machine_learning/clustering_002.png') # # #

# #

The Mean Distortion shows that there is a diminishing value in using more clusters.

# # # # # Note that code above uses the `cluster_centers_`, which are # estimated from K-means algorithm. The resulting [Figure](#fig:clustering_002) shows the point of diminishing returns for # added additional clusters. # # # Another figure-of-merit is the silhouette coefficient, which measures # how compact and separated the individual clusters are. To compute the # silhouette coefficient, we need to compute the mean intra-cluster # distance for each sample ($a_i$) and the mean distance to the next # nearest cluster ($b_i$). Then, the silhouette coefficient for the # $i^{th}$ sample is # $$ # \texttt{sc}_i = \frac{b_i-a_i}{\max(a_i,b_i)} # $$ # The mean silhouette coefficient is just the mean of all these values # over all the samples. The best value is one and the worst is negative one, # with values near zero indicating overlapping clusters and negative values # showing that samples have been incorrectly assigned to the wrong cluster. This # figure-of-merit is implemented in Scikit-learn as in the following, # In[9]: from sklearn.metrics import silhouette_score # In[10]: def scatter_fit(X,y,ax): _=kmeans.fit(X) _=ax.scatter(X[:,0],X[:,1],c=y,s=50,cmap='gray',marker='.') _=ax.set_title('silhouette={:.3f}'.format(silhouette_score(X,kmeans.labels_))) fig,axs = subplots(2,2,sharex=True,sharey=True) np.random.seed(12) ax=axs[0,0] X,y=make_blobs(centers=[[0,0],[3,0]],n_samples=100) scatter_fit(X,y,ax) ax=axs[0,1] X,y=make_blobs(centers=[[0,0],[10,0]],n_samples=100) scatter_fit(X,y,ax) ax=axs[1,0] X,y=make_blobs(centers=[[0,0],[3,0]],n_samples=100,cluster_std=[.5,.5]) scatter_fit(X,y,ax) ax=axs[1,1] X,y=make_blobs(centers=[[0,0],[10,0]],n_samples=100,cluster_std=[.5,.5]) scatter_fit(X,y,ax) fig.savefig('fig-machine_learning/clustering_003.png') # [Figure](#fig:clustering_004) shows how the silhouette coefficient varies # as the clusters become more dispersed and/or closer together. # # # #

# #

The shows how the silhouette coefficient varies as the clusters move closer and become more compact.

# # # # # K-means is easy to understand and to implement, but can be sensitive # to the initial choice of cluster-centers. The default initialization # method in Scikit-learn uses a very effective and clever randomization # to come up with the initial cluster-centers. Nonetheless, to see why # initialization can cause instability with K-means, consider the # following [Figure](#fig:clustering_004), In [Figure](#fig:clustering_004), there are two large clusters on the left and # a very sparse cluster on the far right. The large circles at the # centers are the cluster-centers that K-means found. Given $K=2$, how # should the cluster-centers be chosen? Intuitively, the first two # clusters should have their own cluster-center somewhere between them # and the sparse cluster on the right should have its own cluster-center [^kmeans]. # Why isn't this happening? # # [^kmeans]: Note that we are using the `init=random` keyword argument for this # example in order to illustrate this. # In[11]: X,y = make_blobs(centers=[[0,0],[5,0]],random_state=100,n_samples=200) Xx,yx=make_blobs(centers=[[20,0]],random_state=100,n_samples=3) X=np.vstack([X,Xx]) y=np.hstack([y,yx+2]) fig,axs=subplots(2,1,sharex=True,sharey=True) ax=axs[0] _=ax.scatter(X[:,0],X[:,1],c=y,s=50,cmap='gray',marker='.',alpha=.3); _=kmeans = KMeans(n_clusters=2,random_state=123,init='random') _=kmeans.fit(X) _=ax.set_aspect(1) _=ax.plot(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],'o',color='gray',ms=15,alpha=.5) X,y = make_blobs(centers=[[0,0],[5,0]],random_state=100,n_samples=200) Xx,yx=make_blobs(centers=[[20,0]],random_state=100,n_samples=10) X=np.vstack([X,Xx]) y=np.hstack([y,yx+2]) ax=axs[1] _=ax.scatter(X[:,0],X[:,1],c=y,s=50,cmap='gray',marker='.',alpha=.3); kmeans = KMeans(n_clusters=2,random_state=123,init='random') _=kmeans.fit(X) _=ax.set_aspect(1) _=ax.plot(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],'o',color='gray',ms=15,alpha=.8) fig.savefig('fig-machine_learning/clustering_004.png') # # #

# #

The large circles indicate the cluster-centers found by the K-means algorithm.

# # # # # The problem is that the objective function for K-means is trading the distance # of the far-off sparse cluster with its small size. If we keep increasing the # number of samples in the sparse cluster on the right, then K-means will move # the cluster centers out to meet them, as shown in [Figure](#fig:clustering_004). That is, if one of the initial cluster-centers was # right in the middle of the sparse cluster, the the algorithm would have # immediately captured it and then moved the next cluster-center to the middle of # the other two clusters (bottom panel of [Figure](#fig:clustering_004)). # Without some thoughtful initialization, this may not happen and the sparse # cluster would have been merged into the middle cluster (top panel of [Figure](#fig:clustering_004)). Furthermore, such problems are hard to visualize with # high-dimensional clusters. Nonetheless, K-means is generally very fast, # easy-to-interpret, and easy to understand. It is straightforward to parallelize # using the `n_jobs` keyword argument so that many initial cluster-centers can be # easily evaluated. Many extensions of K-means use different metrics beyond # Euclidean and incorporate adaptive weighting of features. This enables the # clusters to have ellipsoidal instead of spherical shapes.