Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Jupyter notebook 2018-19/Week 11 - Clustering/handouts/IrisCluster-Interactive.ipynb

Views: 38
Kernel: Python 3 (old Anaconda 3)

Demonstration of k-means and hierarchical clustering

Information on the iris data here: https://en.wikipedia.org/wiki/Iris_flower_data_set

Information on the sklearn machine learning library here: http://scikit-learn.org/

We evaluate the different clustering methods using the adjusted rand index, a measure between 0 and 1 with 1 denoting a perfect agreement with the true labels. This is only possible when class labels are known.

# from matplotlib import pyplot as plt import matplotlib.pyplot as plt import numpy as np import pandas as pd import visualisation import clustering # use PCA from sklearn from sklearn.decomposition import PCA # plotly import plotly import plotly.graph_objs as go from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot init_notebook_mode(connected=False) # use seaborn plotting style defaults doInteractive = True def interactivePlots(fig, axes): # helper function to decide to use plotly interactive plots or not if(doInteractive): plotly.offline.iplot_mpl(fig, show_link=False, strip_style=True) # offline ipython notebook

Import Iris dataset from sklearn and put into a pandas DataFrame

# Iris data from sklearn import datasets data = datasets.load_iris() irislabels = np.array(['setosa']*50 + ['versicolor']*50 + ['virginica']*50) irisdata = pd.DataFrame(data.data, columns = ['Sepal length', 'Sepal width','Petal length','Petal width'], index = irislabels)

Data summary: rows are samples, columns are features

print('data shape\n', irisdata.shape) irisdata.head()
data shape (150, 4)
Sepal length Sepal width Petal length Petal width
setosa 5.1 3.5 1.4 0.2
setosa 4.9 3.0 1.4 0.2
setosa 4.7 3.2 1.3 0.2
setosa 4.6 3.1 1.5 0.2
setosa 5.0 3.6 1.4 0.2

Calculate PCA so that we can visualise the clusters of data in 2D

We will only use PCA to help visualise the data, it plays no role in any of the clustering algorithms used here

Wt, scores, fracs = visualisation.do_pca(irisdata) scores = scores/abs(scores).max().max()

Run the k-means algorithm

from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3, random_state=0).fit(irisdata.values)

Return the cluster labels for each data item

kmeanslabels = kmeans.labels_ print (kmeanslabels)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2 2 0]

Compare cluster labels and class labels on the PCA projection

Note that the clustering was not done in the PCA space, it was done in the original 4-dimensional data space

What does the Adjusted Rand Index tell us here?

from __future__ import print_function from ipywidgets import interact, interactive, fixed import ipywidgets as widgets from sklearn.metrics import adjusted_rand_score def plotinteractPCA(feature='kmeans', XaxisPC=1, YaxisPC=2): traceGene = list() if(feature == 'class' or feature == 'kmeans'): labelsPlot = np.unique(kmeanslabels) labelsAll = kmeanslabels if(feature == 'class'): labelsPlot = np.unique(irislabels) labelsAll = irislabels ari = adjusted_rand_score(labelsAll, irislabels) for lbl in labelsPlot: traceGene.append(go.Scatter( x = scores[labelsAll==lbl].iloc[:,XaxisPC-1], y = scores[labelsAll==lbl].iloc[:,YaxisPC-1], mode='markers', name = lbl, text = lbl )) layout = go.Layout(showlegend=True, title='Interactive PCA plot, AdjRand Index %.2f' % ari , annotations=list(), legend={'orientation': 'h'}) else: traceGene.append(go.Scatter( x = scores.iloc[:,XaxisPC], y = scores.iloc[:,YaxisPC], mode='markers', marker = {'color': irisdata[feature].values, 'size': 10, 'colorscale': 'Viridis', 'colorbar':{'title': 'expression'}}, text = irislabels, name = 'Expression' )) layout = go.Layout(showlegend=True, title='Interactive PCA plot' , annotations=list(), legend={'orientation': 'h'}) fig = go.Figure(data=traceGene, layout=layout) iplot(fig) _=interact(plotinteractPCA, feature=['kmeans', 'class']+list(irisdata.columns.values),XaxisPC=(1, 4), YaxisPC=(2, 4))

Failed to display Jupyter Widget of type interactive.

If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.

Hierarchical clustering

For a very nice tutorial see this page.

from scipy.cluster.hierarchy import dendrogram, linkage np.set_printoptions(precision=5, suppress=True) # suppress scientific float notation %matplotlib inline
# generate the linkage matrix X = irisdata.values print('Shape', X.shape) Z = linkage(X, method='ward', metric='euclidean') print(Z)
Shape (150, 4) [[ 9. 34. 0. 2. ] [ 37. 150. 0. 3. ] [ 101. 142. 0. 2. ] [ 7. 39. 0.1 2. ] [ 0. 17. 0.1 2. ] [ 128. 132. 0.1 2. ] [ 10. 48. 0.1 2. ] [ 19. 21. 0.14142 2. ] [ 29. 30. 0.14142 2. ] [ 57. 93. 0.14142 2. ] [ 80. 81. 0.14142 2. ] [ 116. 137. 0.14142 2. ] [ 8. 38. 0.14142 2. ] [ 3. 47. 0.14142 2. ] [ 27. 28. 0.14142 2. ] [ 82. 92. 0.14142 2. ] [ 95. 96. 0.14142 2. ] [ 127. 138. 0.14142 2. ] [ 1. 45. 0.14142 2. ] [ 63. 91. 0.14142 2. ] [ 65. 75. 0.14142 2. ] [ 40. 154. 0.17321 3. ] [ 123. 126. 0.17321 2. ] [ 4. 171. 0.17321 4. ] [ 49. 153. 0.17321 3. ] [ 112. 139. 0.17321 2. ] [ 94. 99. 0.17321 2. ] [ 12. 168. 0.18257 3. ] [ 88. 166. 0.18257 3. ] [ 66. 84. 0.2 2. ] [ 23. 26. 0.2 2. ] [ 53. 89. 0.2 2. ] [ 74. 97. 0.2 2. ] [ 46. 157. 0.21602 3. ] [ 2. 163. 0.21602 3. ] [ 110. 147. 0.22361 2. ] [ 120. 143. 0.22361 2. ] [ 136. 148. 0.24495 2. ] [ 78. 169. 0.24495 3. ] [ 69. 160. 0.24495 3. ] [ 54. 58. 0.24495 2. ] [ 25. 151. 0.24495 4. ] [ 140. 144. 0.24495 2. ] [ 141. 145. 0.24495 2. ] [ 43. 180. 0.2582 3. ] [ 68. 87. 0.26458 2. ] [ 50. 52. 0.26458 2. ] [ 51. 56. 0.26458 2. ] [ 107. 130. 0.26458 2. ] [ 105. 122. 0.26458 2. ] [ 103. 161. 0.2708 3. ] [ 20. 31. 0.28284 2. ] [ 164. 174. 0.28983 5. ] [ 11. 158. 0.29439 3. ] [ 67. 165. 0.29439 3. ] [ 70. 167. 0.29439 3. ] [ 42. 162. 0.29439 3. ] [ 113. 152. 0.30551 3. ] [ 6. 184. 0.31358 4. ] [ 55. 90. 0.31623 2. ] [ 86. 196. 0.32146 3. ] [ 124. 186. 0.33166 3. ] [ 83. 133. 0.33166 2. ] [ 5. 18. 0.33166 2. ] [ 13. 206. 0.33665 4. ] [ 176. 178. 0.34157 5. ] [ 32. 33. 0.34641 2. ] [ 125. 129. 0.34641 2. ] [ 177. 191. 0.34778 7. ] [ 104. 155. 0.35119 3. ] [ 173. 202. 0.35182 9. ] [ 73. 188. 0.35355 4. ] [ 149. 205. 0.35824 4. ] [ 146. 172. 0.36056 3. ] [ 121. 207. 0.36286 4. ] [ 36. 156. 0.36968 3. ] [ 76. 190. 0.37417 3. ] [ 115. 187. 0.37417 3. ] [ 61. 71. 0.4 2. ] [ 72. 212. 0.41231 3. ] [ 117. 131. 0.41231 2. ] [ 192. 211. 0.41473 5. ] [ 24. 203. 0.4223 4. ] [ 98. 159. 0.4397 3. ] [ 16. 225. 0.4397 4. ] [ 35. 218. 0.44521 8. ] [ 64. 79. 0.44721 2. ] [ 85. 197. 0.45826 3. ] [ 77. 185. 0.46547 3. ] [ 44. 183. 0.46726 4. ] [ 111. 200. 0.48132 4. ] [ 181. 189. 0.48166 5. ] [ 102. 217. 0.4899 3. ] [ 175. 193. 0.51478 4. ] [ 182. 228. 0.52915 4. ] [ 170. 226. 0.53292 5. ] [ 118. 199. 0.53852 3. ] [ 14. 15. 0.54772 2. ] [ 179. 209. 0.57446 4. ] [ 223. 229. 0.58023 6. ] [ 201. 234. 0.59442 6. ] [ 114. 224. 0.60581 5. ] [ 60. 233. 0.61373 4. ] [ 216. 247. 0.6245 4. ] [ 59. 241. 0.63087 6. ] [ 208. 232. 0.6364 8. ] [ 198. 242. 0.64031 5. ] [ 62. 204. 0.64291 4. ] [ 213. 250. 0.6627 8. ] [ 119. 195. 0.70946 3. ] [ 100. 227. 0.72457 4. ] [ 108. 219. 0.73258 4. ] [ 215. 248. 0.73409 9. ] [ 210. 245. 0.73496 8. ] [ 194. 220. 0.75535 12. ] [ 240. 261. 0.76322 8. ] [ 109. 135. 0.80623 2. ] [ 238. 243. 0.82187 7. ] [ 236. 254. 0.82614 8. ] [ 235. 255. 0.83741 16. ] [ 22. 214. 0.85557 5. ] [ 239. 258. 0.85781 12. ] [ 221. 244. 0.86458 8. ] [ 257. 268. 0.92871 12. ] [ 134. 249. 0.92916 7. ] [ 222. 237. 1.00534 7. ] [ 231. 260. 1.04706 9. ] [ 41. 270. 1.10514 6. ] [ 230. 266. 1.15326 4. ] [ 106. 262. 1.21701 10. ] [ 264. 271. 1.2984 24. ] [ 259. 274. 1.30486 10. ] [ 269. 277. 1.39698 22. ] [ 265. 267. 1.41139 15. ] [ 272. 275. 1.49548 15. ] [ 246. 278. 1.59777 7. ] [ 251. 281. 1.75917 15. ] [ 276. 283. 1.76045 24. ] [ 256. 285. 1.84584 12. ] [ 253. 280. 1.86838 28. ] [ 273. 279. 1.91608 22. ] [ 263. 284. 2.05363 23. ] [ 252. 290. 2.81394 26. ] [ 286. 291. 2.86942 38. ] [ 282. 289. 3.87584 50. ] [ 287. 288. 4.84771 36. ] [ 292. 293. 6.39941 64. ] [ 295. 296. 12.3004 100. ] [ 294. 297. 32.42801 150. ]]

For an explanation of other linkage methods and metrics see this page

For an explanation of difference distance metrics see here.

Let us now look at the linkage matrix:

Z[:5,:]
array([[ 9. , 34. , 0. , 2. ], [ 37. , 150. , 0. , 3. ], [ 101. , 142. , 0. , 2. ], [ 7. , 39. , 0.1, 2. ], [ 0. , 17. , 0.1, 2. ]])

The last column gives you the cluster size for each step of the algorithm. We see already in the second step a cluster of three points.

The first two columns are the indices of the points. So in the first step point 9 and point 34 are merged into a single cluster. All indices over the data size, in this case we have 150 points, denote a merge of an existing cluster with another cluster/point. In the second row we show points 37 and 150 merge - but point 150 does not exists because we have data indices 0-149 - remember in python we have 0-based counting! So this denotes the merging of point 37 with a new 'point' 150 which represents the cluster containing 9 and 34. Every time a cluster is formed it is given the next index, so the cluster containing 9, 34 and 37 will be numbered 151 and so on.

Now let us plot a dendogram summarising the clustering.

For a particular cutoff a specific number of clusters is defined. We show these on the PCA plot below the dendogram.

def plotinteractHierarchicalClustering(method='ward', metric='euclidean', XaxisPC=1, YaxisPC=2, distanceThreshold=0.3): if(method in ['‘centroid’, ‘median’,‘ward’'] and metric != 'euclidean'): raise NameError('Methods ‘centroid’, ‘median’ and ‘ward’ are correctly defined only if Euclidean pairwise metric is used.') X = irisdata.values.copy() Z = linkage(X, method=method, metric=metric) Z[:, 2] = Z[:, 2] / Z[:, 2].max() # distance in [0, 1] plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) ax = plt.gca() ax.tick_params(axis='y', which='major', labelsize=20) ax.set_ylabel('Distance', fontsize=20) ax.set_xlabel('Sample Index', fontsize=20) ax.set_title('Hierarchical Clustering Dendrogram', fontsize=20) plt.axhline(y=distanceThreshold, c='k') plt.show() # do clustering plot traceGene = list() cid = clustering.GetClusters(Z.copy(), X.copy(), distanceThreshold) labelsPlot = np.unique(cid) labelsAll = cid ari = adjusted_rand_score(labelsAll, irislabels) for lbl in labelsPlot: traceGene.append(go.Scatter( x = scores[labelsAll==lbl].iloc[:,XaxisPC-1], y = scores[labelsAll==lbl].iloc[:,YaxisPC-1], mode='markers', name = lbl, text = lbl )) layout = go.Layout(showlegend=True, title='Threshold=%.2f, Number of clusters=%g, AdjRand Index %.2f' % (distanceThreshold, labelsPlot.size, ari), annotations=list(), legend={'orientation': 'h'}) fig = go.Figure(data=traceGene, layout=layout) iplot(fig) _=interact(plotinteractHierarchicalClustering, method=['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median'], metric=['euclidean', 'minkowski', 'cityblock', 'seuclidean', 'sqeuclidean', 'cosine', 'correlation', 'hamming', 'jaccard', 'chebyshev', 'canberra', 'braycurtis', 'mahalanobis', 'wminkowski'], XaxisPC=(1, 4), YaxisPC=(2, 4), distanceThreshold=(0.0,1.0,0.01))

Failed to display Jupyter Widget of type interactive.

If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.