CoCalc -- gradesexample.ipynb

Kernel: Python 3 (system-wide)

In [16]:

# import 
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

# fake data
gradesy=np.array([94.28379291,86.98982195,86.63611546,85.7203357,84.52017914,83.11406572,82.5180994,80.0522343,77.76367201,76.87708018,74.87290704,74.43641397,70.92109845,70.74501915,69.955087,68.72333003,67.43377364,65.31794233,65.19391681,64.07626518,46.06843416])
gradesx=np.zeros(gradesy.size)
grades=np.concatenate((gradesx.reshape(gradesy.size,1),gradesy.reshape(gradesy.size,1)),axis=1)

#plot fake date before sorting to see input
plt.scatter(grades[:,0], grades[:,1], s=100)

<matplotlib.collections.PathCollection at 0x7f822b9a7f28>

In [17]:


# make dendrogram
dendrogram = sch.dendrogram(sch.linkage(grades, method='ward'))

# find clusters
clusters=11
hc = AgglomerativeClustering(n_clusters=clusters, affinity = 'euclidean', linkage = 'ward')

In [18]:

# make 2d scatter plot even though its not the best way to visualize this
y_hc = hc.fit_predict(grades)
for i in range(0,clusters):
    plt.scatter(grades[y_hc ==i,0], grades[y_hc == i,1], s=100)
    print("Cluster ",i," ", grades[y_hc == i,1])

Cluster  0   [65.31794233 65.19391681 64.07626518]
Cluster  1   [68.72333003 67.43377364]
Cluster  2   [85.7203357  84.52017914]
Cluster  3   [70.92109845 70.74501915 69.955087  ]
Cluster  4   [86.98982195 86.63611546]
Cluster  5   [46.06843416]
Cluster  6   [74.87290704 74.43641397]
Cluster  7   [83.11406572 82.5180994 ]
Cluster  8   [77.76367201 76.87708018]
Cluster  9   [94.28379291]
Cluster  10   [80.0522343]

In [0]: