Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Семинар + задание (в конце)

Project: Para1
Views: 31
Kernel: Python 3 (Anaconda 5)

K-means

import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn.cluster as skcl
attr_names = """sepal lenth sepal width petal length petal width class""" attr_names = attr_names.split("\n")
iris_df = pd.read_csv("data/iris.data", sep=',', names=attr_names)
class_num = iris_df["class"] class_names = class_num.unique() class_names = list(class_names) print(class_names) class_num = class_num.apply(lambda val: class_names.index(val)+1)
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
X1 = iris_df[attr_names[0]].values X2 = iris_df[attr_names[1]].values X3 = iris_df[attr_names[2]].values X4 = iris_df[attr_names[3]].values X = np.c_[X1,X2,X3,X4] print(X.shape)
(150, 4)
Y = class_num.values X23 = np.c_[X2, X3]
cl = skcl.KMeans(n_clusters=3) cl.fit(X23)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
Z = cl.predict(X23)
plt.figure(figsize=(7,3)) plt.subplot(1,2,1) plt.scatter(X2,X3, c=Y, edgecolors='k') plt.subplot(1,2,2) plt.scatter(X2,X3, c=Z, edgecolors='k') centers = cl.cluster_centers_ plt.scatter(centers[:,0], centers[:,1], c='r', s=81) plt.tight_layout() plt.show()
Image in a Jupyter notebook
import sklearn.decomposition as decomposition
pca = decomposition.PCA(n_components=2) U = pca.fit_transform(X)
cl2 = skcl.KMeans(n_clusters=3) cl2.fit(U)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
Z2 = cl2.predict(U) U1, U2 = U[:,0], U[:,1]
plt.figure(figsize=(7,3)) plt.subplot(1,2,1) plt.scatter(U1,U2, c=Y, edgecolors='k') plt.subplot(1,2,2) plt.scatter(U1,U2, c=Z2, edgecolors='k') centers2 = cl2.cluster_centers_ plt.scatter(centers2[:,0], centers2[:,1], c='r', s=81) plt.tight_layout() plt.show()
Image in a Jupyter notebook
cl3 = skcl.KMeans(n_clusters=2) cl3.fit(U)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
Z3 = cl3.predict(U) U1, U2 = U[:,0], U[:,1]
plt.figure(figsize=(8,4)) plt.subplot(1,2,1) plt.scatter(U1,U2, c=Y, edgecolors='k') plt.subplot(1,2,2) plt.scatter(U1,U2, c=Z3, edgecolors='k') centers3 = cl3.cluster_centers_ plt.scatter(centers3[:,0], centers3[:,1], c='r', s=81) plt.tight_layout() plt.show()
Image in a Jupyter notebook

Задача

Разбить данные из wine dataset на кластеры (число кластеров совпадает с числом классов). Перед кластеризацией найти главные компоненты. Разбиение осуществлять относительно выбранных главных компонент.

attrs = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"] wine_df = pd.read_csv("data/wine.data", sep=",", names = attrs)
class_num = wine_df["class"] class_names = class_num.unique() class_names = list(class_names) print(class_names) class_num = class_num.apply(lambda val: class_names.index(val)+1)
[1, 2, 3]
X1 = wine_df[attrs[1]].values X2 = wine_df[attrs[2]].values X3 = wine_df[attrs[3]].values X4 = wine_df[attrs[4]].values X5 = wine_df[attrs[5]].values X6 = wine_df[attrs[6]].values X7 = wine_df[attrs[7]].values X8 = wine_df[attrs[8]].values X9 = wine_df[attrs[9]].values X10 = wine_df[attrs[10]].values X11 = wine_df[attrs[11]].values X12 = wine_df[attrs[12]].values X13 = wine_df[attrs[13]].values V1 = (X1-X1.min()) / (X1.max() - X1.min()) X1 = V1 V2 = (X2-X2.min()) / (X2.max() - X2.min()) X2 = V2 V3 = (X3-X3.min()) / (X3.max() - X3.min()) X3 = V3 V4 = (X4-X4.min()) / (X4.max() - X4.min()) X4 = V4 V5 = (X5-X5.min()) / (X5.max() - X5.min()) X5 = V5 V6 = (X6-X6.min()) / (X6.max() - X6.min()) X6 = V6 V7 = (X7-X7.min()) / (X7.max() - X7.min()) X7 = V7 V8 = (X8-X8.min()) / (X8.max() - X8.min()) X8 = V8 V9 = (X9-X9.min()) / (X9.max() - X9.min()) X9 = V9 V10 = (X10-X10.min()) / (X10.max() - X10.min()) X10 = V10 V11 = (X11-X11.min()) / (X11.max() - X11.min()) X11 = V11 V12 = (X12-X12.min()) / (X12.max() - X12.min()) X12 = V12 V13 = (X13-X13.min()) / (X13.max() - X13.min()) X13 = V13 X = np.c_[X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13] print(X.shape) Y = class_num.values pca = decomposition.PCA(n_components=13) U = pca.fit_transform(X) pca.explained_variance_ratio_
(178, 13)
array([0.40749485, 0.18970352, 0.08561671, 0.07426678, 0.05565301, 0.04658837, 0.03663929, 0.02408789, 0.02274371, 0.02250965, 0.01381292, 0.01273236, 0.00815095])
plt.figure(figsize=(5,3)) plt.bar(range(1,14), pca.explained_variance_ratio_) plt.grid(1) plt.minorticks_on() plt.show()
Image in a Jupyter notebook
pca = decomposition.PCA(n_components=2) U = pca.fit_transform(X) cl2 = skcl.KMeans(n_clusters=3) cl2.fit(U)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
Z2 = cl2.predict(U) U1, U2 = U[:,0], U[:,1]
plt.figure(figsize=(7,3)) plt.subplot(1,2,1) plt.scatter(U1,U2, c=Y, edgecolors='k') plt.subplot(1,2,2) plt.scatter(U1,U2, c=Z2, edgecolors='k') centers2 = cl2.cluster_centers_ plt.scatter(centers2[:,0], centers2[:,1], c='r', s=81) plt.tight_layout() plt.show()
Image in a Jupyter notebook