SharedInício.ipynbOpen in CoCalc
Intro naive ao PCA

Scikit Learn is an advanced machine learning library.

It's main design idea is to fit a prediction model to a dataset by an API, which is consistenly used throughout all methods.

In the first example here, we give the classical K-Means algorithm a hard time: 4 random blobs but only 3 clusters to look for.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=1000, centers = 5)
y_pred = KMeans(n_clusters=3).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)

<matplotlib.collections.PathCollection at 0x7f931cb6c470>

Setup

import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,  svd_solver='auto', tol=0.0, whiten=False)
print(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

[[-1 -1] [-2 -1] [-3 -2] [ 1 1] [ 2 1] [ 3 2]] [ 0.99244289 0.00755711] [ 6.30061232 0.54980396]
A = np.random.randn(9, 6)

A

array([[ 0.14951303, -2.07990586, -0.11172609, 0.20282039, -0.68910226, -1.9354933 ], [ 0.55589371, -0.36440374, -0.49965389, 1.75017318, 0.60128138, 0.8083439 ], [ 1.59974156, 1.38593198, 0.04118741, 1.58465053, -0.32814035, 0.38628408], [ 2.34546148, -0.84474695, -0.13565193, 0.36467729, -1.06126775, -0.73609866], [ 0.67086539, 1.29918519, -0.66543366, -0.36079058, 0.49720726, -1.09133919], [-1.27017016, 0.91480503, 1.55474907, -2.34989812, -2.03220673, -0.01429826], [ 0.20249946, 0.46564405, -0.96683714, -0.28651864, 0.29876689, 0.0228112 ], [-0.45950656, 1.84434408, -0.29457464, 0.54662151, 0.88931201, -0.38255987], [ 0.65336634, -1.08585497, -1.43459617, -0.02805966, -0.83315637, 0.64720964]])
from sympy import pprint

pprint(A)

[[ 0.14951303 -2.07990586 -0.11172609 0.20282039 -0.68910226 -1.9354933 ] [ 0.55589371 -0.36440374 -0.49965389 1.75017318 0.60128138 0.8083439 ] [ 1.59974156 1.38593198 0.04118741 1.58465053 -0.32814035 0.38628408] [ 2.34546148 -0.84474695 -0.13565193 0.36467729 -1.06126775 -0.73609866] [ 0.67086539 1.29918519 -0.66543366 -0.36079058 0.49720726 -1.09133919] [-1.27017016 0.91480503 1.55474907 -2.34989812 -2.03220673 -0.01429826] [ 0.20249946 0.46564405 -0.96683714 -0.28651864 0.29876689 0.0228112 ] [-0.45950656 1.84434408 -0.29457464 0.54662151 0.88931201 -0.38255987] [ 0.65336634 -1.08585497 -1.43459617 -0.02805966 -0.83315637 0.64720964]]


U, s, V = np.linalg.svd(A, full_matrices=True)
U.shape, V.shape, s.shape

((9, 9), (6, 6), (6,))
pprint(s)

[ 4.70459985 4.11279482 2.89975439 2.49793917 1.95932535 0.9420679 ]
S = np.zeros((9, 6))
S[:6, :6] = np.diag(s)

pprint(S)

[[ 4.70459985 0. 0. 0. 0. 0. ] [ 0. 4.11279482 0. 0. 0. 0. ] [ 0. 0. 2.89975439 0. 0. 0. ] [ 0. 0. 0. 2.49793917 0. 0. ] [ 0. 0. 0. 0. 1.95932535 0. ] [ 0. 0. 0. 0. 0. 0.9420679 ] [ 0. 0. 0. 0. 0. 0. ] [ 0. 0. 0. 0. 0. 0. ] [ 0. 0. 0. 0. 0. 0. ]]
type(S)

numpy.ndarray
from numpy import matmul

pprint(matmul(U,matmul(S, V)))

[[ 0.14951303 -2.07990586 -0.11172609 0.20282039 -0.68910226 -1.9354933 ] [ 0.55589371 -0.36440374 -0.49965389 1.75017318 0.60128138 0.8083439 ] [ 1.59974156 1.38593198 0.04118741 1.58465053 -0.32814035 0.38628408] [ 2.34546148 -0.84474695 -0.13565193 0.36467729 -1.06126775 -0.73609866] [ 0.67086539 1.29918519 -0.66543366 -0.36079058 0.49720726 -1.09133919] [-1.27017016 0.91480503 1.55474907 -2.34989812 -2.03220673 -0.01429826] [ 0.20249946 0.46564405 -0.96683714 -0.28651864 0.29876689 0.0228112 ] [-0.45950656 1.84434408 -0.29457464 0.54662151 0.88931201 -0.38255987] [ 0.65336634 -1.08585497 -1.43459617 -0.02805966 -0.83315637 0.64720964]]
pprint(A)

[[ 0.14951303 -2.07990586 -0.11172609 0.20282039 -0.68910226 -1.9354933 ] [ 0.55589371 -0.36440374 -0.49965389 1.75017318 0.60128138 0.8083439 ] [ 1.59974156 1.38593198 0.04118741 1.58465053 -0.32814035 0.38628408] [ 2.34546148 -0.84474695 -0.13565193 0.36467729 -1.06126775 -0.73609866] [ 0.67086539 1.29918519 -0.66543366 -0.36079058 0.49720726 -1.09133919] [-1.27017016 0.91480503 1.55474907 -2.34989812 -2.03220673 -0.01429826] [ 0.20249946 0.46564405 -0.96683714 -0.28651864 0.29876689 0.0228112 ] [-0.45950656 1.84434408 -0.29457464 0.54662151 0.88931201 -0.38255987] [ 0.65336634 -1.08585497 -1.43459617 -0.02805966 -0.83315637 0.64720964]]
U.shape[0]

9
def my_svd(A):
m, n = A.shape
U, s, V = np.linalg.svd(A, full_matrices=True)
k=s.shape[0]
S = np.zeros((m, n))
S[:k, :k] = np.diag(s)
return U, S, V

import numpy as np
A = np.random.randn(3, 5)

my_svd(A)

(array([[-0.61065421, 0.29544672, 0.73471945], [ 0.04664036, 0.93960405, -0.33907065], [-0.79052268, -0.17278734, -0.58755291]]), array([[ 4.20973742, 0. , 0. , 0. , 0. ], [ 0. , 2.76957395, 0. , 0. , 0. ], [ 0. , 0. , 1.86906999, 0. , 0. ]]), array([[-0.27331948, 0.33380402, -0.02231331, 0.79648941, -0.42305801], [ 0.11214579, -0.51175169, 0.7833416 , 0.10645057, -0.31713994], [ 0.47375278, 0.03935592, 0.16734891, 0.51790589, 0.69121432], [ 0.28391369, -0.67456705, -0.59820857, 0.25561576, -0.20287781], [-0.77953262, -0.41242634, 0.00434816, 0.14389915, 0.44889591]]))
uns=np.ones((3,1))

uns

array([[ 1.], [ 1.], [ 1.]])
uns.shape

(3, 1)
uns.transpose()

array([[ 1., 1., 1.]])
from numpy import matmul

matmul(uns, uns.transpose())

array([[ 1., 1., 1.], [ 1., 1., 1.], [ 1., 1., 1.]])
Uns=np.ones((3,3))

Uns

array([[ 1., 1., 1.], [ 1., 1., 1.], [ 1., 1., 1.]])
A = np.random.randn( 5, 3)
n = A.shape[1]

media = 1/n * matmul(A,np.ones((n,1)))

media

array([[-0.14710569], [-0.93993508], [ 0.17662994], [ 0.02092006], [ 0.82216025]])
media.shape

(5, 1)

$A$ é a matriz dos atributos; $A_c$ é a matriz centrada. $A_c= A-media [1, \dots , 1].$

Ac=A-matmul(media, np.ones((1, n)))

Ac

array([[-1.92334498, 0.9847132 , 0.93863178], [-0.25581362, -0.85593803, 1.11175164], [ 0.78284708, -1.61886148, 0.8360144 ], [ 0.8300604 , -0.58065568, -0.24940472], [ 0.13466784, -0.1774775 , 0.04280966]])

Covariância $\Sigma = \frac{1}{n} A_c A_c^T.$

Sigma=1/n*matmul(Ac, Ac.transpose())
Sigma

array([[ 1.84998187, 0.23089659, -0.77169653, -0.80079034, -0.13119822], [ 0.23089659, 0.67802074, 0.70494085, 0.00246281, 0.05501786], [-0.77169653, 0.70494085, 1.31082737, 0.46043518, 0.14284177], [-0.80079034, 0.00246281, 0.46043518, 0.362788 , 0.06805295], [-0.13119822, 0.05501786, 0.14284177, 0.06805295, 0.01715545]])
my_svd(Sigma)

(array([[-0.74180012, -0.46975394, 0.2690093 , -0.38937913, 0.07128612], [ 0.10673064, -0.6721524 , 0.16727602, 0.70934079, -0.07532667], [ 0.55543022, -0.56126746, -0.25057434, -0.55935268, -0.02851645], [ 0.35253976, 0.10655834, 0.91456776, -0.16694197, 0.00756512], [ 0.0745261 , -0.03415468, -0.02076163, 0.06689177, 0.99416989]]), array([[ 2.78833162e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 1.43044182e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 1.21600495e-16, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.96208428e-17, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.86762520e-18]]), array([[-0.74180012, 0.10673064, 0.55543022, 0.35253976, 0.0745261 ], [-0.46975394, -0.6721524 , -0.56126746, 0.10655834, -0.03415468], [ 0.3929592 , -0.10278289, -0.03062826, 0.91275679, -0.03091438], [ 0.26133121, -0.71692996, 0.61280343, -0.17624886, -0.10547853], [ 0.07970496, -0.11075571, 0.00315661, -0.01313114, 0.99055434]]))
_, Sing, _ = my_svd(Sigma)

Sing

array([[ 2.78833162e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 1.43044182e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 1.21600495e-16, 0.00000000e+00, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.96208428e-17, 0.00000000e+00], [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.86762520e-18]])