# basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
# general machine Learning
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# unsupervised learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from sklearn.ensemble import IsolationForest
Dimension Reduction
= make_blobs(
X, _ =500,
n_samples=4,
centers=50,
n_features=5,
cluster_std=42,
random_state
) X
array([[ 5.47268545, 2.36711978, -7.72797716, ..., 11.56267621,
5.3875766 , 8.30440778],
[ 1.85492413, 4.18350251, 4.24293671, ..., 6.95231996,
-0.14559701, -1.61552343],
[ -4.83822547, 8.26009757, 4.27014101, ..., 1.90689712,
1.85337817, 7.16225743],
...,
[ 22.32156843, 7.59791014, 11.85630655, ..., -7.07516877,
-14.93508452, -5.03480831],
[ 6.99270309, -11.19575062, -3.5367364 , ..., 7.54436995,
17.31625101, 0.1853057 ],
[-13.91218603, 0.6172048 , -4.80263491, ..., -7.4577458 ,
-8.18493172, -4.08366717]], shape=(500, 50))
X.shape
(500, 50)
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=ax,
ax
)r"$x_1$")
ax.set_xlabel(r"$x_2$")
ax.set_ylabel( plt.show()
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 1],
x=X[:, 2],
y=ax,
ax
)r"$x_2$")
ax.set_xlabel(r"$x_3$")
ax.set_ylabel( plt.show()
= PCA()
pca = pca.fit(X)
_ = pca.transform(X)
X_pca X_pca
array([[-21.0132157 , 0.58429056, -18.78127557, ..., -3.43557718,
-3.48778491, 7.13500179],
[ -8.29402824, 41.09833267, 20.51146456, ..., -0.68948175,
-6.51053673, 2.54956227],
[ -6.5519595 , 30.98821707, 9.65805067, ..., 1.23333536,
3.86083456, 0.83170031],
...,
[-14.29741386, -37.31812582, 36.25339557, ..., 2.14258928,
1.06942156, -2.05336295],
[-20.74836199, 2.01953335, -37.94579109, ..., -0.77644235,
4.32287409, -3.04784811],
[ 43.92071368, -2.57280644, -8.61724191, ..., -1.65243415,
-9.02562484, 6.99464932]], shape=(500, 50))
10, :3] X_pca[:
array([[-21.0132157 , 0.58429056, -18.78127557],
[ -8.29402824, 41.09833267, 20.51146456],
[ -6.5519595 , 30.98821707, 9.65805067],
[ 44.20977982, -10.11196016, 1.68990725],
[ 38.05703987, -14.70385106, -11.51850903],
[ 41.83695478, 1.76656913, -8.26814285],
[ 24.56272454, -0.90306868, -8.59899694],
[ -5.68329991, 26.30511016, 10.74802756],
[ -3.13380174, 27.50701824, 12.32952769],
[ 47.81897616, -8.86343347, -9.26382513]])
X_pca.shape
(500, 50)
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X_pca[:, 0],
x=X_pca[:, 1],
y=ax,
ax
)"Principal Component 1")
ax.set_xlabel("Principal Component 2")
ax.set_ylabel( plt.show()
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X_pca[:, 1],
x=X_pca[:, 2],
y=ax,
ax
)"Principal Component 2")
ax.set_xlabel("Principal Component 3")
ax.set_ylabel( plt.show()
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X_pca[:, 2],
x=X_pca[:, 3],
y=ax,
ax
)"Principal Component 3")
ax.set_xlabel("Principal Component 4")
ax.set_ylabel( plt.show()
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X_pca[:, 9],
x=X_pca[:, 10],
y=ax,
ax
)"Principal Component 10")
ax.set_xlabel("Principal Component 11")
ax.set_ylabel( plt.show()
# center the data
= X - np.mean(X, axis=0)
X_centered
# compute the covariance matrix
= np.cov(X_centered, rowvar=False)
cov_matrix
# compute the eigenvalues and eigenvectors
= np.linalg.eigh(cov_matrix)
eigenvalues, eigenvectors
# sort the eigenvectors by eigenvalues in descending order
= np.argsort(eigenvalues)[::-1]
sorted_indices = eigenvectors[:, sorted_indices]
eigenvectors
# project the data
= np.dot(X_centered, eigenvectors)
X_pca_numpy
# print the first 10 rows of the first three principal components
10, :3] X_pca_numpy[:
array([[-21.0132157 , -0.58429056, -18.78127557],
[ -8.29402824, -41.09833267, 20.51146456],
[ -6.5519595 , -30.98821707, 9.65805067],
[ 44.20977982, 10.11196016, 1.68990725],
[ 38.05703987, 14.70385106, -11.51850903],
[ 41.83695478, -1.76656913, -8.26814285],
[ 24.56272454, 0.90306868, -8.59899694],
[ -5.68329991, -26.30511016, 10.74802756],
[ -3.13380174, -27.50701824, 12.32952769],
[ 47.81897616, 8.86343347, -9.26382513]])
1], X_pca_numpy[:,:1]) np.allclose(X_pca[:,:
True
1:2], -X_pca_numpy[:,1:2]) np.allclose(X_pca[:,
True
= PCA(n_components=2)
pca = pca.fit(X)
_ = pca.transform(X)
X_pca X_pca.shape
(500, 2)
= PCA(n_components=0.95)
pca = pca.fit(X)
_ = pca.transform(X)
X_pca X_pca.shape
(500, 42)
# create a synthetic dataset
= make_classification(
X, y =1000,
n_samples=50,
n_features=10,
n_informative=10,
n_redundant=42,
random_state
)
# split into train and test sets
= train_test_split(
X_train, X_test, y_train, y_test
X,
y,=0.2,
test_size=42,
random_state
)
# create and fit logistic regression
= LogisticRegression(max_iter=1000, random_state=42)
logistic
logistic.fit(X_train, y_train)
# evaluate the logistic regression
= logistic.score(X_test, y_test)
accuracy print(f"Test Accuracy without PCA: {accuracy:.2f}")
# create a pipeline with PCA and Logistic Regression
= Pipeline(
pipeline
["pca", PCA(n_components=10)),
("classifier", LogisticRegression(max_iter=1000, random_state=42)),
(
]
)
# fit the pipeline with PCA
pipeline.fit(X_train, y_train)
# evaluate the pipeline with PCA
= pipeline.score(X_test, y_test)
accuracy_with_pca print(f"Test Accuracy with PCA: {accuracy_with_pca:.2f}")
Test Accuracy without PCA: 0.80
Test Accuracy with PCA: 0.82
Clustering
K-Means
To fit \(K\)-Means, minimize the cost function, otherwise known as the within cluster sum of squares:
\[ C(\pmb x_i, \pmb\mu_1, \ldots, \pmb\mu_K, \pmb r_1, \ldots, \pmb r_K) = \sum_{i = 1}^{n}\sum_{k = 1}^{K} r_{ik} || \pmb x_i - \pmb\mu_k || ^ 2 \]
The responsibilities, \(r_{ik}\), are defined as:
\[ r_{ik} = \begin{cases} 1 & \text{if } \pmb x_i \text{ is closest to } \pmb\mu_k\\ 0 & \text{otherwise} \end{cases} \]
This means that \(r_{ik}\) is 1 if the center of cluster \(k\) is the closest to data point \(x_i\), and 0 otherwise.
Assuming \(\pmb x_i\) is a \(p\)-dimensional vector, we have:
\[ || \pmb x_i - \pmb\mu_k || ^ 2 = (x_{i1} - \mu_{k1}) ^ 2 + \ldots + (x_{ip} - \mu_{kp}) ^ 2. \]
This quantity is the squared Euclidean distance (also known as the L2 norm) between data point \(\pmb x_i\) and the center of cluster \(k\), \(\pmb\mu_k\). It measures the “closeness” of \(\pmb x_i\) to \(\pmb \mu_k\). The goal of \(K\)-Means is to minimize this distance for all data points and their assigned clusters, hence minimizing the within cluster sum of squares.
This is easy! Simply… assign each data point to its own cluster! But that is silly and useless. So instead, we will use the Expectation–Maximization (EM) Algorithm to fit \(K\)-Means, after first choosing a value of \(K\).
To perform \(K\)-Means, first choose \(K\), the number of clusters to learn. Initialize a random center for each cluster.
EM Algorithm for \(K\)-Means
- Pre-select a value of \(K\), then number of clusters to learn.
- Randomly initialize a center for each cluster.
- Repeat the E and M steps until convergence.
- E-Step: Update the responsibilities. That is, assign each data point to the cluster that has the closest center. \[ r_{ik} = \begin{cases} 1 & \text{if } \pmb x_i \text{ is closest to } \pmb\mu_k\\ 0 & \text{otherwise} \end{cases} \]
- If there are no updates to the responsibilities, the algorithm has converged.
- M-Step: Update the cluster centers \(\pmb \mu_k\) by calculating the mean of all data points assigned to cluster \(k\). \[ \mu_k = \frac{\sum_{i=1}^n r_{ik} \pmb x_i}{\sum_{i=1}^n r_{ik}} \]
Because of the random initialization, \(K\)-Means is often by default run multiples times, and the “best” outcome (the outcome with the lowest cost) is chosen.
= make_blobs(
X, _ =500,
n_samples=2,
n_features=1,
cluster_std=3,
random_state )
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=ax,
ax
)r"$x_1$")
ax.set_xlabel(r"$x_2$")
ax.set_ylabel( plt.show()
= KMeans()
km = km.fit(X)
_ = km.predict(X)
clusters clusters
array([0, 2, 0, 2, 3, 1, 3, 0, 3, 3, 2, 3, 3, 4, 0, 3, 0, 0, 6, 4, 2, 2,
3, 0, 7, 3, 1, 5, 7, 2, 2, 0, 2, 2, 5, 5, 2, 3, 7, 7, 1, 4, 1, 3,
2, 5, 2, 6, 3, 5, 0, 0, 0, 3, 5, 0, 3, 2, 3, 2, 0, 0, 6, 3, 2, 4,
4, 6, 5, 7, 5, 5, 0, 0, 2, 2, 3, 2, 2, 2, 3, 7, 2, 3, 1, 5, 2, 2,
3, 2, 4, 3, 0, 1, 3, 5, 7, 5, 2, 3, 3, 2, 5, 5, 3, 0, 7, 3, 3, 5,
2, 3, 5, 7, 1, 6, 1, 1, 5, 0, 6, 1, 6, 4, 3, 7, 4, 7, 5, 3, 7, 0,
0, 0, 6, 2, 7, 4, 5, 1, 3, 1, 3, 7, 0, 7, 2, 1, 3, 5, 0, 1, 5, 5,
4, 3, 3, 2, 3, 1, 2, 3, 2, 4, 2, 2, 2, 7, 6, 5, 6, 5, 4, 0, 5, 6,
0, 3, 7, 2, 7, 3, 5, 6, 2, 3, 5, 2, 1, 0, 1, 5, 0, 4, 3, 6, 3, 5,
5, 3, 6, 7, 4, 4, 0, 6, 7, 1, 2, 5, 4, 4, 7, 7, 5, 4, 0, 2, 1, 5,
4, 2, 7, 2, 2, 0, 7, 2, 0, 2, 3, 3, 5, 2, 2, 7, 3, 7, 3, 2, 2, 2,
3, 6, 3, 7, 0, 7, 7, 1, 0, 0, 3, 5, 0, 1, 5, 3, 4, 2, 0, 0, 3, 1,
4, 0, 2, 2, 5, 0, 5, 0, 1, 3, 7, 5, 3, 3, 3, 3, 5, 4, 0, 0, 5, 1,
3, 2, 2, 4, 5, 3, 3, 3, 2, 3, 5, 2, 3, 1, 5, 3, 2, 4, 0, 4, 5, 3,
2, 6, 6, 5, 0, 1, 0, 4, 1, 2, 6, 0, 5, 5, 2, 2, 1, 4, 2, 3, 7, 2,
3, 1, 0, 0, 2, 2, 4, 2, 2, 5, 6, 1, 3, 1, 7, 2, 3, 3, 5, 2, 2, 7,
5, 1, 0, 2, 2, 1, 4, 6, 5, 0, 7, 4, 3, 1, 3, 7, 1, 6, 6, 0, 5, 3,
5, 7, 7, 0, 0, 5, 0, 5, 1, 5, 0, 6, 7, 3, 3, 3, 2, 1, 3, 1, 6, 4,
0, 2, 3, 3, 3, 7, 2, 5, 3, 5, 2, 4, 3, 7, 3, 4, 3, 2, 2, 3, 0, 0,
2, 2, 3, 3, 7, 1, 0, 7, 5, 7, 0, 7, 2, 7, 3, 2, 2, 1, 2, 0, 5, 5,
4, 7, 1, 5, 1, 2, 5, 2, 3, 0, 3, 4, 5, 5, 1, 5, 1, 7, 2, 6, 2, 3,
3, 2, 7, 3, 7, 7, 3, 6, 0, 7, 3, 3, 0, 5, 5, 7, 2, 0, 2, 5, 6, 5,
7, 2, 2, 1, 0, 5, 0, 4, 5, 3, 1, 3, 6, 5, 1, 4], dtype=int32)
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(clusters),
hue=ax,
ax
)r"$x_1$")
ax.set_xlabel(r"$x_2$")
ax.set_ylabel( plt.show()
= KMeans(n_clusters=3)
km3
km3.fit(X)= km3.predict(X)
clusters clusters
array([2, 1, 2, 1, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 1,
2, 2, 0, 2, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 2,
1, 1, 1, 0, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0,
0, 0, 1, 0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1,
2, 1, 0, 2, 2, 0, 2, 1, 0, 1, 1, 2, 2, 2, 1, 1, 2, 2, 0, 2, 2, 1,
1, 2, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 2,
2, 2, 0, 1, 0, 0, 1, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 1, 2, 0, 1, 1,
0, 2, 2, 1, 2, 0, 1, 2, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 2, 1, 0,
2, 2, 0, 1, 0, 2, 1, 0, 1, 2, 1, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1,
1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1,
0, 1, 0, 1, 1, 2, 0, 1, 2, 1, 2, 2, 1, 1, 1, 0, 2, 0, 2, 1, 1, 1,
2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0,
0, 2, 1, 1, 1, 2, 1, 2, 0, 2, 0, 1, 2, 2, 2, 2, 1, 0, 2, 2, 1, 0,
2, 1, 1, 0, 1, 2, 2, 2, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 2, 0, 1, 2,
1, 0, 0, 1, 2, 0, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 2, 0, 1,
2, 0, 2, 2, 1, 1, 0, 1, 1, 1, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 1, 0,
1, 0, 2, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 2,
1, 0, 0, 2, 2, 1, 2, 1, 0, 1, 2, 0, 0, 2, 2, 2, 1, 0, 2, 0, 0, 0,
2, 1, 2, 2, 2, 0, 1, 1, 2, 1, 1, 0, 2, 0, 2, 0, 2, 1, 1, 2, 2, 2,
1, 1, 2, 2, 0, 0, 2, 0, 1, 0, 2, 0, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1,
0, 0, 0, 1, 0, 1, 1, 1, 2, 2, 2, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 2,
2, 1, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 1, 1, 0, 1, 2, 1, 1, 0, 1,
0, 1, 1, 0, 2, 1, 2, 0, 1, 2, 0, 2, 0, 1, 0, 0], dtype=int32)
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(clusters),
hue=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
# check center of each learned cluster
km3.cluster_centers_
array([[ 7.67197216, 8.11134342],
[ 1.01450711, 4.20318963],
[-4.18726897, 0.27543154]])
# check cost
km3.inertia_
1005.8251148922151
= []
cost
for k in range(1, 11):
= KMeans(n_clusters=k, random_state=0)
kmeans
kmeans.fit(X) cost.append(kmeans.inertia_)
Alternative Clustering Methods
= make_blobs(
X, _ =500,
n_samples=2,
n_features=1,
cluster_std=3,
random_state )
DBSCAN
= DBSCAN()
dbscan = dbscan.fit_predict(X)
clusters clusters
array([ 0, 1, 0, -1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0,
0, -1, 2, 1, 1, 0, 0, 2, 0, 2, 1, 2, 1, -1, 0, 1, 1,
-1, 1, 1, 0, 2, 2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 1, 0,
0, 0, 0, 1, 3, -1, 1, 0, 1, 0, 0, 2, 0, 1, 2, 2, -1,
1, 2, 1, 1, 0, 0, -1, -1, 0, 1, 1, 1, 0, -1, 1, 0, -1,
1, -1, 1, 0, 1, 2, 0, 3, 2, 0, 1, 2, 1, 1, 0, 0, -1,
1, 1, 0, 0, 2, 0, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 1,
3, 2, 2, 2, 2, 0, 2, 2, 2, 1, 0, 2, 0, 0, 0, 2, 1,
2, 2, 1, 2, 0, 2, 0, 2, 0, 2, -1, 2, 0, -1, 0, 2, 1,
1, 2, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 2, 2, 1,
-1, 1, 2, 0, 1, 2, 3, 0, 2, 1, 2, 0, 1, 2, 1, 0, 1,
1, 2, 0, 2, 1, -1, -1, 0, 2, 0, 1, 1, 0, 2, 2, 2, 2,
0, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1, 2, 3, 1, -1, 1, 2,
1, 2, 1, 1, 0, 2, 1, 0, 1, -1, 0, 1, 1, 1, 2, 0, 2,
0, 1, 1, 1, 0, 2, 0, 2, 0, -1, 2, 2, 0, 0, 0, 1, 0,
2, 1, 0, 2, 1, 0, 0, 0, 2, 2, 0, 1, 1, -1, 0, 1, 0,
2, 0, 2, -1, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, -1, 1, 1,
2, 1, 0, 0, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 2, 0, 2,
1, 0, 1, 2, 2, 1, 0, -1, 0, 2, 2, 1, 2, -1, 1, 1, 1,
1, 2, 2, 1, 0, 2, -1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 1,
2, 2, 0, 2, 2, 1, 0, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1,
2, 2, 2, 1, 0, 2, 2, 0, 2, -1, 2, 2, 2, 2, 0, 1, 0,
1, 2, 2, 0, 0, 1, 0, 1, 2, 1, 0, 2, 2, 0, 0, 0, 1,
2, 0, 2, 2, 2, 0, 1, 0, 0, 0, 2, 1, 1, 0, 1, 1, 2,
0, 2, 0, 2, -1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 2, -1, 0,
2, 1, 2, 0, 2, 1, 2, -1, 1, 1, 2, 1, 0, -1, 1, 2, 2,
2, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 1, 2, -1, 2, 2, 1,
2, 1, 0, 0, 1, 2, 0, 2, 2, 0, -1, 0, 2, 0, 0, 0, 1,
1, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 0, 1, 3, 2, -1,
0, 2, 0, -1, 1, -1, 2])
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(clusters),
hue=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= DBSCAN(min_samples=25, eps=1.5)
dbscan = dbscan.fit_predict(X)
clusters clusters
array([ 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0,
0, 2, 2, 1, 1, 0, 0, 2, 0, 2, 1, 2, 1, 1, 0, 1, 1,
1, 1, 1, 0, 2, 2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 1, 0,
0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 2, 2, 2,
1, 2, 1, 1, 0, 0, 1, -1, 0, 1, 1, 1, 0, -1, 1, 0, 2,
1, 1, 1, 0, 1, 2, 0, 0, 2, 0, 1, 2, 1, 1, 0, 0, -1,
1, 1, 0, 0, 2, 0, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 1,
0, 2, 2, 2, 2, 0, 2, 2, 2, 1, 0, 2, 0, 0, 0, 2, 1,
2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 1, 0, 2, 1,
1, 2, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 2, 2, 1,
2, 1, 2, 0, 1, 2, 0, 0, 2, 1, 2, 0, 1, 2, 1, 0, 1,
1, 2, 0, 2, 1, 0, 2, 0, 2, 0, 1, 1, 0, 2, 2, 2, 2,
0, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1, 2, 0, 1, 2, 1, 2,
1, 2, 1, 1, 0, 2, 1, 0, 1, 0, 0, 1, 1, 1, 2, 0, 2,
0, 1, 1, 1, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 1, 0,
2, 1, 0, 2, 1, 0, 0, 0, 2, 2, 0, 1, 1, 1, 0, 1, 0,
2, 0, 2, 1, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1,
2, 1, 0, 0, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 2, 0, 2,
1, 0, 1, 2, 2, 1, 0, 2, 0, 2, 2, 1, 2, 0, 1, 1, 1,
1, 2, 2, 1, 0, 2, 1, 0, 2, 0, 0, 1, 1, 2, 1, 1, 1,
2, 2, 0, 2, 2, 1, 0, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1,
2, 2, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 1, 0,
1, 2, 2, 0, 0, 1, 0, 1, 2, 1, 0, 2, 2, 0, 0, 0, 1,
2, 0, 2, 2, 2, 0, 1, 0, 0, 0, 2, 1, 1, 0, 1, 1, 2,
0, 2, 0, 2, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 0,
2, 1, 2, 0, 2, 1, 2, 0, 1, 1, 2, 1, 0, 1, 1, 2, 2,
2, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 1, 2, 1, 2, 2, 1,
2, 1, 0, 0, 1, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 1,
1, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 0, 1, 0, 2, 1,
0, 2, 0, 2, 1, 2, 2])
Agglomerative Clustering
= AgglomerativeClustering()
ac = ac.fit_predict(X)
clusters clusters
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(clusters),
hue=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= AgglomerativeClustering(n_clusters=3)
ac = ac.fit_predict(X)
clusters clusters
array([1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 2, 2,
1, 1, 0, 1, 0, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 1,
2, 2, 2, 0, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 0, 1, 2, 0,
0, 0, 2, 0, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 2, 2, 2,
1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2,
2, 1, 2, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1,
1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 2, 1, 0, 2, 2,
0, 1, 1, 2, 1, 0, 2, 1, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 1, 2, 0,
1, 1, 0, 2, 0, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 2,
2, 1, 0, 0, 0, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2,
0, 2, 0, 2, 2, 1, 0, 2, 1, 2, 1, 1, 2, 2, 2, 0, 1, 0, 1, 2, 2, 2,
1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 1, 0,
0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 0, 2, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0,
1, 2, 2, 0, 2, 1, 1, 1, 2, 1, 2, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1,
2, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 1, 2, 2, 2, 2, 0, 0, 2, 1, 0, 2,
1, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0,
2, 0, 1, 2, 2, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 2, 1,
2, 0, 0, 1, 1, 2, 1, 2, 0, 2, 1, 0, 0, 1, 1, 1, 2, 0, 1, 0, 0, 0,
1, 2, 1, 1, 1, 0, 2, 2, 1, 2, 2, 0, 1, 0, 1, 0, 1, 2, 2, 1, 1, 1,
2, 2, 1, 1, 0, 0, 1, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 2,
0, 0, 0, 2, 0, 2, 2, 2, 1, 1, 1, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 1,
1, 2, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2,
0, 2, 2, 0, 1, 2, 1, 0, 2, 1, 0, 1, 0, 2, 0, 0])
Density Estimation
Kernel Density Estimation
\[ f_h(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right) \]
# generate a small sample
= np.array([-2, -1.5, -1.3, -1, -0.5, 2, 5, 6]).reshape(-1, 1)
X
# fit KDE
= KernelDensity(bandwidth=1)
kde = kde.fit(X)
_
# get pdf values for the plot x values
= np.linspace(X.min() - 3, X.max() + 3, 1000)
x = kde.score_samples(x.reshape(-1, 1))
logprob = np.exp(logprob)
pdf
# create rug plot
= plt.subplots(figsize=(8, 4))
fig, ax
sns.rugplot(
X.ravel(),="dodgerblue",
color=0.1,
height=2,
linewidth=False,
expand_margins=2,
zorder=ax,
ax
)
# add individual kernels
for x_i in X.ravel():
= norm.pdf(x, loc=x_i, scale=kde.bandwidth)
kernel
ax.plot(
x,
kernel,="lightblue",
color=0.65,
alpha="--",
linestyle=1,
zorder
)
# add the KDE
="darkorange")
ax.plot(x, pdf, color plt.show()
= make_blobs(
X, _ =250,
n_samples=2,
centers=1,
n_features=[1.5, 3],
cluster_std=42,
random_state
)10] X[:
array([[-2.78768609],
[ 6.09024112],
[-1.62891198],
[ 9.6636619 ],
[-3.00069084],
[-2.50152744],
[-1.9456506 ],
[10.69663971],
[-1.90312134],
[-3.56227726]])
= plt.subplots()
fig, ax
sns.histplot(
X.ravel(),=20,
bins="density",
stat="lightgrey",
color=ax,
ax
)
sns.rugplot(
X.ravel(),="dodgerblue",
color=ax,
ax
) plt.show()
= KernelDensity(bandwidth=1)
kde = kde.fit(X) _
# get pdf values for the plot x values
= np.linspace(X.min(), X.max(), 1000)
x = kde.score_samples(x.reshape(-1, 1))
logprob = np.exp(logprob)
pdf
= plt.subplots()
fig, ax
sns.histplot(
X.ravel(),=20,
bins="density",
stat="lightgrey",
color=ax,
ax
)
sns.rugplot(
X.ravel(),="dodgerblue",
color=ax,
ax
)="darkorange")
ax.plot(x, pdf, color plt.show()
print(np.exp(kde.score_samples([[10]])))
print(np.mean(norm.pdf(10 - X.ravel())))
[0.07041698]
0.07041697626037091
# generate new samples from learned distribution
= kde.sample(
X_new =250,
n_samples=42,
random_state )
Gaussian Mixture Models
= GaussianMixture(n_components=2)
gmm
gmm.fit(X) gmm.predict(X)
array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
0, 0, 0, 1, 1, 1, 1, 0])
10]) gmm.predict_proba(X[:
array([[9.99809500e-01, 1.90499762e-04],
[1.73596228e-08, 9.99999983e-01],
[9.98965880e-01, 1.03411986e-03],
[6.62325206e-17, 1.00000000e+00],
[9.99852179e-01, 1.47821159e-04],
[9.99724537e-01, 2.75463373e-04],
[9.99381880e-01, 6.18119575e-04],
[9.58091584e-20, 1.00000000e+00],
[9.99339166e-01, 6.60834258e-04],
[9.99917508e-01, 8.24920374e-05]])
10]) gmm._estimate_log_prob(X[:
array([[ -1.26869862, -9.84679292],
[-20.38619544, -2.52950181],
[ -1.51297646, -8.3985716 ],
[-39.27823941, -2.03730453],
[ -1.29732827, -10.12911304],
[ -1.26618292, -9.47538851],
[ -1.37908791, -8.77972343],
[-45.93686441, -2.15736095],
[ -1.39413066, -8.72790231],
[ -1.48228032, -10.89743182]])
gmm.weights_
array([0.49689374, 0.50310626])
gmm.means_
array([[-2.62713984],
[ 9.12611073]])
gmm.covariances_
array([[[1.98684369]],
[[9.06906997]]])
sum(np.exp(gmm._estimate_log_prob(X[:10])) * gmm.weights_, axis=1) np.
array([0.13975181, 0.04009692, 0.10955584, 0.06559483, 0.1358017 ,
0.14011574, 0.12519927, 0.05817411, 0.12333529, 0.11286342])
10])) np.exp(gmm.score_samples(X[:
array([0.13975181, 0.04009692, 0.10955584, 0.06559483, 0.1358017 ,
0.14011574, 0.12519927, 0.05817411, 0.12333529, 0.11286342])
# get pdf values for the plot x values
= np.linspace(X.min(), X.max(), 1000)
x = gmm.score_samples(x.reshape(-1, 1))
logprob = np.exp(logprob)
pdf
# create plot
= plt.subplots()
fig, ax
sns.histplot(
X.ravel(),=20,
bins="density",
stat="lightgrey",
color=ax,
ax
)
sns.rugplot(
X.ravel(),="dodgerblue",
color=ax,
ax
)="darkorange")
ax.plot(x, pdf, color plt.show()
=10) gmm.sample(n_samples
(array([[-4.91697739],
[-1.54392333],
[-2.33796974],
[-2.59626579],
[ 9.94695185],
[14.05432476],
[10.7947898 ],
[ 8.91455004],
[ 8.26899985],
[10.46576932]]),
array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]))
= make_blobs(
X, _ =500,
n_samples=2,
n_features=[0.5, 2, 5],
cluster_std=3,
random_state
) X.shape
(500, 2)
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= KMeans(n_clusters=3)
km3
km3.fit(X)= km3.predict(X) clusters
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(clusters),
hue=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= GaussianMixture(n_components=3)
gmm
gmm.fit(X)= gmm.predict(X) clusters
Outlier Detection
= make_blobs(
X, _ =500,
n_samples=3,
centers=2,
n_features=1.5,
cluster_std=42,
random_state
)= np.random.RandomState(42).uniform(
outliers =-20,
low=20,
high=(25, 2),
size
)= np.vstack((X, outliers))
X X
array([[ -5.1557172 , -7.93487423],
[ 0.59454887, 1.89172738],
[ 7.92458324, 0.76072226],
...,
[ -9.64880074, 6.50089137],
[ -7.53155696, 0.80272085],
[ 1.86841117, -12.60582178]], shape=(525, 2))
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= IsolationForest(
iso =42,
random_state
)= iso.fit(X)
_ = iso.predict(X)
inout inout
array([ 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1,
1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1,
1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, 1,
1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1,
1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
= plt.subplots(figsize=(6, 4))
fig, ax
sns.scatterplot(=X[:, 0],
x=X[:, 1],
y=pd.Categorical(inout),
hue=ax,
ax
)r"$x_1$")
plt.xlabel(r"$x_2$")
plt.ylabel( plt.show()
= IsolationForest(
iso =0.025,
contamination=42,
random_state
)= iso.fit(X)
_ = iso.predict(X)
inout inout
array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1,
-1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1])