Unsupervised Learning

Learning Without a Supervisor

Author
Modified

April 23, 2025

This note is under active development. Some additional information can be found in the following scribbles.

# basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

# general machine Learning
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# unsupervised learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from sklearn.ensemble import IsolationForest

Dimension Reduction

X, _ = make_blobs(
    n_samples=500,
    centers=4,
    n_features=50,
    cluster_std=5,
    random_state=42,
)
X
array([[  5.47268545,   2.36711978,  -7.72797716, ...,  11.56267621,
          5.3875766 ,   8.30440778],
       [  1.85492413,   4.18350251,   4.24293671, ...,   6.95231996,
         -0.14559701,  -1.61552343],
       [ -4.83822547,   8.26009757,   4.27014101, ...,   1.90689712,
          1.85337817,   7.16225743],
       ...,
       [ 22.32156843,   7.59791014,  11.85630655, ...,  -7.07516877,
        -14.93508452,  -5.03480831],
       [  6.99270309, -11.19575062,  -3.5367364 , ...,   7.54436995,
         17.31625101,   0.1853057 ],
       [-13.91218603,   0.6172048 ,  -4.80263491, ...,  -7.4577458 ,
         -8.18493172,  -4.08366717]], shape=(500, 50))
X.shape
(500, 50)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    ax=ax,
)
ax.set_xlabel(r"$x_1$")
ax.set_ylabel(r"$x_2$")
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 1],
    y=X[:, 2],
    ax=ax,
)
ax.set_xlabel(r"$x_2$")
ax.set_ylabel(r"$x_3$")
plt.show()

pca = PCA()
_ = pca.fit(X)
X_pca = pca.transform(X)
X_pca
array([[-21.0132157 ,   0.58429056, -18.78127557, ...,  -3.43557718,
         -3.48778491,   7.13500179],
       [ -8.29402824,  41.09833267,  20.51146456, ...,  -0.68948175,
         -6.51053673,   2.54956227],
       [ -6.5519595 ,  30.98821707,   9.65805067, ...,   1.23333536,
          3.86083456,   0.83170031],
       ...,
       [-14.29741386, -37.31812582,  36.25339557, ...,   2.14258928,
          1.06942156,  -2.05336295],
       [-20.74836199,   2.01953335, -37.94579109, ...,  -0.77644235,
          4.32287409,  -3.04784811],
       [ 43.92071368,  -2.57280644,  -8.61724191, ...,  -1.65243415,
         -9.02562484,   6.99464932]], shape=(500, 50))
X_pca[:10, :3]
array([[-21.0132157 ,   0.58429056, -18.78127557],
       [ -8.29402824,  41.09833267,  20.51146456],
       [ -6.5519595 ,  30.98821707,   9.65805067],
       [ 44.20977982, -10.11196016,   1.68990725],
       [ 38.05703987, -14.70385106, -11.51850903],
       [ 41.83695478,   1.76656913,  -8.26814285],
       [ 24.56272454,  -0.90306868,  -8.59899694],
       [ -5.68329991,  26.30511016,  10.74802756],
       [ -3.13380174,  27.50701824,  12.32952769],
       [ 47.81897616,  -8.86343347,  -9.26382513]])
X_pca.shape
(500, 50)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    ax=ax,
)
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X_pca[:, 1],
    y=X_pca[:, 2],
    ax=ax,
)
ax.set_xlabel("Principal Component 2")
ax.set_ylabel("Principal Component 3")
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X_pca[:, 2],
    y=X_pca[:, 3],
    ax=ax,
)
ax.set_xlabel("Principal Component 3")
ax.set_ylabel("Principal Component 4")
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X_pca[:, 9],
    y=X_pca[:, 10],
    ax=ax,
)
ax.set_xlabel("Principal Component 10")
ax.set_ylabel("Principal Component 11")
plt.show()

# center the data
X_centered = X - np.mean(X, axis=0)

# compute the covariance matrix
cov_matrix = np.cov(X_centered, rowvar=False)

# compute the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# sort the eigenvectors by eigenvalues in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_indices]

# project the data
X_pca_numpy = np.dot(X_centered, eigenvectors)

# print the first 10 rows of the first three principal components
X_pca_numpy[:10, :3]
array([[-21.0132157 ,  -0.58429056, -18.78127557],
       [ -8.29402824, -41.09833267,  20.51146456],
       [ -6.5519595 , -30.98821707,   9.65805067],
       [ 44.20977982,  10.11196016,   1.68990725],
       [ 38.05703987,  14.70385106, -11.51850903],
       [ 41.83695478,  -1.76656913,  -8.26814285],
       [ 24.56272454,   0.90306868,  -8.59899694],
       [ -5.68329991, -26.30511016,  10.74802756],
       [ -3.13380174, -27.50701824,  12.32952769],
       [ 47.81897616,   8.86343347,  -9.26382513]])
np.allclose(X_pca[:,:1], X_pca_numpy[:,:1])
True
np.allclose(X_pca[:,1:2], -X_pca_numpy[:,1:2])
True
pca = PCA(n_components=2)
_ = pca.fit(X)
X_pca = pca.transform(X)
X_pca.shape
(500, 2)
pca = PCA(n_components=0.95)
_ = pca.fit(X)
X_pca = pca.transform(X)
X_pca.shape
(500, 42)
# create a synthetic dataset
X, y = make_classification(
    n_samples=1000,
    n_features=50,
    n_informative=10,
    n_redundant=10,
    random_state=42,
)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

# create and fit logistic regression
logistic = LogisticRegression(max_iter=1000, random_state=42)
logistic.fit(X_train, y_train)

# evaluate the logistic regression
accuracy = logistic.score(X_test, y_test)
print(f"Test Accuracy without PCA: {accuracy:.2f}")

# create a pipeline with PCA and Logistic Regression
pipeline = Pipeline(
    [
        ("pca", PCA(n_components=10)),
        ("classifier", LogisticRegression(max_iter=1000, random_state=42)),
    ]
)

# fit the pipeline with PCA
pipeline.fit(X_train, y_train)

# evaluate the pipeline with PCA
accuracy_with_pca = pipeline.score(X_test, y_test)
print(f"Test Accuracy with PCA: {accuracy_with_pca:.2f}")
Test Accuracy without PCA: 0.80
Test Accuracy with PCA: 0.82

Clustering

K-Means

To fit \(K\)-Means, minimize the cost function, otherwise known as the within cluster sum of squares:

\[ C(\pmb x_i, \pmb\mu_1, \ldots, \pmb\mu_K, \pmb r_1, \ldots, \pmb r_K) = \sum_{i = 1}^{n}\sum_{k = 1}^{K} r_{ik} || \pmb x_i - \pmb\mu_k || ^ 2 \]

The responsibilities, \(r_{ik}\), are defined as:

\[ r_{ik} = \begin{cases} 1 & \text{if } \pmb x_i \text{ is closest to } \pmb\mu_k\\ 0 & \text{otherwise} \end{cases} \]

This means that \(r_{ik}\) is 1 if the center of cluster \(k\) is the closest to data point \(x_i\), and 0 otherwise.

Assuming \(\pmb x_i\) is a \(p\)-dimensional vector, we have:

\[ || \pmb x_i - \pmb\mu_k || ^ 2 = (x_{i1} - \mu_{k1}) ^ 2 + \ldots + (x_{ip} - \mu_{kp}) ^ 2. \]

This quantity is the squared Euclidean distance (also known as the L2 norm) between data point \(\pmb x_i\) and the center of cluster \(k\), \(\pmb\mu_k\). It measures the “closeness” of \(\pmb x_i\) to \(\pmb \mu_k\). The goal of \(K\)-Means is to minimize this distance for all data points and their assigned clusters, hence minimizing the within cluster sum of squares.

This is easy! Simply… assign each data point to its own cluster! But that is silly and useless. So instead, we will use the Expectation–Maximization (EM) Algorithm to fit \(K\)-Means, after first choosing a value of \(K\).

To perform \(K\)-Means, first choose \(K\), the number of clusters to learn. Initialize a random center for each cluster.

EM Algorithm for \(K\)-Means

  1. Pre-select a value of \(K\), then number of clusters to learn.
  2. Randomly initialize a center for each cluster.
  3. Repeat the E and M steps until convergence.
    • E-Step: Update the responsibilities. That is, assign each data point to the cluster that has the closest center. \[ r_{ik} = \begin{cases} 1 & \text{if } \pmb x_i \text{ is closest to } \pmb\mu_k\\ 0 & \text{otherwise} \end{cases} \]
    • If there are no updates to the responsibilities, the algorithm has converged.
    • M-Step: Update the cluster centers \(\pmb \mu_k\) by calculating the mean of all data points assigned to cluster \(k\). \[ \mu_k = \frac{\sum_{i=1}^n r_{ik} \pmb x_i}{\sum_{i=1}^n r_{ik}} \]

Because of the random initialization, \(K\)-Means is often by default run multiples times, and the “best” outcome (the outcome with the lowest cost) is chosen.

X, _ = make_blobs(
    n_samples=500,
    n_features=2,
    cluster_std=1,
    random_state=3,
)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    ax=ax,
)
ax.set_xlabel(r"$x_1$")
ax.set_ylabel(r"$x_2$")
plt.show()

km = KMeans()
_ = km.fit(X)
clusters = km.predict(X)
clusters
array([0, 2, 0, 2, 3, 1, 3, 0, 3, 3, 2, 3, 3, 4, 0, 3, 0, 0, 6, 4, 2, 2,
       3, 0, 7, 3, 1, 5, 7, 2, 2, 0, 2, 2, 5, 5, 2, 3, 7, 7, 1, 4, 1, 3,
       2, 5, 2, 6, 3, 5, 0, 0, 0, 3, 5, 0, 3, 2, 3, 2, 0, 0, 6, 3, 2, 4,
       4, 6, 5, 7, 5, 5, 0, 0, 2, 2, 3, 2, 2, 2, 3, 7, 2, 3, 1, 5, 2, 2,
       3, 2, 4, 3, 0, 1, 3, 5, 7, 5, 2, 3, 3, 2, 5, 5, 3, 0, 7, 3, 3, 5,
       2, 3, 5, 7, 1, 6, 1, 1, 5, 0, 6, 1, 6, 4, 3, 7, 4, 7, 5, 3, 7, 0,
       0, 0, 6, 2, 7, 4, 5, 1, 3, 1, 3, 7, 0, 7, 2, 1, 3, 5, 0, 1, 5, 5,
       4, 3, 3, 2, 3, 1, 2, 3, 2, 4, 2, 2, 2, 7, 6, 5, 6, 5, 4, 0, 5, 6,
       0, 3, 7, 2, 7, 3, 5, 6, 2, 3, 5, 2, 1, 0, 1, 5, 0, 4, 3, 6, 3, 5,
       5, 3, 6, 7, 4, 4, 0, 6, 7, 1, 2, 5, 4, 4, 7, 7, 5, 4, 0, 2, 1, 5,
       4, 2, 7, 2, 2, 0, 7, 2, 0, 2, 3, 3, 5, 2, 2, 7, 3, 7, 3, 2, 2, 2,
       3, 6, 3, 7, 0, 7, 7, 1, 0, 0, 3, 5, 0, 1, 5, 3, 4, 2, 0, 0, 3, 1,
       4, 0, 2, 2, 5, 0, 5, 0, 1, 3, 7, 5, 3, 3, 3, 3, 5, 4, 0, 0, 5, 1,
       3, 2, 2, 4, 5, 3, 3, 3, 2, 3, 5, 2, 3, 1, 5, 3, 2, 4, 0, 4, 5, 3,
       2, 6, 6, 5, 0, 1, 0, 4, 1, 2, 6, 0, 5, 5, 2, 2, 1, 4, 2, 3, 7, 2,
       3, 1, 0, 0, 2, 2, 4, 2, 2, 5, 6, 1, 3, 1, 7, 2, 3, 3, 5, 2, 2, 7,
       5, 1, 0, 2, 2, 1, 4, 6, 5, 0, 7, 4, 3, 1, 3, 7, 1, 6, 6, 0, 5, 3,
       5, 7, 7, 0, 0, 5, 0, 5, 1, 5, 0, 6, 7, 3, 3, 3, 2, 1, 3, 1, 6, 4,
       0, 2, 3, 3, 3, 7, 2, 5, 3, 5, 2, 4, 3, 7, 3, 4, 3, 2, 2, 3, 0, 0,
       2, 2, 3, 3, 7, 1, 0, 7, 5, 7, 0, 7, 2, 7, 3, 2, 2, 1, 2, 0, 5, 5,
       4, 7, 1, 5, 1, 2, 5, 2, 3, 0, 3, 4, 5, 5, 1, 5, 1, 7, 2, 6, 2, 3,
       3, 2, 7, 3, 7, 7, 3, 6, 0, 7, 3, 3, 0, 5, 5, 7, 2, 0, 2, 5, 6, 5,
       7, 2, 2, 1, 0, 5, 0, 4, 5, 3, 1, 3, 6, 5, 1, 4], dtype=int32)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
ax.set_xlabel(r"$x_1$")
ax.set_ylabel(r"$x_2$")
plt.show()

km3 = KMeans(n_clusters=3)
km3.fit(X)
clusters = km3.predict(X)
clusters
array([2, 1, 2, 1, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 1,
       2, 2, 0, 2, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 2,
       1, 1, 1, 0, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 0, 2, 1, 0,
       0, 0, 1, 0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1,
       2, 1, 0, 2, 2, 0, 2, 1, 0, 1, 1, 2, 2, 2, 1, 1, 2, 2, 0, 2, 2, 1,
       1, 2, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 2,
       2, 2, 0, 1, 0, 0, 1, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 1, 2, 0, 1, 1,
       0, 2, 2, 1, 2, 0, 1, 2, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 2, 1, 0,
       2, 2, 0, 1, 0, 2, 1, 0, 1, 2, 1, 1, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1,
       1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1,
       0, 1, 0, 1, 1, 2, 0, 1, 2, 1, 2, 2, 1, 1, 1, 0, 2, 0, 2, 1, 1, 1,
       2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0,
       0, 2, 1, 1, 1, 2, 1, 2, 0, 2, 0, 1, 2, 2, 2, 2, 1, 0, 2, 2, 1, 0,
       2, 1, 1, 0, 1, 2, 2, 2, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 2, 0, 1, 2,
       1, 0, 0, 1, 2, 0, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 2, 0, 1,
       2, 0, 2, 2, 1, 1, 0, 1, 1, 1, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 1, 0,
       1, 0, 2, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 1, 2,
       1, 0, 0, 2, 2, 1, 2, 1, 0, 1, 2, 0, 0, 2, 2, 2, 1, 0, 2, 0, 0, 0,
       2, 1, 2, 2, 2, 0, 1, 1, 2, 1, 1, 0, 2, 0, 2, 0, 2, 1, 1, 2, 2, 2,
       1, 1, 2, 2, 0, 0, 2, 0, 1, 0, 2, 0, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 2, 2, 2, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 2,
       2, 1, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 1, 1, 0, 1, 2, 1, 1, 0, 1,
       0, 1, 1, 0, 2, 1, 2, 0, 1, 2, 0, 2, 0, 1, 0, 0], dtype=int32)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

# check center of each learned cluster
km3.cluster_centers_
array([[ 7.67197216,  8.11134342],
       [ 1.01450711,  4.20318963],
       [-4.18726897,  0.27543154]])
# check cost
km3.inertia_
1005.8251148922151
cost = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    cost.append(kmeans.inertia_)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(range(1, 11), cost, marker="o")
ax.set_title("Elbow Plot")
ax.set_xlabel("Number of Clusters (k)")
ax.set_ylabel("Cost (Within Cluster Sum of Squares)")
plt.show()

Alternative Clustering Methods

X, _ = make_blobs(
    n_samples=500,
    n_features=2,
    cluster_std=1,
    random_state=3,
)

DBSCAN

dbscan = DBSCAN()
clusters = dbscan.fit_predict(X)
clusters
array([ 0,  1,  0, -1,  0,  2,  0,  0,  0,  0,  1,  0,  0,  2,  0,  0,  0,
        0, -1,  2,  1,  1,  0,  0,  2,  0,  2,  1,  2,  1, -1,  0,  1,  1,
       -1,  1,  1,  0,  2,  2,  2,  2,  2,  0,  1,  1,  1,  2,  0,  1,  0,
        0,  0,  0,  1,  3, -1,  1,  0,  1,  0,  0,  2,  0,  1,  2,  2, -1,
        1,  2,  1,  1,  0,  0, -1, -1,  0,  1,  1,  1,  0, -1,  1,  0, -1,
        1, -1,  1,  0,  1,  2,  0,  3,  2,  0,  1,  2,  1,  1,  0,  0, -1,
        1,  1,  0,  0,  2,  0,  0,  1,  1,  0,  1,  2,  2,  2,  2,  2,  1,
        3,  2,  2,  2,  2,  0,  2,  2,  2,  1,  0,  2,  0,  0,  0,  2,  1,
        2,  2,  1,  2,  0,  2,  0,  2,  0,  2, -1,  2,  0, -1,  0,  2,  1,
        1,  2,  0,  0,  1,  0,  2,  1,  0,  1,  2,  1,  1,  1,  2,  2,  1,
       -1,  1,  2,  0,  1,  2,  3,  0,  2,  1,  2,  0,  1,  2,  1,  0,  1,
        1,  2,  0,  2,  1, -1, -1,  0,  2,  0,  1,  1,  0,  2,  2,  2,  2,
        0,  2,  2, -1,  1,  1,  2,  2,  2,  2,  1,  2,  3,  1, -1,  1,  2,
        1,  2,  1,  1,  0,  2,  1,  0,  1, -1,  0,  1,  1,  1,  2,  0,  2,
        0,  1,  1,  1,  0,  2,  0,  2,  0, -1,  2,  2,  0,  0,  0,  1,  0,
        2,  1,  0,  2,  1,  0,  0,  0,  2,  2,  0,  1,  1, -1,  0,  1,  0,
        2,  0,  2, -1,  0,  0,  0,  0,  1,  2,  0,  0,  1,  2, -1,  1,  1,
        2,  1,  0,  0,  0,  1,  0,  1,  1,  0,  2,  1,  0,  1,  2,  0,  2,
        1,  0,  1,  2,  2,  1,  0, -1,  0,  2,  2,  1,  2, -1,  1,  1,  1,
        1,  2,  2,  1,  0,  2, -1,  0,  2,  0,  0,  1,  1,  2,  1,  1,  1,
        2,  2,  0,  2,  2,  1,  0,  0,  1,  1,  1,  2,  1,  2,  0,  1,  1,
        2,  2,  2,  1,  0,  2,  2,  0,  2, -1,  2,  2,  2,  2,  0,  1,  0,
        1,  2,  2,  0,  0,  1,  0,  1,  2,  1,  0,  2,  2,  0,  0,  0,  1,
        2,  0,  2,  2,  2,  0,  1,  0,  0,  0,  2,  1,  1,  0,  1,  1,  2,
        0,  2,  0,  2, -1,  1,  1,  0,  0,  0,  1,  1,  0,  0,  2, -1,  0,
        2,  1,  2,  0,  2,  1,  2, -1,  1,  1,  2,  1,  0, -1,  1,  2,  2,
        2,  1,  2,  1,  1,  1,  0,  0,  0,  2,  1,  1,  2, -1,  2,  2,  1,
        2,  1,  0,  0,  1,  2,  0,  2,  2,  0, -1,  0,  2,  0,  0,  0,  1,
        1,  2,  1,  0,  1,  1,  2,  1,  2,  1,  1,  2,  0,  1,  3,  2, -1,
        0,  2,  0, -1,  1, -1,  2])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

dbscan = DBSCAN(min_samples=25, eps=1.5)
clusters = dbscan.fit_predict(X)
clusters
array([ 0,  1,  0,  1,  0,  2,  0,  0,  0,  0,  1,  0,  0,  2,  0,  0,  0,
        0,  2,  2,  1,  1,  0,  0,  2,  0,  2,  1,  2,  1,  1,  0,  1,  1,
        1,  1,  1,  0,  2,  2,  2,  2,  2,  0,  1,  1,  1,  2,  0,  1,  0,
        0,  0,  0,  1,  0,  0,  1,  0,  1,  0,  0,  2,  0,  1,  2,  2,  2,
        1,  2,  1,  1,  0,  0,  1, -1,  0,  1,  1,  1,  0, -1,  1,  0,  2,
        1,  1,  1,  0,  1,  2,  0,  0,  2,  0,  1,  2,  1,  1,  0,  0, -1,
        1,  1,  0,  0,  2,  0,  0,  1,  1,  0,  1,  2,  2,  2,  2,  2,  1,
        0,  2,  2,  2,  2,  0,  2,  2,  2,  1,  0,  2,  0,  0,  0,  2,  1,
        2,  2,  1,  2,  0,  2,  0,  2,  0,  2,  1,  2,  0,  1,  0,  2,  1,
        1,  2,  0,  0,  1,  0,  2,  1,  0,  1,  2,  1,  1,  1,  2,  2,  1,
        2,  1,  2,  0,  1,  2,  0,  0,  2,  1,  2,  0,  1,  2,  1,  0,  1,
        1,  2,  0,  2,  1,  0,  2,  0,  2,  0,  1,  1,  0,  2,  2,  2,  2,
        0,  2,  2, -1,  1,  1,  2,  2,  2,  2,  1,  2,  0,  1,  2,  1,  2,
        1,  2,  1,  1,  0,  2,  1,  0,  1,  0,  0,  1,  1,  1,  2,  0,  2,
        0,  1,  1,  1,  0,  2,  0,  2,  0,  2,  2,  2,  0,  0,  0,  1,  0,
        2,  1,  0,  2,  1,  0,  0,  0,  2,  2,  0,  1,  1,  1,  0,  1,  0,
        2,  0,  2,  1,  0,  0,  0,  0,  1,  2,  0,  0,  1,  2,  0,  1,  1,
        2,  1,  0,  0,  0,  1,  0,  1,  1,  0,  2,  1,  0,  1,  2,  0,  2,
        1,  0,  1,  2,  2,  1,  0,  2,  0,  2,  2,  1,  2,  0,  1,  1,  1,
        1,  2,  2,  1,  0,  2,  1,  0,  2,  0,  0,  1,  1,  2,  1,  1,  1,
        2,  2,  0,  2,  2,  1,  0,  0,  1,  1,  1,  2,  1,  2,  0,  1,  1,
        2,  2,  2,  1,  0,  2,  2,  0,  2,  0,  2,  2,  2,  2,  0,  1,  0,
        1,  2,  2,  0,  0,  1,  0,  1,  2,  1,  0,  2,  2,  0,  0,  0,  1,
        2,  0,  2,  2,  2,  0,  1,  0,  0,  0,  2,  1,  1,  0,  1,  1,  2,
        0,  2,  0,  2,  0,  1,  1,  0,  0,  0,  1,  1,  0,  0,  2,  2,  0,
        2,  1,  2,  0,  2,  1,  2,  0,  1,  1,  2,  1,  0,  1,  1,  2,  2,
        2,  1,  2,  1,  1,  1,  0,  0,  0,  2,  1,  1,  2,  1,  2,  2,  1,
        2,  1,  0,  0,  1,  2,  0,  2,  2,  0,  2,  0,  2,  0,  0,  0,  1,
        1,  2,  1,  0,  1,  1,  2,  1,  2,  1,  1,  2,  0,  1,  0,  2,  1,
        0,  2,  0,  2,  1,  2,  2])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

Agglomerative Clustering

ac = AgglomerativeClustering()
clusters = ac.fit_predict(X)
clusters
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

ac = AgglomerativeClustering(n_clusters=3)
clusters = ac.fit_predict(X)
clusters
array([1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 2, 2,
       1, 1, 0, 1, 0, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 1,
       2, 2, 2, 0, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 0, 1, 2, 0,
       0, 0, 2, 0, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 2, 2, 2,
       1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2,
       2, 1, 2, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1,
       1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 2, 1, 0, 2, 2,
       0, 1, 1, 2, 1, 0, 2, 1, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 1, 2, 0,
       1, 1, 0, 2, 0, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 1, 0, 1, 0, 1, 2,
       2, 1, 0, 0, 0, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2,
       0, 2, 0, 2, 2, 1, 0, 2, 1, 2, 1, 1, 2, 2, 2, 0, 1, 0, 1, 2, 2, 2,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 1, 0,
       0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 0, 2, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0,
       1, 2, 2, 0, 2, 1, 1, 1, 2, 1, 2, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1,
       2, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 1, 2, 2, 2, 2, 0, 0, 2, 1, 0, 2,
       1, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0,
       2, 0, 1, 2, 2, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 2, 1,
       2, 0, 0, 1, 1, 2, 1, 2, 0, 2, 1, 0, 0, 1, 1, 1, 2, 0, 1, 0, 0, 0,
       1, 2, 1, 1, 1, 0, 2, 2, 1, 2, 2, 0, 1, 0, 1, 0, 1, 2, 2, 1, 1, 1,
       2, 2, 1, 1, 0, 0, 1, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 2,
       0, 0, 0, 2, 0, 2, 2, 2, 1, 1, 1, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 1,
       1, 2, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2,
       0, 2, 2, 0, 1, 2, 1, 0, 2, 1, 0, 1, 0, 2, 0, 0])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

Density Estimation

Kernel Density Estimation

\[ f_h(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right) \]

# generate a small sample
X = np.array([-2, -1.5, -1.3, -1, -0.5, 2, 5, 6]).reshape(-1, 1)

# fit KDE
kde = KernelDensity(bandwidth=1)
_ = kde.fit(X)

# get pdf values for the plot x values
x = np.linspace(X.min() - 3, X.max() + 3, 1000)
logprob = kde.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)

# create rug plot
fig, ax = plt.subplots(figsize=(8, 4))
sns.rugplot(
    X.ravel(),
    color="dodgerblue",
    height=0.1,
    linewidth=2,
    expand_margins=False,
    zorder=2,
    ax=ax,
)

# add individual kernels
for x_i in X.ravel():
    kernel = norm.pdf(x, loc=x_i, scale=kde.bandwidth)
    ax.plot(
        x,
        kernel,
        color="lightblue",
        alpha=0.65,
        linestyle="--",
        zorder=1,
    )

# add the KDE
ax.plot(x, pdf, color="darkorange")
plt.show()

X, _ = make_blobs(
    n_samples=250,
    centers=2,
    n_features=1,
    cluster_std=[1.5, 3],
    random_state=42,
)
X[:10]
array([[-2.78768609],
       [ 6.09024112],
       [-1.62891198],
       [ 9.6636619 ],
       [-3.00069084],
       [-2.50152744],
       [-1.9456506 ],
       [10.69663971],
       [-1.90312134],
       [-3.56227726]])
fig, ax = plt.subplots()
sns.histplot(
    X.ravel(),
    bins=20,
    stat="density",
    color="lightgrey",
    ax=ax,
)
sns.rugplot(
    X.ravel(),
    color="dodgerblue",
    ax=ax,
)
plt.show()

kde = KernelDensity(bandwidth=1)
_ = kde.fit(X)
# get pdf values for the plot x values
x = np.linspace(X.min(), X.max(), 1000)
logprob = kde.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)

fig, ax = plt.subplots()
sns.histplot(
    X.ravel(),
    bins=20,
    stat="density",
    color="lightgrey",
    ax=ax,
)
sns.rugplot(
    X.ravel(),
    color="dodgerblue",
    ax=ax,
)
ax.plot(x, pdf, color="darkorange")
plt.show()

print(np.exp(kde.score_samples([[10]])))
print(np.mean(norm.pdf(10 - X.ravel())))
[0.07041698]
0.07041697626037091
# generate new samples from learned distribution
X_new = kde.sample(
    n_samples=250,
    random_state=42,
)
fig, ax = plt.subplots()
sns.histplot(
    X_new.ravel(),
    bins=20,
    stat="density",
    color="lightgrey",
    ax=ax,
)
sns.rugplot(
    X.ravel(),
    color="dodgerblue",
    ax=ax,
)
plt.show()

Gaussian Mixture Models

gmm = GaussianMixture(n_components=2)
gmm.fit(X)
gmm.predict(X)
array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0])
gmm.predict_proba(X[:10])
array([[9.99809500e-01, 1.90499762e-04],
       [1.73596228e-08, 9.99999983e-01],
       [9.98965880e-01, 1.03411986e-03],
       [6.62325206e-17, 1.00000000e+00],
       [9.99852179e-01, 1.47821159e-04],
       [9.99724537e-01, 2.75463373e-04],
       [9.99381880e-01, 6.18119575e-04],
       [9.58091584e-20, 1.00000000e+00],
       [9.99339166e-01, 6.60834258e-04],
       [9.99917508e-01, 8.24920374e-05]])
gmm._estimate_log_prob(X[:10])
array([[ -1.26869862,  -9.84679292],
       [-20.38619544,  -2.52950181],
       [ -1.51297646,  -8.3985716 ],
       [-39.27823941,  -2.03730453],
       [ -1.29732827, -10.12911304],
       [ -1.26618292,  -9.47538851],
       [ -1.37908791,  -8.77972343],
       [-45.93686441,  -2.15736095],
       [ -1.39413066,  -8.72790231],
       [ -1.48228032, -10.89743182]])
gmm.weights_
array([0.49689374, 0.50310626])
gmm.means_
array([[-2.62713984],
       [ 9.12611073]])
gmm.covariances_
array([[[1.98684369]],

       [[9.06906997]]])
np.sum(np.exp(gmm._estimate_log_prob(X[:10])) * gmm.weights_, axis=1)
array([0.13975181, 0.04009692, 0.10955584, 0.06559483, 0.1358017 ,
       0.14011574, 0.12519927, 0.05817411, 0.12333529, 0.11286342])
np.exp(gmm.score_samples(X[:10]))
array([0.13975181, 0.04009692, 0.10955584, 0.06559483, 0.1358017 ,
       0.14011574, 0.12519927, 0.05817411, 0.12333529, 0.11286342])
# get pdf values for the plot x values
x = np.linspace(X.min(), X.max(), 1000)
logprob = gmm.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)

# create plot
fig, ax = plt.subplots()
sns.histplot(
    X.ravel(),
    bins=20,
    stat="density",
    color="lightgrey",
    ax=ax,
)
sns.rugplot(
    X.ravel(),
    color="dodgerblue",
    ax=ax,
)
ax.plot(x, pdf, color="darkorange")
plt.show()

gmm.sample(n_samples=10)
(array([[-4.91697739],
        [-1.54392333],
        [-2.33796974],
        [-2.59626579],
        [ 9.94695185],
        [14.05432476],
        [10.7947898 ],
        [ 8.91455004],
        [ 8.26899985],
        [10.46576932]]),
 array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]))
X, _ = make_blobs(
    n_samples=500,
    n_features=2,
    cluster_std=[0.5, 2, 5],
    random_state=3,
)
X.shape
(500, 2)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

km3 = KMeans(n_clusters=3)
km3.fit(X)
clusters = km3.predict(X)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

gmm = GaussianMixture(n_components=3)
gmm.fit(X)
clusters = gmm.predict(X)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(clusters),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

Outlier Detection

X, _ = make_blobs(
    n_samples=500,
    centers=3,
    n_features=2,
    cluster_std=1.5,
    random_state=42,
)
outliers = np.random.RandomState(42).uniform(
    low=-20,
    high=20,
    size=(25, 2),
)
X = np.vstack((X, outliers))
X
array([[ -5.1557172 ,  -7.93487423],
       [  0.59454887,   1.89172738],
       [  7.92458324,   0.76072226],
       ...,
       [ -9.64880074,   6.50089137],
       [ -7.53155696,   0.80272085],
       [  1.86841117, -12.60582178]], shape=(525, 2))
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

iso = IsolationForest(
    random_state=42,
)
_ = iso.fit(X)
inout = iso.predict(X)
inout
array([ 1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(inout),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()

iso = IsolationForest(
    contamination=0.025,
    random_state=42,
)
_ = iso.fit(X)
inout = iso.predict(X)
inout
array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
       -1,  1,  1,  1, -1, -1, -1, -1, -1,  1, -1, -1,  1,  1,  1])
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=pd.Categorical(inout),
    ax=ax,
)
plt.xlabel(r"$x_1$")
plt.ylabel(r"$x_2$")
plt.show()