# basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# model imports
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
# metric imports
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import accuracy_score
# model selection imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
# data imports
from sklearn.datasets import make_friedman1
Model Flexibility
A model’s flexibility determines how well the model can learn the training data.
- A “flexible” model can learn “complex” patterns in the data, but is also more likely to overfit.
- An “inflexible” model is less likely to overfit, but may not be able to learn the true underlying patterns in the data.
Given a particular dataset, when fitting a model, you are essentially trying to find a model that is flexible enough to learn the underlying patterns in the data, but not so flexible that it learns the noise in the data.
How do we control a model’s flexibility? Our main tool for controlling a model’s flexibility is the available hyperparameters (tuning parameters).
Let’s investigate with \(k\) for \(k\)-nearest neighbors.
# simulate data
= make_friedman1(
X, y =1000,
n_samples=0.25,
noise=42,
random_state )
# split the data
= train_test_split(
X_train, X_test, y_train, y_test
X,
y,=0.25,
test_size=42,
random_state )
# define range of k values to search over
= range(1, 152, 2) k_values
# initialize storage for train RMSE values
= []
train_rmse
# initialize storage for test RMSE values
= [] test_rmse
# fit models and calculate train and test test RMSE for each value of k
for k in k_values:
# initialize model, with the current k
= KNeighborsRegressor(n_neighbors=k)
knn
# fit the model to the (validation) train data
knn.fit(X_train, y_train)
# get train predictions
= knn.predict(X_train)
y_train_pred
# calculate (and store) train RMSE
train_rmse.append(root_mean_squared_error(y_train, y_train_pred))
# get test predictions
= knn.predict(X_test)
y_test_pred
# calculate (and store) test RMSE
test_rmse.append(root_mean_squared_error(y_test, y_test_pred))
# plot RMSE against k
= plt.subplots(figsize=(10, 6))
fig, ax ="o", label="Train RMSE")
ax.plot(k_values, train_rmse, marker="o", label="Test RMSE")
ax.plot(k_values, test_rmse, marker"k")
ax.set_xlabel("RMSE")
ax.set_ylabel("k versus Error")
ax.set_title(
ax.legend() plt.show()
How does \(k\) relate to model flexibility?
- A small \(k\) results in a more flexible model.
- A large \(k\) results in a less flexible model.
That is, as \(k\) increases, the model becomes less flexible.
How do we know this? Note that as \(k\) increases, the train RMSE increases. Said in reverse, as \(k\) decreases, the train RMSE decreases, and thus the model better learns the training data.
Overfitting
What is overfitting?
As the name subtly suggests, overfitting occurs when a model has learned too much. In particular, it has learned the training data so well, that beyond learning the underlying patterns in the data, it has also learned the noise in the data.
How do we know when overfitting has occurred?
- The model has a low training error, relative to training error for other models.
- The model has a high test error, relative to test error for other models.
Bias-Variance Tradeoff
There are three sources of error in a model for supervised learning:
- Bias
- Variance
- Noise
The bias and variance make up the reducible error in a model. The noise is also called the irreducible error.
The bias of a model is the error due to the model’s assumptions or lack of flexibility. It is systematic error due to the model’s inability to (fully) learn the true underlying patterns in the data.
The variance of a model is the error due to the model’s sensitivity to the training data. It is the error due to the model (partially) learning some noise in the training data, and thus the model would change if the training data were changed.
Both bias and variance are related to the model’s flexibility.
- As flexibility increases, bias decreases.
- As flexibility increases, variance increases.
def simulate_sin_data(n, sd, seed):
np.random.seed(seed)= np.random.uniform(
X =-2 * np.pi,
low=2 * np.pi,
high=(n, 1),
size
)= np.sin(X).ravel()
signal = np.random.normal(
noise =0,
loc=sd,
scale=n,
size
)= signal + noise
y return X, y
def plot_four_simulations_and_predictions(n_neighbors):
= plt.subplots(2, 2, figsize=(12, 10))
fig, ax "Simulated Sine Wave Data")
fig.suptitle(= np.linspace(-2 * np.pi, 2 * np.pi, 1000).reshape((-1, 1))
x_plot for i in range(2):
for j in range(2):
="True Signal")
ax[i, j].plot(x_plot, np.sin(x_plot), label= simulate_sin_data(n=100, sd=0.25, seed=i * 2 + j)
X, y ="gray")
ax[i, j].scatter(X, y, c= KNeighborsRegressor(n_neighbors=n_neighbors)
knn
knn.fit(X, y)
ax[i, j].plot(
x_plot,
knn.predict(x_plot),="Predicted Signal",
label="--",
linestyle
)
ax[i, j].legend() plt.show()
def plot_many_simulations_and_predictions_with_average(n_neighbors):
= plt.subplots(figsize=(10, 6))
fig, ax "Simulated Sine Wave Data")
fig.suptitle(= np.linspace(-2 * np.pi, 2 * np.pi, 1000).reshape((-1, 1))
x_plot ="tab:blue", label="True Signal")
ax.plot(x_plot, np.sin(x_plot), color= []
predictions for seed in range(25):
= simulate_sin_data(n=100, sd=0.25, seed=seed)
X, y = KNeighborsRegressor(n_neighbors=n_neighbors)
knn
knn.fit(X, y)="gray", alpha=0.25)
ax.plot(x_plot, knn.predict(x_plot), color
predictions.append(knn.predict(x_plot))= np.mean(predictions, axis=0)
avg_prediction
ax.plot(
x_plot,
avg_prediction,="tab:orange",
color="--",
linestyle="Average Prediction",
label
)
ax.legend() plt.show()