# Cross-Validation

In [None]:
# basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

# model imports
from sklearn.neighbors import KNeighborsClassifier

# metric imports
from sklearn.metrics import accuracy_score

# model selection imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

# data imports
from palmerpenguins import load_penguins

## Palmer Penguins

In [None]:
penguins = load_penguins()

In [None]:
penguins_train, penguins_test = train_test_split(
    penguins,
    test_size=0.20,
    random_state=42,
)

In [None]:
penguins_train

In [None]:
plot = sns.jointplot(
    data=penguins_train,
    x="bill_length_mm",
    y="bill_depth_mm",
    hue="species",
    edgecolor="k",
    alpha=0.75,
    space=0,
)
plot.set_axis_labels(
    xlabel="Bill Length (mm)",
    ylabel="Bill Depth (mm)",
)
plot.ax_joint.legend(
    title="Species",
    loc="lower left",
)
plt.show()

In [None]:
penguins_vtrain, penguins_validation = train_test_split(
    penguins_train,
    test_size=0.20,
    random_state=42,
)

In [None]:
numeric_features = ["bill_length_mm", "bill_depth_mm", "body_mass_g"]
categorical_features = ["sex"]
features = numeric_features + categorical_features
target = "species"

In [None]:
X_train = penguins_train[features]
y_train = penguins_train[target]

X_test = penguins_test[features]
y_test = penguins_test[target]

In [None]:
X_vtrain = penguins_vtrain[features]
y_vtrain = penguins_vtrain[target]

X_validation = penguins_validation[features]
y_validation = penguins_validation[target]

In [None]:
# define preprocessing for numeric features
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
)

# define preprocessing for categorical features
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(),
)

# create general preprocessor
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    remainder="drop",
)

In [None]:
validation_accuracy = []
param_grid = [1, 5, 10, 15, 20, 25, 50]

In [None]:
for k in param_grid:
    mod = make_pipeline(
        preprocessor,
        KNeighborsClassifier(n_neighbors=k),
    )
    mod.fit(X_vtrain, y_vtrain)
    y_pred = mod.predict(X_validation)
    validation_accuracy.append(accuracy_score(y_validation, y_pred))

In [None]:
# arrange and print validation results
validation_results = pd.DataFrame(
    {
        "k": param_grid,
        "Accuracy": validation_accuracy,
    }
)
print(validation_results)
print(f"")

In [None]:
# get best k from validation process
k_best = np.flip(param_grid)[np.argmax(np.flip(validation_accuracy))]

In [None]:
# create and fit the model with the best k on the (full) training data
final_model = make_pipeline(
    preprocessor,
    KNeighborsClassifier(n_neighbors=k_best),
)
final_model.fit(X_train, y_train)

In [None]:
# predict on the test data
y_test_pred = final_model.predict(X_test)

# calculate and print the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")