# Preprocessing

In [None]:
# basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# data imports
from palmerpenguins import load_penguins

## Simulated Data

In [None]:
def simulate_mixed_data(n_samples=10, missing_rate=0.2, random_state=42):

    # set random seed for reproducibility
    np.random.seed(random_state)

    # create DataFrame with mixed data types
    df = pd.DataFrame(
        {
            "num_big": np.random.normal(1000, 100, n_samples),
            "num_small": np.random.normal(0, 1, n_samples),
            "cat_str": np.random.choice(["A", "B", "C"], n_samples),
            "cat_num": np.random.choice([0, 1], n_samples),
            "target": np.random.choice([0, 1], n_samples),
        }
    )

    # add missing values to feature columns only
    for col in df.columns[:-1]:
        missing_mask = np.random.random(n_samples) < missing_rate
        df.loc[missing_mask, col] = np.nan

    # return the DataFrame with mixed feature types
    return df

## Palmer Penguins

In [None]:
penguins = load_penguins()

In [None]:
penguins_train, penguins_test = train_test_split(
    penguins,
    test_size=0.20,
    random_state=42,
)

In [None]:
penguins_train

In [None]:
plot = sns.jointplot(
    data=penguins_train,
    x="bill_length_mm",
    y="bill_depth_mm",
    hue="species",
    edgecolor="k",
    alpha=0.75,
    space=0,
)
plot.set_axis_labels(
    xlabel="Bill Length (mm)",
    ylabel="Bill Depth (mm)",
)
plot.ax_joint.legend(
    title="Species",
    loc="lower left",
)
plt.show()