# basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# preprocessing imports
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
# data imports
from palmerpenguins import load_penguins
In these notes, we will cover various preprocessing techniques and their applications, including:
- Numeric Scaling: Standardizing numeric features to have a mean of 0 and a standard deviation of 1.
- Categorical Encoding: Converting categorical features into (multiple) numeric features using one-hot encoding.
- Imputation: Handling missing data by replacing missing values.
- Pipelines: Combining preprocessing steps and machine learning models for seamless application.
To summarize these techniques, we will demonstrate their applications using the Palmer Penguins dataset.
Motivation
# define sample size
= 10
n_samples
# set random seed for reproducibility
42)
np.random.seed(
# create feature variables of different types
= np.random.normal(loc=1000, scale=100, size=n_samples)
num_feat_big = np.random.normal(loc=0, scale=1, size=n_samples)
num_feat_small = np.random.choice(["A", "B", "C"], size=n_samples)
cat_feat_letters = np.random.choice([0, 1], size=n_samples)
cat_feat_binary = np.random.choice([0, 1], size=n_samples) target_binary
pd.DataFrame(
{"num_feat_big": num_feat_big,
"num_feat_small": num_feat_small,
"cat_feat_letters": cat_feat_letters,
"cat_feat_binary": cat_feat_binary,
"target": target_binary,
} )
num_feat_big | num_feat_small | cat_feat_letters | cat_feat_binary | target | |
---|---|---|---|---|---|
0 | 1049.671415 | -0.463418 | B | 0 | 1 |
1 | 986.173570 | -0.465730 | B | 0 | 0 |
2 | 1064.768854 | 0.241962 | C | 0 | 1 |
3 | 1152.302986 | -1.913280 | B | 0 | 0 |
4 | 976.584663 | -1.724918 | C | 1 | 1 |
5 | 976.586304 | -0.562288 | C | 1 | 1 |
6 | 1157.921282 | -1.012831 | A | 0 | 1 |
7 | 1076.743473 | 0.314247 | C | 1 | 0 |
8 | 953.052561 | -0.908024 | A | 1 | 1 |
9 | 1054.256004 | -1.412304 | C | 1 | 0 |
Why do we need to preprocess data? Two reasons:
- Preprocessing is done to make modeling possible.
- Preprocessing is done to improve model performance.
Consider the data frame above. We have numeric features with different scales. We also have two categorical features, one that is currently encoded as letters, that is, as strings. There are none here, but we could also have missing values that need to be imputed.
Now consider modeling this data with \(k\)-nearest neighbors.
Until we have dealt with any missing data, we cannot fit the model. Additionally, something will need to be done to the cat_feat_letters
variable, as we cannot include values likes “B” in distance calculations. Theses transformations would make modeling possible.
Notice that the two numeric features are on different scales. We could consider scaling them, thus putting them on the same scale. This is an example of a transformation that could improve model performance. (It isn’t a guarantee, but it could help.)
Numeric Scaling
= pd.DataFrame(
df_numeric
{"num_feat_big": num_feat_big,
"num_feat_small": num_feat_small,
}
) df_numeric
num_feat_big | num_feat_small | |
---|---|---|
0 | 1049.671415 | -0.463418 |
1 | 986.173570 | -0.465730 |
2 | 1064.768854 | 0.241962 |
3 | 1152.302986 | -1.913280 |
4 | 976.584663 | -1.724918 |
5 | 976.586304 | -0.562288 |
6 | 1157.921282 | -1.012831 |
7 | 1076.743473 | 0.314247 |
8 | 953.052561 | -0.908024 |
9 | 1054.256004 | -1.412304 |
= StandardScaler()
standard_scaler = standard_scaler.fit(df_numeric) _
print(standard_scaler.transform(df_numeric))
[[ 0.07093253 0.45668015]
[-0.85481899 0.45345355]
[ 0.29104199 1.44107232]
[ 1.56722474 -1.56667381]
[-0.99461815 -1.30380487]
[-0.99459421 0.31870247]
[ 1.64913533 -0.31005311]
[ 0.46562306 1.54194965]
[-1.33769874 -0.16378976]
[ 0.13777244 -0.86753659]]
print(standard_scaler.mean_)
print(standard_scaler.scale_)
[ 1.04480611e+03 -7.90658235e-01]
[68.59059303 0.71656397]
= standard_scaler.transform(df_numeric)
arr = np.mean(arr, axis=0)
col_mean = np.std(arr, axis=0)
col_sd
print(f"Column means: {col_mean}")
print(f"Column standard deviations: {col_sd}")
Column means: [ 1.49880108e-15 -4.44089210e-17]
Column standard deviations: [1. 1.]
Categorical Encoding
= pd.DataFrame(
df_categorical
{"cat_feat_letters": cat_feat_letters,
"cat_feat_binary": cat_feat_binary,
}
) df_categorical
cat_feat_letters | cat_feat_binary | |
---|---|---|
0 | B | 0 |
1 | B | 0 |
2 | C | 0 |
3 | B | 0 |
4 | C | 1 |
5 | C | 1 |
6 | A | 0 |
7 | C | 1 |
8 | A | 1 |
9 | C | 1 |
= OneHotEncoder(
one_hot_encoder ="infrequent_if_exist",
handle_unknown
)= one_hot_encoder.fit(df_categorical)
_ print(one_hot_encoder.transform(df_categorical).todense())
[[0. 1. 0. 1. 0.]
[0. 1. 0. 1. 0.]
[0. 0. 1. 1. 0.]
[0. 1. 0. 1. 0.]
[0. 0. 1. 0. 1.]
[0. 0. 1. 0. 1.]
[1. 0. 0. 1. 0.]
[0. 0. 1. 0. 1.]
[1. 0. 0. 0. 1.]
[0. 0. 1. 0. 1.]]
= OneHotEncoder(
dummy_encoder ="infrequent_if_exist",
handle_unknown="first",
drop
)= dummy_encoder.fit(df_categorical)
_ print(dummy_encoder.transform(df_categorical).todense())
[[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 1.]
[0. 1. 1.]
[0. 0. 0.]
[0. 1. 1.]
[0. 0. 1.]
[0. 1. 1.]]
Imputation
Numeric Features
1, 2], "num_feat_big"] = np.nan
df_numeric.loc[[ df_numeric
num_feat_big | num_feat_small | |
---|---|---|
0 | 1049.671415 | -0.463418 |
1 | NaN | -0.465730 |
2 | NaN | 0.241962 |
3 | 1152.302986 | -1.913280 |
4 | 976.584663 | -1.724918 |
5 | 976.586304 | -0.562288 |
6 | 1157.921282 | -1.012831 |
7 | 1076.743473 | 0.314247 |
8 | 953.052561 | -0.908024 |
9 | 1054.256004 | -1.412304 |
= SimpleImputer(strategy="median")
simple_imputer = simple_imputer.fit(df_numeric)
_ print(simple_imputer.transform(df_numeric))
[[ 1.04967142e+03 -4.63417693e-01]
[ 1.05196371e+03 -4.65729754e-01]
[ 1.05196371e+03 2.41962272e-01]
[ 1.15230299e+03 -1.91328024e+00]
[ 9.76584663e+02 -1.72491783e+00]
[ 9.76586304e+02 -5.62287529e-01]
[ 1.15792128e+03 -1.01283112e+00]
[ 1.07674347e+03 3.14247333e-01]
[ 9.53052561e+02 -9.08024076e-01]
[ 1.05425600e+03 -1.41230370e+00]]
Categorical Features
3, 4], "cat_feat_letters"] = np.nan
df_categorical.loc[[ df_categorical
cat_feat_letters | cat_feat_binary | |
---|---|---|
0 | B | 0 |
1 | B | 0 |
2 | C | 0 |
3 | NaN | 0 |
4 | NaN | 1 |
5 | C | 1 |
6 | A | 0 |
7 | C | 1 |
8 | A | 1 |
9 | C | 1 |
= SimpleImputer(strategy="most_frequent")
simple_imputer = simple_imputer.fit(df_categorical)
_ print(simple_imputer.transform(df_categorical))
[['B' 0]
['B' 0]
['C' 0]
['C' 0]
['C' 1]
['C' 1]
['A' 0]
['C' 1]
['A' 1]
['C' 1]]
Pipelines
= pd.concat([df_numeric, df_categorical], axis=1)
df_all "target"] = target_binary
df_all[ df_all
num_feat_big | num_feat_small | cat_feat_letters | cat_feat_binary | target | |
---|---|---|---|---|---|
0 | 1049.671415 | -0.463418 | B | 0 | 1 |
1 | NaN | -0.465730 | B | 0 | 0 |
2 | NaN | 0.241962 | C | 0 | 1 |
3 | 1152.302986 | -1.913280 | NaN | 0 | 0 |
4 | 976.584663 | -1.724918 | NaN | 1 | 1 |
5 | 976.586304 | -0.562288 | C | 1 | 1 |
6 | 1157.921282 | -1.012831 | A | 0 | 1 |
7 | 1076.743473 | 0.314247 | C | 1 | 0 |
8 | 953.052561 | -0.908024 | A | 1 | 1 |
9 | 1054.256004 | -1.412304 | C | 1 | 0 |
= ["num_feat_big", "num_feat_small"]
numeric_features = ["cat_feat_letters", "cat_feat_binary"]
categorical_features = numeric_features + categorical_features
features = "target" target
= df_all[features]
X = df_all[target] y
= make_pipeline(
numeric_transformer ="median"),
SimpleImputer(strategy
StandardScaler(),
)
= make_pipeline(
categorical_transformer ="most_frequent"),
SimpleImputer(strategy="infrequent_if_exist"),
OneHotEncoder(handle_unknown
)
= make_column_transformer(
preprocessor
(numeric_transformer, numeric_features),
(categorical_transformer, categorical_features),="drop",
remainder
)
preprocessor.fit(X) preprocessor.transform(X)
array([[-0.0066037 , 0.45668015, 0. , 1. , 0. ,
1. , 0. ],
[ 0.02834041, 0.45345355, 0. , 1. , 0. ,
1. , 0. ],
[ 0.02834041, 1.44107232, 0. , 0. , 1. ,
1. , 0. ],
[ 1.55792867, -1.56667381, 0. , 0. , 1. ,
1. , 0. ],
[-1.12075006, -1.30380487, 0. , 0. , 1. ,
0. , 1. ],
[-1.12072503, 0.31870247, 0. , 0. , 1. ,
0. , 1. ],
[ 1.64357488, -0.31005311, 1. , 0. , 0. ,
1. , 0. ],
[ 0.40608715, 1.54194965, 0. , 0. , 1. ,
0. , 1. ],
[-1.47947724, -0.16378976, 1. , 0. , 0. ,
0. , 1. ],
[ 0.06328452, -0.86753659, 0. , 0. , 1. ,
0. , 1. ]])
= make_pipeline(
mod
preprocessor,=5),
KNeighborsClassifier(n_neighbors
)
mod.fit(X, y) mod.predict(X)
array([1, 1, 1, 0, 1, 1, 0, 1, 1, 1])
Example: Palmer Penguins
= load_penguins() penguins
= train_test_split(
penguins_train, penguins_test
penguins,=0.35,
test_size=42,
random_state )
penguins_train
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
292 | Chinstrap | Dream | 50.3 | 20.0 | 197.0 | 3300.0 | male | 2007 |
302 | Chinstrap | Dream | 50.5 | 18.4 | 200.0 | 3400.0 | female | 2008 |
56 | Adelie | Biscoe | 39.0 | 17.5 | 186.0 | 3550.0 | female | 2008 |
271 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN | 2009 |
10 | Adelie | Torgersen | 37.8 | 17.1 | 186.0 | 3300.0 | NaN | 2007 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
188 | Gentoo | Biscoe | 42.6 | 13.7 | 213.0 | 4950.0 | female | 2008 |
71 | Adelie | Torgersen | 39.7 | 18.4 | 190.0 | 3900.0 | male | 2008 |
106 | Adelie | Biscoe | 38.6 | 17.2 | 199.0 | 3750.0 | female | 2009 |
270 | Gentoo | Biscoe | 47.2 | 13.7 | 214.0 | 4925.0 | female | 2009 |
102 | Adelie | Biscoe | 37.7 | 16.0 | 183.0 | 3075.0 | female | 2009 |
223 rows × 8 columns
= sns.jointplot(
plot =penguins,
data="bill_length_mm",
x="bill_depth_mm",
y="species",
hue=0,
space=2,
zorder
)
plot.set_axis_labels(="Bill Length (mm)",
xlabel="Bill Depth (mm)",
ylabel
)
plot.figure.suptitle(="Palmer Penguins",
t=1.02,
y
)
plot.ax_joint.legend(="Species",
title="lower left",
loc
)
plot.ax_joint.grid(="lightgrey",
color="--",
linestyle=0.75,
linewidth=1,
zorder
)
plot.figure.set_size_inches(=8,
w=8,
h )
= train_test_split(
penguins_vtrain, penguins_validation
penguins_train,=0.35,
test_size=42,
random_state )
= [
numeric_features "bill_length_mm",
"bill_depth_mm",
"body_mass_g",
]= ["sex"]
categorical_features = numeric_features + categorical_features
features = "species" target
= penguins_train[features]
X_train = penguins_train["species"]
y_train
= penguins_test[features]
X_test = penguins_test["species"]
y_test
= penguins_vtrain[features]
X_vtrain = penguins_vtrain["species"]
y_vtrain
= penguins_validation[features]
X_validation = penguins_validation["species"] y_validation
# define preprocessing for numeric features
= make_pipeline(
numeric_transformer ="median"),
SimpleImputer(strategy
StandardScaler(),
)
# define preprocessing for categorical features
= make_pipeline(
categorical_transformer ="most_frequent"),
SimpleImputer(strategy="infrequent_if_exist"),
OneHotEncoder(handle_unknown
)
# create general preprocessor
= make_column_transformer(
preprocessor
(numeric_transformer, numeric_features),
(categorical_transformer, categorical_features),="drop",
remainder
)
# try many values of k for knn
= []
validation_accuracy_scores = [1, 5, 10, 15, 20, 25, 50, 100]
param_grid for k in param_grid:
= make_pipeline(
mod
preprocessor,=k),
KNeighborsClassifier(n_neighbors
)
mod.fit(X_vtrain, y_vtrain)= mod.predict(X_validation)
y_pred = accuracy_score(y_validation, y_pred)
validation_accuracy_score
validation_accuracy_scores.append(validation_accuracy_score)
# get best k from validation process
= param_grid[np.argmax(validation_accuracy_scores)]
k_best
# arrange and print validation results
= pd.DataFrame(
validation_results
{"k": param_grid,
"Accuracy": validation_accuracy_scores,
}
)print(validation_results)
print(f"")
k Accuracy
0 1 0.974684
1 5 0.974684
2 10 0.962025
3 15 0.962025
4 20 0.962025
5 25 0.962025
6 50 0.911392
7 100 0.746835
# fit the model with the best k on the full training data
= make_pipeline(
final_model
preprocessor,=k_best),
KNeighborsClassifier(n_neighbors
)
final_model.fit(X_train, y_train)
# predict on the test data
= final_model.predict(X_test)
y_test_pred
# calculate and print the test accuracy
= accuracy_score(y_test, y_test_pred)
test_accuracy print(f"Test Accuracy: {test_accuracy}")
Test Accuracy: 0.9834710743801653