# basics
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# machine learning
from sklearn.datasets import make_blobs, make_circles
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score
Binary Classification Setup
Logistic regression is a model for a binary target. Recall that in a binary setting, we often encode the target as \(Y \in \{0, 1\}\). Because we are mostly concerned with \(P[Y = 1 \mid \pmb{X} = \pmb{x}]\), we define a useful shorthand notation.
\[ p(\pmb{x}) = P[Y = 1 \mid \pmb{X} = \pmb{x}] \]
\[ P[Y = 0 \mid \pmb{X} = \pmb{x}] + P[Y = 1 \mid \pmb{X} = \pmb{x}] = 1 \]
\[ 1 - p(\pmb{x}) = P[Y = 0 \mid \pmb{X} = \pmb{x}] \]
Logistic Regression Model
\[ \log\left(\frac{p(\pmb{x})}{1 - p(\pmb{x})}\right) = \beta_0 + \beta_1 x_1 + \ldots + \beta_p x_p \]
\[ \frac{p(\pmb{x})}{1 - p(\pmb{x})} = \frac{P[Y = 1 \mid \pmb{X} = \pmb{x}]}{P[Y = 0 \mid \pmb{X} = \pmb{x}]} \]
\[ p(\pmb{x}) = P[Y = 1 \mid \pmb{X} = \pmb{x}] = \frac{e^{\beta_0 + \beta_1 x_{1} + \cdots + \beta_p x_p}}{1 + e^{\beta_0 + \beta_1 x_{1} + \cdots + \beta_p x_p}} \]
\[ \text{logit}(\xi) = \log\left(\frac{\xi}{1 - \xi}\right) \]
\[ \sigma(\xi) = \text{logit}^{-1}(\xi) = \frac{e^\xi}{1 + e^{\xi}} = \frac{1}{1 + e^{-\xi}} \]
\[ \eta(\pmb{x}) = \beta_0 + \beta_1 x_1 + \ldots + \beta_p x_p \]
\[ \log\left(\frac{p(\pmb{x})}{1 - p(\pmb{x})}\right) = \eta(\pmb{x}) \]
\[ p(\pmb{x}) = \sigma(\eta(\pmb{x})) = \frac{e^{\eta(\pmb{x})}}{1 + e^{\eta(\pmb{x})}} = \frac{1}{1 + e^{-\eta(\pmb{x})}} = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \ldots + \beta_p x_p)}} \]
\[ Y \mid \pmb{X} = \pmb{x} \sim \text{Bernoulli}(p(\pmb{x})) \]
Examples in Python
# helper function to convert numpy (two column) X and y to a pandas df
# useful for printing and plotting
def numpy_to_df(X, y):
= pd.DataFrame(
df
{"x1": X[:, 0],
"x2": X[:, 1],
"y": y,
}
)return df
# helper function to create a custom decision region plot
def plot_decision_regions(mod, X, y):
# define colors for points
= ListedColormap(
illini_cmap
["#1D58A7", # illini industrial blue
"#FF5F05", # illini orange
]
)
# define colors for decision regions
= ListedColormap(
light_illini_cmap
["#BFC9D9", # illini industrial blue (lighter)
"#FCCFB5", # a lighter orange
]
)
# create figure and axes
= plt.subplots(figsize=(8, 6))
_, ax
# start with sklearn default plot
DecisionBoundaryDisplay.from_estimator(
mod,
X,=light_illini_cmap,
cmap=ax,
ax=1000,
grid_resolution="predict",
response_method="pcolormesh",
plot_method="auto",
shading="$x_1$",
xlabel="$x_2$",
ylabel
)
# add points to plot
for i, shape in enumerate(["o", "s"]):
= y == i
mask
ax.scatter(0],
X[mask, 1],
X[mask, =illini_cmap(i),
color="white",
edgecolors=1,
linewidth=shape,
marker=f"Class {i}",
label
)
# add title and legend
= ax.set_title("Logistic Regression Decision Boundary")
_ = ax.legend(loc="best")
_
# show plot
plt.show()
# make "blob" data for classification
= make_blobs(
X, y =200,
n_samples=2,
centers=3.5,
cluster_std=42,
random_state
)
# split the "blog" data
= train_test_split(
X_train, X_test, y_train, y_test
X,
y,=0.50,
test_size=42,
random_state
)
# temporarily create df for easy plotting and printing
= numpy_to_df(X_train, y_train)
train_df train_df
x1 | x2 | y | |
---|---|---|---|
0 | 5.62 | -0.21 | 1 |
1 | -5.98 | 7.03 | 0 |
2 | -5.62 | 6.16 | 0 |
3 | -0.43 | -2.95 | 1 |
4 | -8.14 | 9.66 | 0 |
... | ... | ... | ... |
95 | 7.98 | 4.25 | 1 |
96 | -6.70 | -1.61 | 1 |
97 | -3.18 | 10.07 | 0 |
98 | 2.32 | 4.96 | 1 |
99 | 12.11 | -4.86 | 1 |
100 rows × 3 columns
# (quickly) create a "joint" plot
= sns.jointplot(data=train_df, x="x1", y="x2", hue="y")
_ plt.show()
# initialize and fit logistic regression
= LogisticRegression(penalty=None)
logistic = logistic.fit(X_train, y_train)
_
# estimated coefficients
logistic.intercept_, logistic.coef_
(array([3.63406579]), array([[ 0.80226887, -0.69174668]]))
# estimated conditional probabilities (first 10 samples from test set)
round(logistic.predict_proba(X_test)[range(10)], 3) np.
array([[0.982, 0.018],
[0.628, 0.372],
[1. , 0. ],
[0.992, 0.008],
[1. , 0. ],
[0.798, 0.202],
[0.35 , 0.65 ],
[0. , 1. ],
[1. , 0. ],
[0.012, 0.988]])
# predictions (classifications) on the test data
logistic.predict(X_test)
array([0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0])
# train accuracy
accuracy_score(y_train, logistic.predict(X_train))
# test accuracy
accuracy_score(y_test, logistic.predict(X_test))
0.92
# create "second order" features (without bias/intercept, it will be "added" later)
= PolynomialFeatures(degree=2, include_bias=False)
poly
# "learn" the transformation then apply it to the train and test data
= poly.fit(X_train)
_ = poly.transform(X_train)
X_train_trans = poly.transform(X_test)
X_test_trans
# initialize and fit logistic regression with higher order terms
= LogisticRegression(penalty=None)
logistic_higher = logistic_higher.fit(X_train_trans, y_train)
_
# estimated coefficients
logistic_higher.intercept_, logistic_higher.coef_
(array([6.73899122]),
array([[ 1.22657355, -1.90522748, 0.06187221, -0.0475074 , 0.07781114]]))
# train accuracy
accuracy_score(y_train, logistic_higher.predict(X_train_trans))
0.96
# test accuracy
accuracy_score(y_test, logistic_higher.predict(X_test_trans))
0.94
# refit the higher-order model using a pipeline
# # this is helpful for plotting decision boundary, also better in practice
= Pipeline(
pipeline
["poly", PolynomialFeatures(degree=2, include_bias=False)),
("logistic", LogisticRegression(penalty=None)),
(
]
) pipeline.fit(X_train, y_train)
Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)), ('logistic', LogisticRegression(penalty=None))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)), ('logistic', LogisticRegression(penalty=None))])
PolynomialFeatures(include_bias=False)
LogisticRegression(penalty=None)
# make and split "circle" data for classification
= make_circles(
X, y =200,
n_samples=42,
random_state=0.05,
noise
)= train_test_split(
X_train, X_test, y_train, y_test
X,
y,=0.50,
test_size=42,
random_state )
# temporarily create df for easy plotting
= numpy_to_df(X_train, y_train)
train_df train_df
x1 | x2 | y | |
---|---|---|---|
0 | -0.19 | 0.86 | 1 |
1 | -0.81 | -0.30 | 0 |
2 | 0.40 | -0.62 | 1 |
3 | -0.52 | 0.95 | 0 |
4 | -0.80 | 0.21 | 1 |
... | ... | ... | ... |
95 | 0.70 | 0.68 | 0 |
96 | 0.49 | -0.59 | 1 |
97 | 0.83 | 0.01 | 1 |
98 | 0.80 | -0.46 | 1 |
99 | -0.79 | 0.36 | 1 |
100 rows × 3 columns
# (quickly) create a "joint" plot
= sns.jointplot(data=train_df, x="x1", y="x2", hue="y") _
# initialize and fit logistic regression
= LogisticRegression(penalty=None)
logistic = logistic.fit(X_train, y_train)
_
# plot decision region for initial model
plot_decision_regions(logistic, X_train, y_train)
# setup a higher-order model using a pipeline
= Pipeline(
pipeline
["poly", PolynomialFeatures(degree=2, include_bias=False)),
("logistic", LogisticRegression(penalty=None)),
(
]
)
# fit a higher-order model using a pipeline
pipeline.fit(X_train, y_train)
Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)), ('logistic', LogisticRegression(penalty=None))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)), ('logistic', LogisticRegression(penalty=None))])
PolynomialFeatures(include_bias=False)
LogisticRegression(penalty=None)
Footnotes
This is likely true for computer scientists as well, but they also have good reason to use base 2 logarithms.↩︎