# Decision Trees for Classification

In [None]:
# basics
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## Heart Disease Data

In [3]:
heart = pd.read_parquet("https://notes.cs307.org/data/heart.parquet")

In [4]:
X = heart.drop(columns=["num"])
y = heart["num"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

In [6]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,location
343,53.0,1.0,2.0,120.0,0.0,0.0,0.0,95.0,0.0,0.0,2.0,,3.0,ch
743,74.0,1.0,3.0,,0.0,0.0,0.0,,,,,,,va
349,53.0,1.0,4.0,130.0,0.0,0.0,2.0,135.0,1.0,1.0,2.0,,7.0,ch
708,47.0,0.0,4.0,120.0,205.0,0.0,0.0,98.0,1.0,2.0,2.0,,6.0,hu
913,62.0,1.0,4.0,158.0,170.0,0.0,1.0,138.0,1.0,0.0,,,,va
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,cl
242,49.0,0.0,4.0,130.0,269.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,cl
55,54.0,1.0,4.0,124.0,266.0,0.0,2.0,109.0,1.0,2.2,2.0,1.0,7.0,cl
21,58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,cl


In [None]:
cat_cols = ["location", "sex", "cp", "fbs", "exang", "slope", "thal", "restecg"]
num_cols = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]

In [None]:
# create numeric transformer
numeric_transformer = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

# create column transformer
categorical_transformer = Pipeline(
    [
        ("encoder", OneHotEncoder(drop="first", sparse_output=False)),
    ]
)

# create column transformer
preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="passthrough",
)

In [None]:
dt_mod = mod.best_estimator_.named_steps["dt"]
feature_names = mod.best_estimator_.named_steps["preprocessor"].get_feature_names_out()
plt.figure(figsize=(20, 10))
plot_tree(dt_mod, filled=True, fontsize=8, feature_names=feature_names)
plt.show()

In [None]:
y_pred = mod.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=mod.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=mod.classes_)
disp.plot()