Machine Learning - Classification
Quick start (Logistic Regression)
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
X, y = load_breast_cancer(return_X_y=True)
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
pipe.fit(Xtr, ytr)
proba = pipe.predict_proba(Xte)[:, 1]
print({"roc_auc": roc_auc_score(yte, proba)})
print(classification_report(yte, pipe.predict(Xte)))
Class imbalance & thresholding
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.metrics import f1_score
classes = np.unique(ytr)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
class_weight = {c:w for c,w in zip(classes, weights)}
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000, class_weight=class_weight))
pipe.fit(Xtr, ytr)
proba = pipe.predict_proba(Xte)[:,1]
best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.1, 0.9, 17):
preds = (proba >= t).astype(int)
f1 = f1_score(yte, preds)
if f1 > best_f1: best_f1, best_t = f1, t
print({"best_threshold": best_t, "best_f1": best_f1})
Calibration
from sklearn.calibration import CalibratedClassifierCV
base = LogisticRegression(max_iter=2000)
cal = CalibratedClassifierCV(base, cv=5, method="isotonic")
cal.fit(Xtr, ytr)
prob = cal.predict_proba(Xte)[:,1]