Machine Learning - Model Selection & Validation
Pipeline + GridSearchCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
num_cols = ["age", "income"]; cat_cols = ["country", "segment"]
pre = ColumnTransformer([
("num", Pipeline([("imp", SimpleImputer()), ("sc", StandardScaler())]), num_cols),
("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
])
pipe = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=2000))])
param_grid = {"clf__C": [0.1, 1.0, 10.0], "clf__penalty": ["l2"]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(pipe, param_grid, cv=cv, scoring="roc_auc", n_jobs=-1)
search.fit(X_train, y_train)
print(search.best_params_, search.best_score_)
RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
param_dist = {"clf__C": loguniform(1e-3, 1e2)}
rs = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=25, cv=cv, scoring="roc_auc", n_jobs=-1, random_state=42)
rs.fit(X_train, y_train)
print(rs.best_params_, rs.best_score_)