Scikit-learn Common Patterns

Common patterns and workflows for scikit-learn: preprocessing, model training, evaluation, and pipelines.

Installation

1pip install scikit-learn numpy pandas matplotlib

Basic Workflow

 1from sklearn.model_selection import train_test_split
 2from sklearn.preprocessing import StandardScaler
 3from sklearn.linear_model import LogisticRegression
 4from sklearn.metrics import accuracy_score, classification_report
 5
 6# 1. Load data
 7X, y = load_data()  # Features and target
 8
 9# 2. Split data
10X_train, X_test, y_train, y_test = train_test_split(
11    X, y, test_size=0.2, random_state=42
12)
13
14# 3. Preprocess
15scaler = StandardScaler()
16X_train_scaled = scaler.fit_transform(X_train)
17X_test_scaled = scaler.transform(X_test)
18
19# 4. Train model
20model = LogisticRegression()
21model.fit(X_train_scaled, y_train)
22
23# 5. Predict
24y_pred = model.predict(X_test_scaled)
25
26# 6. Evaluate
27print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
28print(classification_report(y_test, y_pred))

Data Preprocessing

Scaling and Normalization

 1from sklearn.preprocessing import (
 2    StandardScaler, MinMaxScaler, RobustScaler,
 3    Normalizer, MaxAbsScaler
 4)
 5
 6# StandardScaler: mean=0, std=1
 7scaler = StandardScaler()
 8X_scaled = scaler.fit_transform(X_train)
 9
10# MinMaxScaler: scale to [0, 1]
11scaler = MinMaxScaler()
12X_scaled = scaler.fit_transform(X_train)
13
14# MinMaxScaler: custom range
15scaler = MinMaxScaler(feature_range=(-1, 1))
16X_scaled = scaler.fit_transform(X_train)
17
18# RobustScaler: robust to outliers (uses median and IQR)
19scaler = RobustScaler()
20X_scaled = scaler.fit_transform(X_train)
21
22# Normalizer: scale samples to unit norm
23normalizer = Normalizer(norm='l2')
24X_normalized = normalizer.fit_transform(X_train)
25
26# MaxAbsScaler: scale by maximum absolute value
27scaler = MaxAbsScaler()
28X_scaled = scaler.fit_transform(X_train)

Encoding Categorical Variables

 1from sklearn.preprocessing import (
 2    LabelEncoder, OneHotEncoder, OrdinalEncoder
 3)
 4
 5# LabelEncoder: for target variable
 6le = LabelEncoder()
 7y_encoded = le.fit_transform(y)  # ['cat', 'dog'] -> [0, 1]
 8y_decoded = le.inverse_transform(y_encoded)
 9
10# OneHotEncoder: for features
11encoder = OneHotEncoder(sparse_output=False, drop='first')
12X_encoded = encoder.fit_transform(X[['category']])
13
14# OrdinalEncoder: for ordinal features
15encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
16X_encoded = encoder.fit_transform(X[['priority']])
17
18# pd.get_dummies (Pandas alternative)
19import pandas as pd
20X_encoded = pd.get_dummies(X, columns=['category'], drop_first=True)

Handling Missing Values

 1from sklearn.impute import SimpleImputer, KNNImputer
 2
 3# SimpleImputer: mean, median, most_frequent, constant
 4imputer = SimpleImputer(strategy='mean')
 5X_imputed = imputer.fit_transform(X)
 6
 7imputer = SimpleImputer(strategy='median')
 8X_imputed = imputer.fit_transform(X)
 9
10imputer = SimpleImputer(strategy='most_frequent')
11X_imputed = imputer.fit_transform(X)
12
13imputer = SimpleImputer(strategy='constant', fill_value=0)
14X_imputed = imputer.fit_transform(X)
15
16# KNNImputer: impute using k-nearest neighbors
17imputer = KNNImputer(n_neighbors=5)
18X_imputed = imputer.fit_transform(X)

Feature Engineering

 1from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
 2
 3# Polynomial features
 4poly = PolynomialFeatures(degree=2, include_bias=False)
 5X_poly = poly.fit_transform(X)
 6
 7# Power transformation (make data more Gaussian)
 8pt = PowerTransformer(method='yeo-johnson')
 9X_transformed = pt.fit_transform(X)
10
11# Box-Cox (requires positive data)
12pt = PowerTransformer(method='box-cox')
13X_transformed = pt.fit_transform(X)

Feature Selection

 1from sklearn.feature_selection import (
 2    SelectKBest, SelectPercentile, RFE, RFECV,
 3    SelectFromModel, VarianceThreshold, f_classif, mutual_info_classif
 4)
 5
 6# SelectKBest: select top k features
 7selector = SelectKBest(score_func=f_classif, k=10)
 8X_selected = selector.fit_transform(X, y)
 9selected_features = X.columns[selector.get_support()]
10
11# SelectPercentile: select top percentile
12selector = SelectPercentile(score_func=mutual_info_classif, percentile=20)
13X_selected = selector.fit_transform(X, y)
14
15# RFE: Recursive Feature Elimination
16from sklearn.ensemble import RandomForestClassifier
17estimator = RandomForestClassifier(n_estimators=100)
18selector = RFE(estimator, n_features_to_select=10)
19X_selected = selector.fit_transform(X, y)
20
21# RFECV: RFE with cross-validation
22selector = RFECV(estimator, step=1, cv=5)
23X_selected = selector.fit_transform(X, y)
24print(f"Optimal features: {selector.n_features_}")
25
26# SelectFromModel: based on feature importances
27selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
28X_selected = selector.fit_transform(X, y)
29
30# VarianceThreshold: remove low-variance features
31selector = VarianceThreshold(threshold=0.01)
32X_selected = selector.fit_transform(X)

Model Training

Classification

 1from sklearn.linear_model import LogisticRegression
 2from sklearn.tree import DecisionTreeClassifier
 3from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 4from sklearn.svm import SVC
 5from sklearn.neighbors import KNeighborsClassifier
 6from sklearn.naive_bayes import GaussianNB
 7
 8# Logistic Regression
 9model = LogisticRegression(max_iter=1000, random_state=42)
10model.fit(X_train, y_train)
11
12# Decision Tree
13model = DecisionTreeClassifier(max_depth=5, random_state=42)
14model.fit(X_train, y_train)
15
16# Random Forest
17model = RandomForestClassifier(n_estimators=100, random_state=42)
18model.fit(X_train, y_train)
19
20# Gradient Boosting
21model = GradientBoostingClassifier(n_estimators=100, random_state=42)
22model.fit(X_train, y_train)
23
24# SVM
25model = SVC(kernel='rbf', C=1.0, random_state=42)
26model.fit(X_train, y_train)
27
28# K-Nearest Neighbors
29model = KNeighborsClassifier(n_neighbors=5)
30model.fit(X_train, y_train)
31
32# Naive Bayes
33model = GaussianNB()
34model.fit(X_train, y_train)

Regression

 1from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
 2from sklearn.tree import DecisionTreeRegressor
 3from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
 4from sklearn.svm import SVR
 5
 6# Linear Regression
 7model = LinearRegression()
 8model.fit(X_train, y_train)
 9
10# Ridge Regression (L2 regularization)
11model = Ridge(alpha=1.0)
12model.fit(X_train, y_train)
13
14# Lasso Regression (L1 regularization)
15model = Lasso(alpha=1.0)
16model.fit(X_train, y_train)
17
18# ElasticNet (L1 + L2)
19model = ElasticNet(alpha=1.0, l1_ratio=0.5)
20model.fit(X_train, y_train)
21
22# Decision Tree Regressor
23model = DecisionTreeRegressor(max_depth=5, random_state=42)
24model.fit(X_train, y_train)
25
26# Random Forest Regressor
27model = RandomForestRegressor(n_estimators=100, random_state=42)
28model.fit(X_train, y_train)
29
30# Gradient Boosting Regressor
31model = GradientBoostingRegressor(n_estimators=100, random_state=42)
32model.fit(X_train, y_train)
33
34# SVR
35model = SVR(kernel='rbf', C=1.0)
36model.fit(X_train, y_train)

Model Evaluation

Classification Metrics

 1from sklearn.metrics import (
 2    accuracy_score, precision_score, recall_score, f1_score,
 3    confusion_matrix, classification_report, roc_auc_score, roc_curve,
 4    precision_recall_curve, average_precision_score
 5)
 6
 7y_pred = model.predict(X_test)
 8y_pred_proba = model.predict_proba(X_test)[:, 1]  # For binary classification
 9
10# Basic metrics
11print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
12print(f"Precision: {precision_score(y_test, y_pred):.3f}")
13print(f"Recall: {recall_score(y_test, y_pred):.3f}")
14print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
15
16# Confusion matrix
17cm = confusion_matrix(y_test, y_pred)
18print("Confusion Matrix:")
19print(cm)
20
21# Classification report
22print(classification_report(y_test, y_pred))
23
24# ROC-AUC
25auc = roc_auc_score(y_test, y_pred_proba)
26print(f"ROC-AUC: {auc:.3f}")
27
28# Plot ROC curve
29import matplotlib.pyplot as plt
30fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
31plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
32plt.plot([0, 1], [0, 1], 'k--')
33plt.xlabel('False Positive Rate')
34plt.ylabel('True Positive Rate')
35plt.title('ROC Curve')
36plt.legend()
37plt.show()
38
39# Precision-Recall curve
40precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
41ap = average_precision_score(y_test, y_pred_proba)
42plt.plot(recall, precision, label=f'AP = {ap:.3f}')
43plt.xlabel('Recall')
44plt.ylabel('Precision')
45plt.title('Precision-Recall Curve')
46plt.legend()
47plt.show()

Regression Metrics

 1from sklearn.metrics import (
 2    mean_squared_error, mean_absolute_error, r2_score,
 3    mean_absolute_percentage_error, explained_variance_score
 4)
 5
 6y_pred = model.predict(X_test)
 7
 8# Metrics
 9mse = mean_squared_error(y_test, y_pred)
10rmse = np.sqrt(mse)
11mae = mean_absolute_error(y_test, y_pred)
12r2 = r2_score(y_test, y_pred)
13mape = mean_absolute_percentage_error(y_test, y_pred)
14
15print(f"MSE: {mse:.3f}")
16print(f"RMSE: {rmse:.3f}")
17print(f"MAE: {mae:.3f}")
18print(f"R²: {r2:.3f}")
19print(f"MAPE: {mape:.3f}")
20
21# Plot predictions vs actual
22plt.scatter(y_test, y_pred, alpha=0.5)
23plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
24plt.xlabel('Actual')
25plt.ylabel('Predicted')
26plt.title('Predictions vs Actual')
27plt.show()

Cross-Validation

 1from sklearn.model_selection import (
 2    cross_val_score, cross_validate, KFold, StratifiedKFold,
 3    TimeSeriesSplit, LeaveOneOut
 4)
 5
 6# Simple cross-validation
 7scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
 8print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")
 9
10# Multiple metrics
11scoring = ['accuracy', 'precision', 'recall', 'f1']
12scores = cross_validate(model, X, y, cv=5, scoring=scoring)
13for metric in scoring:
14    print(f"{metric}: {scores[f'test_{metric}'].mean():.3f}")
15
16# K-Fold
17kf = KFold(n_splits=5, shuffle=True, random_state=42)
18scores = cross_val_score(model, X, y, cv=kf)
19
20# Stratified K-Fold (preserves class distribution)
21skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
22scores = cross_val_score(model, X, y, cv=skf)
23
24# Time Series Split (no shuffle)
25tscv = TimeSeriesSplit(n_splits=5)
26scores = cross_val_score(model, X, y, cv=tscv)
27
28# Leave-One-Out
29loo = LeaveOneOut()
30scores = cross_val_score(model, X, y, cv=loo)

Hyperparameter Tuning

 1from sklearn.model_selection import GridSearchCV
 2
 3# Define parameter grid
 4param_grid = {
 5    'n_estimators': [50, 100, 200],
 6    'max_depth': [3, 5, 7, None],
 7    'min_samples_split': [2, 5, 10],
 8    'min_samples_leaf': [1, 2, 4]
 9}
10
11# Grid search
12grid_search = GridSearchCV(
13    RandomForestClassifier(random_state=42),
14    param_grid,
15    cv=5,
16    scoring='accuracy',
17    n_jobs=-1,
18    verbose=1
19)
20
21grid_search.fit(X_train, y_train)
22
23# Best parameters and score
24print("Best parameters:", grid_search.best_params_)
25print("Best score:", grid_search.best_score_)
26
27# Use best model
28best_model = grid_search.best_estimator_
29y_pred = best_model.predict(X_test)
 1from sklearn.model_selection import RandomizedSearchCV
 2from scipy.stats import randint, uniform
 3
 4# Define parameter distributions
 5param_dist = {
 6    'n_estimators': randint(50, 200),
 7    'max_depth': [3, 5, 7, 10, None],
 8    'min_samples_split': randint(2, 20),
 9    'min_samples_leaf': randint(1, 10),
10    'max_features': uniform(0.1, 0.9)
11}
12
13# Random search
14random_search = RandomizedSearchCV(
15    RandomForestClassifier(random_state=42),
16    param_dist,
17    n_iter=100,
18    cv=5,
19    scoring='accuracy',
20    n_jobs=-1,
21    random_state=42,
22    verbose=1
23)
24
25random_search.fit(X_train, y_train)
26
27print("Best parameters:", random_search.best_params_)
28print("Best score:", random_search.best_score_)

Pipelines

Basic Pipeline

 1from sklearn.pipeline import Pipeline
 2
 3# Create pipeline
 4pipeline = Pipeline([
 5    ('scaler', StandardScaler()),
 6    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
 7])
 8
 9# Train
10pipeline.fit(X_train, y_train)
11
12# Predict
13y_pred = pipeline.predict(X_test)
14
15# Cross-validation on pipeline
16scores = cross_val_score(pipeline, X, y, cv=5)
17print(f"CV Accuracy: {scores.mean():.3f}")

Pipeline with Feature Selection

1pipeline = Pipeline([
2    ('imputer', SimpleImputer(strategy='mean')),
3    ('scaler', StandardScaler()),
4    ('feature_selection', SelectKBest(k=10)),
5    ('classifier', LogisticRegression())
6])
7
8pipeline.fit(X_train, y_train)

Column Transformer

 1from sklearn.compose import ColumnTransformer
 2
 3# Define transformers for different column types
 4numeric_features = ['age', 'income', 'score']
 5categorical_features = ['city', 'category']
 6
 7preprocessor = ColumnTransformer(
 8    transformers=[
 9        ('num', StandardScaler(), numeric_features),
10        ('cat', OneHotEncoder(drop='first'), categorical_features)
11    ])
12
13# Full pipeline
14pipeline = Pipeline([
15    ('preprocessor', preprocessor),
16    ('classifier', RandomForestClassifier(n_estimators=100))
17])
18
19pipeline.fit(X_train, y_train)
20y_pred = pipeline.predict(X_test)
 1pipeline = Pipeline([
 2    ('scaler', StandardScaler()),
 3    ('classifier', RandomForestClassifier(random_state=42))
 4])
 5
 6param_grid = {
 7    'classifier__n_estimators': [50, 100, 200],
 8    'classifier__max_depth': [3, 5, 7],
 9    'classifier__min_samples_split': [2, 5, 10]
10}
11
12grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
13grid_search.fit(X_train, y_train)
14
15print("Best parameters:", grid_search.best_params_)

Model Persistence

 1import joblib
 2import pickle
 3
 4# Save model with joblib (recommended)
 5joblib.dump(model, 'model.joblib')
 6loaded_model = joblib.load('model.joblib')
 7
 8# Save model with pickle
 9with open('model.pkl', 'wb') as f:
10    pickle.dump(model, f)
11
12with open('model.pkl', 'rb') as f:
13    loaded_model = pickle.load(f)
14
15# Save pipeline
16joblib.dump(pipeline, 'pipeline.joblib')
17loaded_pipeline = joblib.load('pipeline.joblib')

Common Patterns

Handling Imbalanced Data

 1from sklearn.utils import class_weight
 2from imblearn.over_sampling import SMOTE
 3from imblearn.under_sampling import RandomUnderSampler
 4
 5# Class weights
 6class_weights = class_weight.compute_class_weight(
 7    'balanced',
 8    classes=np.unique(y_train),
 9    y=y_train
10)
11model = RandomForestClassifier(class_weight='balanced')
12
13# SMOTE (oversampling)
14smote = SMOTE(random_state=42)
15X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
16
17# Undersampling
18rus = RandomUnderSampler(random_state=42)
19X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

Learning Curves

 1from sklearn.model_selection import learning_curve
 2
 3train_sizes, train_scores, val_scores = learning_curve(
 4    model, X, y, cv=5, n_jobs=-1,
 5    train_sizes=np.linspace(0.1, 1.0, 10)
 6)
 7
 8plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
 9plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation score')
10plt.xlabel('Training Set Size')
11plt.ylabel('Score')
12plt.legend()
13plt.show()

Feature Importances

 1# For tree-based models
 2importances = model.feature_importances_
 3indices = np.argsort(importances)[::-1]
 4
 5plt.figure(figsize=(10, 6))
 6plt.title("Feature Importances")
 7plt.bar(range(X.shape[1]), importances[indices])
 8plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
 9plt.tight_layout()
10plt.show()

Further Reading

Related Snippets