Scikit-learn Common Patterns

Dec 13, 2024 · 8 min read · scikit-learn machine-learning python ml ·

Share on:

Common patterns and workflows for scikit-learn: preprocessing, model training, evaluation, and pipelines.

Installation

1pip install scikit-learn numpy pandas matplotlib

Basic Workflow

 1from sklearn.model_selection import train_test_split
 2from sklearn.preprocessing import StandardScaler
 3from sklearn.linear_model import LogisticRegression
 4from sklearn.metrics import accuracy_score, classification_report
 5
 6# 1. Load data
 7X, y = load_data()  # Features and target
 8
 9# 2. Split data
10X_train, X_test, y_train, y_test = train_test_split(
11    X, y, test_size=0.2, random_state=42
12)
13
14# 3. Preprocess
15scaler = StandardScaler()
16X_train_scaled = scaler.fit_transform(X_train)
17X_test_scaled = scaler.transform(X_test)
18
19# 4. Train model
20model = LogisticRegression()
21model.fit(X_train_scaled, y_train)
22
23# 5. Predict
24y_pred = model.predict(X_test_scaled)
25
26# 6. Evaluate
27print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
28print(classification_report(y_test, y_pred))

Data Preprocessing

Scaling and Normalization

 1from sklearn.preprocessing import (
 2    StandardScaler, MinMaxScaler, RobustScaler,
 3    Normalizer, MaxAbsScaler
 4)
 5
 6# StandardScaler: mean=0, std=1
 7scaler = StandardScaler()
 8X_scaled = scaler.fit_transform(X_train)
 9
10# MinMaxScaler: scale to [0, 1]
11scaler = MinMaxScaler()
12X_scaled = scaler.fit_transform(X_train)
13
14# MinMaxScaler: custom range
15scaler = MinMaxScaler(feature_range=(-1, 1))
16X_scaled = scaler.fit_transform(X_train)
17
18# RobustScaler: robust to outliers (uses median and IQR)
19scaler = RobustScaler()
20X_scaled = scaler.fit_transform(X_train)
21
22# Normalizer: scale samples to unit norm
23normalizer = Normalizer(norm='l2')
24X_normalized = normalizer.fit_transform(X_train)
25
26# MaxAbsScaler: scale by maximum absolute value
27scaler = MaxAbsScaler()
28X_scaled = scaler.fit_transform(X_train)

Encoding Categorical Variables

 1from sklearn.preprocessing import (
 2    LabelEncoder, OneHotEncoder, OrdinalEncoder
 3)
 4
 5# LabelEncoder: for target variable
 6le = LabelEncoder()
 7y_encoded = le.fit_transform(y)  # ['cat', 'dog'] -> [0, 1]
 8y_decoded = le.inverse_transform(y_encoded)
 9
10# OneHotEncoder: for features
11encoder = OneHotEncoder(sparse_output=False, drop='first')
12X_encoded = encoder.fit_transform(X[['category']])
13
14# OrdinalEncoder: for ordinal features
15encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
16X_encoded = encoder.fit_transform(X[['priority']])
17
18# pd.get_dummies (Pandas alternative)
19import pandas as pd
20X_encoded = pd.get_dummies(X, columns=['category'], drop_first=True)

Handling Missing Values

 1from sklearn.impute import SimpleImputer, KNNImputer
 2
 3# SimpleImputer: mean, median, most_frequent, constant
 4imputer = SimpleImputer(strategy='mean')
 5X_imputed = imputer.fit_transform(X)
 6
 7imputer = SimpleImputer(strategy='median')
 8X_imputed = imputer.fit_transform(X)
 9
10imputer = SimpleImputer(strategy='most_frequent')
11X_imputed = imputer.fit_transform(X)
12
13imputer = SimpleImputer(strategy='constant', fill_value=0)
14X_imputed = imputer.fit_transform(X)
15
16# KNNImputer: impute using k-nearest neighbors
17imputer = KNNImputer(n_neighbors=5)
18X_imputed = imputer.fit_transform(X)

Feature Engineering

 1from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
 2
 3# Polynomial features
 4poly = PolynomialFeatures(degree=2, include_bias=False)
 5X_poly = poly.fit_transform(X)
 6
 7# Power transformation (make data more Gaussian)
 8pt = PowerTransformer(method='yeo-johnson')
 9X_transformed = pt.fit_transform(X)
10
11# Box-Cox (requires positive data)
12pt = PowerTransformer(method='box-cox')
13X_transformed = pt.fit_transform(X)

Feature Selection

 1from sklearn.feature_selection import (
 2    SelectKBest, SelectPercentile, RFE, RFECV,
 3    SelectFromModel, VarianceThreshold, f_classif, mutual_info_classif
 4)
 5
 6# SelectKBest: select top k features
 7selector = SelectKBest(score_func=f_classif, k=10)
 8X_selected = selector.fit_transform(X, y)
 9selected_features = X.columns[selector.get_support()]
10
11# SelectPercentile: select top percentile
12selector = SelectPercentile(score_func=mutual_info_classif, percentile=20)
13X_selected = selector.fit_transform(X, y)
14
15# RFE: Recursive Feature Elimination
16from sklearn.ensemble import RandomForestClassifier
17estimator = RandomForestClassifier(n_estimators=100)
18selector = RFE(estimator, n_features_to_select=10)
19X_selected = selector.fit_transform(X, y)
20
21# RFECV: RFE with cross-validation
22selector = RFECV(estimator, step=1, cv=5)
23X_selected = selector.fit_transform(X, y)
24print(f"Optimal features: {selector.n_features_}")
25
26# SelectFromModel: based on feature importances
27selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
28X_selected = selector.fit_transform(X, y)
29
30# VarianceThreshold: remove low-variance features
31selector = VarianceThreshold(threshold=0.01)
32X_selected = selector.fit_transform(X)

Model Training

Classification

 1from sklearn.linear_model import LogisticRegression
 2from sklearn.tree import DecisionTreeClassifier
 3from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 4from sklearn.svm import SVC
 5from sklearn.neighbors import KNeighborsClassifier
 6from sklearn.naive_bayes import GaussianNB
 7
 8# Logistic Regression
 9model = LogisticRegression(max_iter=1000, random_state=42)
10model.fit(X_train, y_train)
11
12# Decision Tree
13model = DecisionTreeClassifier(max_depth=5, random_state=42)
14model.fit(X_train, y_train)
15
16# Random Forest
17model = RandomForestClassifier(n_estimators=100, random_state=42)
18model.fit(X_train, y_train)
19
20# Gradient Boosting
21model = GradientBoostingClassifier(n_estimators=100, random_state=42)
22model.fit(X_train, y_train)
23
24# SVM
25model = SVC(kernel='rbf', C=1.0, random_state=42)
26model.fit(X_train, y_train)
27
28# K-Nearest Neighbors
29model = KNeighborsClassifier(n_neighbors=5)
30model.fit(X_train, y_train)
31
32# Naive Bayes
33model = GaussianNB()
34model.fit(X_train, y_train)

Regression

 1from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
 2from sklearn.tree import DecisionTreeRegressor
 3from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
 4from sklearn.svm import SVR
 5
 6# Linear Regression
 7model = LinearRegression()
 8model.fit(X_train, y_train)
 9
10# Ridge Regression (L2 regularization)
11model = Ridge(alpha=1.0)
12model.fit(X_train, y_train)
13
14# Lasso Regression (L1 regularization)
15model = Lasso(alpha=1.0)
16model.fit(X_train, y_train)
17
18# ElasticNet (L1 + L2)
19model = ElasticNet(alpha=1.0, l1_ratio=0.5)
20model.fit(X_train, y_train)
21
22# Decision Tree Regressor
23model = DecisionTreeRegressor(max_depth=5, random_state=42)
24model.fit(X_train, y_train)
25
26# Random Forest Regressor
27model = RandomForestRegressor(n_estimators=100, random_state=42)
28model.fit(X_train, y_train)
29
30# Gradient Boosting Regressor
31model = GradientBoostingRegressor(n_estimators=100, random_state=42)
32model.fit(X_train, y_train)
33
34# SVR
35model = SVR(kernel='rbf', C=1.0)
36model.fit(X_train, y_train)

Model Evaluation

Classification Metrics

 1from sklearn.metrics import (
 2    accuracy_score, precision_score, recall_score, f1_score,
 3    confusion_matrix, classification_report, roc_auc_score, roc_curve,
 4    precision_recall_curve, average_precision_score
 5)
 6
 7y_pred = model.predict(X_test)
 8y_pred_proba = model.predict_proba(X_test)[:, 1]  # For binary classification
 9
10# Basic metrics
11print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
12print(f"Precision: {precision_score(y_test, y_pred):.3f}")
13print(f"Recall: {recall_score(y_test, y_pred):.3f}")
14print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
15
16# Confusion matrix
17cm = confusion_matrix(y_test, y_pred)
18print("Confusion Matrix:")
19print(cm)
20
21# Classification report
22print(classification_report(y_test, y_pred))
23
24# ROC-AUC
25auc = roc_auc_score(y_test, y_pred_proba)
26print(f"ROC-AUC: {auc:.3f}")
27
28# Plot ROC curve
29import matplotlib.pyplot as plt
30fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
31plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
32plt.plot([0, 1], [0, 1], 'k--')
33plt.xlabel('False Positive Rate')
34plt.ylabel('True Positive Rate')
35plt.title('ROC Curve')
36plt.legend()
37plt.show()
38
39# Precision-Recall curve
40precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
41ap = average_precision_score(y_test, y_pred_proba)
42plt.plot(recall, precision, label=f'AP = {ap:.3f}')
43plt.xlabel('Recall')
44plt.ylabel('Precision')
45plt.title('Precision-Recall Curve')
46plt.legend()
47plt.show()

Regression Metrics

 1from sklearn.metrics import (
 2    mean_squared_error, mean_absolute_error, r2_score,
 3    mean_absolute_percentage_error, explained_variance_score
 4)
 5
 6y_pred = model.predict(X_test)
 7
 8# Metrics
 9mse = mean_squared_error(y_test, y_pred)
10rmse = np.sqrt(mse)
11mae = mean_absolute_error(y_test, y_pred)
12r2 = r2_score(y_test, y_pred)
13mape = mean_absolute_percentage_error(y_test, y_pred)
14
15print(f"MSE: {mse:.3f}")
16print(f"RMSE: {rmse:.3f}")
17print(f"MAE: {mae:.3f}")
18print(f"R²: {r2:.3f}")
19print(f"MAPE: {mape:.3f}")
20
21# Plot predictions vs actual
22plt.scatter(y_test, y_pred, alpha=0.5)
23plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
24plt.xlabel('Actual')
25plt.ylabel('Predicted')
26plt.title('Predictions vs Actual')
27plt.show()

Cross-Validation

 1from sklearn.model_selection import (
 2    cross_val_score, cross_validate, KFold, StratifiedKFold,
 3    TimeSeriesSplit, LeaveOneOut
 4)
 5
 6# Simple cross-validation
 7scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
 8print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")
 9
10# Multiple metrics
11scoring = ['accuracy', 'precision', 'recall', 'f1']
12scores = cross_validate(model, X, y, cv=5, scoring=scoring)
13for metric in scoring:
14    print(f"{metric}: {scores[f'test_{metric}'].mean():.3f}")
15
16# K-Fold
17kf = KFold(n_splits=5, shuffle=True, random_state=42)
18scores = cross_val_score(model, X, y, cv=kf)
19
20# Stratified K-Fold (preserves class distribution)
21skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
22scores = cross_val_score(model, X, y, cv=skf)
23
24# Time Series Split (no shuffle)
25tscv = TimeSeriesSplit(n_splits=5)
26scores = cross_val_score(model, X, y, cv=tscv)
27
28# Leave-One-Out
29loo = LeaveOneOut()
30scores = cross_val_score(model, X, y, cv=loo)

Hyperparameter Tuning

Grid Search

 1from sklearn.model_selection import GridSearchCV
 2
 3# Define parameter grid
 4param_grid = {
 5    'n_estimators': [50, 100, 200],
 6    'max_depth': [3, 5, 7, None],
 7    'min_samples_split': [2, 5, 10],
 8    'min_samples_leaf': [1, 2, 4]
 9}
10
11# Grid search
12grid_search = GridSearchCV(
13    RandomForestClassifier(random_state=42),
14    param_grid,
15    cv=5,
16    scoring='accuracy',
17    n_jobs=-1,
18    verbose=1
19)
20
21grid_search.fit(X_train, y_train)
22
23# Best parameters and score
24print("Best parameters:", grid_search.best_params_)
25print("Best score:", grid_search.best_score_)
26
27# Use best model
28best_model = grid_search.best_estimator_
29y_pred = best_model.predict(X_test)

Random Search

 1from sklearn.model_selection import RandomizedSearchCV
 2from scipy.stats import randint, uniform
 3
 4# Define parameter distributions
 5param_dist = {
 6    'n_estimators': randint(50, 200),
 7    'max_depth': [3, 5, 7, 10, None],
 8    'min_samples_split': randint(2, 20),
 9    'min_samples_leaf': randint(1, 10),
10    'max_features': uniform(0.1, 0.9)
11}
12
13# Random search
14random_search = RandomizedSearchCV(
15    RandomForestClassifier(random_state=42),
16    param_dist,
17    n_iter=100,
18    cv=5,
19    scoring='accuracy',
20    n_jobs=-1,
21    random_state=42,
22    verbose=1
23)
24
25random_search.fit(X_train, y_train)
26
27print("Best parameters:", random_search.best_params_)
28print("Best score:", random_search.best_score_)

Pipelines

Basic Pipeline

 1from sklearn.pipeline import Pipeline
 2
 3# Create pipeline
 4pipeline = Pipeline([
 5    ('scaler', StandardScaler()),
 6    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
 7])
 8
 9# Train
10pipeline.fit(X_train, y_train)
11
12# Predict
13y_pred = pipeline.predict(X_test)
14
15# Cross-validation on pipeline
16scores = cross_val_score(pipeline, X, y, cv=5)
17print(f"CV Accuracy: {scores.mean():.3f}")

Pipeline with Feature Selection

1pipeline = Pipeline([
2    ('imputer', SimpleImputer(strategy='mean')),
3    ('scaler', StandardScaler()),
4    ('feature_selection', SelectKBest(k=10)),
5    ('classifier', LogisticRegression())
6])
7
8pipeline.fit(X_train, y_train)

Column Transformer

 1from sklearn.compose import ColumnTransformer
 2
 3# Define transformers for different column types
 4numeric_features = ['age', 'income', 'score']
 5categorical_features = ['city', 'category']
 6
 7preprocessor = ColumnTransformer(
 8    transformers=[
 9        ('num', StandardScaler(), numeric_features),
10        ('cat', OneHotEncoder(drop='first'), categorical_features)
11    ])
12
13# Full pipeline
14pipeline = Pipeline([
15    ('preprocessor', preprocessor),
16    ('classifier', RandomForestClassifier(n_estimators=100))
17])
18
19pipeline.fit(X_train, y_train)
20y_pred = pipeline.predict(X_test)

Pipeline with Grid Search

 1pipeline = Pipeline([
 2    ('scaler', StandardScaler()),
 3    ('classifier', RandomForestClassifier(random_state=42))
 4])
 5
 6param_grid = {
 7    'classifier__n_estimators': [50, 100, 200],
 8    'classifier__max_depth': [3, 5, 7],
 9    'classifier__min_samples_split': [2, 5, 10]
10}
11
12grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
13grid_search.fit(X_train, y_train)
14
15print("Best parameters:", grid_search.best_params_)

Model Persistence

 1import joblib
 2import pickle
 3
 4# Save model with joblib (recommended)
 5joblib.dump(model, 'model.joblib')
 6loaded_model = joblib.load('model.joblib')
 7
 8# Save model with pickle
 9with open('model.pkl', 'wb') as f:
10    pickle.dump(model, f)
11
12with open('model.pkl', 'rb') as f:
13    loaded_model = pickle.load(f)
14
15# Save pipeline
16joblib.dump(pipeline, 'pipeline.joblib')
17loaded_pipeline = joblib.load('pipeline.joblib')

Common Patterns

Handling Imbalanced Data

 1from sklearn.utils import class_weight
 2from imblearn.over_sampling import SMOTE
 3from imblearn.under_sampling import RandomUnderSampler
 4
 5# Class weights
 6class_weights = class_weight.compute_class_weight(
 7    'balanced',
 8    classes=np.unique(y_train),
 9    y=y_train
10)
11model = RandomForestClassifier(class_weight='balanced')
12
13# SMOTE (oversampling)
14smote = SMOTE(random_state=42)
15X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
16
17# Undersampling
18rus = RandomUnderSampler(random_state=42)
19X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

Learning Curves

 1from sklearn.model_selection import learning_curve
 2
 3train_sizes, train_scores, val_scores = learning_curve(
 4    model, X, y, cv=5, n_jobs=-1,
 5    train_sizes=np.linspace(0.1, 1.0, 10)
 6)
 7
 8plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
 9plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation score')
10plt.xlabel('Training Set Size')
11plt.ylabel('Score')
12plt.legend()
13plt.show()

Feature Importances

 1# For tree-based models
 2importances = model.feature_importances_
 3indices = np.argsort(importances)[::-1]
 4
 5plt.figure(figsize=(10, 6))
 6plt.title("Feature Importances")
 7plt.bar(range(X.shape[1]), importances[indices])
 8plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
 9plt.tight_layout()
10plt.show()

Related Snippets

Click CLI Framework
Building CLI applications with Click in Python
FastAPI with OpenAPI
FastAPI with automatic OpenAPI documentation using Pydantic models and …
Flask Essentials
Flask web framework essentials for building web applications and APIs. …
Function Timing Decorator
Decorator to measure function execution time
LangChain Chatbot with Tools
Simple stdin chatbot using LangChain with tool calling (OpenRouter). …
Pandas DataFrames Essential Patterns
Essential patterns for working with Pandas DataFrames: creation, manipulation, …
Pydantic Data Validation
Pydantic - Data validation using Python type hints. Installation 1pip install …
Python Dataclasses
Python dataclasses for clean, boilerplate-free data structures. Basic Usage …
Python Metaclasses
Python metaclasses with visual explanations using Mermaid diagrams. What are …
Python Virtual Environments
Managing Python virtual environments and dependencies
Random Forests in Depth
Comprehensive guide to Random Forests: theory, implementation, tuning, and …

Scikit-learn Common Patterns

Installation

Basic Workflow

Data Preprocessing

Scaling and Normalization

Encoding Categorical Variables

Handling Missing Values

Feature Engineering

Feature Selection

Model Training

Classification

Regression

Model Evaluation

Classification Metrics

Regression Metrics

Cross-Validation

Hyperparameter Tuning

Grid Search

Random Search

Pipelines

Basic Pipeline

Pipeline with Feature Selection

Column Transformer

Pipeline with Grid Search

Model Persistence

Common Patterns

Handling Imbalanced Data

Learning Curves

Feature Importances

Further Reading

Related Snippets