Scikit-learn Common Patterns
Common patterns and workflows for scikit-learn: preprocessing, model training, evaluation, and pipelines.
Installation
1pip install scikit-learn numpy pandas matplotlib
Basic Workflow
1from sklearn.model_selection import train_test_split
2from sklearn.preprocessing import StandardScaler
3from sklearn.linear_model import LogisticRegression
4from sklearn.metrics import accuracy_score, classification_report
5
6# 1. Load data
7X, y = load_data() # Features and target
8
9# 2. Split data
10X_train, X_test, y_train, y_test = train_test_split(
11 X, y, test_size=0.2, random_state=42
12)
13
14# 3. Preprocess
15scaler = StandardScaler()
16X_train_scaled = scaler.fit_transform(X_train)
17X_test_scaled = scaler.transform(X_test)
18
19# 4. Train model
20model = LogisticRegression()
21model.fit(X_train_scaled, y_train)
22
23# 5. Predict
24y_pred = model.predict(X_test_scaled)
25
26# 6. Evaluate
27print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
28print(classification_report(y_test, y_pred))
Data Preprocessing
Scaling and Normalization
1from sklearn.preprocessing import (
2 StandardScaler, MinMaxScaler, RobustScaler,
3 Normalizer, MaxAbsScaler
4)
5
6# StandardScaler: mean=0, std=1
7scaler = StandardScaler()
8X_scaled = scaler.fit_transform(X_train)
9
10# MinMaxScaler: scale to [0, 1]
11scaler = MinMaxScaler()
12X_scaled = scaler.fit_transform(X_train)
13
14# MinMaxScaler: custom range
15scaler = MinMaxScaler(feature_range=(-1, 1))
16X_scaled = scaler.fit_transform(X_train)
17
18# RobustScaler: robust to outliers (uses median and IQR)
19scaler = RobustScaler()
20X_scaled = scaler.fit_transform(X_train)
21
22# Normalizer: scale samples to unit norm
23normalizer = Normalizer(norm='l2')
24X_normalized = normalizer.fit_transform(X_train)
25
26# MaxAbsScaler: scale by maximum absolute value
27scaler = MaxAbsScaler()
28X_scaled = scaler.fit_transform(X_train)
Encoding Categorical Variables
1from sklearn.preprocessing import (
2 LabelEncoder, OneHotEncoder, OrdinalEncoder
3)
4
5# LabelEncoder: for target variable
6le = LabelEncoder()
7y_encoded = le.fit_transform(y) # ['cat', 'dog'] -> [0, 1]
8y_decoded = le.inverse_transform(y_encoded)
9
10# OneHotEncoder: for features
11encoder = OneHotEncoder(sparse_output=False, drop='first')
12X_encoded = encoder.fit_transform(X[['category']])
13
14# OrdinalEncoder: for ordinal features
15encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
16X_encoded = encoder.fit_transform(X[['priority']])
17
18# pd.get_dummies (Pandas alternative)
19import pandas as pd
20X_encoded = pd.get_dummies(X, columns=['category'], drop_first=True)
Handling Missing Values
1from sklearn.impute import SimpleImputer, KNNImputer
2
3# SimpleImputer: mean, median, most_frequent, constant
4imputer = SimpleImputer(strategy='mean')
5X_imputed = imputer.fit_transform(X)
6
7imputer = SimpleImputer(strategy='median')
8X_imputed = imputer.fit_transform(X)
9
10imputer = SimpleImputer(strategy='most_frequent')
11X_imputed = imputer.fit_transform(X)
12
13imputer = SimpleImputer(strategy='constant', fill_value=0)
14X_imputed = imputer.fit_transform(X)
15
16# KNNImputer: impute using k-nearest neighbors
17imputer = KNNImputer(n_neighbors=5)
18X_imputed = imputer.fit_transform(X)
Feature Engineering
1from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
2
3# Polynomial features
4poly = PolynomialFeatures(degree=2, include_bias=False)
5X_poly = poly.fit_transform(X)
6
7# Power transformation (make data more Gaussian)
8pt = PowerTransformer(method='yeo-johnson')
9X_transformed = pt.fit_transform(X)
10
11# Box-Cox (requires positive data)
12pt = PowerTransformer(method='box-cox')
13X_transformed = pt.fit_transform(X)
Feature Selection
1from sklearn.feature_selection import (
2 SelectKBest, SelectPercentile, RFE, RFECV,
3 SelectFromModel, VarianceThreshold, f_classif, mutual_info_classif
4)
5
6# SelectKBest: select top k features
7selector = SelectKBest(score_func=f_classif, k=10)
8X_selected = selector.fit_transform(X, y)
9selected_features = X.columns[selector.get_support()]
10
11# SelectPercentile: select top percentile
12selector = SelectPercentile(score_func=mutual_info_classif, percentile=20)
13X_selected = selector.fit_transform(X, y)
14
15# RFE: Recursive Feature Elimination
16from sklearn.ensemble import RandomForestClassifier
17estimator = RandomForestClassifier(n_estimators=100)
18selector = RFE(estimator, n_features_to_select=10)
19X_selected = selector.fit_transform(X, y)
20
21# RFECV: RFE with cross-validation
22selector = RFECV(estimator, step=1, cv=5)
23X_selected = selector.fit_transform(X, y)
24print(f"Optimal features: {selector.n_features_}")
25
26# SelectFromModel: based on feature importances
27selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
28X_selected = selector.fit_transform(X, y)
29
30# VarianceThreshold: remove low-variance features
31selector = VarianceThreshold(threshold=0.01)
32X_selected = selector.fit_transform(X)
Model Training
Classification
1from sklearn.linear_model import LogisticRegression
2from sklearn.tree import DecisionTreeClassifier
3from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
4from sklearn.svm import SVC
5from sklearn.neighbors import KNeighborsClassifier
6from sklearn.naive_bayes import GaussianNB
7
8# Logistic Regression
9model = LogisticRegression(max_iter=1000, random_state=42)
10model.fit(X_train, y_train)
11
12# Decision Tree
13model = DecisionTreeClassifier(max_depth=5, random_state=42)
14model.fit(X_train, y_train)
15
16# Random Forest
17model = RandomForestClassifier(n_estimators=100, random_state=42)
18model.fit(X_train, y_train)
19
20# Gradient Boosting
21model = GradientBoostingClassifier(n_estimators=100, random_state=42)
22model.fit(X_train, y_train)
23
24# SVM
25model = SVC(kernel='rbf', C=1.0, random_state=42)
26model.fit(X_train, y_train)
27
28# K-Nearest Neighbors
29model = KNeighborsClassifier(n_neighbors=5)
30model.fit(X_train, y_train)
31
32# Naive Bayes
33model = GaussianNB()
34model.fit(X_train, y_train)
Regression
1from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
2from sklearn.tree import DecisionTreeRegressor
3from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
4from sklearn.svm import SVR
5
6# Linear Regression
7model = LinearRegression()
8model.fit(X_train, y_train)
9
10# Ridge Regression (L2 regularization)
11model = Ridge(alpha=1.0)
12model.fit(X_train, y_train)
13
14# Lasso Regression (L1 regularization)
15model = Lasso(alpha=1.0)
16model.fit(X_train, y_train)
17
18# ElasticNet (L1 + L2)
19model = ElasticNet(alpha=1.0, l1_ratio=0.5)
20model.fit(X_train, y_train)
21
22# Decision Tree Regressor
23model = DecisionTreeRegressor(max_depth=5, random_state=42)
24model.fit(X_train, y_train)
25
26# Random Forest Regressor
27model = RandomForestRegressor(n_estimators=100, random_state=42)
28model.fit(X_train, y_train)
29
30# Gradient Boosting Regressor
31model = GradientBoostingRegressor(n_estimators=100, random_state=42)
32model.fit(X_train, y_train)
33
34# SVR
35model = SVR(kernel='rbf', C=1.0)
36model.fit(X_train, y_train)
Model Evaluation
Classification Metrics
1from sklearn.metrics import (
2 accuracy_score, precision_score, recall_score, f1_score,
3 confusion_matrix, classification_report, roc_auc_score, roc_curve,
4 precision_recall_curve, average_precision_score
5)
6
7y_pred = model.predict(X_test)
8y_pred_proba = model.predict_proba(X_test)[:, 1] # For binary classification
9
10# Basic metrics
11print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
12print(f"Precision: {precision_score(y_test, y_pred):.3f}")
13print(f"Recall: {recall_score(y_test, y_pred):.3f}")
14print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")
15
16# Confusion matrix
17cm = confusion_matrix(y_test, y_pred)
18print("Confusion Matrix:")
19print(cm)
20
21# Classification report
22print(classification_report(y_test, y_pred))
23
24# ROC-AUC
25auc = roc_auc_score(y_test, y_pred_proba)
26print(f"ROC-AUC: {auc:.3f}")
27
28# Plot ROC curve
29import matplotlib.pyplot as plt
30fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
31plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
32plt.plot([0, 1], [0, 1], 'k--')
33plt.xlabel('False Positive Rate')
34plt.ylabel('True Positive Rate')
35plt.title('ROC Curve')
36plt.legend()
37plt.show()
38
39# Precision-Recall curve
40precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
41ap = average_precision_score(y_test, y_pred_proba)
42plt.plot(recall, precision, label=f'AP = {ap:.3f}')
43plt.xlabel('Recall')
44plt.ylabel('Precision')
45plt.title('Precision-Recall Curve')
46plt.legend()
47plt.show()
Regression Metrics
1from sklearn.metrics import (
2 mean_squared_error, mean_absolute_error, r2_score,
3 mean_absolute_percentage_error, explained_variance_score
4)
5
6y_pred = model.predict(X_test)
7
8# Metrics
9mse = mean_squared_error(y_test, y_pred)
10rmse = np.sqrt(mse)
11mae = mean_absolute_error(y_test, y_pred)
12r2 = r2_score(y_test, y_pred)
13mape = mean_absolute_percentage_error(y_test, y_pred)
14
15print(f"MSE: {mse:.3f}")
16print(f"RMSE: {rmse:.3f}")
17print(f"MAE: {mae:.3f}")
18print(f"R²: {r2:.3f}")
19print(f"MAPE: {mape:.3f}")
20
21# Plot predictions vs actual
22plt.scatter(y_test, y_pred, alpha=0.5)
23plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
24plt.xlabel('Actual')
25plt.ylabel('Predicted')
26plt.title('Predictions vs Actual')
27plt.show()
Cross-Validation
1from sklearn.model_selection import (
2 cross_val_score, cross_validate, KFold, StratifiedKFold,
3 TimeSeriesSplit, LeaveOneOut
4)
5
6# Simple cross-validation
7scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
8print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})")
9
10# Multiple metrics
11scoring = ['accuracy', 'precision', 'recall', 'f1']
12scores = cross_validate(model, X, y, cv=5, scoring=scoring)
13for metric in scoring:
14 print(f"{metric}: {scores[f'test_{metric}'].mean():.3f}")
15
16# K-Fold
17kf = KFold(n_splits=5, shuffle=True, random_state=42)
18scores = cross_val_score(model, X, y, cv=kf)
19
20# Stratified K-Fold (preserves class distribution)
21skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
22scores = cross_val_score(model, X, y, cv=skf)
23
24# Time Series Split (no shuffle)
25tscv = TimeSeriesSplit(n_splits=5)
26scores = cross_val_score(model, X, y, cv=tscv)
27
28# Leave-One-Out
29loo = LeaveOneOut()
30scores = cross_val_score(model, X, y, cv=loo)
Hyperparameter Tuning
Grid Search
1from sklearn.model_selection import GridSearchCV
2
3# Define parameter grid
4param_grid = {
5 'n_estimators': [50, 100, 200],
6 'max_depth': [3, 5, 7, None],
7 'min_samples_split': [2, 5, 10],
8 'min_samples_leaf': [1, 2, 4]
9}
10
11# Grid search
12grid_search = GridSearchCV(
13 RandomForestClassifier(random_state=42),
14 param_grid,
15 cv=5,
16 scoring='accuracy',
17 n_jobs=-1,
18 verbose=1
19)
20
21grid_search.fit(X_train, y_train)
22
23# Best parameters and score
24print("Best parameters:", grid_search.best_params_)
25print("Best score:", grid_search.best_score_)
26
27# Use best model
28best_model = grid_search.best_estimator_
29y_pred = best_model.predict(X_test)
Random Search
1from sklearn.model_selection import RandomizedSearchCV
2from scipy.stats import randint, uniform
3
4# Define parameter distributions
5param_dist = {
6 'n_estimators': randint(50, 200),
7 'max_depth': [3, 5, 7, 10, None],
8 'min_samples_split': randint(2, 20),
9 'min_samples_leaf': randint(1, 10),
10 'max_features': uniform(0.1, 0.9)
11}
12
13# Random search
14random_search = RandomizedSearchCV(
15 RandomForestClassifier(random_state=42),
16 param_dist,
17 n_iter=100,
18 cv=5,
19 scoring='accuracy',
20 n_jobs=-1,
21 random_state=42,
22 verbose=1
23)
24
25random_search.fit(X_train, y_train)
26
27print("Best parameters:", random_search.best_params_)
28print("Best score:", random_search.best_score_)
Pipelines
Basic Pipeline
1from sklearn.pipeline import Pipeline
2
3# Create pipeline
4pipeline = Pipeline([
5 ('scaler', StandardScaler()),
6 ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
7])
8
9# Train
10pipeline.fit(X_train, y_train)
11
12# Predict
13y_pred = pipeline.predict(X_test)
14
15# Cross-validation on pipeline
16scores = cross_val_score(pipeline, X, y, cv=5)
17print(f"CV Accuracy: {scores.mean():.3f}")
Pipeline with Feature Selection
1pipeline = Pipeline([
2 ('imputer', SimpleImputer(strategy='mean')),
3 ('scaler', StandardScaler()),
4 ('feature_selection', SelectKBest(k=10)),
5 ('classifier', LogisticRegression())
6])
7
8pipeline.fit(X_train, y_train)
Column Transformer
1from sklearn.compose import ColumnTransformer
2
3# Define transformers for different column types
4numeric_features = ['age', 'income', 'score']
5categorical_features = ['city', 'category']
6
7preprocessor = ColumnTransformer(
8 transformers=[
9 ('num', StandardScaler(), numeric_features),
10 ('cat', OneHotEncoder(drop='first'), categorical_features)
11 ])
12
13# Full pipeline
14pipeline = Pipeline([
15 ('preprocessor', preprocessor),
16 ('classifier', RandomForestClassifier(n_estimators=100))
17])
18
19pipeline.fit(X_train, y_train)
20y_pred = pipeline.predict(X_test)
Pipeline with Grid Search
1pipeline = Pipeline([
2 ('scaler', StandardScaler()),
3 ('classifier', RandomForestClassifier(random_state=42))
4])
5
6param_grid = {
7 'classifier__n_estimators': [50, 100, 200],
8 'classifier__max_depth': [3, 5, 7],
9 'classifier__min_samples_split': [2, 5, 10]
10}
11
12grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
13grid_search.fit(X_train, y_train)
14
15print("Best parameters:", grid_search.best_params_)
Model Persistence
1import joblib
2import pickle
3
4# Save model with joblib (recommended)
5joblib.dump(model, 'model.joblib')
6loaded_model = joblib.load('model.joblib')
7
8# Save model with pickle
9with open('model.pkl', 'wb') as f:
10 pickle.dump(model, f)
11
12with open('model.pkl', 'rb') as f:
13 loaded_model = pickle.load(f)
14
15# Save pipeline
16joblib.dump(pipeline, 'pipeline.joblib')
17loaded_pipeline = joblib.load('pipeline.joblib')
Common Patterns
Handling Imbalanced Data
1from sklearn.utils import class_weight
2from imblearn.over_sampling import SMOTE
3from imblearn.under_sampling import RandomUnderSampler
4
5# Class weights
6class_weights = class_weight.compute_class_weight(
7 'balanced',
8 classes=np.unique(y_train),
9 y=y_train
10)
11model = RandomForestClassifier(class_weight='balanced')
12
13# SMOTE (oversampling)
14smote = SMOTE(random_state=42)
15X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
16
17# Undersampling
18rus = RandomUnderSampler(random_state=42)
19X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Learning Curves
1from sklearn.model_selection import learning_curve
2
3train_sizes, train_scores, val_scores = learning_curve(
4 model, X, y, cv=5, n_jobs=-1,
5 train_sizes=np.linspace(0.1, 1.0, 10)
6)
7
8plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
9plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation score')
10plt.xlabel('Training Set Size')
11plt.ylabel('Score')
12plt.legend()
13plt.show()
Feature Importances
1# For tree-based models
2importances = model.feature_importances_
3indices = np.argsort(importances)[::-1]
4
5plt.figure(figsize=(10, 6))
6plt.title("Feature Importances")
7plt.bar(range(X.shape[1]), importances[indices])
8plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
9plt.tight_layout()
10plt.show()
Further Reading
Related Snippets
- Click CLI Framework
Building CLI applications with Click in Python - FastAPI with OpenAPI
FastAPI with automatic OpenAPI documentation using Pydantic models and … - Flask Essentials
Flask web framework essentials for building web applications and APIs. … - Function Timing Decorator
Decorator to measure function execution time - LangChain Chatbot with Tools
Simple stdin chatbot using LangChain with tool calling (OpenRouter). … - Pandas DataFrames Essential Patterns
Essential patterns for working with Pandas DataFrames: creation, manipulation, … - Pydantic Data Validation
Pydantic - Data validation using Python type hints. Installation 1pip install … - Python Dataclasses
Python dataclasses for clean, boilerplate-free data structures. Basic Usage … - Python Metaclasses
Python metaclasses with visual explanations using Mermaid diagrams. What are … - Python Virtual Environments
Managing Python virtual environments and dependencies - Random Forests in Depth
Comprehensive guide to Random Forests: theory, implementation, tuning, and …