Pandas DataFrames Essential Patterns
Essential patterns for working with Pandas DataFrames: creation, manipulation, filtering, aggregation, and transformation.
Installation
1pip install pandas numpy
Creating DataFrames
From Dictionary
1import pandas as pd
2import numpy as np
3
4# From dict of lists
5df = pd.DataFrame({
6 'name': ['Alice', 'Bob', 'Charlie'],
7 'age': [25, 30, 35],
8 'city': ['NYC', 'LA', 'Chicago']
9})
10
11# From list of dicts
12data = [
13 {'name': 'Alice', 'age': 25, 'city': 'NYC'},
14 {'name': 'Bob', 'age': 30, 'city': 'LA'},
15 {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
16]
17df = pd.DataFrame(data)
18
19# From dict of Series
20df = pd.DataFrame({
21 'A': pd.Series([1, 2, 3]),
22 'B': pd.Series([4, 5, 6])
23})
From Files
1# CSV
2df = pd.read_csv('data.csv')
3df = pd.read_csv('data.csv', sep=';', encoding='utf-8')
4df = pd.read_csv('data.csv', parse_dates=['date_column'])
5
6# Excel
7df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
8
9# JSON
10df = pd.read_json('data.json')
11df = pd.read_json('data.json', orient='records')
12
13# SQL
14import sqlite3
15conn = sqlite3.connect('database.db')
16df = pd.read_sql_query('SELECT * FROM table', conn)
17
18# Parquet
19df = pd.read_parquet('data.parquet')
20
21# From clipboard
22df = pd.read_clipboard()
Generate Sample Data
1# Random data
2df = pd.DataFrame({
3 'A': np.random.randn(100),
4 'B': np.random.randint(0, 100, 100),
5 'C': np.random.choice(['X', 'Y', 'Z'], 100)
6})
7
8# Date range
9dates = pd.date_range('2024-01-01', periods=100, freq='D')
10df = pd.DataFrame({
11 'date': dates,
12 'value': np.random.randn(100)
13})
14
15# From NumPy array
16arr = np.random.randn(5, 3)
17df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
Inspecting DataFrames
1# Basic info
2df.head() # First 5 rows
3df.head(10) # First 10 rows
4df.tail() # Last 5 rows
5df.sample(5) # Random 5 rows
6df.shape # (rows, columns)
7df.columns # Column names
8df.dtypes # Data types
9df.info() # Summary info
10df.describe() # Statistical summary
11df.memory_usage() # Memory usage
12
13# Quick stats
14df['column'].value_counts()
15df['column'].nunique()
16df['column'].unique()
17df.isnull().sum() # Count nulls per column
18df.duplicated().sum() # Count duplicates
Selecting Data
Column Selection
1# Single column (returns Series)
2df['name']
3df.name # Attribute access (if no spaces in name)
4
5# Multiple columns (returns DataFrame)
6df[['name', 'age']]
7
8# Select by dtype
9df.select_dtypes(include=['int64', 'float64'])
10df.select_dtypes(exclude=['object'])
11
12# Column slicing
13df.iloc[:, 0:3] # First 3 columns
Row Selection
1# By index
2df.iloc[0] # First row
3df.iloc[0:5] # First 5 rows
4df.iloc[[0, 2, 4]] # Specific rows
5
6# By label
7df.loc[0] # Row with index 0
8df.loc[0:5] # Rows 0 through 5 (inclusive)
9df.loc[[0, 2, 4]] # Specific rows
10
11# Boolean indexing
12df[df['age'] > 30]
13df[df['city'] == 'NYC']
14df[(df['age'] > 25) & (df['city'] == 'NYC')]
15df[df['age'].between(25, 35)]
16df[df['name'].str.contains('Alice')]
17df[df['name'].isin(['Alice', 'Bob'])]
18df[~df['name'].isin(['Alice'])] # NOT in list
19
20# Query method
21df.query('age > 30')
22df.query('age > 30 and city == "NYC"')
23df.query('name in ["Alice", "Bob"]')
Combined Selection
1# Row and column
2df.loc[0:5, ['name', 'age']]
3df.iloc[0:5, 0:2]
4
5# Boolean + columns
6df.loc[df['age'] > 30, ['name', 'city']]
Filtering Patterns
1# Multiple conditions
2mask = (df['age'] > 25) & (df['city'].isin(['NYC', 'LA']))
3filtered = df[mask]
4
5# Filter by string patterns
6df[df['name'].str.startswith('A')]
7df[df['name'].str.endswith('e')]
8df[df['name'].str.contains('li', case=False)]
9df[df['email'].str.match(r'.*@gmail\.com')]
10
11# Filter by null values
12df[df['column'].isnull()]
13df[df['column'].notnull()]
14df.dropna() # Drop rows with any null
15df.dropna(subset=['column']) # Drop if specific column is null
16df.dropna(how='all') # Drop only if all values are null
17
18# Filter by index
19df[df.index.isin([0, 2, 4])]
20df.loc['2024-01-01':'2024-01-31'] # Date range
21
22# Top/bottom N
23df.nlargest(10, 'age')
24df.nsmallest(10, 'age')
25
26# Sample
27df.sample(n=10) # Random 10 rows
28df.sample(frac=0.1) # Random 10%
29df.sample(n=10, random_state=42) # Reproducible
Adding/Modifying Columns
1# Simple assignment
2df['new_col'] = 0
3df['new_col'] = df['age'] * 2
4df['full_name'] = df['first_name'] + ' ' + df['last_name']
5
6# Conditional assignment
7df['category'] = df['age'].apply(lambda x: 'young' if x < 30 else 'old')
8df['category'] = np.where(df['age'] < 30, 'young', 'old')
9df['category'] = pd.cut(df['age'], bins=[0, 30, 60, 100],
10 labels=['young', 'middle', 'senior'])
11
12# Multiple conditions
13conditions = [
14 df['age'] < 25,
15 (df['age'] >= 25) & (df['age'] < 40),
16 df['age'] >= 40
17]
18choices = ['young', 'middle', 'senior']
19df['age_group'] = np.select(conditions, choices, default='unknown')
20
21# Apply function
22df['age_squared'] = df['age'].apply(lambda x: x**2)
23df['name_length'] = df['name'].str.len()
24
25# Map values
26mapping = {'NYC': 'New York', 'LA': 'Los Angeles'}
27df['city_full'] = df['city'].map(mapping)
28
29# Replace values
30df['city'] = df['city'].replace({'NYC': 'New York', 'LA': 'Los Angeles'})
31
32# Insert at specific position
33df.insert(1, 'new_col', 0) # Insert at position 1
34
35# Rename columns
36df.rename(columns={'old_name': 'new_name'}, inplace=True)
37df.columns = ['col1', 'col2', 'col3'] # Rename all
Aggregation and Grouping
GroupBy Operations
1# Basic groupby
2grouped = df.groupby('city')
3grouped.size() # Count per group
4grouped.mean() # Mean of numeric columns
5grouped.sum()
6grouped.count()
7grouped.min()
8grouped.max()
9grouped.std()
10
11# Multiple columns
12df.groupby(['city', 'age_group']).size()
13
14# Specific aggregation
15df.groupby('city')['age'].mean()
16df.groupby('city')['age'].agg(['mean', 'min', 'max', 'std'])
17
18# Multiple aggregations
19df.groupby('city').agg({
20 'age': ['mean', 'min', 'max'],
21 'salary': ['sum', 'mean']
22})
23
24# Custom aggregation
25df.groupby('city')['age'].agg(
26 mean_age='mean',
27 min_age='min',
28 max_age='max',
29 range_age=lambda x: x.max() - x.min()
30)
31
32# Apply custom function
33def custom_func(group):
34 return group['age'].max() - group['age'].min()
35
36df.groupby('city').apply(custom_func)
37
38# Transform (keeps original shape)
39df['age_normalized'] = df.groupby('city')['age'].transform(
40 lambda x: (x - x.mean()) / x.std()
41)
42
43# Filter groups
44df.groupby('city').filter(lambda x: len(x) > 10)
45df.groupby('city').filter(lambda x: x['age'].mean() > 30)
Pivot Tables
1# Basic pivot
2pivot = df.pivot_table(
3 values='salary',
4 index='city',
5 columns='department',
6 aggfunc='mean'
7)
8
9# Multiple aggregations
10pivot = df.pivot_table(
11 values='salary',
12 index='city',
13 columns='department',
14 aggfunc=['mean', 'sum', 'count']
15)
16
17# Multiple values
18pivot = df.pivot_table(
19 values=['salary', 'age'],
20 index='city',
21 columns='department',
22 aggfunc='mean'
23)
24
25# With margins (totals)
26pivot = df.pivot_table(
27 values='salary',
28 index='city',
29 columns='department',
30 aggfunc='mean',
31 margins=True
32)
33
34# Cross-tabulation
35pd.crosstab(df['city'], df['department'])
36pd.crosstab(df['city'], df['department'], normalize='all') # Percentages
Sorting
1# Sort by single column
2df.sort_values('age')
3df.sort_values('age', ascending=False)
4
5# Sort by multiple columns
6df.sort_values(['city', 'age'], ascending=[True, False])
7
8# Sort by index
9df.sort_index()
10df.sort_index(ascending=False)
11
12# In-place sorting
13df.sort_values('age', inplace=True)
Merging and Joining
1# Merge (SQL-like join)
2df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
3df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
4
5# Inner join (default)
6pd.merge(df1, df2, on='key')
7
8# Left join
9pd.merge(df1, df2, on='key', how='left')
10
11# Right join
12pd.merge(df1, df2, on='key', how='right')
13
14# Outer join
15pd.merge(df1, df2, on='key', how='outer')
16
17# Multiple keys
18pd.merge(df1, df2, on=['key1', 'key2'])
19
20# Different column names
21pd.merge(df1, df2, left_on='key1', right_on='key2')
22
23# Merge on index
24pd.merge(df1, df2, left_index=True, right_index=True)
25
26# Concatenate
27pd.concat([df1, df2]) # Vertical (rows)
28pd.concat([df1, df2], axis=1) # Horizontal (columns)
29pd.concat([df1, df2], ignore_index=True) # Reset index
30
31# Append (deprecated, use concat)
32pd.concat([df1, df2], ignore_index=True)
33
34# Join (on index)
35df1.join(df2, how='left')
Reshaping
1# Melt (wide to long)
2df_wide = pd.DataFrame({
3 'id': [1, 2],
4 'A': [10, 20],
5 'B': [30, 40]
6})
7df_long = df_wide.melt(id_vars=['id'], var_name='variable', value_name='value')
8
9# Pivot (long to wide)
10df_wide = df_long.pivot(index='id', columns='variable', values='value')
11
12# Stack/Unstack
13df.stack() # Columns to rows
14df.unstack() # Rows to columns
15
16# Transpose
17df.T
Handling Missing Data
1# Detect missing
2df.isnull()
3df.notnull()
4df.isnull().sum() # Count per column
5
6# Drop missing
7df.dropna() # Drop rows with any null
8df.dropna(axis=1) # Drop columns with any null
9df.dropna(how='all') # Drop only if all values null
10df.dropna(subset=['column']) # Drop if specific column null
11df.dropna(thresh=2) # Keep rows with at least 2 non-null values
12
13# Fill missing
14df.fillna(0) # Fill with constant
15df.fillna(method='ffill') # Forward fill
16df.fillna(method='bfill') # Backward fill
17df.fillna(df.mean()) # Fill with mean
18df.fillna({'col1': 0, 'col2': 'unknown'}) # Different values per column
19
20# Interpolate
21df['column'].interpolate()
22df['column'].interpolate(method='linear')
23df['column'].interpolate(method='polynomial', order=2)
24
25# Replace
26df.replace(np.nan, 0)
27df.replace([0, -999], np.nan)
Time Series Operations
1# Create datetime index
2df['date'] = pd.to_datetime(df['date'])
3df.set_index('date', inplace=True)
4
5# Resample
6df.resample('D').mean() # Daily average
7df.resample('W').sum() # Weekly sum
8df.resample('M').last() # Monthly last value
9df.resample('Q').agg({'value': 'sum', 'count': 'size'})
10
11# Rolling window
12df['rolling_mean'] = df['value'].rolling(window=7).mean()
13df['rolling_std'] = df['value'].rolling(window=7).std()
14df['rolling_sum'] = df['value'].rolling(window=7).sum()
15
16# Expanding window
17df['expanding_mean'] = df['value'].expanding().mean()
18
19# Shift
20df['prev_value'] = df['value'].shift(1) # Previous row
21df['next_value'] = df['value'].shift(-1) # Next row
22df['pct_change'] = df['value'].pct_change() # Percentage change
23
24# Date operations
25df['year'] = df.index.year
26df['month'] = df.index.month
27df['day'] = df.index.day
28df['dayofweek'] = df.index.dayofweek
29df['quarter'] = df.index.quarter
30
31# Filter by date
32df['2024-01-01':'2024-01-31']
33df.loc['2024-01'] # All of January 2024
String Operations
1# String methods (vectorized)
2df['name'].str.lower()
3df['name'].str.upper()
4df['name'].str.title()
5df['name'].str.strip()
6df['name'].str.replace('old', 'new')
7df['name'].str.split(' ')
8df['name'].str.split(' ', expand=True) # Split into columns
9df['name'].str.len()
10df['name'].str.contains('pattern')
11df['name'].str.startswith('A')
12df['name'].str.endswith('e')
13df['name'].str.extract(r'(\d+)') # Regex extraction
14df['name'].str.findall(r'\d+')
15df['name'].str.count('a')
16df['name'].str.slice(0, 3) # First 3 characters
Performance Optimization
1# Use categorical for repeated strings
2df['category'] = df['category'].astype('category')
3
4# Downcast numeric types
5df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
6df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')
7
8# Use chunks for large files
9chunks = pd.read_csv('large_file.csv', chunksize=10000)
10for chunk in chunks:
11 process(chunk)
12
13# Vectorized operations (avoid loops)
14# ❌ Slow
15for i in range(len(df)):
16 df.loc[i, 'new_col'] = df.loc[i, 'col1'] * 2
17
18# ✅ Fast
19df['new_col'] = df['col1'] * 2
20
21# Use query for complex filters
22df.query('age > 30 and city == "NYC"') # Faster than boolean indexing
23
24# Use eval for complex calculations
25df.eval('new_col = col1 + col2 * col3')
26
27# Copy vs view
28df_copy = df.copy() # Full copy
29df_view = df[['col1', 'col2']] # View (faster, but modifies original)
Export Data
1# CSV
2df.to_csv('output.csv', index=False)
3df.to_csv('output.csv', sep=';', encoding='utf-8')
4
5# Excel
6df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)
7
8# JSON
9df.to_json('output.json', orient='records')
10
11# SQL
12df.to_sql('table_name', conn, if_exists='replace', index=False)
13
14# Parquet
15df.to_parquet('output.parquet', compression='gzip')
16
17# HTML
18df.to_html('output.html')
19
20# Clipboard
21df.to_clipboard()
Common Patterns
Remove Duplicates
1# Drop duplicate rows
2df.drop_duplicates()
3df.drop_duplicates(subset=['column'])
4df.drop_duplicates(keep='first') # Keep first occurrence
5df.drop_duplicates(keep='last') # Keep last occurrence
6df.drop_duplicates(keep=False) # Remove all duplicates
Reset Index
1df.reset_index(drop=True) # Drop old index
2df.reset_index() # Keep old index as column
Set Index
1df.set_index('column')
2df.set_index(['col1', 'col2']) # MultiIndex
Chain Operations
1result = (df
2 .query('age > 25')
3 .groupby('city')
4 .agg({'salary': 'mean', 'age': 'max'})
5 .reset_index()
6 .sort_values('salary', ascending=False)
7 .head(10)
8)
Conditional Replacement
1# Replace values conditionally
2df.loc[df['age'] > 30, 'category'] = 'senior'
3df.loc[df['city'] == 'NYC', 'region'] = 'East'
Further Reading
Related Snippets
- Click CLI Framework
Building CLI applications with Click in Python - FastAPI with OpenAPI
FastAPI with automatic OpenAPI documentation using Pydantic models and … - Flask Essentials
Flask web framework essentials for building web applications and APIs. … - Function Timing Decorator
Decorator to measure function execution time - LangChain Chatbot with Tools
Simple stdin chatbot using LangChain with tool calling (OpenRouter). … - Pydantic Data Validation
Pydantic - Data validation using Python type hints. Installation 1pip install … - Python Dataclasses
Python dataclasses for clean, boilerplate-free data structures. Basic Usage … - Python Metaclasses
Python metaclasses with visual explanations using Mermaid diagrams. What are … - Python Virtual Environments
Managing Python virtual environments and dependencies - Random Forests in Depth
Comprehensive guide to Random Forests: theory, implementation, tuning, and … - Scikit-learn Common Patterns
Common patterns and workflows for scikit-learn: preprocessing, model training, …