Pandas DataFrames Essential Patterns

Essential patterns for working with Pandas DataFrames: creation, manipulation, filtering, aggregation, and transformation.

Installation

1pip install pandas numpy

Creating DataFrames

From Dictionary

 1import pandas as pd
 2import numpy as np
 3
 4# From dict of lists
 5df = pd.DataFrame({
 6    'name': ['Alice', 'Bob', 'Charlie'],
 7    'age': [25, 30, 35],
 8    'city': ['NYC', 'LA', 'Chicago']
 9})
10
11# From list of dicts
12data = [
13    {'name': 'Alice', 'age': 25, 'city': 'NYC'},
14    {'name': 'Bob', 'age': 30, 'city': 'LA'},
15    {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
16]
17df = pd.DataFrame(data)
18
19# From dict of Series
20df = pd.DataFrame({
21    'A': pd.Series([1, 2, 3]),
22    'B': pd.Series([4, 5, 6])
23})

From Files

 1# CSV
 2df = pd.read_csv('data.csv')
 3df = pd.read_csv('data.csv', sep=';', encoding='utf-8')
 4df = pd.read_csv('data.csv', parse_dates=['date_column'])
 5
 6# Excel
 7df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
 8
 9# JSON
10df = pd.read_json('data.json')
11df = pd.read_json('data.json', orient='records')
12
13# SQL
14import sqlite3
15conn = sqlite3.connect('database.db')
16df = pd.read_sql_query('SELECT * FROM table', conn)
17
18# Parquet
19df = pd.read_parquet('data.parquet')
20
21# From clipboard
22df = pd.read_clipboard()

Generate Sample Data

 1# Random data
 2df = pd.DataFrame({
 3    'A': np.random.randn(100),
 4    'B': np.random.randint(0, 100, 100),
 5    'C': np.random.choice(['X', 'Y', 'Z'], 100)
 6})
 7
 8# Date range
 9dates = pd.date_range('2024-01-01', periods=100, freq='D')
10df = pd.DataFrame({
11    'date': dates,
12    'value': np.random.randn(100)
13})
14
15# From NumPy array
16arr = np.random.randn(5, 3)
17df = pd.DataFrame(arr, columns=['A', 'B', 'C'])

Inspecting DataFrames

 1# Basic info
 2df.head()           # First 5 rows
 3df.head(10)         # First 10 rows
 4df.tail()           # Last 5 rows
 5df.sample(5)        # Random 5 rows
 6df.shape            # (rows, columns)
 7df.columns          # Column names
 8df.dtypes           # Data types
 9df.info()           # Summary info
10df.describe()       # Statistical summary
11df.memory_usage()   # Memory usage
12
13# Quick stats
14df['column'].value_counts()
15df['column'].nunique()
16df['column'].unique()
17df.isnull().sum()   # Count nulls per column
18df.duplicated().sum()  # Count duplicates

Selecting Data

Column Selection

 1# Single column (returns Series)
 2df['name']
 3df.name  # Attribute access (if no spaces in name)
 4
 5# Multiple columns (returns DataFrame)
 6df[['name', 'age']]
 7
 8# Select by dtype
 9df.select_dtypes(include=['int64', 'float64'])
10df.select_dtypes(exclude=['object'])
11
12# Column slicing
13df.iloc[:, 0:3]  # First 3 columns

Row Selection

 1# By index
 2df.iloc[0]          # First row
 3df.iloc[0:5]        # First 5 rows
 4df.iloc[[0, 2, 4]]  # Specific rows
 5
 6# By label
 7df.loc[0]           # Row with index 0
 8df.loc[0:5]         # Rows 0 through 5 (inclusive)
 9df.loc[[0, 2, 4]]   # Specific rows
10
11# Boolean indexing
12df[df['age'] > 30]
13df[df['city'] == 'NYC']
14df[(df['age'] > 25) & (df['city'] == 'NYC')]
15df[df['age'].between(25, 35)]
16df[df['name'].str.contains('Alice')]
17df[df['name'].isin(['Alice', 'Bob'])]
18df[~df['name'].isin(['Alice'])]  # NOT in list
19
20# Query method
21df.query('age > 30')
22df.query('age > 30 and city == "NYC"')
23df.query('name in ["Alice", "Bob"]')

Combined Selection

1# Row and column
2df.loc[0:5, ['name', 'age']]
3df.iloc[0:5, 0:2]
4
5# Boolean + columns
6df.loc[df['age'] > 30, ['name', 'city']]

Filtering Patterns

 1# Multiple conditions
 2mask = (df['age'] > 25) & (df['city'].isin(['NYC', 'LA']))
 3filtered = df[mask]
 4
 5# Filter by string patterns
 6df[df['name'].str.startswith('A')]
 7df[df['name'].str.endswith('e')]
 8df[df['name'].str.contains('li', case=False)]
 9df[df['email'].str.match(r'.*@gmail\.com')]
10
11# Filter by null values
12df[df['column'].isnull()]
13df[df['column'].notnull()]
14df.dropna()  # Drop rows with any null
15df.dropna(subset=['column'])  # Drop if specific column is null
16df.dropna(how='all')  # Drop only if all values are null
17
18# Filter by index
19df[df.index.isin([0, 2, 4])]
20df.loc['2024-01-01':'2024-01-31']  # Date range
21
22# Top/bottom N
23df.nlargest(10, 'age')
24df.nsmallest(10, 'age')
25
26# Sample
27df.sample(n=10)  # Random 10 rows
28df.sample(frac=0.1)  # Random 10%
29df.sample(n=10, random_state=42)  # Reproducible

Adding/Modifying Columns

 1# Simple assignment
 2df['new_col'] = 0
 3df['new_col'] = df['age'] * 2
 4df['full_name'] = df['first_name'] + ' ' + df['last_name']
 5
 6# Conditional assignment
 7df['category'] = df['age'].apply(lambda x: 'young' if x < 30 else 'old')
 8df['category'] = np.where(df['age'] < 30, 'young', 'old')
 9df['category'] = pd.cut(df['age'], bins=[0, 30, 60, 100], 
10                         labels=['young', 'middle', 'senior'])
11
12# Multiple conditions
13conditions = [
14    df['age'] < 25,
15    (df['age'] >= 25) & (df['age'] < 40),
16    df['age'] >= 40
17]
18choices = ['young', 'middle', 'senior']
19df['age_group'] = np.select(conditions, choices, default='unknown')
20
21# Apply function
22df['age_squared'] = df['age'].apply(lambda x: x**2)
23df['name_length'] = df['name'].str.len()
24
25# Map values
26mapping = {'NYC': 'New York', 'LA': 'Los Angeles'}
27df['city_full'] = df['city'].map(mapping)
28
29# Replace values
30df['city'] = df['city'].replace({'NYC': 'New York', 'LA': 'Los Angeles'})
31
32# Insert at specific position
33df.insert(1, 'new_col', 0)  # Insert at position 1
34
35# Rename columns
36df.rename(columns={'old_name': 'new_name'}, inplace=True)
37df.columns = ['col1', 'col2', 'col3']  # Rename all

Aggregation and Grouping

GroupBy Operations

 1# Basic groupby
 2grouped = df.groupby('city')
 3grouped.size()  # Count per group
 4grouped.mean()  # Mean of numeric columns
 5grouped.sum()
 6grouped.count()
 7grouped.min()
 8grouped.max()
 9grouped.std()
10
11# Multiple columns
12df.groupby(['city', 'age_group']).size()
13
14# Specific aggregation
15df.groupby('city')['age'].mean()
16df.groupby('city')['age'].agg(['mean', 'min', 'max', 'std'])
17
18# Multiple aggregations
19df.groupby('city').agg({
20    'age': ['mean', 'min', 'max'],
21    'salary': ['sum', 'mean']
22})
23
24# Custom aggregation
25df.groupby('city')['age'].agg(
26    mean_age='mean',
27    min_age='min',
28    max_age='max',
29    range_age=lambda x: x.max() - x.min()
30)
31
32# Apply custom function
33def custom_func(group):
34    return group['age'].max() - group['age'].min()
35
36df.groupby('city').apply(custom_func)
37
38# Transform (keeps original shape)
39df['age_normalized'] = df.groupby('city')['age'].transform(
40    lambda x: (x - x.mean()) / x.std()
41)
42
43# Filter groups
44df.groupby('city').filter(lambda x: len(x) > 10)
45df.groupby('city').filter(lambda x: x['age'].mean() > 30)

Pivot Tables

 1# Basic pivot
 2pivot = df.pivot_table(
 3    values='salary',
 4    index='city',
 5    columns='department',
 6    aggfunc='mean'
 7)
 8
 9# Multiple aggregations
10pivot = df.pivot_table(
11    values='salary',
12    index='city',
13    columns='department',
14    aggfunc=['mean', 'sum', 'count']
15)
16
17# Multiple values
18pivot = df.pivot_table(
19    values=['salary', 'age'],
20    index='city',
21    columns='department',
22    aggfunc='mean'
23)
24
25# With margins (totals)
26pivot = df.pivot_table(
27    values='salary',
28    index='city',
29    columns='department',
30    aggfunc='mean',
31    margins=True
32)
33
34# Cross-tabulation
35pd.crosstab(df['city'], df['department'])
36pd.crosstab(df['city'], df['department'], normalize='all')  # Percentages

Sorting

 1# Sort by single column
 2df.sort_values('age')
 3df.sort_values('age', ascending=False)
 4
 5# Sort by multiple columns
 6df.sort_values(['city', 'age'], ascending=[True, False])
 7
 8# Sort by index
 9df.sort_index()
10df.sort_index(ascending=False)
11
12# In-place sorting
13df.sort_values('age', inplace=True)

Merging and Joining

 1# Merge (SQL-like join)
 2df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
 3df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
 4
 5# Inner join (default)
 6pd.merge(df1, df2, on='key')
 7
 8# Left join
 9pd.merge(df1, df2, on='key', how='left')
10
11# Right join
12pd.merge(df1, df2, on='key', how='right')
13
14# Outer join
15pd.merge(df1, df2, on='key', how='outer')
16
17# Multiple keys
18pd.merge(df1, df2, on=['key1', 'key2'])
19
20# Different column names
21pd.merge(df1, df2, left_on='key1', right_on='key2')
22
23# Merge on index
24pd.merge(df1, df2, left_index=True, right_index=True)
25
26# Concatenate
27pd.concat([df1, df2])  # Vertical (rows)
28pd.concat([df1, df2], axis=1)  # Horizontal (columns)
29pd.concat([df1, df2], ignore_index=True)  # Reset index
30
31# Append (deprecated, use concat)
32pd.concat([df1, df2], ignore_index=True)
33
34# Join (on index)
35df1.join(df2, how='left')

Reshaping

 1# Melt (wide to long)
 2df_wide = pd.DataFrame({
 3    'id': [1, 2],
 4    'A': [10, 20],
 5    'B': [30, 40]
 6})
 7df_long = df_wide.melt(id_vars=['id'], var_name='variable', value_name='value')
 8
 9# Pivot (long to wide)
10df_wide = df_long.pivot(index='id', columns='variable', values='value')
11
12# Stack/Unstack
13df.stack()    # Columns to rows
14df.unstack()  # Rows to columns
15
16# Transpose
17df.T

Handling Missing Data

 1# Detect missing
 2df.isnull()
 3df.notnull()
 4df.isnull().sum()  # Count per column
 5
 6# Drop missing
 7df.dropna()  # Drop rows with any null
 8df.dropna(axis=1)  # Drop columns with any null
 9df.dropna(how='all')  # Drop only if all values null
10df.dropna(subset=['column'])  # Drop if specific column null
11df.dropna(thresh=2)  # Keep rows with at least 2 non-null values
12
13# Fill missing
14df.fillna(0)  # Fill with constant
15df.fillna(method='ffill')  # Forward fill
16df.fillna(method='bfill')  # Backward fill
17df.fillna(df.mean())  # Fill with mean
18df.fillna({'col1': 0, 'col2': 'unknown'})  # Different values per column
19
20# Interpolate
21df['column'].interpolate()
22df['column'].interpolate(method='linear')
23df['column'].interpolate(method='polynomial', order=2)
24
25# Replace
26df.replace(np.nan, 0)
27df.replace([0, -999], np.nan)

Time Series Operations

 1# Create datetime index
 2df['date'] = pd.to_datetime(df['date'])
 3df.set_index('date', inplace=True)
 4
 5# Resample
 6df.resample('D').mean()  # Daily average
 7df.resample('W').sum()   # Weekly sum
 8df.resample('M').last()  # Monthly last value
 9df.resample('Q').agg({'value': 'sum', 'count': 'size'})
10
11# Rolling window
12df['rolling_mean'] = df['value'].rolling(window=7).mean()
13df['rolling_std'] = df['value'].rolling(window=7).std()
14df['rolling_sum'] = df['value'].rolling(window=7).sum()
15
16# Expanding window
17df['expanding_mean'] = df['value'].expanding().mean()
18
19# Shift
20df['prev_value'] = df['value'].shift(1)  # Previous row
21df['next_value'] = df['value'].shift(-1)  # Next row
22df['pct_change'] = df['value'].pct_change()  # Percentage change
23
24# Date operations
25df['year'] = df.index.year
26df['month'] = df.index.month
27df['day'] = df.index.day
28df['dayofweek'] = df.index.dayofweek
29df['quarter'] = df.index.quarter
30
31# Filter by date
32df['2024-01-01':'2024-01-31']
33df.loc['2024-01']  # All of January 2024

String Operations

 1# String methods (vectorized)
 2df['name'].str.lower()
 3df['name'].str.upper()
 4df['name'].str.title()
 5df['name'].str.strip()
 6df['name'].str.replace('old', 'new')
 7df['name'].str.split(' ')
 8df['name'].str.split(' ', expand=True)  # Split into columns
 9df['name'].str.len()
10df['name'].str.contains('pattern')
11df['name'].str.startswith('A')
12df['name'].str.endswith('e')
13df['name'].str.extract(r'(\d+)')  # Regex extraction
14df['name'].str.findall(r'\d+')
15df['name'].str.count('a')
16df['name'].str.slice(0, 3)  # First 3 characters

Performance Optimization

 1# Use categorical for repeated strings
 2df['category'] = df['category'].astype('category')
 3
 4# Downcast numeric types
 5df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
 6df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')
 7
 8# Use chunks for large files
 9chunks = pd.read_csv('large_file.csv', chunksize=10000)
10for chunk in chunks:
11    process(chunk)
12
13# Vectorized operations (avoid loops)
14# ❌ Slow
15for i in range(len(df)):
16    df.loc[i, 'new_col'] = df.loc[i, 'col1'] * 2
17
18# ✅ Fast
19df['new_col'] = df['col1'] * 2
20
21# Use query for complex filters
22df.query('age > 30 and city == "NYC"')  # Faster than boolean indexing
23
24# Use eval for complex calculations
25df.eval('new_col = col1 + col2 * col3')
26
27# Copy vs view
28df_copy = df.copy()  # Full copy
29df_view = df[['col1', 'col2']]  # View (faster, but modifies original)

Export Data

 1# CSV
 2df.to_csv('output.csv', index=False)
 3df.to_csv('output.csv', sep=';', encoding='utf-8')
 4
 5# Excel
 6df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)
 7
 8# JSON
 9df.to_json('output.json', orient='records')
10
11# SQL
12df.to_sql('table_name', conn, if_exists='replace', index=False)
13
14# Parquet
15df.to_parquet('output.parquet', compression='gzip')
16
17# HTML
18df.to_html('output.html')
19
20# Clipboard
21df.to_clipboard()

Common Patterns

Remove Duplicates

1# Drop duplicate rows
2df.drop_duplicates()
3df.drop_duplicates(subset=['column'])
4df.drop_duplicates(keep='first')  # Keep first occurrence
5df.drop_duplicates(keep='last')   # Keep last occurrence
6df.drop_duplicates(keep=False)    # Remove all duplicates

Reset Index

1df.reset_index(drop=True)  # Drop old index
2df.reset_index()  # Keep old index as column

Set Index

1df.set_index('column')
2df.set_index(['col1', 'col2'])  # MultiIndex

Chain Operations

1result = (df
2    .query('age > 25')
3    .groupby('city')
4    .agg({'salary': 'mean', 'age': 'max'})
5    .reset_index()
6    .sort_values('salary', ascending=False)
7    .head(10)
8)

Conditional Replacement

1# Replace values conditionally
2df.loc[df['age'] > 30, 'category'] = 'senior'
3df.loc[df['city'] == 'NYC', 'region'] = 'East'

Further Reading

Related Snippets