Complete Pandas Tutorial
Complete Pandas Tutorial
Table of Contents
1. Introduction & Setup
2. Basic Data Structures
3. Data Loading & Saving
4. Data Inspection & Exploration
5. Data Selection & Indexing
6. Data Cleaning
7. Data Transformation
8. Grouping & Aggregation
9. Merging & Joining
10.Time Series Analysis
11.Visualization with Pandas
12.Advanced Operations
13.Performance Optimization
14.Real-World Projects
Pandas is a powerful Python library for data manipulation and analysis. It provides data structures and
functions needed to work with structured data seamlessly.
Installation
pip install pandas numpy matplotlib seaborn
Basic Imports
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('[Link]', None)
2. Basic Data Structures
Series
# Creating Series
s1 = [Link]([1, 2, 3, 4, 5])
s2 = [Link]([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s3 = [Link]({'a': 1, 'b': 2, 'c': 3})
# Series properties
print([Link]) # Index
print([Link]) # Values
print([Link]) # Data type
print([Link]) # Shape
print([Link]) # Size
DataFrame
# Creating DataFrames
df1 = [Link]({
'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'Age': [25, 30, 35, 28],
'City': ['NYC', 'LA', 'Chicago', 'Houston']
})
# From lists
df2 = [Link]([
['Alice', 25, 'NYC'],
['Bob', 30, 'LA']
], columns=['Name', 'Age', 'City'])
# DataFrame properties
print([Link]) # (rows, columns)
print([Link]) # Column names
print([Link]) # Row indices
print([Link]) # Data types
print([Link]()) # Summary info
3. Data Loading & Saving
Reading Data
# CSV files
df = pd.read_csv('[Link]')
df = pd.read_csv('[Link]', index_col=0) # Set first column as index
df = pd.read_csv('[Link]', usecols=['col1', 'col2']) # Select specific columns
df = pd.read_csv('[Link]', nrows=1000) # Read first 1000 rows
# Excel files
df = pd.read_excel('[Link]', sheet_name='Sheet1')
df = pd.read_excel('[Link]', sheet_name=0) # By index
# JSON files
df = pd.read_json('[Link]')
df = pd.read_json('[Link]', orient='records')
# SQL databases
import sqlite3
conn = [Link]('[Link]')
df = pd.read_sql_query('SELECT * FROM table_name', conn)
# Parquet files
df = pd.read_parquet('[Link]')
# Text files
df = pd.read_csv('[Link]', delimiter='\t') # Tab-separated
df = pd.read_csv('[Link]', delimiter='|') # Pipe-separated
Saving Data
# CSV
df.to_csv('[Link]', index=False)
df.to_csv('[Link]', columns=['col1', 'col2']) # Specific columns
# Excel
df.to_excel('[Link]', sheet_name='Data', index=False)
# JSON
df.to_json('[Link]', orient='records')
# Parquet
df.to_parquet('[Link]')
# SQL
df.to_sql('table_name', conn, if_exists='replace', index=False)
4. Data Inspection & Exploration
Basic Information
# Shape and structure
[Link] # (rows, columns)
[Link]() # Data types and memory usage
[Link]() # Statistical summary
[Link](include='all') # All columns including non-numeric
# Sampling
[Link]() # Random single row
[Link](5) # Random 5 rows
[Link](frac=0.1) # Random 10% of data
# Duplicates
[Link]().sum() # Count duplicate rows
[Link](subset=['col1']).sum() # Duplicates based on specific columns
df[[Link]()] # Show duplicate rows
# Unique values
df['column'].unique() # Unique values
df['column'].nunique() # Count of unique values
df['column'].value_counts() # Value frequency counts
df['column'].value_counts(normalize=True) # Proportions
Memory Usage
df.memory_usage() # Memory usage by column
df.memory_usage(deep=True) # Deep memory usage
[Link](memory_usage='deep')
5. Data Selection & Indexing
Column Selection
# Single column
df['Name'] # Returns Series
df[['Name']] # Returns DataFrame
# Multiple columns
df[['Name', 'Age']]
cols = ['Name', 'Age']
df[cols]
# Column slicing
[Link][:, 'Name':'City'] # All rows, columns from Name to City
Row Selection
# By index position
[Link][0] # First row
[Link][0:3] # First 3 rows
[Link][-1] # Last row
# By index label
[Link][0] # Row with index 0
[Link][0:2] # Rows with index 0 to 2
# Multiple rows
[Link][[0, 2, 4]] # Rows at positions 0, 2, 4
[Link][[0, 2, 4]] # Rows with indices 0, 2, 4
Boolean Indexing
# Single condition
df[df['Age'] > 30]
df[df['City'] == 'NYC']
df[df['Name'].[Link]('A')]
# Multiple conditions
df[(df['Age'] > 25) & (df['City'] == 'NYC')]
df[(df['Age'] < 25) | (df['Age'] > 35)]
df[df['Age'].between(25, 35)]
# Using isin()
df[df['City'].isin(['NYC', 'LA'])]
df[~df['City'].isin(['NYC', 'LA'])] # NOT in
# Using query()
[Link]('Age > 30')
[Link]('Age > 30 & City == "NYC"')
[Link]('City in ["NYC", "LA"]')
Advanced Indexing
# Set index
df_indexed = df.set_index('Name')
df_indexed.loc['Alice']
# Reset index
df_reset = df_indexed.reset_index()
# Multi-level indexing
df_multi = df.set_index(['City', 'Name'])
df_multi.loc[('NYC', 'Alice')]
# Cross-section
df_multi.xs('NYC', level='City')
6. Data Cleaning
Handling Missing Values
# Detect missing values
[Link]()
[Link]() # Same as isnull()
[Link]() # Opposite of isna()
# Interpolation
[Link]() # Linear interpolation
[Link](method='polynomial', order=2)
Handling Duplicates
# Remove duplicates
df.drop_duplicates()
df.drop_duplicates(subset=['col1']) # Based on specific columns
df.drop_duplicates(keep='first') # Keep first occurrence
df.drop_duplicates(keep='last') # Keep last occurrence
df.drop_duplicates(keep=False) # Remove all duplicates
# Convert to categorical
df['Category'] = df['Category'].astype('category')
String Cleaning
# String methods
df['Name'].[Link]() # Lowercase
df['Name'].[Link]() # Uppercase
df['Name'].[Link]() # Title case
df['Name'].[Link]() # Remove whitespace
df['Name'].[Link]('old', 'new') # Replace text
# String operations
df['Name'].[Link]() # String length
df['Name'].[Link]('pattern') # Contains pattern
df['Name'].[Link]('A') # Starts with
df['Name'].[Link]('son') # Ends with
df['Name'].[Link]('(\w+)') # Extract pattern
# Split strings
df['Name'].[Link]() # Split on whitespace
df['Name'].[Link](' ', expand=True) # Split into columns
# Remove outliers
df_clean = df[(df['column'] >= lower_bound) & (df['column'] <= upper_bound)]
# Z-score method
from scipy import stats
z_scores = [Link]([Link](df['column']))
df_clean = df[z_scores < 3]
7. Data Transformation
Adding and Modifying Columns
# Add new columns
df['New_Column'] = 0
df['Age_Plus_10'] = df['Age'] + 10
df['Full_Name'] = df['First_Name'] + ' ' + df['Last_Name']
# Multiple conditions
conditions = [
df['Age'] <= 25,
(df['Age'] > 25) & (df['Age'] <= 35),
df['Age'] > 35
]
choices = ['Young', 'Middle', 'Old']
df['Age_Category'] = [Link](conditions, choices, default='Unknown')
# Using apply()
df['Age_Squared'] = df['Age'].apply(lambda x: x**2)
df['Name_Length'] = df['Name'].apply(len)
# Using map()
mapping = {1: 'One', 2: 'Two', 3: 'Three'}
df['Number_Word'] = df['Number'].map(mapping)
Renaming
# Rename columns
[Link](columns={'old_name': 'new_name'})
[Link](columns={'col1': 'Column1', 'col2': 'Column2'})
# Rename index
[Link](index={0: 'first', 1: 'second'})
Sorting
# Sort by single column
df.sort_values('Age')
df.sort_values('Age', ascending=False)
Reshaping Data
# Melt (wide to long)
df_melted = [Link](df, id_vars=['Name'], value_vars=['Math', 'Science'])
# Quantile-based binning
df['Age_Quantiles'] = [Link](df['Age'], q=4)
# Custom binning
bins = [0, 18, 30, 50, 100]
labels = ['Child', 'Young Adult', 'Adult', 'Senior']
df['Age_Category'] = [Link](df['Age'], bins=bins, labels=labels)
8. Grouping & Aggregation
Basic Grouping
# Group by single column
grouped = [Link]('City')
[Link]() # Count of rows per group
[Link]() # Count of non-null values per group
[Link]() # Sum per group
[Link]() # Mean per group
[Link]() # Standard deviation per group
Multiple Aggregations
# Multiple aggregation functions
[Link]('City').agg({
'Age': ['mean', 'std', 'min', 'max'],
'Salary': ['sum', 'mean']
})
# Named aggregations
[Link]('City').agg(
avg_age=('Age', 'mean'),
total_salary=('Salary', 'sum'),
count=('Name', 'count')
)
[Link]('City').agg({
'Age': [age_range, 'mean'],
'Salary': 'sum'
})
# Filter groups
[Link]('City').filter(lambda x: len(x) > 2) # Groups with more than 2 members
[Link]('City').filter(lambda x: x['Age'].mean() > 30) # Groups with mean age > 30
[Link]('City').apply(group_summary)
# Expanding windows
df['Expanding_Mean'] = df['Value'].expanding().mean()
df['Expanding_Sum'] = df['Value'].expanding().sum()
Merging
# Inner join (default)
df_merged = [Link](df1, df2, on='key_column')
# Multiple keys
df_merged = [Link](df1, df2, on=['key1', 'key2'])
# Index-based merging
df_merged = [Link](df1, df2, left_index=True, right_index=True)
Advanced Joining
# Join method (similar to merge but index-based)
df_joined = [Link](df2, how='left')
df_joined = [Link](df2, on='key_column')
# Cross join
df1['key'] = 1
df2['key'] = 1
df_cross = [Link](df1, df2, on='key').drop('key', axis=1)
10. Time Series Analysis
Date and Time Handling
# Convert to datetime
df['date'] = pd.to_datetime(df['date_string'])
df['date'] = pd.to_datetime(df['date_string'], format='%Y-%m-%d')
# Time-based selection
df['2023'] # All data from 2023
df['2023-01'] # January 2023
df['2023-01-01':'2023-01-31'] # Date range
# Time components
df['year'] = [Link]
df['month'] = [Link]
df['day'] = [Link]
df['weekday'] = [Link]
df['quarter'] = [Link]
# Percentage change
df['pct_change'] = df['value'].pct_change()
# Cumulative operations
df['cumsum'] = df['value'].cumsum()
df['cumprod'] = df['value'].cumprod()
# Time-based grouping
[Link]([Link]).mean() # Group by month
[Link]([Link](freq='M')).sum() # Group by month-end
# Holiday handling
from [Link] import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = [Link](start='2023-01-01', end='2023-12-31')
11. Visualization with Pandas
Basic Plotting
import [Link] as plt
# Line plot
df['value'].plot()
[Link](x='date', y='value')
# Multiple lines
df[['col1', 'col2']].plot()
Advanced Plotting
# Subplots
fig, axes = [Link](2, 2, figsize=(12, 8))
df['col1'].plot(ax=axes[0, 0], kind='line')
df['col2'].plot(ax=axes[0, 1], kind='bar')
[Link](ax=axes[1, 0], kind='scatter', x='x', y='y')
df['col4'].plot(ax=axes[1, 1], kind='hist')
# Customization
df['value'].plot(
title='My Plot',
xlabel='X Label',
ylabel='Y Label',
color='red',
style='--',
figsize=(10, 6)
)
# Group plotting
[Link]('category')['value'].plot(legend=True)
12. Advanced Operations
Apply Functions
# Apply to Series
df['column'].apply(lambda x: x**2)
df['text'].apply([Link])
df['text'].apply(len)
# Apply to DataFrame
[Link](lambda x: [Link]() - [Link]()) # By column (axis=0)
[Link](lambda x: [Link]() - [Link](), axis=1) # By row (axis=1)
df['column'].apply(custom_function, multiplier=2)
# Replace values
df['column'].replace(0, [Link]) # Replace single value
df['column'].replace([0, 1], [[Link], 99]) # Replace multiple values
[Link]({'col1': {0: [Link]}, 'col2': {1: 99}}) # Column-specific replacement
Window Functions
# Ranking
df['rank'] = df['score'].rank()
df['rank_pct'] = df['score'].rank(pct=True)
df['rank_dense'] = df['score'].rank(method='dense')
# Percentiles
df['percentile'] = df['score'].rank(pct=True)
# Cumulative functions
df['cumsum'] = df['value'].cumsum()
df['cumprod'] = df['value'].cumprod()
df['cummax'] = df['value'].cummax()
df['cummin'] = df['value'].cummin()
MultiIndex Operations
# Create MultiIndex
df_multi = df.set_index(['level1', 'level2'])
# Access levels
df_multi.index.get_level_values(0) # Get level 0 values
df_multi.index.get_level_values('level1') # Get by name
# Swap levels
df_multi.swaplevel(0, 1)
# Cross-section
df_multi.xs('value', level='level1')
df_multi.xs(('val1', 'val2'), level=['level1', 'level2'])
Custom Aggregations
# Custom aggregation functions
def q75(x):
return [Link](0.75)
def custom_stats(x):
return [Link]({
'min': [Link](),
'max': [Link](),
'range': [Link]() - [Link](),
'q75': [Link](0.75)
})
[Link]('category').agg({
'value': [q75, 'mean', 'std'],
'count': 'sum'
})
[Link]('category').apply(custom_stats)
13. Performance Optimization
Memory Optimization
# Check memory usage
[Link](memory_usage='deep')
df.memory_usage(deep=True)
Efficient Operations
# Use vectorized operations instead of loops
# Slow
result = []
for val in df['column']:
[Link](val * 2)
df['new_col'] = result
# Fast
df['new_col'] = df['column'] * 2
# Fast
[Link][df['condition'] > 5, 'result'] = 'high'
# Fast
[Link]('A > 5 & B < 10 & C == "value"')
# Combine results
result = [Link](chunks)
Parallel Processing
# Using multiprocessing with apply
from multiprocessing import Pool
import numpy as np
def parallel_apply(df_split):
return df_split.apply(lambda x: x**2)
# Split dataframe
df_split = np.array_split(df, 4)
# Process in parallel
with Pool(processes=4) as pool:
results = [Link](parallel_apply, df_split)
# Combine results
df_result = [Link](results)
14. Real-World Projects
Project 1: Sales Data Analysis
# Create sample sales data
[Link](42)
dates = pd.date_range('2023-01-01', '2023-12-31', freq='D')
products = ['Product A', 'Product B', 'Product C', 'Product D']
regions = ['North', 'South', 'East', 'West']
sales_data = []
for date in dates:
for _ in range([Link](5, 15)): # 5-15 transactions per day
sales_data.append({
'date': date,
'product': [Link](products),
'region': [Link](regions),
'sales_amount': [Link](1000, 300),
'quantity': [Link](1, 10)
})
df_sales = [Link](sales_data)
df_sales['date'] = pd.to_datetime(df_sales['date'])
# Analysis Tasks
# 1. Monthly sales trends
monthly_sales = df_sales.groupby(df_sales['date'].dt.to_period('M')).agg({
'sales_amount': 'sum',
'quantity': 'sum'
}).reset_index()
# 3. Regional analysis
regional_analysis = df_sales.groupby('region').agg({
'sales_amount': ['sum', 'mean'],
'quantity': 'sum'
}).round(2)
# 4. Seasonal patterns
df_sales['month'] = df_sales['date'].[Link]
df_sales['quarter'] = df_sales['date'].[Link]
seasonal_patterns = df_sales.groupby(['quarter', 'product'])['sales_amount'].sum().unstack()
df_customers = [Link](customer_data)
# 2. Duplicates
df_customers = [Link]([df_customers, df_customers.iloc[:20]], ignore_index=True)
# 3. Outliers
df_customers.loc[[Link](df_customers.index, size=10), 'age'] = [Link]([150, -5,
200])
# 4. Inconsistent formatting
df_customers.loc[[Link](df_customers.index, size=50), 'city'] =
df_customers.loc[[Link](df_customers.index, size=50), 'city'].[Link]()
# 1. Remove duplicates
df_clean = df_clean.drop_duplicates()
# 2. Handle missing emails
df_clean['email'] = df_clean['email'].fillna(f"customer{df_clean['customer_id']}@[Link]")
df_clean['purchase_category'] = [Link](df_clean['purchase_amount'],
bins=3,
labels=['Low', 'Medium', 'High'])
return df_clean
df_customers_clean = clean_customer_data(df_customers)
# Customer Analysis
customer_analysis = {
'total_customers': len(df_customers_clean),
'avg_age': df_customers_clean['age'].mean(),
'total_revenue': df_customers_clean['purchase_amount'].sum(),
'avg_purchase': df_customers_clean['purchase_amount'].mean()
}
age_group_analysis = df_customers_clean.groupby('age_group').agg({
'customer_id': 'count',
'purchase_amount': ['sum', 'mean']
}).round(2)
city_analysis = df_customers_clean.groupby('city').agg({
'customer_id': 'count',
'purchase_amount': ['sum', 'mean']
}).round(2)
df_financial = [Link]({
'date': dates,
'price': prices[:len(dates)],
'volume': [Link](10, 0.5, len(dates))
})
df_financial['date'] = pd.to_datetime(df_financial['date'])
df_financial = df_financial.set_index('date')
def calculate_moving_averages(df):
df['ma_20'] = df['price'].rolling(window=20).mean()
df['ma_50'] = df['price'].rolling(window=50).mean()
df['ma_200'] = df['price'].rolling(window=200).mean()
return df
def calculate_technical_indicators(df):
# RSI
delta = df['price'].diff()
gain = ([Link](delta > 0, 0)).rolling(window=14).mean()
loss = (-[Link](delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['rsi'] = 100 - (100 / (1 + rs))
# Bollinger Bands
df['bb_middle'] = df['price'].rolling(window=20).mean()
bb_std = df['price'].rolling(window=20).std()
df['bb_upper'] = df['bb_middle'] + (bb_std * 2)
df['bb_lower'] = df['bb_middle'] - (bb_std * 2)
return df
# Performance metrics
def calculate_performance_metrics(df):
total_return = (df['price'].iloc[-1] / df['price'].iloc[0] - 1) * 100
annual_return = (df['price'].iloc[-1] / df['price'].iloc[0]) ** (365.25 / len(df)) - 1
annual_volatility = df['daily_return'].std() * [Link](252)
sharpe_ratio = annual_return / annual_volatility
max_drawdown = ((df['price'] / df['price'].expanding().max()) - 1).min()
return {
'total_return_%': round(total_return, 2),
'annual_return_%': round(annual_return * 100, 2),
'annual_volatility_%': round(annual_volatility * 100, 2),
'sharpe_ratio': round(sharpe_ratio, 2),
'max_drawdown_%': round(max_drawdown * 100, 2)
}
performance_metrics = calculate_performance_metrics(df_financial)
print("\nYearly Returns:")
for year, return_val in yearly_returns.items():
print(f"{[Link]}: {return_val:.2%}")
df_orders = [Link](orders_data)
df_orders['order_date'] = pd.to_datetime(df_orders['order_date'])
return df_orders
def comprehensive_ecommerce_analysis(df):
analysis_results = {}
# 1. Revenue Analysis
analysis_results['total_revenue'] = df['total_amount'].sum()
analysis_results['avg_order_value'] = df['total_amount'].mean()
analysis_results['total_orders'] = len(df)
# 3. Category Performance
category_performance = [Link]('category').agg({
'total_amount': ['sum', 'mean', 'count'],
'quantity': 'sum',
'discount_amount': 'sum'
}).round(2)
customer_df['rfm_score'] = customer_df['r_score'].astype(str) + \
customer_df['f_score'].astype(str) + \
customer_df['m_score'].astype(str)
# 5. Seasonal Analysis
df['month'] = df['order_date'].[Link]
df['quarter'] = df['order_date'].[Link]
df['day_of_week'] = df['order_date'].[Link]
seasonal_analysis = {
'monthly': [Link]('month')['total_amount'].sum(),
'quarterly': [Link]('quarter')['total_amount'].sum(),
'weekly': [Link]('day_of_week')['total_amount'].sum()
}
# 6. Product Analysis
product_analysis = [Link]('product_id').agg({
'total_amount': 'sum',
'quantity': 'sum',
'order_id': 'count'
}).sort_values('total_amount', ascending=False)
return {
'summary': analysis_results,
'monthly_revenue': monthly_revenue,
'category_performance': category_performance,
'customer_segments': customer_df['segment'].value_counts(),
'seasonal_analysis': seasonal_analysis,
'top_products': product_analysis.head(10)
}
print("\nCustomer Segments:")
print(ecommerce_results['customer_segments'])
2. Error Handling
# Always handle potential errors
try:
df = pd.read_csv('[Link]')
except FileNotFoundError:
print("File not found!")
except [Link]:
print("File is empty!")
3. Data Validation
def validate_dataframe(df, required_columns=None, date_columns=None):
"""Validate DataFrame structure and content"""
return df
# Usage
df = validate_dataframe(df,
required_columns=['date', 'amount'],
date_columns=['date'])
4. Memory Management
# Use categorical data for repeated strings
df['category'] = df['category'].astype('category')
Conclusion
This comprehensive guide covers pandas from basic concepts to advanced techniques used in
professional data analysis. The key to mastering pandas is:
Remember that pandas is constantly evolving, so stay updated with the latest versions and features. The
official pandas documentation is an excellent resource for detailed information on specific functions and
methods.