Pandas Cheatsheet
Pandas Cheatsheet
pages)
Setup
import pandas as pd
import numpy as np
pd.__version__
Core Objects
# Series: 1D labeled array
s = pd.Series([10, 20, 30], index=['a','b','c'])
Quick Inspect
df.head(3); df.tail(3)
df.shape; df.columns; df.index; df.dtypes
df.info() # schema
df.describe(include='all') # stats
Select Columns/Rows
df['A'] # Series
df[['A','B']] # subset columns
df.loc['row_label', 'A'] # label-based
df.iloc[0:5, 0:2] # position-based
df.at['row_label','A'] # scalar fast (label)
df.iat[0,0] # scalar fast (pos)
Filter Rows
df[df['A'] > 2]
df[(df['A'] > 1) & (df['B'] < 6)]
df.query('A > 1 and B < 6') # expressive
Missing Data
df.isna(); df.notna()
df.fillna(0); df.fillna(method='ffill')
df.dropna() # drop rows
df.dropna(axis=1) # drop cols
df.interpolate() # numeric
Sorting
df.sort_values(by=['A','B'], ascending=[True, False])
df.sort_index()
Groupby / Aggregation
g = df.groupby('A') # groupby key
g['B'].agg(['mean','max','count'])
df.groupby('key').agg(total=('val','sum'), avg=('val','mean'))
df.groupby(['k1','k2'], dropna=False).size()
Categoricals
df['cat'] = pd.Categorical(df['cat'], categories=['low','med','high'], ordered=True)
df['cat'].cat.reorder_categories(['low','med','high'])
Apply / Map
df['A'].map({1:'one',2:'two'})
df['B'].apply(lambda x: x**2) # Series-wise
df.apply(np.sum, axis=0) # column-wise
df.apply(np.sum, axis=1) # row-wise
Descriptive Stats
df.mean(numeric_only=True); df.std()
df['A'].quantile([0.25, 0.5, 0.75])
df.corr(numeric_only=True); df.cov()
df.value_counts(dropna=False)
Indexing Utilities
df.set_index('id', inplace=False)
df.reset_index(drop=True)
df.reindex(range(10)).reindex(columns=['A','B','C'])
df.swaplevel(0,1, axis=0) # MultiIndex
df.sort_index(level=[0,1])
Plotting (quick)
# Matplotlib must be installed; Pandas uses it under the hood
df.plot() # line by default
df['A'].plot(kind='hist') # histogram
df.plot(kind='scatter', x='A', y='B')
Export / Save
df.to_csv('out.csv', index=False)
df.to_excel('out.xlsx', index=False)
df.to_parquet('out.parquet')
df.to_json('out.json', orient='records', lines=True)
df.to_sql('table', conn, if_exists='replace', index=False)
Performance Tips
# Prefer vectorization over Python loops
# Use categoricals for low-cardinality strings
# For large CSVs: use dtype=, usecols=, chunksize=
# Consider Parquet for fast IO and types
# Use pd.options.mode.copy_on_write = True # (pandas 2.0+)