0% found this document useful (0 votes)

91 views9 pages

Pandas for Data Analysis Enthusiasts

Uploaded by

Anh Thư Trần Võ

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

91 views9 pages

Pandas for Data Analysis Enthusiasts

Uploaded by

Anh Thư Trần Võ

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

# Print the top few rows

DATA MANIPULATION WITH PANDAS

print(homelessness_ind.head())
1) Inspecting a DataFrame
# Sort homelessness by descending family members
# edited/added homelessness_fam = homelessness.sort_values("family_members", ascending=False)
import pandas as pd # Print the top few rows
homelessness = pd.read_csv('homelessness.csv', index_col=0) print(homelessness_fam.head())
# Print the head of the homelessness data # Sort homelessness by region, then descending family members
print(homelessness.head()) homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], a
# Print information about homelessness scending=[True, False])

print(homelessness.info()) # Print the top few rows

# Print the shape of homelessness print(homelessness_reg_fam.head())

print(homelessness.shape)
4) Subsetting columns
# Print a description of homelessness
# Select the individuals column
print(homelessness.describe())
individuals = homelessness["individuals"]
2) Parts of a DataFrame # Print the head of the result

# Import pandas using the alias pd print(individuals.head())

import pandas as pd # Select the state and family_members columns

# Print the values of homelessness state_fam = homelessness[["state", "family_members"]]

print(homelessness.values) # Print the head of the result

# Print the column index of homelessness print(state_fam.head())

print(homelessness.columns) # Select only the individuals and state columns, in that order

# Print the row index of homelessness ind_state = homelessness[["individuals", "state"]]

print(homelessness.index) # Print the head of the result

print(ind_state.head())
3) Sorting rows
Subsetting rows
# Sort homelessness by individuals
homelessness_ind = homelessness.sort_values("individuals") # Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"] > 10000] homelessness["total"] = homelessness["individuals"] + homelessness["family_memb
ers"]
# See the result
# Add p_individuals col as proportion of total that are individuals
print(ind_gt_10k)
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific
# Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & (hom
elessness["region"] == "Pacific")] homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessn
ess["state_pop"]
# See the result
# Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac)
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
Subsetting rows by categorical variables # Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascendin
# Subset for rows in South Atlantic or Mid-Atlantic regions
g=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") |
# From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states
# edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result
# Print the head of the sales DataFrame
print(mojave_homelessness)
print(sales.head())
Adding new columns # Print the info about the sales DataFrame
print(sales.info())
# Add total col as sum of individuals and family_members
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean()) sales_1_1 = sales[(sales["department"] == 1) & (sales["store"] == 1)]
# Print the median of weekly_sales # Sort sales_1_1 by date
print(sales["weekly_sales"].median()) sales_1_1 = sales_1_1.sort_values("date")
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
Summarizing dates
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
# Print the maximum of the date column # Get the cumulative max of weekly_sales, add as cum_max_sales col
print(sales["date"].max()) sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
# Print the minimum of the date column # See the columns you calculated
print(sales["date"].min()) print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

Efficient summaries Dropping duplicates

# A custom IQR functiondef iqr(column):  Remove rows of sales with duplicate pairs of store and type and save
return column.quantile(0.75) - column.quantile(0.25) as store_types and print the head.
 Remove rows of sales with duplicate pairs of store and department and save
# Print IQR of the temperature_c column as store_depts and print the head.
print(sales["temperature_c"].agg(iqr))
# A custom IQR functiondef iqr(column): # Drop duplicate store/type combinations
return column.quantile(0.75) - column.quantile(0.25) store_types = sales.drop_duplicates(subset=["store", "type"])
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment print(store_types.head())
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)) # Drop duplicate store/department combinations
# Import NumPy and create custom IQR functionimport numpy as npdef iqr(colum store_depts = sales.drop_duplicates(subset=["store", "department"])
n):
print(store_depts.head())
return column.quantile(0.75) - column.quantile(0.25)
# Subset the rows where is_holiday is True and drop duplicate dates
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unem
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date")
ployment
# Print date col of holiday_dates
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, n
p.median])) print(holiday_dates["date"])

Cumulative statistics Counting categorical variables

# edited/added # Count the number of stores of each type

store_counts = store_types["type"].value_counts() sales_by_type = sales.groupby("type")["weekly_sales"].sum()
print(store_counts) # Get proportion for each type
# Get the proportion of stores of each type sales_propn_by_type = sales_by_type / sum(sales_by_type)
store_props = store_types["type"].value_counts(normalize=True) print(sales_propn_by_type)
print(store_props) # Group by type and is_holiday; calc total weekly sales
# Count the number of each department number and sort sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].s
um()
dept_counts_sorted = store_depts["department"].value_counts(sort=True)
print(sales_by_type_is_holiday)
print(dept_counts_sorted)
# Get the proportion of departments of each number and sort Multiple grouped summaries
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize
=True) # Import numpy with the alias npimport numpy as np
print(dept_props_sorted) # For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean,
What percent of sales occurred at each store type? np.median])
# Print sales_stats
# Calc total weekly sales
print(sales_stats)
sales_all = sales["weekly_sales"].sum()
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min,
# Subset for type A stores, calc total weekly sales
max, mean, and median
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l
# Subset for type B stores, calc total weekly sales "]].agg([np.min, np.max, np.mean, np.median])
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum() # Print unemp_fuel_stats
# Subset for type C stores, calc total weekly sales print(unemp_fuel_stats)
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()
Pivoting on one variable
# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all # Pivot for mean weekly_sales for each store type
print(sales_propn_by_type) mean_sales_by_type = sales.pivot_table(values="weekly_sales", index="type")
# Print mean_sales_by_type
Calculations with .groupby()
print(mean_sales_by_type)
# Group by type; calc total weekly sales # Import NumPy as npimport numpy as np
# Pivot for mean and median weekly_sales for each store type print(temperatures_ind)
mean_med_sales_by_type = sales.pivot_table("weekly_sales", "type", aggfunc = [np. # Reset the temperatures_ind index, keeping its contents
mean, np.median])
print(temperatures_ind.reset_index())
# Print mean_med_sales_by_type
# Reset the temperatures_ind index, dropping its contents
print(mean_med_sales_by_type)
print(temperatures_ind.reset_index(drop=True))
# Pivot for mean weekly_sales by store type and holiday
mean_sales_by_type_holiday = sales.pivot_table("weekly_sales", "type", "is_holida Subsetting with .loc[]
y")
# Make a list of cities to subset on
# Print mean_sales_by_type_holiday
cities = ["Moscow", "Saint Petersburg"]
print(mean_sales_by_type_holiday)
# Subset temperatures using square brackets
Fill in missing values and sum values with pivot tables print(temperatures[temperatures["city"].isin(cities)])
# Subset temperatures_ind using .loc[]
# Print mean weekly_sales by department and type; fill missing values with 0
print(temperatures_ind.loc[cities])
print(sales.pivot_table(values="weekly_sales", index="department", columns="type
", fill_value=0))
Setting multi-level indexes
# Print the mean weekly_sales by department and type; fill missing values with 0s; s
um all rows and cols # Index temperatures by country & city
print(sales.pivot_table(values="weekly_sales", index="department", columns="type temperatures_ind = temperatures.set_index(["country", "city"])
", fill_value=0, margins =True))
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]
# Subset for rows to keep
# edited/added
print(temperatures_ind.loc[rows_to_keep])
temperatures = pd.read_csv('temperatures.csv', index_col=0)
temperatures['date'] = pd.to_datetime(temperatures['date'], infer_datetime_format= Sorting by index values
True)
# Look at temperatures # Sort temperatures_ind by index values
print(temperatures) print(temperatures_ind.sort_index())
# Set the index of temperatures to city # Sort temperatures_ind by index values at the city level
temperatures_ind = temperatures.set_index("city") print(temperatures_ind.sort_index(level="city"))
# Look at temperatures_ind # Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True, Fal # Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
se]))
print(temperatures_ind.loc["2010":"2011"])

Slicing index values # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index() Subsetting by row/column number
# Subset rows from Pakistan to Russia
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures_srt.loc["Pakistan":"Russia"])
print(temperatures.iloc[22, 1])
# Try to subset rows from Lahore to Moscow
# Use slicing to get the first 5 rows
print(temperatures_srt.loc["Lahore":"Moscow"])
print(temperatures.iloc[:5])
# Subset rows from Pakistan, Lahore to Russia, Moscow
# Use slicing to get columns 3 to 4
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])
print(temperatures.iloc[:, 2:4])

Slicing in both directions # Use slicing in both directions at once

print(temperatures.iloc[:5, 2:4])
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")]) Pivot temperature by city and year
# Subset columns from date to avg_temp_c
# Add a year column to temperatures
print(temperatures_srt.loc[:, "date":"avg_temp_c"])
temperatures["year"] = temperatures["date"].dt.year
# Subset in both directions at once
# Pivot avg_temp_c by country and city vs year
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index =
temp_c"])
["country", "city"], columns = "year")

Slicing time series # See the result

print(temp_by_country_city_vs_year)
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temp Subsetting pivot tables
eratures["date"] <= "2011-12-31")]
# Subset for Egypt to India
print(temperatures_bool)
temp_by_country_city_vs_year.loc["Egypt":"India"]
# Set date as the index and sort the index
# Subset for Egypt, Cairo to India, Delhi
temperatures_ind = temperatures.set_index("date").sort_index()
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")] Changes in sales over time

# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010 # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2 # Get the total number of avocados sold on each date
010"]
nb_sold_by_date = avocados.groupby("date")["nb_sold"].sum()

Calculating on a pivot table # Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind="line")
# Get the worldwide mean temp by year
# Show the plot
mean_temp_by_year = temp_by_country_city_vs_year.mean()
plt.show()
# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]) Avocado supply and demand
# Get the mean temp by city
# Scatter plot of avg_price vs. nb_sold with title
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
avocados.plot(x="nb_sold", y="avg_price", kind="scatter", title="Number of avocad
# Filter for the city that had the lowest mean temp os sold vs. average price")
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()]) # Show the plot
plt.show()
Which avocado size is most popular?

# edited/added Price of conventional vs. organic avocados

import urllib.requestimport pickle # Histogram of conventional avg_price

avocados = pd.read_pickle("avoplotto.pkl") avocados[avocados["type"] == "conventional"]["avg_price"].hist()
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt # Histogram of organic avg_price
# Look at the first few rows of data avocados[avocados["type"] == "organic"]["avg_price"].hist()
print(avocados.head()) # Add a legend
# Get the total number of avocados sold of each size plt.legend(["conventional", "organic"])
nb_sold_by_size = avocados.groupby("size")["nb_sold"].sum() # Show the plot
# Create a bar plot of the number of avocados sold by size plt.show()
nb_sold_by_size.plot(kind="bar") # Modify histogram transparency to 0.5
# Show the plot avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5)
plt.show() # Modify histogram transparency to 0.5
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5) Removing missing values

# Add a legend # Remove rows with missing values

plt.legend(["conventional", "organic"]) avocados_complete = avocados_2016.dropna()
# Show the plot # Check if any columns contain missing values
plt.show() print(avocados_complete.isna().any())
# Modify bins to 20
Replacing missing values
avocados[avocados["type"] == "conventional"]["avg_price"].hist(bins=20, alpha=0.
5) # List the columns with missing values
# Modify bins to 20 cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
avocados[avocados["type"] == "organic"]["avg_price"].hist(bins= 20, alpha=0.5) # Create histograms showing the distributions cols_with_missing
# Add a legend avocados_2016[cols_with_missing].hist()
plt.legend(["conventional", "organic"]) # Show the plot
# Show the plot plt.show()
plt.show()
List of dictionaries
Finding missing values
# Create a list of dictionaries with new data
# edited/added avocados_list = [
avocados_2016 = pd.read_csv('avocados_2016.csv') {"date": "2019-11-03", "small_sold": 10376832, "large_sold": 7835071},
cols_with_missing = ['small_sold', 'large_sold', 'xl_sold'] {"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348},
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt ]
# Check individual values for missing values # Convert list into DataFrame
print(avocados_2016.isna()) avocados_2019 = pd.DataFrame(avocados_list)
# Check each column for missing values # Print the new DataFrame
print(avocados_2016.isna().any()) print(avocados_2019)
# Bar plot of missing values by variable
avocados_2016.isna().sum().plot(kind="bar") Dictionary of lists
# Show plot # Create a dictionary of lists with new data
plt.show() avocados_dict = {
"date": ["2019-11-17", "2019-12-01"], # Print airline_totals_sorted
"small_sold": [10859987, 9291631], print(airline_totals_sorted)
"large_sold": [7674135, 6238096] # Save as airline_totals_sorted.csv
} airline_totals_sorted.to_csv("airline_totals_sorted.csv")
# Convert dictionary into DataFrame
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)

CSV to DataFrame

# Read CSV as DataFrame called airline_bumping

airline_bumping = pd.read_csv('airline_bumping.csv') # edited/added
# Take a look at the DataFrame
print(airline_bumping.head())
# For each airline, select nb_bumped and total_passengers and sum
airline_totals = airline_bumping.groupby("airline")[["nb_bumped", "total_passenger
s"]].sum()
# Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline
airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] / airline_totals["tota
l_passengers"] * 10000
# Print airline_totals
print(airline_totals)
# Print airline_totals
print(airline_totals)

DataFrame to CSV

# Create airline_totals_sorted
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k", ascending=Fals
e)

Numpy
No ratings yet
Numpy
9 pages
Data Manipulation With Pandas - Yulei's Sandbox
No ratings yet
Data Manipulation With Pandas - Yulei's Sandbox
18 pages
Experiment No 11
No ratings yet
Experiment No 11
19 pages
Pandas Notes
No ratings yet
Pandas Notes
4 pages
The Data Science Process
100% (1)
The Data Science Process
53 pages
NumPy, SciPy, Pandas, Quandl Cheat Sheet
100% (3)
NumPy, SciPy, Pandas, Quandl Cheat Sheet
4 pages
12 IP Practial Programs 2025-26
No ratings yet
12 IP Practial Programs 2025-26
10 pages
Project Intern - Jupyter Notebook
No ratings yet
Project Intern - Jupyter Notebook
16 pages
Edp 3
No ratings yet
Edp 3
16 pages
DALab Part-B BCU&BU
No ratings yet
DALab Part-B BCU&BU
12 pages
Lab Record IP
No ratings yet
Lab Record IP
13 pages
Python Data Structures and Libraries Guide
No ratings yet
Python Data Structures and Libraries Guide
7 pages
Real Estate
No ratings yet
Real Estate
10 pages
Python Pandas: 12 Data Manipulation Techniques
100% (2)
Python Pandas: 12 Data Manipulation Techniques
19 pages
Pandas
No ratings yet
Pandas
5 pages
External
No ratings yet
External
11 pages
Pandas PDF
No ratings yet
Pandas PDF
6 pages
Practice Questions2
No ratings yet
Practice Questions2
2 pages
12 Ip Practical List With Solution Complete
No ratings yet
12 Ip Practical List With Solution Complete
5 pages
Python Finance & Trading Guide
No ratings yet
Python Finance & Trading Guide
11 pages
Data Analysis and Visualization Course
No ratings yet
Data Analysis and Visualization Course
4 pages
Class XII-IP-Practical File 1
No ratings yet
Class XII-IP-Practical File 1
28 pages
Python Cheatsheet
No ratings yet
Python Cheatsheet
2 pages
Practical - With Solution - XII - IP
No ratings yet
Practical - With Solution - XII - IP
13 pages
Data Analysis Tools
No ratings yet
Data Analysis Tools
26 pages
Unit 4
No ratings yet
Unit 4
27 pages
Major Terror Attacks in India Analysis
No ratings yet
Major Terror Attacks in India Analysis
6 pages
Python Data Science: Pandas & ML Basics
100% (1)
Python Data Science: Pandas & ML Basics
41 pages
ML Expt 1 Description
No ratings yet
ML Expt 1 Description
15 pages
Solution
No ratings yet
Solution
8 pages
Python Interviews
No ratings yet
Python Interviews
154 pages
NumPy and Pandas Step
No ratings yet
NumPy and Pandas Step
9 pages
AI & Data Science Lab Record
No ratings yet
AI & Data Science Lab Record
28 pages
Python Data Analysis Cheat Sheet
100% (3)
Python Data Analysis Cheat Sheet
9 pages
Python Cheat Sheet For Excel Users
100% (2)
Python Cheat Sheet For Excel Users
5 pages
Ai Programs
No ratings yet
Ai Programs
22 pages
Numpy Boolean Indexing: Filter
No ratings yet
Numpy Boolean Indexing: Filter
39 pages
ML 1-11
No ratings yet
ML 1-11
27 pages
Lab Record Dev
No ratings yet
Lab Record Dev
20 pages
Creation of Series Using List, Dictionary & Ndarray
No ratings yet
Creation of Series Using List, Dictionary & Ndarray
65 pages
Even Students
No ratings yet
Even Students
36 pages
Exercises 2
No ratings yet
Exercises 2
10 pages
TD5Numpy Pandas and Matplotlib
No ratings yet
TD5Numpy Pandas and Matplotlib
5 pages
Pandas Syntax Revision For ML
No ratings yet
Pandas Syntax Revision For ML
10 pages
DA Lab
No ratings yet
DA Lab
27 pages
Data Analysis with Python Scripts
No ratings yet
Data Analysis with Python Scripts
9 pages
23bet10114 Naman Gupta Assignment-1
No ratings yet
23bet10114 Naman Gupta Assignment-1
17 pages
Data Science
No ratings yet
Data Science
18 pages
Python CheatSheet
No ratings yet
Python CheatSheet
2 pages
Assignment 4 On Visualization On Graph With Solution
No ratings yet
Assignment 4 On Visualization On Graph With Solution
14 pages
Unit - 4 - Part 2
No ratings yet
Unit - 4 - Part 2
36 pages
Python Basics: Variables and Lists
No ratings yet
Python Basics: Variables and Lists
13 pages
Pandas
No ratings yet
Pandas
9 pages
Python Data Science Toolbox
No ratings yet
Python Data Science Toolbox
14 pages
Python Data Science Toolbox Guide
No ratings yet
Python Data Science Toolbox Guide
16 pages
Pandas
No ratings yet
Pandas
9 pages
Python Iterators & Iterables Guide
No ratings yet
Python Iterators & Iterables Guide
14 pages
Python Data Science Toolbox Overview
No ratings yet
Python Data Science Toolbox Overview
17 pages
NYL - Midterm Review
No ratings yet
NYL - Midterm Review
62 pages
Introduction To Python
No ratings yet
Introduction To Python
14 pages
Chap 04
100% (1)
Chap 04
72 pages
Chap 02
No ratings yet
Chap 02
51 pages
Chap 06
No ratings yet
Chap 06
52 pages
10 2015 Anal1 ch2 Par2
No ratings yet
10 2015 Anal1 ch2 Par2
94 pages
Chap 01
No ratings yet
Chap 01
37 pages
Analysis 1 Midterm (Update)
No ratings yet
Analysis 1 Midterm (Update)
11 pages
Pdf24 Merged
No ratings yet
Pdf24 Merged
132 pages
Lecture 12 - Chapter 17 - Oligopoly
No ratings yet
Lecture 12 - Chapter 17 - Oligopoly
35 pages
Unit 2 PPT - Part 2
100% (1)
Unit 2 PPT - Part 2
81 pages
Datasheet Bw55e Pre62002010 Sa04
No ratings yet
Datasheet Bw55e Pre62002010 Sa04
2 pages
RA Form Empty
No ratings yet
RA Form Empty
2 pages
Accounting Transactions Guide
No ratings yet
Accounting Transactions Guide
47 pages
CA Unions Labor Case Summary GR 94716
No ratings yet
CA Unions Labor Case Summary GR 94716
2 pages
Summary Case Study Report (Group) Final
No ratings yet
Summary Case Study Report (Group) Final
2 pages
TH-1 Energy Conversion-I (4TH Elect.)
No ratings yet
TH-1 Energy Conversion-I (4TH Elect.)
86 pages
Using Word For Scaffolding Learning
No ratings yet
Using Word For Scaffolding Learning
21 pages
Production & Operations Management Guide
No ratings yet
Production & Operations Management Guide
15 pages
Amc Resi Drill Pds
No ratings yet
Amc Resi Drill Pds
1 page
Primemover Inspection Check Sheet
No ratings yet
Primemover Inspection Check Sheet
4 pages
Roy Kurtzbard Event Schedule
No ratings yet
Roy Kurtzbard Event Schedule
1 page
Saipan Campaign: A Crucial Battle
No ratings yet
Saipan Campaign: A Crucial Battle
30 pages
Lampiran Hasil Output Spss
No ratings yet
Lampiran Hasil Output Spss
9 pages
CBSE Result 2019, CBSE 10th & 12th Result 2019, WWW - Cbseresults.nic - in
No ratings yet
CBSE Result 2019, CBSE 10th & 12th Result 2019, WWW - Cbseresults.nic - in
8 pages
Chemist & Materials Scientist CV
No ratings yet
Chemist & Materials Scientist CV
2 pages
IS-IS TroubleShooting Guide Draft
No ratings yet
IS-IS TroubleShooting Guide Draft
34 pages
Diploma Course Details
No ratings yet
Diploma Course Details
1 page
Piping Spring Hangers
No ratings yet
Piping Spring Hangers
2 pages
Macro Volatility Digest Sep222025 U1
No ratings yet
Macro Volatility Digest Sep222025 U1
6 pages
National Broadband Strategy
No ratings yet
National Broadband Strategy
70 pages
2018 East End
No ratings yet
2018 East End
7 pages
Warehouse Management System Literature Review
100% (1)
Warehouse Management System Literature Review
6 pages
Resume of Sunder Rajan L
No ratings yet
Resume of Sunder Rajan L
3 pages
Intranet Design Annual 2021
No ratings yet
Intranet Design Annual 2021
636 pages
Techstreet Enterprise User Guide
No ratings yet
Techstreet Enterprise User Guide
13 pages
CCI Recruitment 2020: Apply Online Now
No ratings yet
CCI Recruitment 2020: Apply Online Now
39 pages
Types of Chemical Bonds Explained
No ratings yet
Types of Chemical Bonds Explained
63 pages
Mitsubishi Pinout Ep4
No ratings yet
Mitsubishi Pinout Ep4
100 pages
Filipino Values in Business Practices
No ratings yet
Filipino Values in Business Practices
4 pages

Pandas for Data Analysis Enthusiasts

Uploaded by

Pandas for Data Analysis Enthusiasts

Uploaded by

# Print the top few rows

DATA MANIPULATION WITH PANDAS

print(homelessness.info()) # Print the top few rows

# Print the shape of homelessness print(homelessness_reg_fam.head())

# Import pandas using the alias pd print(individuals.head())

import pandas as pd # Select the state and family_members columns

# Print the values of homelessness state_fam = homelessness[["state", "family_members"]]

print(homelessness.values) # Print the head of the result

# Print the column index of homelessness print(state_fam.head())

# Print the row index of homelessness ind_state = homelessness[["individuals", "state"]]

print(homelessness.index) # Print the head of the result

Efficient summaries Dropping duplicates

Cumulative statistics Counting categorical variables

# edited/added # Count the number of stores of each type

Slicing in both directions # Use slicing in both directions at once

Slicing time series # See the result

# edited/added Price of conventional vs. organic avocados

import urllib.requestimport pickle # Histogram of conventional avg_price

# Add a legend # Remove rows with missing values

# Read CSV as DataFrame called airline_bumping

You might also like