# Print the top few rows
DATA MANIPULATION WITH PANDAS
print(homelessness_ind.head())
1) Inspecting a DataFrame
# Sort homelessness by descending family members
# edited/added homelessness_fam = homelessness.sort_values("family_members", ascending=False)
import pandas as pd # Print the top few rows
homelessness = pd.read_csv('homelessness.csv', index_col=0) print(homelessness_fam.head())
# Print the head of the homelessness data # Sort homelessness by region, then descending family members
print(homelessness.head()) homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], a
# Print information about homelessness scending=[True, False])
print(homelessness.info()) # Print the top few rows
# Print the shape of homelessness print(homelessness_reg_fam.head())
print(homelessness.shape)
4) Subsetting columns
# Print a description of homelessness
# Select the individuals column
print(homelessness.describe())
individuals = homelessness["individuals"]
2) Parts of a DataFrame # Print the head of the result
# Import pandas using the alias pd print(individuals.head())
import pandas as pd # Select the state and family_members columns
# Print the values of homelessness state_fam = homelessness[["state", "family_members"]]
print(homelessness.values) # Print the head of the result
# Print the column index of homelessness print(state_fam.head())
print(homelessness.columns) # Select only the individuals and state columns, in that order
# Print the row index of homelessness ind_state = homelessness[["individuals", "state"]]
print(homelessness.index) # Print the head of the result
print(ind_state.head())
3) Sorting rows
Subsetting rows
# Sort homelessness by individuals
homelessness_ind = homelessness.sort_values("individuals") # Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"] > 10000] homelessness["total"] = homelessness["individuals"] + homelessness["family_memb
ers"]
# See the result
# Add p_individuals col as proportion of total that are individuals
print(ind_gt_10k)
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific
# Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & (hom
elessness["region"] == "Pacific")] homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessn
ess["state_pop"]
# See the result
# Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac)
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
Subsetting rows by categorical variables # Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascendin
# Subset for rows in South Atlantic or Mid-Atlantic regions
g=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") |
# From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states
# edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result
# Print the head of the sales DataFrame
print(mojave_homelessness)
print(sales.head())
Adding new columns # Print the info about the sales DataFrame
print(sales.info())
# Add total col as sum of individuals and family_members
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean()) sales_1_1 = sales[(sales["department"] == 1) & (sales["store"] == 1)]
# Print the median of weekly_sales # Sort sales_1_1 by date
print(sales["weekly_sales"].median()) sales_1_1 = sales_1_1.sort_values("date")
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
Summarizing dates
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
# Print the maximum of the date column # Get the cumulative max of weekly_sales, add as cum_max_sales col
print(sales["date"].max()) sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
# Print the minimum of the date column # See the columns you calculated
print(sales["date"].min()) print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])
Efficient summaries Dropping duplicates
# A custom IQR functiondef iqr(column): Remove rows of sales with duplicate pairs of store and type and save
return column.quantile(0.75) - column.quantile(0.25) as store_types and print the head.
Remove rows of sales with duplicate pairs of store and department and save
# Print IQR of the temperature_c column as store_depts and print the head.
print(sales["temperature_c"].agg(iqr))
# A custom IQR functiondef iqr(column): # Drop duplicate store/type combinations
return column.quantile(0.75) - column.quantile(0.25) store_types = sales.drop_duplicates(subset=["store", "type"])
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment print(store_types.head())
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)) # Drop duplicate store/department combinations
# Import NumPy and create custom IQR functionimport numpy as npdef iqr(colum store_depts = sales.drop_duplicates(subset=["store", "department"])
n):
print(store_depts.head())
return column.quantile(0.75) - column.quantile(0.25)
# Subset the rows where is_holiday is True and drop duplicate dates
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unem
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset="date")
ployment
# Print date col of holiday_dates
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, n
p.median])) print(holiday_dates["date"])
Cumulative statistics Counting categorical variables
# edited/added # Count the number of stores of each type
store_counts = store_types["type"].value_counts() sales_by_type = sales.groupby("type")["weekly_sales"].sum()
print(store_counts) # Get proportion for each type
# Get the proportion of stores of each type sales_propn_by_type = sales_by_type / sum(sales_by_type)
store_props = store_types["type"].value_counts(normalize=True) print(sales_propn_by_type)
print(store_props) # Group by type and is_holiday; calc total weekly sales
# Count the number of each department number and sort sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].s
um()
dept_counts_sorted = store_depts["department"].value_counts(sort=True)
print(sales_by_type_is_holiday)
print(dept_counts_sorted)
# Get the proportion of departments of each number and sort Multiple grouped summaries
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize
=True) # Import numpy with the alias npimport numpy as np
print(dept_props_sorted) # For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean,
What percent of sales occurred at each store type? np.median])
# Print sales_stats
# Calc total weekly sales
print(sales_stats)
sales_all = sales["weekly_sales"].sum()
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min,
# Subset for type A stores, calc total weekly sales
max, mean, and median
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l
# Subset for type B stores, calc total weekly sales "]].agg([np.min, np.max, np.mean, np.median])
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum() # Print unemp_fuel_stats
# Subset for type C stores, calc total weekly sales print(unemp_fuel_stats)
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()
Pivoting on one variable
# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all # Pivot for mean weekly_sales for each store type
print(sales_propn_by_type) mean_sales_by_type = sales.pivot_table(values="weekly_sales", index="type")
# Print mean_sales_by_type
Calculations with .groupby()
print(mean_sales_by_type)
# Group by type; calc total weekly sales # Import NumPy as npimport numpy as np
# Pivot for mean and median weekly_sales for each store type print(temperatures_ind)
mean_med_sales_by_type = sales.pivot_table("weekly_sales", "type", aggfunc = [np. # Reset the temperatures_ind index, keeping its contents
mean, np.median])
print(temperatures_ind.reset_index())
# Print mean_med_sales_by_type
# Reset the temperatures_ind index, dropping its contents
print(mean_med_sales_by_type)
print(temperatures_ind.reset_index(drop=True))
# Pivot for mean weekly_sales by store type and holiday
mean_sales_by_type_holiday = sales.pivot_table("weekly_sales", "type", "is_holida Subsetting with .loc[]
y")
# Make a list of cities to subset on
# Print mean_sales_by_type_holiday
cities = ["Moscow", "Saint Petersburg"]
print(mean_sales_by_type_holiday)
# Subset temperatures using square brackets
Fill in missing values and sum values with pivot tables print(temperatures[temperatures["city"].isin(cities)])
# Subset temperatures_ind using .loc[]
# Print mean weekly_sales by department and type; fill missing values with 0
print(temperatures_ind.loc[cities])
print(sales.pivot_table(values="weekly_sales", index="department", columns="type
", fill_value=0))
Setting multi-level indexes
# Print the mean weekly_sales by department and type; fill missing values with 0s; s
um all rows and cols # Index temperatures by country & city
print(sales.pivot_table(values="weekly_sales", index="department", columns="type temperatures_ind = temperatures.set_index(["country", "city"])
", fill_value=0, margins =True))
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]
# Subset for rows to keep
# edited/added
print(temperatures_ind.loc[rows_to_keep])
temperatures = pd.read_csv('temperatures.csv', index_col=0)
temperatures['date'] = pd.to_datetime(temperatures['date'], infer_datetime_format= Sorting by index values
True)
# Look at temperatures # Sort temperatures_ind by index values
print(temperatures) print(temperatures_ind.sort_index())
# Set the index of temperatures to city # Sort temperatures_ind by index values at the city level
temperatures_ind = temperatures.set_index("city") print(temperatures_ind.sort_index(level="city"))
# Look at temperatures_ind # Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True, Fal # Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
se]))
print(temperatures_ind.loc["2010":"2011"])
Slicing index values # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index() Subsetting by row/column number
# Subset rows from Pakistan to Russia
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures_srt.loc["Pakistan":"Russia"])
print(temperatures.iloc[22, 1])
# Try to subset rows from Lahore to Moscow
# Use slicing to get the first 5 rows
print(temperatures_srt.loc["Lahore":"Moscow"])
print(temperatures.iloc[:5])
# Subset rows from Pakistan, Lahore to Russia, Moscow
# Use slicing to get columns 3 to 4
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])
print(temperatures.iloc[:, 2:4])
Slicing in both directions # Use slicing in both directions at once
print(temperatures.iloc[:5, 2:4])
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")]) Pivot temperature by city and year
# Subset columns from date to avg_temp_c
# Add a year column to temperatures
print(temperatures_srt.loc[:, "date":"avg_temp_c"])
temperatures["year"] = temperatures["date"].dt.year
# Subset in both directions at once
# Pivot avg_temp_c by country and city vs year
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index =
temp_c"])
["country", "city"], columns = "year")
Slicing time series # See the result
print(temp_by_country_city_vs_year)
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temp Subsetting pivot tables
eratures["date"] <= "2011-12-31")]
# Subset for Egypt to India
print(temperatures_bool)
temp_by_country_city_vs_year.loc["Egypt":"India"]
# Set date as the index and sort the index
# Subset for Egypt, Cairo to India, Delhi
temperatures_ind = temperatures.set_index("date").sort_index()
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")] Changes in sales over time
# Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010 # Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2 # Get the total number of avocados sold on each date
010"]
nb_sold_by_date = avocados.groupby("date")["nb_sold"].sum()
Calculating on a pivot table # Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind="line")
# Get the worldwide mean temp by year
# Show the plot
mean_temp_by_year = temp_by_country_city_vs_year.mean()
plt.show()
# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]) Avocado supply and demand
# Get the mean temp by city
# Scatter plot of avg_price vs. nb_sold with title
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
avocados.plot(x="nb_sold", y="avg_price", kind="scatter", title="Number of avocad
# Filter for the city that had the lowest mean temp os sold vs. average price")
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()]) # Show the plot
plt.show()
Which avocado size is most popular?
# edited/added Price of conventional vs. organic avocados
import urllib.requestimport pickle # Histogram of conventional avg_price
avocados = pd.read_pickle("avoplotto.pkl") avocados[avocados["type"] == "conventional"]["avg_price"].hist()
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt # Histogram of organic avg_price
# Look at the first few rows of data avocados[avocados["type"] == "organic"]["avg_price"].hist()
print(avocados.head()) # Add a legend
# Get the total number of avocados sold of each size plt.legend(["conventional", "organic"])
nb_sold_by_size = avocados.groupby("size")["nb_sold"].sum() # Show the plot
# Create a bar plot of the number of avocados sold by size plt.show()
nb_sold_by_size.plot(kind="bar") # Modify histogram transparency to 0.5
# Show the plot avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5)
plt.show() # Modify histogram transparency to 0.5
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5) Removing missing values
# Add a legend # Remove rows with missing values
plt.legend(["conventional", "organic"]) avocados_complete = avocados_2016.dropna()
# Show the plot # Check if any columns contain missing values
plt.show() print(avocados_complete.isna().any())
# Modify bins to 20
Replacing missing values
avocados[avocados["type"] == "conventional"]["avg_price"].hist(bins=20, alpha=0.
5) # List the columns with missing values
# Modify bins to 20 cols_with_missing = ["small_sold", "large_sold", "xl_sold"]
avocados[avocados["type"] == "organic"]["avg_price"].hist(bins= 20, alpha=0.5) # Create histograms showing the distributions cols_with_missing
# Add a legend avocados_2016[cols_with_missing].hist()
plt.legend(["conventional", "organic"]) # Show the plot
# Show the plot plt.show()
plt.show()
List of dictionaries
Finding missing values
# Create a list of dictionaries with new data
# edited/added avocados_list = [
avocados_2016 = pd.read_csv('avocados_2016.csv') {"date": "2019-11-03", "small_sold": 10376832, "large_sold": 7835071},
cols_with_missing = ['small_sold', 'large_sold', 'xl_sold'] {"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348},
# Import matplotlib.pyplot with alias pltimport matplotlib.pyplot as plt ]
# Check individual values for missing values # Convert list into DataFrame
print(avocados_2016.isna()) avocados_2019 = pd.DataFrame(avocados_list)
# Check each column for missing values # Print the new DataFrame
print(avocados_2016.isna().any()) print(avocados_2019)
# Bar plot of missing values by variable
avocados_2016.isna().sum().plot(kind="bar") Dictionary of lists
# Show plot # Create a dictionary of lists with new data
plt.show() avocados_dict = {
"date": ["2019-11-17", "2019-12-01"], # Print airline_totals_sorted
"small_sold": [10859987, 9291631], print(airline_totals_sorted)
"large_sold": [7674135, 6238096] # Save as airline_totals_sorted.csv
} airline_totals_sorted.to_csv("airline_totals_sorted.csv")
# Convert dictionary into DataFrame
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)
CSV to DataFrame
# Read CSV as DataFrame called airline_bumping
airline_bumping = pd.read_csv('airline_bumping.csv') # edited/added
# Take a look at the DataFrame
print(airline_bumping.head())
# For each airline, select nb_bumped and total_passengers and sum
airline_totals = airline_bumping.groupby("airline")[["nb_bumped", "total_passenger
s"]].sum()
# Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline
airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] / airline_totals["tota
l_passengers"] * 10000
# Print airline_totals
print(airline_totals)
# Print airline_totals
print(airline_totals)
DataFrame to CSV
# Create airline_totals_sorted
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k", ascending=Fals
e)