Numpy
Numpy
sort_values("individuals")
1) Inspecting a DataFrame # Print the top few rows
print(homelessness_ind.head())
# edited/added
# Sort homelessness by descending family members
import pandas as pd
homelessness_fam = homelessness.sort_values("family_members",
homelessness = pd.read_csv('homelessness.csv', index_col=0)
ascending=False)
# Print the head of the homelessness data
# Print the top few rows
print(homelessness.head())
print(homelessness_fam.head())
# Print information about homelessness
# Sort homelessness by region, then descending family members
print(homelessness.info())
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"],
# Print the shape of homelessness ascending=[True, False])
print(homelessness.shape) # Print the top few rows
# Print a description of homelessness print(homelessness_reg_fam.head())
print(homelessness.describe())
4) Subsetting columns
2) Parts of a DataFrame
# Select the individuals column
# Import pandas using the alias pd individuals = homelessness["individuals"]
import pandas as pd # Print the head of the result
# Print the values of homelessness print(individuals.head())
print(homelessness.values) # Select the state and family_members columns
# Print the column index of homelessness state_fam = homelessness[["state", "family_members"]]
print(homelessness.columns) # Print the head of the result
# Print the row index of homelessness print(state_fam.head())
print(homelessness.index) # Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]
3) Sorting rows
# Print the head of the result
# Sort homelessness by individuals print(ind_state.head())
Subsetting rows # Add total col as sum of individuals and family_members
# Filter for rows where individuals is greater than 10000 homelessness["total"] = homelessness["individuals"] +
homelessness["family_members"]
ind_gt_10k = homelessness[homelessness["individuals"] > 10000]
# Add p_individuals col as proportion of total that are individuals
# See the result
homelessness["p_individuals"] = homelessness["individuals"] /
print(ind_gt_10k)
homelessness["total"]
# Filter for rows where region is Mountain
# See the result
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(homelessness)
# See the result
print(mountain_reg) Combo-attack!
# Filter for rows where family_members is less than 1000 # and region is Pacific # Create indiv_per_10k col as homeless individuals per 10k state pop
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] /
(homelessness["region"] == "Pacific")] homelessness["state_pop"]
# See the result # Subset rows for indiv_per_10k greater than 20
print(fam_lt_1k_pac) high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
Subsetting rows by categorical variables
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k",
# Subset for rows in South Atlantic or Mid-Atlantic regions ascending=False)
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") | # From high_homelessness_srt, select the state and indiv_per_10k cols
(homelessness["region"] == "Mid-Atlantic")]
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
# See the result
print(south_mid_atlantic)
print(result)
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"] Mean and median
# Filter for rows in the Mojave Desert states # edited/added
mojave_homelessness = homelessness[homelessness["state"].isin(canu)] sales = pd.read_csv('sales_subset.csv', index_col=0)
# See the result # Print the head of the sales DataFrame
print(mojave_homelessness) print(sales.head())
# Pivot for mean and median weekly_sales for each store type # Look at temperatures_ind
# Print the mean weekly_sales by department and type; fill missing values with 0s;
Setting multi-level indexes
sum all rows and cols
print(sales.pivot_table(values="weekly_sales", index="department", # Index temperatures by country & city
columns="type", fill_value=0, margins =True)) temperatures_ind = temperatures.set_index(["country", "city"])
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
Setting and removing indexes
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")] print(temperatures_srt.loc[:, "date":"avg_temp_c"])
# Subset for rows to keep # Subset in both directions at once
print(temperatures_ind.loc[rows_to_keep]) print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"),
"date":"avg_temp_c"])
Sorting by index values
Slicing time series
# Sort temperatures_ind by index values
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
print(temperatures_ind.sort_index())
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") &
# Sort temperatures_ind by index values at the city level
(temperatures["date"] <= "2011-12-31")]
print(temperatures_ind.sort_index(level="city"))
print(temperatures_bool)
# Sort temperatures_ind by country then descending city
# Set date as the index and sort the index
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True,
temperatures_ind = temperatures.set_index("date").sort_index()
False]))
# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
Slicing index values print(temperatures_ind.loc["2010":"2011"])
# Sort the index of temperatures_ind # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
# Subset rows from India, Hyderabad to Iraq, Baghdad # Use slicing in both directions at once
# Get the worldwide mean temp by year # Create a line plot of the number of avocados sold by date
# Filter for the year that had the highest mean temp # Show the plot
{"date": "2019-11-10", "small_sold": 10717154, "large_sold": 8561348}, # Create new col, bumps_per_10k: no. of bumps per 10k passengers for each airline
] airline_totals["bumps_per_10k"] = airline_totals["nb_bumped"] /
airline_totals["total_passengers"] * 10000
# Convert list into DataFrame
# Print airline_totals
avocados_2019 = pd.DataFrame(avocados_list)
print(airline_totals)
# Print the new DataFrame
# Print airline_totals
print(avocados_2019)
print(airline_totals)
Dictionary of lists
DataFrame to CSV
# Create a dictionary of lists with new data
# Create airline_totals_sorted
avocados_dict = {
airline_totals_sorted = airline_totals.sort_values("bumps_per_10k",
"date": ["2019-11-17", "2019-12-01"], ascending=False)
"small_sold": [10859987, 9291631], # Print airline_totals_sorted
"large_sold": [7674135, 6238096] print(airline_totals_sorted)
} # Save as airline_totals_sorted.csv
# Convert dictionary into DataFrame airline_totals_sorted.to_csv("airline_totals_sorted.csv")
avocados_2019 = pd.DataFrame(avocados_dict)
# Print the new DataFrame
print(avocados_2019)
CSV to DataFrame