Bar Plots, Histograms, and Distributions
import pandas as pd
import matplotlib.pyplot as plt
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
plt.scatter(bike_sharing['workingday'], bike_sharing['casual'])
plt.title('Working Day Vs. Casual')
plt.show()
plt.scatter(bike_sharing['workingday'], bike_sharing['registered'])
plt.title('Working Day Vs. Registered')
plt.show()
2. Bar Plots
import matplotlib.pyplot as plt
working_days = ['Non-Working Day', 'Working Day']
registered_avg = [2959, 3978]
plt.bar(working_days, registered_avg)
plt.show()
3.Customizing Bar Plots
import pandas as pd
import matplotlib.pyplot as plt
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
weekday_averages = bike_sharing.groupby('weekday').mean()[['casual', 'registered']].reset_index() #
It's not essential to understand how this code works, we'll cover this in a later course
plt.bar(weekday_averages['weekday'], weekday_averages['registered'])
plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6],
labels=['Sunday', 'Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday'],
rotation=30)
plt.show()
4. Frequency Tables
import matplotlib.pyplot as plt
unique_values = [1, 2, 3, 4]
weather_2011 = [226, 124, 15, 0]
weather_2012 = [237, 123, 6, 0]
plt.bar(unique_values, weather_2011)
plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2011')
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
plt.bar(unique_values, weather_2012)
plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2012')
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
5.Grouped Frequency Tables
import pandas as pd
import matplotlib.pyplot as plt
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
registered_freq = bike_sharing['registered'].value_counts(bins=10).sort_index()
casual_freq = bike_sharing['casual'].value_counts(bins=10).sort_index()
6.Histograms
import pandas as pd
import matplotlib.pyplot as plt
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
plt.hist(bike_sharing['casual'])
plt.show()
7.The Normal Distribution
sentence_1 = True
sentence_2 = False
sentence_3 = True
sentence_4 = True
sentence_5 = False
8. The Uniform Distribution
sentence_1 = True
sentence_2 = False
sentence_3 = False
sentence_4 = False
9. Skewed Distributions
Bar Charts - Learn about this chart and tools to create it (datavizcatalogue.com)
Histogram - Learn about this chart and tools to create it (datavizcatalogue.com)
Frequency Distribution (mathsisfun.com)
Grouped Frequency Distribution (mathsisfun.com)
Normal Distribution (mathsisfun.com)
Pandas Visualizations and Grid Charts
1. Traffic Congestions in Sao Paulo
import pandas as pd
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic.head()
traffic.tail()
traffic.info()
3. Slowness in Traffic
import pandas as pd
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
import matplotlib.pyplot as plt
plt.hist(traffic['Slowness in traffic (%)'])
plt.show()
sentence_1 = True
sentence_2 = True
sentence_3 = False
4. Pandas Visualization Methods
import matplotlib.pyplot as plt
import pandas as pd
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
traffic['Slowness in traffic (%)'].plot.hist()
plt.title('Distribution of Slowness in traffic (%)')
plt.xlabel('Slowness in traffic (%)')
plt.show()
5. Frequency of Incidents
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
incidents = traffic.drop(['Hour (Coded)', 'Slowness in traffic (%)'],
axis=1)
incidents.sum().plot.barh()
plt.show()
sentence_1 = False
sentence_2 = True
sentence_3 = True
6. Correlations with Traffic Slowness
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
traffic.plot.scatter(x='Slowness in traffic (%)',
y='Lack of electricity')
plt.show()
traffic.plot.scatter(x='Slowness in traffic (%)',
y='Point of flooding')
plt.show()
traffic.plot.scatter(x='Slowness in traffic (%)',
y='Semaphore off')
plt.show()
7. Traffic Slowness Over 20%
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
slowness_20_or_more = traffic[traffic['Slowness in traffic (%)'] >= 20]
slowness_20_or_more = slowness_20_or_more.drop(['Slowness in traffic (%)',
'Hour (Coded)'], axis=1)
incident_frequencies = slowness_20_or_more.sum()
incident_frequencies.plot.barh()
plt.show()
8. How Traffic Slowness Change
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days):
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
for day in days:
traffic_per_day[day].plot.line(x='Hour (Coded)',
y='Slowness in traffic (%)')
plt.title(day)
plt.ylim([0, 25])
plt.show()
9. Comparing Graphs
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days):
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
for day in days:
plt.plot(traffic_per_day[day]['Hour (Coded)'],
traffic_per_day[day]['Slowness in traffic (%)'],
label=day)
plt.legend()
plt.show()
10. Grid Charts
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
plt.figure()
plt.subplot(3, 2, 1)
plt.subplot(3, 2, 2)
plt.subplot(3, 2, 6)
plt.subplot(3, 2, 3)
plt.subplot(3, 2, 4)
plt.subplot(3, 2, 5)
plt.show()
11. Grid Charts (II)
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days):
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
plt.figure(figsize=(10,12))
for i, day in zip(range(1,6), days):
plt.subplot(3, 2, i)
plt.plot(traffic_per_day[day]['Hour (Coded)'],
traffic_per_day[day]['Slowness in traffic (%)'])
plt.title(day)
plt.ylim([0,25])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days):
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
plt.figure(figsize=(10,12))
for i, day in zip(range(1,6), days):
plt.subplot(3, 2, i)
plt.plot(traffic_per_day[day]['Hour (Coded)'],
traffic_per_day[day]['Slowness in traffic (%)'])
plt.title(day)
plt.ylim([0,25])
plt.subplot(3, 2, 6)
for day in days:
plt.plot(traffic_per_day[day]['Hour (Coded)'],
traffic_per_day[day]['Slowness in traffic (%)'],
label=day)
plt.ylim([0,25])
plt.legend()
plt.show()
Chart Visualization — pandas 1.4.1 documentation (pydata.org)
Small multiple - Wikipedia
Relational Plots and Multiple Variables
import pandas as pd
housing = pd.read_csv('housing.csv')
housing.head()
housing.tail()
housing.info()
2. Seaborn
import pandas as pd
housing = pd.read_csv('housing.csv')
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')
plt.show()
correlation = 'positive'
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
hue='Overall Qual', palette='RdYlGn')
plt.show()
sentence_1 = True
sentence_2 = True
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
# hue='Overall Qual', palette='RdYlGn')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
hue='Overall Qual', palette='RdYlGn',
size='Garage Area', sizes=(1,300))
plt.show()
sentence_1 = False
sentence_2 = True
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
# hue='Overall Qual', palette='RdYlGn',
# size='Garage Area', sizes=(1,300))
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
hue='Overall Qual', palette='RdYlGn',
size='Garage Area', sizes=(1,300),
style='Rooms')
plt.show()
sentence_1 = False
sentence_2 = False
6.Variable Representation: Spatial Separation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
# hue='Overall Qual', palette='RdYlGn',
# size='Garage Area', sizes=(1,300),
# style='Rooms')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
hue='Overall Qual', palette='RdYlGn',
size='Garage Area', sizes=(1,300),
style='Rooms', col='Year')
plt.show()
sentence_1 = True
sentence_2 = True
Visualizing statistical relationships — seaborn 0.11.2 documentation (pydata.org)
Visualizing distributions of data — seaborn 0.11.2 documentation (pydata.org)
Plotting with categorical data — seaborn 0.11.2 documentation (pydata.org)
Building structured multi-plot grids — seaborn 0.11.2 documentation (pydata.org)
Guided Project: Finding Heavy Traffic Indicators on I-94
1. The I-94 Traffic Dataset
UCI Machine Learning Repository: Metro Interstate Traffic Volume Data Set
solutions/Mission524Solutions.ipynb at master · dataquestio/solutions · GitHub
2. Analyzing Traffic Volume
3. Traffic Volume: Day vs. Night
5. Time Indicators
8. Weather Indicators
9. Weather Types
Guided Project: Finding Heavy Traffic Indicators on I-94
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
plt.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
3. Matplotlib Interfaces
Data Visualization With Matplotlib Course | Dataquest
4. The OO Interface
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots()
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
5. Mobile-Friendly Proportions
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
6. Maximizing Data-Ink
7. Erasing Non-Data Ink
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.tick_params(bottom=False, left=False)
plt.show()
8. Erasing Redundant Data-Ink
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
# Initial Code
# fig, ax = plt.subplots(figsize=(4.5, 6))
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'])
# for location in ['left', 'right', 'top', 'bottom']:
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.tick_params(bottom=False, left=False)
ax.set_xticks([0, 150000, 300000])
plt.show()
9. The Direction of Reading
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
# Initial Code
# fig, ax = plt.subplots(figsize=(4.5, 6))
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'],
# height=0.45)
# for location in ['left', 'right', 'top', 'bottom']:
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# ax.set_xticks([0, 150000, 300000])
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
plt.show()
10. Title and Subtitle
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(x=-80000, y=23.5,
s='The Death Toll Worldwide Is 1.5M+',
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
s='Top 20 countries by death toll (December 2020)',
size=12)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots(figsize=(4.5, 6))
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
ax.text(x=-80000, y=23.5,
s='The Death Toll Worldwide Is 1.5M+',
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
s='Top 20 countries by death toll (December 2020)',
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.set_xticklabels(['0', '150,000', '300,000'])
ax.set_yticklabels([]) # an empty list removes the labels
country_names = top20_deathtoll['Country_Other']
for i, country in zip(range(20), country_names):
ax.text(x=-80000, y=i-0.15, s=country)
ax.axvline(x=150000, ymin=0.045, c='grey',
alpha=0.5)
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
The Lifecycle of a Plot — Matplotlib 3.5.1 documenta
Examples — Matplotlib 3.5.1 documentation n
Design for an Audience
1. Data Stories
2. Grid Charts in Matplotlib
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
ax1.plot(death_toll['Month'], death_toll['New_deaths'])
ax2.plot(death_toll['Month'], death_toll['New_deaths'])
ax3.plot(death_toll['Month'], death_toll['New_deaths'])
ax4.plot(death_toll['Month'], death_toll['New_deaths'])
plt.show()
3. Faster Workflow
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'])
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
plt.show()
4. Modifying the Line Plots
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
plt.show()
5. Adding Structural Elements
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',
weight='bold', rotation=3)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
plt.show()
6. Title and Subtitle
WHO Coronavirus (COVID-19) Dashboard | WHO Coronavirus (COVID-19) Dashboard With
Vaccination Data
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',
weight='bold', rotation=3)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax1.text(0.5, 3500, 'The virus kills 851 people each day',
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
plt.show()
7. Adding a Progress Bar
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',
weight='bold', rotation=3)
ax1.text(0.5, 3500, 'The virus kills 851 people each day',
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
for ax in axes:
ax.axhline(y=1600, xmin=0.5, xmax=0.8,
linewidth=6, color='#af0b1e',
alpha=0.1)
'''
Alternatively, you can integrate this code
inside the first for loop above.
'''
plt.show()
8. Completing the Progress Bar
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
deaths = [2398, 126203, 227178, 295406]
proportions = [round(death/295406, 2) for death in deaths]
xmax_vals = [round(0.5 + proportion * 0.3, 3) for proportion in proportions]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',
weight='bold', rotation=3)
ax1.text(0.5, 3500, 'The virus kills 851 people each day',
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
for ax, xmax, death in zip(axes, xmax_vals, deaths):
ax.axhline(y=1600, xmin=0.5, xmax=0.8,
linewidth=6, color='#af0b1e',
alpha=0.1)
ax.axhline(y=1600, xmin=0.5, xmax=xmax,
linewidth=6, color='#af0b1e')
ax.text(7.5, 1850, format(death, ','),
color='#af0b1e', weight='bold')
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
Examples — Matplotlib 3.5.1 documentation
Is Your Data Story Actually A Story? | by Joshua Smith | Nightingale | Medium
Gestalt Principles and Pre-Attentive Attributes
1. Gestalt Principles
Link:
Gestalt Principles And Pre-attentive Attributes — Gestalt Principles | Dataquest!!!!!!!!!!!!
2. Proximity
sentence_1 = True
sentence_2 = True
sentence_3 = False
3. Similarity
sentence_1 = False # It's similarity of color, not shape
sentence_2 = True
sentence_3 = True
4. Enclosure
sentence_1 = True
sentence_2 = True
sentence_3 = True
5. Connection
6. Visual Hierarchy
sentence_1 = False # Enclosure is stronger.
sentence_2 = True
sentence_3 = False # Enclosure is stronger.
sentence_4 = False # The human perception is non-random.
7. Pre-Attentive Attributes
sentence_1 = True
sentence_2 = False
sentence_3 = True
Pre-attentive processing - Wikipedia
Gestalt psychology - Wikipedia
Matplotlib Styles: FiveThirtyEight Case Study
import matplotlib.style as style
style.use('ggplot')
plt.plot([2, 4, 6], [10, 15, 5])
plt.show()
style.use('default')
plt.plot([2, 4, 6], [10, 15, 5])
plt.show()
2. Wine Quality Dataset
import pandas as pd
red_wine = pd.read_csv('winequality-red.csv', sep=';')
red_corr = red_wine.corr()['quality'][:-1]
white_wine = pd.read_csv('winequality-white.csv', sep=';')
white_corr = white_wine.corr()['quality'][:-1]
print(white_corr)
3. FiveThirtyEight Style
# Initial Code
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
# ax.barh(white_corr.index, white_corr, left=2)
# ax.barh(red_corr.index, red_corr)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.barh(white_corr.index, white_corr, left=2, height=0.5)
ax.barh(red_corr.index, red_corr, height=0.5)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.show()
4. Adding Y-tick Labels
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(white_corr.index, white_corr, left=2, height=0.5)
ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,
'Density': 0.80, 'Total Sulfur Dioxide': 0.59,
'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,
'Residual Sugar': 0.67, 'Citric Acid': 0.76,
'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}
y_coord = 9.8
for y_label, x_coord in x_coords.items():
ax.text(x_coord, y_coord, y_label)
y_coord -= 1
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
plt.show()
5. Adding X-tick Labels
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(white_corr.index, white_corr, left=2, height=0.5)
ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,
'Density': 0.80, 'Total Sulfur Dioxide': 0.59,
'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,
'Residual Sugar': 0.67, 'Citric Acid': 0.76,
'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}
y_coord = 9.8
for y_label, x_coord in x_coords.items():
ax.text(x_coord, y_coord, y_label)
y_coord -= 1
ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.33, 11.2, 'RED WINE', weight='bold')
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')
plt.show()
6. Adding a Signature
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(white_corr.index, white_corr, left=2, height=0.5)
ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,
'Density': 0.80, 'Total Sulfur Dioxide': 0.59,
'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,
'Residual Sugar': 0.67, 'Citric Acid': 0.76,
'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}
y_coord = 9.8
for y_label, x_coord in x_coords.items():
ax.text(x_coord, y_coord, y_label)
y_coord -= 1
ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.33, 11.2, 'RED WINE', weight='bold')
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')
ax.text(-0.7, -2.9,
'©DATAQUEST' + ' '*94 + 'Source: P. Cortez et al.',
color = '#f0f0f0', backgroundcolor = '#4d4d4d',
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(-0.7, 13.5,
'Wine Quality Most Strongly Correlated With Alcohol Level',
size=17, weight='bold')
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
plt.show()
7. Coloring Bars Differently
positive_white = white_corr >= 0
color_map_white = positive_white.map({True:'#33A1C9', False:'#ffae42'})
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
ax.barh(white_corr.index, white_corr, left=2, height=0.5,
color=color_map_white)
#ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,
'Density': 0.80, 'Total Sulfur Dioxide': 0.59,
'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,
'Residual Sugar': 0.67, 'Citric Acid': 0.76,
'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}
y_coord = 9.8
for y_label, x_coord in x_coords.items():
ax.text(x_coord, y_coord, y_label)
y_coord -= 1
ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.33, 11.2, 'RED WINE', weight='bold')
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')
ax.text(-0.7, -2.9, '©DATAQUEST' + ' '*92 + 'Source: P. Cortez et al.',
color = '#f0f0f0', backgroundcolor = '#4d4d4d',
size=12)
ax.text(-0.7, 13.5,
'Wine Quality Most Strongly Correlated With Alcohol Level',
size=17, weight='bold')
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
positive_red = red_corr >= 0
color_map_red = positive_red.map({True:'#33A1C9', False:'#ffae42'})
ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1,
color=color_map_red)
plt.show()
How to Generate FiveThirtyEight Graphs in Python – Dataquest
Guided Project: Storytelling Data Visualization on Exchange Rates
An In-Depth Style Guide for Data Science Projects – Dataquest
Latest 529 topics - Dataquest Community
Data Aggregation
World Happiness Report | Kaggle
happiness2015 = pd.read_csv("World_Happiness_2015.csv")
first_5 = happiness2015.head()
happiness2015.info()
mean_happiness = {}
regions = happiness2015['Region'].unique()
for r in regions:
region_group = happiness2015[happiness2015['Region'] == r]
region_mean = region_group['Happiness Score'].mean()
mean_happiness[r] = region_mean
grouped = happiness2015.groupby('Region')
aus_nz = grouped.get_group('Australia and New Zealand')
grouped = happiness2015.groupby('Region')
north_america = happiness2015.iloc[[4,14]]
na_group = grouped.get_group('North America')
equal = north_america == na_group
grouped = happiness2015.groupby('Region')
means = grouped.mean()
grouped = happiness2015.groupby('Region')
happy_grouped = grouped['Happiness Score']
happy_mean = happy_grouped.mean()
import numpy as np
grouped = happiness2015.groupby('Region')
happy_grouped = grouped['Happiness Score']
def dif(group):
return (group.max() - group.mean())
happy_mean_max = happy_grouped.agg([np.mean, np.max])
mean_max_dif = happy_grouped.agg(dif)
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()
print(happiness_means)
pv_happiness = happiness2015.pivot_table(values='Happiness Score', index='Region',
aggfunc=np.mean, margins=True)
pv_happiness.plot(kind='barh', xlim=(0,10), title='Mean Happiness Scores by Region', legend=False)
world_mean_happiness = happiness2015['Happiness Score'].mean()
grouped = happiness2015.groupby('Region')[['Happiness Score','Family']]
happy_family_stats = grouped.agg([np.min, np.max, np.mean])
pv_happy_family_stats = happiness2015.pivot_table(['Happiness Score', 'Family'], 'Region',
aggfunc=[np.min, np.max, np.mean], margins=True)
Combining Data Using Pandas
import pandas as pd
happiness2015 = pd.read_csv("World_Happiness_2015.csv")
happiness2016 = pd.read_csv("World_Happiness_2016.csv")
happiness2017 = pd.read_csv("World_Happiness_2017.csv")
happiness2015['Year'] = 2015
happiness2016['Year'] = 2016
happiness2017['Year'] = 2017
Combining Data Using Pandas
Merge, join, concatenate and compare — pandas 1.4.1 documentation (pydata.org)
Transforming Data with Pandas
mapping = {'Economy (GDP per Capita)': 'Economy', 'Health (Life Expectancy)': 'Health', 'Trust
(Government Corruption)': 'Trust' }
happiness2015 = happiness2015.rename(mapping, axis = 1)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_map = happiness2015['Economy'].map(label)
economy_impact_apply = happiness2015['Economy'].apply(label)
equal = economy_impact_map.equals(economy_impact_apply)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_apply = happiness2015['Economy'].apply(label)
def label(element, x):
if element > x:
return 'High'
else:
return 'Low'
economy_impact_apply = happiness2015['Economy'].apply(label, x = .8)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_apply = happiness2015['Economy'].apply(label)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']
factors_impact = happiness2015[factors].applymap(label)
def v_counts(col):
num = col.value_counts()
den = col.size
return num/den
v_counts_pct = factors_impact.apply(v_counts)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']
def percentages(col):
div = col/happiness2015['Happiness Score']
return div * 100
factor_percentages = happiness2015[factors].apply(percentages)
main_cols = ['Country', 'Region', 'Happiness Rank', 'Happiness Score']
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']
melt = pd.melt(happiness2015, id_vars = main_cols, value_vars = factors)
melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)
melt = pd.melt(happiness2015, id_vars = ['Country', 'Region', 'Happiness Rank', 'Happiness Score'],
value_vars= ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual'])
melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)
pv_melt = melt.pivot_table(index='variable', values='value')
pv_melt.plot(kind='pie', y='value', legend=False)
Tidy Data | Journal of Statistical Software (jstatsoft.org)
Working with Strings In Pandas
world_dev = pd.read_csv("World_dev.csv")
col_renaming = {'SourceOfMostRecentIncomeAndExpenditureData': 'IESurvey'}
merged = pd.merge(left=happiness2015, right=world_dev, how='left', left_on='Country',
right_on='ShortName')
merged = merged.rename(col_renaming, axis=1)
def extract_last_word(element):
return str(element).split()[-1]
merged['Currency Apply'] = merged['CurrencyUnit'].apply(extract_last_word)
merged['Currency Vectorized'] = merged['CurrencyUnit'].str.split().str.get(-1)
print(merged['Currency Vectorized'].head())
lengths = merged['CurrencyUnit'].str.len()
value_counts = lengths.value_counts(dropna=False)
pattern = r"[Nn]ational accounts"
national_accounts = merged['SpecialNotes'].str.contains(pattern)
print(national_accounts.head())
pattern = r"[Nn]ational accounts"
national_accounts = merged['SpecialNotes'].str.contains(r"[Nn]ational accounts", na=False)
merged_national_accounts = merged[national_accounts]
print(merged_national_accounts.head())
pattern =r"()"
pattern = r"([1-2][0-9]{3})"
years = merged['SpecialNotes'].str.extract(pattern)
pattern = r"(?P<Years>[1-2][0-9]{3})"
years = merged['IESurvey'].str.extractall(pattern)
value_counts = years['Years'].value_counts()
print(value_counts)
pattern = r"(?P<First_Year>[1-2][0-9]{3})/?(?P<Second_Year>[0-9]{2})?"
years = merged['IESurvey'].str.extractall(pattern)
first_two_year = years['First_Year'].str[0:2]
years['Second_Year'] = first_two_year + years['Second_Year']
merged['IncomeGroup'] = merged['IncomeGroup'].str.replace(' income', '').str.replace(':',
'').str.upper()
pv_incomes = merged.pivot_table(values='Happiness Score', index='IncomeGroup')
pv_incomes.plot(kind='bar', rot=30, ylim=(0,10))
plt.show()
Working with text data — pandas 1.4.1 documentation (pydata.org)
6.2. re — Regular expression operations — Python 3.4.10 documentation
Working With Missing And Duplicate Dana
shape_2015 = happiness2015.shape
shape_2016 = happiness2016.shape
shape_2017 = happiness2017.shape
missing_2016 = happiness2016.isnull().sum()
missing_2017 = happiness2017.isnull().sum()
happiness2017.columns = happiness2017.columns.str.replace('.', ' ').str.replace('\s+', '
').str.strip().str.upper()
happiness2015.columns = happiness2015.columns.str.replace('(', '').str.replace(')',
'').str.strip().str.upper()
happiness2016.columns = happiness2016.columns.str.replace('(', '').str.replace(')',
'').str.strip().str.upper()
combined = pd.concat([happiness2015, happiness2016, happiness2017], ignore_index=True)
missing = combined.isnull().sum()
regions_2017 = combined[combined['YEAR']==2017]['REGION']
missing = regions_2017.isnull().sum()
combined = pd.merge(left=combined, right=regions, on='COUNTRY', how='left')
combined = combined.drop('REGION_x', axis = 1)
missing = combined.isnull().sum()
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
dups = combined.duplicated(['COUNTRY', 'YEAR'])
print(combined[dups])
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
combined = combined.drop_duplicates(['COUNTRY', 'YEAR'])
columns_to_drop = ['LOWER CONFIDENCE INTERVAL', 'STANDARD ERROR', 'UPPER CONFIDENCE
INTERVAL', 'WHISKER HIGH', 'WHISKER LOW']
combined = combined.drop(columns_to_drop, axis = 1)
missing = combined.isnull().sum()
combined = combined.dropna(thresh=159, axis=1)
missing = combined.isnull().sum()
happiness_mean = combined['HAPPINESS SCORE'].mean()
print(happiness_mean)
combined['HAPPINESS SCORE UPDATED'] = combined['HAPPINESS SCORE'].fillna(happiness_mean)
print(combined['HAPPINESS SCORE UPDATED'].mean())
combined = combined.dropna()
missing = combined.isnull().sum()
Working with missing data — pandas 1.4.1 documentation (pydata.org)
Regular Expression Basics
import re
titles = hn["title"].tolist()
python_mentions = 0
pattern = "[Pp]ython"
for t in titles:
if re.search(pattern, t):
python_mentions += 1
re — Regular expression operations — Python 3.10.2 documentation
RegExr: Learn, Build, & Test RegEx
Advanced Regular Expressions
import pandas as pd
import re
hn = pd.read_csv("hacker_news.csv")
titles = hn['title']
sql_pattern = r"SQL"
sql_counts = titles.str.contains(sql_pattern, flags=re.I).sum()
hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()
hn_sql["flavor"] = hn_sql["title"].str.extract(r"(\w+SQL)", re.I, expand=False)
hn_sql["flavor"] = hn_sql["flavor"].str.lower()
sql_pivot = hn_sql.pivot_table(index="flavor",values="num_comments", aggfunc='mean')
pattern = r"[Pp]ython ([\d\.]+)"
py_versions = titles.str.extract(pattern, expand=False)
py_versions_freq = dict(py_versions.value_counts())
def first_10_matches(pattern):
"""
Return the first 10 story titles that match
the provided regular expression
"""
all_matches = titles[titles.str.contains(pattern)]
first_10 = all_matches.head(10)
return first_10
# pattern = r"\b[Cc]\b"
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)
pattern = r"(?<!Series\s)\b[Cc]\b((?![+.])|\.$)"
c_mentions = titles.str.contains(pattern).sum()
pattern = r"\b(\w+)\s\1\b"
repeated_words = titles[titles.str.contains(pattern)]
email_variations = pd.Series(['email', 'Email', 'e Mail',
'e mail', 'E-mail', 'e-mail',
'eMail', 'E-Mail', 'EMAIL'])
pattern = r"\be[-\s]?mail"
email_uniform = email_variations.str.replace(pattern, "email", flags=re.I)
titles_clean = titles.str.replace(pattern, "email", flags=re.I)
test_urls = pd.Series([
'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
'http://www.interactivedynamicvideo.com/',
'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
'HTTPS://github.com/keppel/pinn',
'Http://phys.org/news/2015-09-scale-solar-youve.html',
'https://iot.seeed.cc',
'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
'http://beta.crowdfireapp.com/?beta=agnipath',
'https://www.valid.ly?param',
'http://css-cursor.techstream.org'
])
pattern = r"https?://([\w\-\.]+)"
test_urls_clean = test_urls.str.extract(pattern, flags=re.I, expand=False)
domains = hn['url'].str.extract(pattern, flags=re.I, expand=False)
top_domains = domains.value_counts().head(5)
# `test_urls` is available from the previous screen
pattern = r"(https?)://([\w\.\-]+)/?(.*)"
test_url_parts = test_urls.str.extract(pattern, flags=re.I)
url_parts = hn['url'].str.extract(pattern, flags=re.I)
# pattern = r"(https?)://([\w\.\-]+)/?(.*)"
pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"
url_parts = hn['url'].str.extract(pattern, flags=re.I)
re — Regular expression operations — Python 3.10.2 documentation
RegExr: Learn, Build, & Test RegEx
List Comprehensions and Lambda Functions
world_cup_str = """
"team_1": "France",
"team_2": "Croatia",
"game_type": "Final",
"score" : [4, 2]
},
"team_1": "Belgium",
"team_2": "England",
"game_type": "3rd/4th Playoff",
"score" : [2, 0]
]
"""
import json
world_cup_obj = json.loads(world_cup_str)
JSON
json — JSON encoder and decoder — Python 3.7.12 documentation
5. Data Structures — Python 3.10.2 documentation
4. More Control Flow Tools — Python 3.10.2 documentation
Working with Missing Dana