Data Analytics with Python - Code and Visualization Examples
1. Data Gathering
import seaborn as sns
import pandas as pd
# Load sample dataset
df = sns.load_dataset("titanic")
print(df.head())
2. Data Scrubbing (Cleaning)
# Check for missing values
print(df.isnull().sum())
# Fill missing 'age' with median
df['age'].fillna(df['age'].median(), inplace=True)
# Drop 'deck' column (too many missing values)
df.drop(columns=['deck'], inplace=True)
# Drop rows with any remaining nulls
df.dropna(inplace=True)
3. Descriptive Analytics
# Descriptive statistics
print(df.describe())
# Plot: Age distribution
import matplotlib.pyplot as plt
import seaborn as sns
sns.histplot(df['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
4. Measures of Central Tendency and Spread
mean_age = df['age'].mean()
median_age = df['age'].median()
std_dev = df['age'].std()
print(f"Mean Age: {mean_age:.2f}")
print(f"Median Age: {median_age}")
print(f"Standard Deviation: {std_dev:.2f}")
Data Analytics with Python - Code and Visualization Examples
5. Inferential Statistics: T-Test
from scipy.stats import ttest_ind
# Create two groups
survived = df[df['survived'] == 1]['age']
not_survived = df[df['survived'] == 0]['age']
# Perform independent t-test
t_stat, p_val = ttest_ind(survived, not_survived)
print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_val:.4f}")
6. Data Analysis: Survival Rate by Sex
sns.barplot(x='sex', y='survived', data=df)
plt.title('Survival Rate by Sex')
plt.ylabel('Survival Rate')
plt.show()
7. Clustering with K-Means
from sklearn.cluster import KMeans
# Select numeric features
X = df[['age', 'fare']]
# Apply K-Means Clustering
kmeans = KMeans(n_clusters=3)
df['cluster'] = kmeans.fit_predict(X)
# Plot clusters
plt.figure(figsize=(8,5))
sns.scatterplot(x='age', y='fare', hue='cluster', data=df, palette='Set2')
plt.title('K-Means Clustering on Age and Fare')
plt.show()