# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Sample Data (Replace with your dataset)
data = {
'date': pd.date_range(start='1967-07-31', periods=10, freq='M'),
'pce': np.random.rand(10) * 1000, # Personal Consumption Expenditure
'pop': np.random.randint(100000, 500000, 10), # Population
'psavert': np.random.rand(10) * 10, # Personal Saving Rate
'uempmed': np.random.rand(10) * 5, # Median Duration of Unemployment
'unemploy': np.random.randint(2000, 5000, 10), # Unemployed people
'contributors': np.random.randint(50, 500, 10), # Contributor Activity
'article_density': np.random.rand(10) * 100, # Number of articles per capita
'gdp': np.random.randint(50000, 200000, 10) # GDP
}
df = pd.DataFrame(data)
# Standardizing numerical columns
scaler = StandardScaler()
df[['pce', 'pop', 'psavert', 'uempmed', 'unemploy', 'contributors',
'article_density', 'gdp']] = \
scaler.fit_transform(df[['pce', 'pop', 'psavert', 'uempmed', 'unemploy',
'contributors', 'article_density', 'gdp']])
# **DISPLAY TABLE DATA FIRST**
# Display first few rows of the dataset
print("🔹 First 5 Rows of the Dataset:")
print(df.head())
# Show summary statistics of numerical columns
print("\n🔹 Summary Statistics:")
print(df.describe())
# Display correlation matrix as a table (useful before heatmap)
print("\n🔹 Correlation Matrix Table:")
print(df.drop(columns=['date']).corr())
# **1. K-Means Clustering Plot**
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(df[['pce', 'pop', 'psavert', 'uempmed',
'unemploy']])
plt.figure(figsize=(8, 6))
sns.scatterplot(x='pce', y='unemploy', hue='cluster', data=df, palette='Set2',
s=100)
plt.title('K-Means Clustering: PCE vs Unemployment')
plt.show()
# **2. Feature Importance (Random Forest)**
X = df[['pce', 'pop', 'psavert', 'uempmed']]
y = df['unemploy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
importance = rf_model.feature_importances_
plt.figure(figsize=(8, 6))
sns.barplot(x=importance, y=X.columns, color='skyblue')
plt.title('Feature Importance (Random Forest)')
plt.show()
# **3. Correlation Matrix Heatmap**
plt.figure(figsize=(8, 6))
sns.heatmap(df.drop(columns=['date', 'cluster']).corr(), annot=True,
cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()
# **4. Box-and-Whisker Plot for Engagement Metrics**
plt.figure(figsize=(8, 6))
sns.boxplot(data=df[['pce', 'pop', 'psavert', 'uempmed', 'unemploy']],
palette="Set3")
plt.title('Box-and-Whisker Plot for Engagement Metrics')
plt.xticks(rotation=45)
plt.show()
# **5. Histogram of Contributor Activity**
plt.figure(figsize=(8, 6))
sns.histplot(df['contributors'], bins=10, kde=True, color='purple')
plt.title('Histogram of Contributor Activity')
plt.xlabel('Contributor Activity')
plt.ylabel('Frequency')
plt.show()
# **6. Scatterplot: Article Density vs. GDP**
plt.figure(figsize=(8, 6))
sns.scatterplot(x='article_density', y='gdp', data=df, color='red')
plt.title('Scatterplot: Article Density vs. GDP')
plt.xlabel('Article Density')
plt.ylabel('GDP')
plt.show()