Farah Jahangir
Introduction to Data Mining
Project Assignment
Task 1
Part A
Code:
1. Combining Dataset
2. import os
3. import pandas as pd
4. import glob
5.
6. # Folder path containing your CSV files
7. folder_path = '/content/drive/MyDrive/dataset'
8.
9. # Use glob to find all CSV files in the folder
10. csv_files = [Link]([Link](folder_path, "*.csv"))
11.
12. # List to hold all dataframes
13. df_list = []
14.
15. # Loop through each CSV file and load it into a DataFrame
16. for file in csv_files:
17. df = pd.read_csv(file)
18. df_list.append(df)
19.
20. # Combine all DataFrames into one
21. combined_data = [Link](df_list, ignore_index=True)
22.
23. # Check the first few rows of the combined data
24. print(combined_data.head())
2. Droping Irrelevant Columns
from [Link] import StandardScaler
# Drop Date and Class columns
data =
combined_data.drop(columns=['date','humidity9am','pressure9am','temp9am','
rain_today','rain_tomorrow','wind_speed9am','cloud9am'])
print([Link]())
3. Mapping Values
import pandas as pd
from [Link] import StandardScaler
# Assuming 'data' is your original DataFrame that contains the cloud cover
column (named 'cloud3pm')
# Mapping of cloud cover categories to numerical values (0 to 16)
cloud_cover_mapping = {
'Fair / Windy': 0, 'Partly Cloudy': 1, 'Partly Cloudy / Windy': 2,
'Cloudy': 3,
'Cloudy / Windy': 4, 'Mostly Cloudy': 5, 'Mostly Cloudy / Windy': 6,
'Fog': 7,
'Haze': 8, 'Light Rain': 9, 'Light Rain with Thunder': 10, 'Thunder':
11,
'Rain': 12, 'Thunder / Windy': 13, 'Heavy T-Storm': 14, 'Thunder in
the Vicinity': 15, 'TStorm': 16
}
# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('/content/scaled_weather_data.csv')
# Map the 'cloud3pm' column to numerical values using the mapping
df['cloud_cover'] = df['cloud3pm'].map(cloud_cover_mapping)
# Drop the original 'cloud3pm' column with string values
df = [Link](columns=['cloud3pm'])
# Save the scaled DataFrame into a new CSV file
data.to_csv('new_weather_data.csv', index=False)
# Confirm that the data has been saved
print("Data has been scaled and saved to 'scaled_weather_data.csv'.")
Forming clusters:
import pandas as pd
import numpy as np
from [Link] import KMeans
from [Link] import StandardScaler
import [Link] as plt
import seaborn as sns
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file
# Step 2: Replace 'Blank' values with NaN for numerical columns
[Link]('Blank', [Link], inplace=True)
# Step 3: Convert all columns to numeric, coercing any non-numeric data to
NaN
df = [Link](pd.to_numeric, errors='coerce')
# Step 4: Impute missing values with median (as before)
df = [Link]([Link]())
# Step 7: Select only numeric columns for clustering
numeric_data = df.select_dtypes(include=[[Link]]) # Select only
numeric columns for clustering
# Step 8: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)
# Step 9: Apply K-means clustering (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
[Link](scaled_data)
# Step 10: Get the cluster labels
df['cluster'] = kmeans.labels_
# Step 11: Report the centroids of the clusters
centroids = [Link](kmeans.cluster_centers_,
columns=numeric_data.columns)
print("Centroids of the clusters:")
print(centroids)
# Step 12: Visualize the clusters using boxplots for selected attributes
selected_columns = ['min_temp', 'max_temp', 'rainfall', 'humidity3pm',
'wind_speed3pm', 'pressure3pm']
[Link](figsize=(15, 10))
for i, column in enumerate(selected_columns, 1):
[Link](2, 3, i)
[Link](x='cluster', y=column, data=df)
[Link](f'Boxplot of {column} by Cluster')
plt.tight_layout()
[Link]()
# Step 13: Visualize the clusters using scatter plots (for 2D projection)
# First, let's reduce the data to 2D for visualization using PCA
(Principal Component Analysis)
from [Link] import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
# Scatter plot of clusters in 2D space
[Link](figsize=(8, 6))
[Link](pca_data[:, 0], pca_data[:, 1], c=df['cluster'],
cmap='viridis', s=50)
[Link]('K-means Clustering (2D PCA projection)')
[Link]('PCA Component 1')
[Link]('PCA Component 2')
[Link](label='Cluster')
[Link]()
Results:
K-Mean Clustering with K=3
Box plots:
Part B
Code:
import pandas as pd
import numpy as np
from [Link] import StandardScaler
from [Link] import DBSCAN
from [Link] import pair_confusion_matrix
import [Link] as plt
import seaborn as sns
from collections import Counter
# Step 1: Load and preprocess the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file
# Step 2: Replace 'Blank' values with NaN for numerical columns
[Link]('Blank', [Link], inplace=True)
# Step 3: Convert all columns to numeric, coercing any non-numeric data to
NaN
df = [Link](pd.to_numeric, errors='coerce')
# Step 4: Impute missing values with median
df = [Link]([Link]())
# Step 5: Select only numeric columns for clustering
numeric_data = df.select_dtypes(include=[[Link]]) # Select only
numeric columns for clustering
# Step 6: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)
# Step 7: Apply DBSCAN clustering
# We will try different values of eps and min_samples to get between 2 and
15 clusters with less than 20% outliers
# Best configuration for DBSCAN (after tuning) -- adjust eps and
min_samples
dbscan = DBSCAN(eps=0.5, min_samples=5) # You can adjust these parameters
as needed
[Link](scaled_data)
# Add cluster labels to dataframe
df['dbscan_cluster'] = dbscan.labels_
# Identify the number of outliers (labeled as -1 in DBSCAN)
outliers = [Link](df['dbscan_cluster'] == -1)
total_points = len(df)
outlier_percentage = outliers / total_points * 100
print(f"Outlier percentage in DBSCAN: {outlier_percentage:.2f}%")
# Check if outliers are below 20% (target condition)
if outlier_percentage > 20:
print("Outliers exceed 20%, adjusting DBSCAN parameters.")
else:
print("Outliers are below 20%, proceed to next steps.")
# Step 8: Visualize the DBSCAN clusters using a scatter plot (2D PCA
projection)
from [Link] import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
# Scatter plot of DBSCAN clusters
[Link](figsize=(8, 6))
[Link](pca_data[:, 0], pca_data[:, 1], c=df['dbscan_cluster'],
cmap='viridis', s=50)
[Link]('DBSCAN Clustering (2D PCA projection)')
[Link]('PCA Component 1')
[Link]('PCA Component 2')
[Link](label='Cluster')
[Link]()
Results:
Visualization of DBSCAN Clustering Algorithm
Task 2
Code:
import pandas as pd
import numpy as np
from [Link] import StandardScaler
from [Link] import KernelDensity
from [Link] import cdist
import [Link] as plt
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with act
ual file path
# Step 2: Preprocess the data
[Link]('Blank', [Link], inplace=True) # Handle missing values
df = [Link](pd.to_numeric, errors='coerce') # Convert all columns to nu
meric
[Link]([Link](), inplace=True) # Fill missing values with median
# Select relevant columns
features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm', 'humidity
3pm','pressure3pm','cloud_cover']
data = df[features]
# Step 3: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# --- Distance-based Outlier Detection ---
def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = [Link](axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores
# --- Density-based Outlier Detection ---
def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
[Link](data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -
log_density.max() # Normalize to range [0, 1]
return outlier_scores
# Step 4: Calculate OLS for both methods
distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)
# Add OLS to dataframe
df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom exampl
es
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
# Top 3 likely outliers
print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))
print("Top 3 outliers based on density-based OLS:")
print(df_sorted_density.head(3))
# Bottom example (most normal)
print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))
print("Most normal (bottom) based on density-based OLS:")
print(df_sorted_density.tail(1))
Results of Outliers Detecting Techniques
Top 3 outliers based on distance-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm
2114 73.0 83.0 20.6 20.0 94.0 29.39
1212 64.0 74.0 18.8 6.0 82.0 29.97
3470 73.0 78.0 18.2 7.0 96.0 29.99
temp3pm cloud_cover distance_OLS density_OLS
2114 75.0 4.0 1.000000 2.000083
1212 74.0 4.0 0.889223 1.988138
3470 75.0 4.0 0.866567 1.988138
Top 3 outliers based on density-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2114 73.0 83.0 20.6 20.0 94.0 29.39
3024 0.0 79.0 1.2 7.0 88.0 29.83
2815 75.0 81.0 11.1 22.0 90.0 29.74
temp3pm cloud_cover distance_OLS density_OLS
2114 75.0 4.0 1.000000 2.000083
3024 77.0 15.0 0.383273 2.000083
2815 77.0 4.0 0.560338 2.000083
Most normal (bottom) based on distance-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2128 63.0 81.0 0.0 10.0 54.0 29.93
temp3pm cloud_cover distance_OLS density_OLS
2128 80.0 4.0 0.132443 1.27591
Most normal (bottom) based on density-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
1353 76.0 91.0 0.0 10.0 55.0 29.89
temp3pm cloud_cover distance_OLS density_OLS
1353 90.0 6.0 0.149656 1.0
Code For Visualizing the results:
import pandas as pd
import numpy as np
from [Link] import StandardScaler
from [Link] import KernelDensity
from [Link] import cdist
import [Link] as plt
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with
actual file path
# Step 2: Preprocess the data
[Link]('Blank', [Link], inplace=True) # Handle missing values
df = [Link](pd.to_numeric, errors='coerce') # Convert all columns to
numeric
[Link]([Link](), inplace=True) # Fill missing values with median
# Select relevant columns
features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm',
'humidity3pm','pressure3pm','cloud_cover']
data = df[features]
# Step 3: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# --- Distance-based Outlier Detection ---
def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = [Link](axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores
# --- Density-based Outlier Detection ---
def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
[Link](data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -log_density.max() # Normalize to
range [0, 1]
return outlier_scores
# Step 4: Calculate OLS for both methods
distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)
# Add OLS to dataframe
df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom
examples
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
# Top 3 likely outliers
print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))
print("Top 3 outliers based on density-based OLS:")
print(df_sorted_density.head(3))
# Bottom example (most normal)
print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))
print("Most normal (bottom) based on density-based OLS:")
print(df_sorted_density.tail(1))
Visualization Results
Comparison Between Techniques
Code:
import [Link] as plt
import seaborn as sns
# Step 7: Visualize the OLS Scores
# Plot the distribution of distance-based and density-based OLS
[Link](figsize=(14, 6))
# Distance-based OLS Distribution
[Link](1, 2, 1)
[Link](df['distance_OLS'], kde=True, color='blue', bins=30)
[Link]('Distribution of Distance-based OLS Scores')
[Link]('Distance-based OLS Score')
[Link]('Frequency')
# Density-based OLS Distribution
[Link](1, 2, 2)
[Link](df['density_OLS'], kde=True, color='green', bins=30)
[Link]('Distribution of Density-based OLS Scores')
[Link]('Density-based OLS Score')
[Link]('Frequency')
plt.tight_layout()
[Link]()
# Step 8: Visualize the Top 3 Outliers and Bottom Example
# Top 3 outliers based on distance-based OLS
top_3_distance_outliers = df_sorted_distance.head(3)
top_3_distance_outliers = top_3_distance_outliers[features +
['distance_OLS']]
# Top 3 outliers based on density-based OLS
top_3_density_outliers = df_sorted_density.head(3)
top_3_density_outliers = top_3_density_outliers[features +
['density_OLS']]
# Plot top 3 distance-based outliers
[Link](figsize=(14, 6))
[Link](1, 2, 1)
[Link](x='min_temp', y='max_temp', data=top_3_distance_outliers,
color='red', s=100, label='Top 3 Distance-based Outliers')
[Link]('Top 3 Distance-based Outliers')
[Link]('Min Temp (°F)')
[Link]('Max Temp (°F)')
# Plot top 3 density-based outliers
[Link](1, 2, 2)
[Link](x='min_temp', y='max_temp', data=top_3_density_outliers,
color='orange', s=100, label='Top 3 Density-based Outliers')
[Link]('Top 3 Density-based Outliers')
[Link]('Min Temp (°F)')
[Link]('Max Temp (°F)')
plt.tight_layout()
[Link]()
# Step 9: Scatter plot comparing distance-based and density-based OLS
scores
[Link](figsize=(8, 6))
[Link](x=df['distance_OLS'], y=df['density_OLS'], color='purple')
[Link]('Comparison of Distance-based vs Density-based OLS Scores')
[Link]('Distance-based OLS Score')
[Link]('Density-based OLS Score')
plt.tight_layout()
[Link]()
# Step 10: Visualize the most normal day (bottom example) for both OLS
methods
# Most normal day based on distance-based OLS
most_normal_distance = df_sorted_distance.tail(1)
# Most normal day based on density-based OLS
most_normal_density = df_sorted_density.tail(1)
# Plot most normal day for both distance-based and density-based OLS
[Link](figsize=(12, 6))
[Link](1, 2, 1)
[Link](x='min_temp', y='max_temp', data=most_normal_distance,
color='blue', s=100, label='Most Normal (Distance-based)')
[Link]('Most Normal Day (Distance-based OLS)')
[Link]('Min Temp (°F)')
[Link]('Max Temp (°F)')
[Link](1, 2, 2)
[Link](x='min_temp', y='max_temp', data=most_normal_density,
color='green', s=100, label='Most Normal (Density-based)')
[Link]('Most Normal Day (Density-based OLS)')
[Link]('Min Temp (°F)')
[Link]('Max Temp (°F)')
plt.tight_layout()
[Link]()
Results