0% found this document useful (0 votes)

16 views7 pages

Clustering Algorithms for Data Analysis

Uploaded by

Gayatri Joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

16 views7 pages

Clustering Algorithms for Data Analysis

Uploaded by

Gayatri Joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 7

Ass6

Set A

1. Write a python program to implement k-means algorithm to build prediction model

(Use Credit Card Dataset CC GENERAL.csv Download from kaggle.com)

import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd

# Load the dataset

df = pd.read_csv('CC GENERAL.csv')

# Display the first few rows of the dataset

print(df.head())

# Check for missing values and drop rows with NaN values
df.dropna(inplace=True)

# Selecting relevant features for clustering (you can choose which features to use)
features = df.drop(columns=['CUST_ID']).values # Using .values to get numpy array

# K-Means Clustering Implementation

def kmeans(X, k, max_iters=100):
# Randomly initialize the centroids
centroids = X[nm.random.choice(X.shape[0], k, replace=False)]

for _ in range(max_iters):
# Calculate distances from data points to centroids
distances = nm.linalg.norm(X[:, nm.newaxis] - centroids, axis=2)

# Assign clusters based on closest centroid

labels = nm.argmin(distances, axis=1)

# Calculate new centroids

new_centroids = nm.array([X[labels == i].mean(axis=0) for i in range(k)])

# If centroids do not change, break

if nm.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Specify the number of clusters

k = 4 # You can adjust this based on your analysis

# Run K-Means
labels, centroids = kmeans(features, k)

# Add cluster labels to the dataframe

df['Cluster'] = labels
# Display the first few rows with cluster labels
print(df.head())

# Optional: Plotting clusters (use only 2 features for visualization)

mtp.scatter(features[:, 0], features[:, 1], c=labels, cmap='viridis', marker='o',
edgecolor='k')
mtp.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Plot centroids
mtp.title('K-Means Clustering of Credit Card Data')
mtp.xlabel('Feature 1')
mtp.ylabel('Feature 2')
mtp.show()

2. Write a python program to implement hierarchical Agglomerative clustering

algorithm.
(Download Customer.csv dataset from github.com).

url---> https://gist.github.com/akuks/2e9b08cebef0181b583a1dff4a97f8a1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# Load the dataset directly from the URL

url = 'Customer.csv' # Replace with the actual URL
df = pd.read_csv(url)

# Display the first few rows of the dataset

print(df.head())

# Check for missing values

print(df.isnull().sum())

# Drop rows with missing values (if any)

df.dropna(inplace=True)

# Convert DOB to age - specify the date format

df['DOB'] = pd.to_datetime(df['DOB'], format='%d/%m/%y %H:%M', dayfirst=True) #
Adjust the format for day first
df['Age'] = (pd.Timestamp.now() - df['DOB']).dt.days // 365 # Calculate age in
years
# Select relevant features for clustering
features = df[['Age', 'Gender']]

# One-hot encode the categorical 'Gender' feature

features_encoded = pd.get_dummies(features, columns=['Gender'], drop_first=True)

# Check the resulting DataFrame after encoding

print("Encoded Features:\n", features_encoded.head())
print("Columns after Encoding:", features_encoded.columns.tolist())

# Standardize the features

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)

# Perform Hierarchical Agglomerative Clustering

model = AgglomerativeClustering(n_clusters=4) # Adjust the number of clusters as
needed
labels = model.fit_predict(features_scaled)

# Add cluster labels to the original dataframe

df['Cluster'] = labels

# Display the first few rows with cluster labels

print(df.head())

# Check the unique labels

print("Unique Cluster Labels:", df['Cluster'].unique())

# Optional: Plotting the clusters (use only Age and one gender column for
visualization)
gender_column = features_encoded.columns[1] # Assuming the first column is 'Age'
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], features_encoded[gender_column], c=labels, cmap='viridis',
marker='o', edgecolor='k')
plt.title('Hierarchical Agglomerative Clustering of Customers')
plt.xlabel('Age')
plt.ylabel(gender_column) # Update the y-label to match the gender column
plt.colorbar(label='Cluster')
plt.grid(True) # Add grid for better readability
plt.show()

# Optional: Dendrogram for visualizing hierarchical clustering

plt.figure(figsize=(10, 6))
from scipy.cluster.hierarchy import dendrogram, linkage
linked = linkage(features_scaled, 'ward')
dendrogram(linked,
orientation='top',
distance_sort='descending',
show_leaf_counts=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.grid(True) # Add grid for better readability
plt.show()
Set B
1. Write a python program to implement k-means algorithms on a synthetic dataset.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Generate a synthetic dataset

from sklearn.datasets import make_blobs

# Generate synthetic data with 3 clusters

n_samples = 300
n_features = 2
n_clusters = 3
random_state = 42

X, y = make_blobs(n_samples=n_samples, centers=n_clusters, n_features=n_features,

random_state=random_state)

# Convert to a DataFrame for easier manipulation

data = pd.DataFrame(X, columns=['Feature_1', 'Feature_2'])

# Step 2: Visualize the synthetic dataset

plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], s=30)
plt.title('Synthetic Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

# Step 3: Implement K-Means algorithm

def k_means(X, n_clusters, n_iterations=100):
# Step 3.1: Randomly initialize the centroids
centroids = X.sample(n_clusters).to_numpy()

for _ in range(n_iterations):
# Step 3.2: Assign clusters based on closest centroid
distances = np.linalg.norm(X.to_numpy()[:, np.newaxis] - centroids, axis=2)
labels = np.argmin(distances, axis=1)

# Step 3.3: Update centroids based on mean of assigned points

new_centroids = np.array([X.to_numpy()[labels == k].mean(axis=0) for k in
range(n_clusters)])

# Step 3.4: Check for convergence

if np.all(centroids == new_centroids):
break

centroids = new_centroids

return labels, centroids

# Step 4: Run K-Means algorithm

labels, centroids = k_means(data, n_clusters)

# Step 5: Visualize the clustering results

plt.figure(figsize=(10, 6))
plt.scatter(data['Feature_1'], data['Feature_2'], c=labels, s=30, cmap='viridis',
marker='o', edgecolor='k')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75,
marker='X') # Centroids
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.show()

2. Write a python program to implement hierarchical clustering algorithm. (Download

Wholesale customers data dataset from github.com).

url----> https://github.com/TrainingByPackt/Data-Science-with-Python/blob/master/
Chapter01/Data/Wholesale%20customers%20data.csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Step 1: Load the dataset

# Replace 'path_to_your_dataset' with the actual path to your dataset
url = 'wholesale-customer.csv'
df = pd.read_csv(url)

# Step 2: Preprocessing the data

# Dropping non-numeric columns (Channel and Region)
data = df.drop(['Channel', 'Region'], axis=1)

# Step 3: Perform Hierarchical Clustering

# Linkage matrix
Z = linkage(data, method='ward')

# Step 4: Plotting the Dendrogram

plt.figure(figsize=(12, 8))
dendrogram(Z, truncate_mode='level', p=3)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Customers')
plt.ylabel('Distance')
plt.grid()
plt.show()

# Step 5: Form clusters

# Define the number of clusters (e.g., 3)
n_clusters = 3
clusters = fcluster(Z, n_clusters, criterion='maxclust')

# Step 6: Add cluster labels to the original DataFrame

df['Cluster'] = clusters

# Step 7: Visualize the clusters

# For visualization, let's plot the first two features (Fresh and Milk)
plt.figure(figsize=(10, 6))
plt.scatter(df['Fresh'], df['Milk'], c=df['Cluster'], cmap='viridis', s=100)
plt.title('Hierarchical Clustering of Wholesale Customers')
plt.xlabel('Fresh Products')
plt.ylabel('Milk Products')
plt.grid()
plt.colorbar(label='Cluster')
plt.show()

Set C
1. Write a python program to implement Agglomerative clustering on a synthetic
dataset.
(use inbuilt Iris data set).

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

# Step 1: Load the Iris dataset

iris = load_iris()
X = iris.data # Features
y = iris.target # True labels (for comparison)

# Step 2: Perform Agglomerative Clustering

# Choose the number of clusters (e.g., 3)
n_clusters = 3
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
clusters = agg_clustering.fit_predict(X)

# Step 3: Reduce dimensions for visualization (using PCA)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Step 4: Plot the clusters

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('Agglomerative Clustering on Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='Cluster Label')
plt.show()

# Optional: Compare with true labels

plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', marker='o',
edgecolor='k', s=100)
plt.title('True Labels of Iris Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.colorbar(label='True Label')
plt.show()

Document 1192
No ratings yet
Document 1192
4 pages
Hierarchical Clustering Mall Data
No ratings yet
Hierarchical Clustering Mall Data
2 pages
23CC554
No ratings yet
23CC554
10 pages
Aiml Assignment 10
No ratings yet
Aiml Assignment 10
6 pages
Intro Qugates
No ratings yet
Intro Qugates
4 pages
Mall Customer Segmentation Guide
No ratings yet
Mall Customer Segmentation Guide
8 pages
Marketing Analytics Week-10 LAQ
No ratings yet
Marketing Analytics Week-10 LAQ
5 pages
Project Explanation
No ratings yet
Project Explanation
17 pages
ML Solution
No ratings yet
ML Solution
60 pages
Credit Card Segmentation Guide
No ratings yet
Credit Card Segmentation Guide
5 pages
Data Mining Assignment Guide
100% (1)
Data Mining Assignment Guide
21 pages
Reading Data: #Importing Required Libraries
No ratings yet
Reading Data: #Importing Required Libraries
16 pages
ML Exp5 C36
No ratings yet
ML Exp5 C36
18 pages
M4 Data Mining W4 Business Report
No ratings yet
M4 Data Mining W4 Business Report
22 pages
Assignment 18
No ratings yet
Assignment 18
10 pages
Customer Clustering Analysis
No ratings yet
Customer Clustering Analysis
22 pages
FMLASS3Q7 - Jupyter Notebook
No ratings yet
FMLASS3Q7 - Jupyter Notebook
6 pages
Implement Clustering Algorithms For Unsupervised Classification
No ratings yet
Implement Clustering Algorithms For Unsupervised Classification
4 pages
Set 2
No ratings yet
Set 2
19 pages
Untitled Document-2-1-13-7-11.4
No ratings yet
Untitled Document-2-1-13-7-11.4
5 pages
Hierarchical Clustering
No ratings yet
Hierarchical Clustering
4 pages
Data Mining
No ratings yet
Data Mining
27 pages
NN Model and Gap Statistic Analysis
80% (10)
NN Model and Gap Statistic Analysis
14 pages
ML Expected Question and Explanation of The 3 PGM
No ratings yet
ML Expected Question and Explanation of The 3 PGM
12 pages
Project Report - Data Mining
0% (1)
Project Report - Data Mining
52 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
Experiment-3 ML Lab
No ratings yet
Experiment-3 ML Lab
20 pages
Kmeansclustering Sales Dataset
No ratings yet
Kmeansclustering Sales Dataset
6 pages
Lesson 6 - Unsupervised Learning
No ratings yet
Lesson 6 - Unsupervised Learning
63 pages
Assignment ....
No ratings yet
Assignment ....
8 pages
S6 - Data Mining Lab Experiments (Except 1)
No ratings yet
S6 - Data Mining Lab Experiments (Except 1)
6 pages
Data Mining Assignment: Sudhanva Saralaya
100% (1)
Data Mining Assignment: Sudhanva Saralaya
16 pages
SOLUTION ONLY CODE DWDM - Lab - All
No ratings yet
SOLUTION ONLY CODE DWDM - Lab - All
8 pages
Data Mining Ex1
No ratings yet
Data Mining Ex1
10 pages
Name: Aditya Parade Roll No: 281047 PRN: 22311577 Batch: A-2 Assignment 5
No ratings yet
Name: Aditya Parade Roll No: 281047 PRN: 22311577 Batch: A-2 Assignment 5
3 pages
Zara
No ratings yet
Zara
47 pages
K-Means for Customer Segmentation
No ratings yet
K-Means for Customer Segmentation
13 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
3 pages
Kman 07
No ratings yet
Kman 07
9 pages
Lecture - 7 - Practical - DBSCAN Clustering in Python
No ratings yet
Lecture - 7 - Practical - DBSCAN Clustering in Python
3 pages
Data Mining for Business Insights
83% (12)
Data Mining for Business Insights
34 pages
Data Mining Project: Clustering & Model Analysis
100% (1)
Data Mining Project: Clustering & Model Analysis
40 pages
Clustering Algorithms in Machine Learning
No ratings yet
Clustering Algorithms in Machine Learning
6 pages
Machine Learning: Dimensionality & Clustering
No ratings yet
Machine Learning: Dimensionality & Clustering
5 pages
DWDM Lab Report
No ratings yet
DWDM Lab Report
12 pages
Customer Segmentation with Jupyter
100% (19)
Customer Segmentation with Jupyter
50 pages
Clustering Algorithms CheatSheet
No ratings yet
Clustering Algorithms CheatSheet
6 pages
Joseph Xavier J - FML
No ratings yet
Joseph Xavier J - FML
15 pages
Artificial Intelligence Report
No ratings yet
Artificial Intelligence Report
23 pages
DWDM Lab Report
No ratings yet
DWDM Lab Report
10 pages
Subject: ML Name: Priyanshu Gandhi Date: 10/4/21 Expt. No.: 9 Roll No.: C008 Title: Clustering Implementation in Python
No ratings yet
Subject: ML Name: Priyanshu Gandhi Date: 10/4/21 Expt. No.: 9 Roll No.: C008 Title: Clustering Implementation in Python
7 pages
Bone Suplement Market Segmentation
No ratings yet
Bone Suplement Market Segmentation
20 pages
Sales Data Clustering
No ratings yet
Sales Data Clustering
15 pages
LP I Assignment A4 Clustering
No ratings yet
LP I Assignment A4 Clustering
13 pages
Experiment 4 1
No ratings yet
Experiment 4 1
4 pages
Jupyter Notebook Project DM Nikita Chaturvedi 25.07.2021
100% (5)
Jupyter Notebook Project DM Nikita Chaturvedi 25.07.2021
83 pages
Prac7 8 9 10
No ratings yet
Prac7 8 9 10
12 pages
21AI71 Module 5 Textbook
No ratings yet
21AI71 Module 5 Textbook
25 pages
Assignment 3
No ratings yet
Assignment 3
1 page
Operating Systems Credit Activity
No ratings yet
Operating Systems Credit Activity
5 pages
Java Summary
No ratings yet
Java Summary
2 pages
Pathfinder-Dna 2025 - 125 - 250720 - 103922
No ratings yet
Pathfinder-Dna 2025 - 125 - 250720 - 103922
9 pages
SQL Database Operations and Queries
No ratings yet
SQL Database Operations and Queries
71 pages
Wireless Electricity: Techniques & Applications
No ratings yet
Wireless Electricity: Techniques & Applications
33 pages
TrendTimesVolczx6 020531
No ratings yet
TrendTimesVolczx6 020531
86 pages
Flammability and Combustibility of Cistus Plant Groups in Tlemcen Region (Algeria)
No ratings yet
Flammability and Combustibility of Cistus Plant Groups in Tlemcen Region (Algeria)
11 pages
Electrodynamic Tethers for Satellites
No ratings yet
Electrodynamic Tethers for Satellites
22 pages
Iceland Geography Exam Stimulus Booklet
No ratings yet
Iceland Geography Exam Stimulus Booklet
4 pages
Java Loops
No ratings yet
Java Loops
2 pages
Lec 3
No ratings yet
Lec 3
33 pages
8720D Manual Networl Analyzer PDF
No ratings yet
8720D Manual Networl Analyzer PDF
477 pages
Microsoft Access Database Tutorial
No ratings yet
Microsoft Access Database Tutorial
28 pages
SS2 3RD Term Mathematics
No ratings yet
SS2 3RD Term Mathematics
73 pages
A Detailed Lesson Plan in Mathematics Five Name: Jerome M. Dela Cruz Subject: Mathematics 5
No ratings yet
A Detailed Lesson Plan in Mathematics Five Name: Jerome M. Dela Cruz Subject: Mathematics 5
4 pages
DSA Lab Manual
No ratings yet
DSA Lab Manual
115 pages
Ece Elective Sylabus
No ratings yet
Ece Elective Sylabus
16 pages
Manhattan Transcripts
No ratings yet
Manhattan Transcripts
13 pages
Home Store Blog Schematics Tutorials Downloads Contact: Ba1404 Hi-Fi Stereo FM Transmitter 88 - 108 MHZ
No ratings yet
Home Store Blog Schematics Tutorials Downloads Contact: Ba1404 Hi-Fi Stereo FM Transmitter 88 - 108 MHZ
2 pages
AniMer: Advanced Animal Pose Estimation
No ratings yet
AniMer: Advanced Animal Pose Estimation
15 pages
Robotic Arm Design for Spray Painting
No ratings yet
Robotic Arm Design for Spray Painting
6 pages
Relations and Functions: Reflexivity, Symmetry, Transitivity
No ratings yet
Relations and Functions: Reflexivity, Symmetry, Transitivity
52 pages
Tenute Striscianti, Rotary Seals
No ratings yet
Tenute Striscianti, Rotary Seals
220 pages
Dynamic Converter Modelling Guide
No ratings yet
Dynamic Converter Modelling Guide
30 pages
Broadband Radial Discone Antenna Design
No ratings yet
Broadband Radial Discone Antenna Design
7 pages
SOLTEQ® Reactor Service Unit (Model: BP 150) Was
No ratings yet
SOLTEQ® Reactor Service Unit (Model: BP 150) Was
4 pages
Igcse Biology 4ed TR Eoc Test 5
No ratings yet
Igcse Biology 4ed TR Eoc Test 5
4 pages
EE477 - mt1 - Fall2020 Solution
No ratings yet
EE477 - mt1 - Fall2020 Solution
10 pages
Implementation of VI Editor With NCURSES
No ratings yet
Implementation of VI Editor With NCURSES
10 pages
Constructive Model Theory
No ratings yet
Constructive Model Theory
13 pages
01 - Class - Ix - CPT 7 Question Paper - 29-09-2024
No ratings yet
01 - Class - Ix - CPT 7 Question Paper - 29-09-2024
9 pages
Excel Scorecard Dashboard Template
No ratings yet
Excel Scorecard Dashboard Template
7 pages
Weak Acids and Their Ionization
No ratings yet
Weak Acids and Their Ionization
71 pages
Emb Vlsi
No ratings yet
Emb Vlsi
4 pages

Clustering Algorithms for Data Analysis

Uploaded by

Clustering Algorithms for Data Analysis

Uploaded by

Ass6

1. Write a python program to implement k-means algorithm to build prediction model

# Load the dataset

# Display the first few rows of the dataset

# K-Means Clustering Implementation

# Assign clusters based on closest centroid

# Calculate new centroids

# If centroids do not change, break

return labels, centroids

# Specify the number of clusters

# Add cluster labels to the dataframe

# Optional: Plotting clusters (use only 2 features for visualization)

2. Write a python program to implement hierarchical Agglomerative clustering

# Load the dataset directly from the URL

# Display the first few rows of the dataset

# Check for missing values

# Drop rows with missing values (if any)

# Convert DOB to age - specify the date format

# One-hot encode the categorical 'Gender' feature

# Check the resulting DataFrame after encoding

# Standardize the features

# Perform Hierarchical Agglomerative Clustering

# Add cluster labels to the original dataframe

# Display the first few rows with cluster labels

# Check the unique labels

# Optional: Dendrogram for visualizing hierarchical clustering

# Step 1: Generate a synthetic dataset

# Generate synthetic data with 3 clusters

X, y = make_blobs(n_samples=n_samples, centers=n_clusters, n_features=n_features,

# Convert to a DataFrame for easier manipulation

# Step 2: Visualize the synthetic dataset

# Step 3: Implement K-Means algorithm

# Step 3.3: Update centroids based on mean of assigned points

# Step 3.4: Check for convergence

return labels, centroids

# Step 4: Run K-Means algorithm

# Step 5: Visualize the clustering results

2. Write a python program to implement hierarchical clustering algorithm. (Download

Wholesale customers data dataset from github.com).

# Step 1: Load the dataset

# Step 2: Preprocessing the data

# Step 3: Perform Hierarchical Clustering

# Step 4: Plotting the Dendrogram

# Step 5: Form clusters

# Step 6: Add cluster labels to the original DataFrame

# Step 7: Visualize the clusters

# Step 1: Load the Iris dataset

# Step 2: Perform Agglomerative Clustering

# Step 3: Reduce dimensions for visualization (using PCA)

# Step 4: Plot the clusters

# Optional: Compare with true labels

You might also like