0% found this document useful (0 votes)
18 views4 pages

Clustering Tutorial

Uploaded by

Koustubh Hire
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views4 pages

Clustering Tutorial

Uploaded by

Koustubh Hire
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

9/16/25, 3:47 PM clustering_tutorial.

ipynb - Colab

1 # Step 1: Import libraries


2 import pandas as pd
3 import numpy as np
4 import [Link] as plt
5 from [Link] import KMeans
6 from [Link] import StandardScaler
7
8 # Step 2: Create a sample dataset (20 customers instead of 10)
9 data = {
10 'CustomerID': list(range(1, 21)),
11 'Age': [19,21,20,23,31,22,35,40,52,47,
12 25,29,33,45,50,28,37,42,55,60],
13 'Annual_Income(k$)': [15,15,16,16,17,17,25,40,60,55,
14 18,20,22,35,58,30,26,45,62,70],
15 'Spending_Score(1-100)': [39,81,6,77,40,76,50,20,10,30,
16 65,55,25,15,12,70,45,18,8,5]
17 }
18
19 df = [Link](data)
20
21 print("Dataset Head:\n", [Link]())
22
23 # Step 3: Select features for clustering (Age & Spending Score)
24 X = df[['Age', 'Spending_Score(1-100)']]
25
26 # Optional: Standardize the data
27 scaler = StandardScaler()
28 X_scaled = scaler.fit_transform(X)
29
30 # Step 4: Run K-Means
31 kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
32 df['Cluster'] = kmeans.fit_predict(X_scaled)
33
34 print("\nClustered Data:\n", df)
35
36 # Step 5: Visualize clusters
37 [Link](figsize=(5,3))
38 colors = ['red','blue','green','magenta']
39 for cluster in range(4):
40 cluster_points = df[df['Cluster'] == cluster]
41 [Link](cluster_points['Age'],
42 cluster_points['Spending_Score(1-100)'],
43 s=100, c=colors[cluster], label=f'Cluster {cluster}')
44
45 # Plot cluster centers
46 centers = scaler.inverse_transform(kmeans.cluster_centers_)
47 [Link](centers[:,0], centers[:,1], s=300, c='yellow', marker='*', label='Centroids')
48
49 [Link]("Age")
50 [Link]("Spending Score (1-100)")
51 [Link]("Customer Segmentation using K-Means (20 Customers)")
52 [Link]()
53 [Link]()
54
55
56
57
58

[Link] 1/4
9/16/25, 3:47 PM clustering_tutorial.ipynb - Colab

Dataset Head:
CustomerID Age Annual_Income(k$) Spending_Score(1-100)
0 1 19 15 39
1 2 21 15 81
2 3 20 16 6
3 4 23 16 77
4 5 31 17 40

Clustered Data:
CustomerID Age Annual_Income(k$) Spending_Score(1-100) Cluster
0 1 19 15 39 0
1 2 21 15 81 3
2 3 20 16 6 0
3 4 23 16 77 3
4 5 31 17 40 2
5 6 22 17 76 3
6 7 35 25 50 2
7 8 40 40 20 1
8 9 52 60 10 1
9 10 47 55 30 1
10 11 25 18 65 3
11 12 29 20 55 2
12 13 33 22 25 2
13 14 45 35 15 1
14 15 50 58 12 1
15 1 import16pandas
28 as pd 30 70 3
16 2 import17numpy
37 as np 26 45 2
17 18 42 45 18 1
3 from [Link] import KMeans
18 19 55 62 8 1
4 from [Link] import StandardScaler
19 20 60 70 5 1
5
6 # Step 1: Create dataset (20 customers)
7 data = {
8 'CustomerID': list(range(1, 21)),
9 'Age': [19,21,20,23,31,22,35,40,52,47,
10 25,29,33,45,50,28,37,42,55,60],
11 'Annual_Income(k$)': [15,15,16,16,17,17,25,40,60,55,
12 18,20,22,35,58,30,26,45,62,70],
13 'Spending_Score(1-100)': [39,81,6,77,40,76,50,20,10,30,
14 65,55,25,15,12,70,45,18,8,5]
15 }
16
17 df = [Link](data)
18
19 # Step 2: Select features
20 X = df[['Age', 'Annual_Income(k$)', 'Spending_Score(1-100)']]
21
22 # Standardize features
23 scaler = StandardScaler()
24 X_scaled = scaler.fit_transform(X)
25
26 # Step 3: Run K-Means
27 kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
28 df['Cluster'] = kmeans.fit_predict(X_scaled)
29
30 # Step 4: Get cluster centroids (convert back from scaled units)
31 centroids_scaled = kmeans.cluster_centers_
32 centroids = scaler.inverse_transform(centroids_scaled)
33 centroid_df = [Link](centroids,
34 columns=['Age', 'Annual_Income(k$)', 'Spending_Score(1-100)'])
35 centroid_df['Cluster'] = range(0, len(centroid_df))
36
37 # Step 5: Save both to CSV
38 # Save customers with their clusters
39 df.to_csv("customer_clusters.csv", index=False)
40
41 # Save cluster centroids
42 centroid_df.to_csv("cluster_centroids.csv", index=False)
43
44 print("✅ Files saved: customer_clusters.csv and cluster_centroids.csv")
45 print("\nCluster Centroids:\n", centroid_df)
46

✅ Files saved: customer_clusters.csv and cluster_centroids.csv

Cluster Centroids:
Age Annual_Income(k$) Spending_Score(1-100) Cluster
0 23.800000 19.200000 73.800000 0
1 48.875000 53.125000 14.750000 1
2 29.142857 20.142857 37.142857 2

1 import [Link] as plt


2 import pandas as pd
3 import numpy as np
4 from [Link] import StandardScaler

[Link] 2/4
9/16/25, 3:47 PM clustering_tutorial.ipynb - Colab
5
6 # Step 1: Create a sample dataset
7 data = {
8 'CustomerID': [1,2,3,4,5,6,7,8,9,10],
9 'Age': [19,21,20,23,31,22,35,40,52,47],
10 'Annual_Income(k$)': [15,15,16,16,17,17,25,40,60,55],
11 'Spending_Score(1-100)':[39,81,6,67,40,76,50,20,10,30]
12 }
13 df = [Link](data)
14 X = df[['Age', 'Spending_Score(1-100)']]
15
16 # Standardize features
17 scaler = StandardScaler()
18 X_scaled = scaler.fit_transform(X)
19
20 # Step 2: K-Means manual iterations with plots
21 def kmeans_static_plots(X_scaled, X_original, n_clusters=3, max_iter=4):
22 [Link](99)
23 # Initialize cluster centers randomly
24 initial_idx = [Link](len(X_scaled), n_clusters, replace=False)
25 centers = X_scaled[initial_idx]
26
27 for i in range(max_iter):
28 # Assign clusters
29 distances = [Link](X_scaled[:, [Link]] - centers, axis=2)
30 labels = [Link](distances, axis=1)
31
32 # Plot
33 [Link](figsize=(5,4))
34 colors = ['red','blue','green']
35 for cluster in range(n_clusters):
36 cluster_points = X_original[[Link](labels) == cluster]
37 [Link](cluster_points[:,0], cluster_points[:,1],
38 s=80, c=colors[cluster], label=f'Cluster {cluster}')
39
40 # Plot centers
41 centers_original = scaler.inverse_transform(centers)
42 [Link](centers_original[:,0], centers_original[:,1],
43 s=300, c='yellow', marker='*', label='Centroids')
44
45 [Link](f"K-Means Iteration {i+1}")
46 [Link]("Age")
47 [Link]("Spending Score (1-100)")
48 [Link]()
49 [Link]()
50
51 # Update centers for next iteration
52 new_centers = []
53 for cluster in range(n_clusters):
54 cluster_points = X_scaled[labels == cluster]
55 if len(cluster_points) > 0:
56 new_centers.append(cluster_points.mean(axis=0))
57 else:
58 new_centers.append(centers[cluster])
59 centers = [Link](new_centers)
60
61 # Step 3: Run
62 kmeans_static_plots(X_scaled, [Link], n_clusters=3, max_iter=4)
63

[Link] 3/4
9/16/25, 3:47 PM clustering_tutorial.ipynb - Colab

[Link] 4/4

You might also like