0% found this document useful (0 votes)

25 views13 pages

KMEANS

The document outlines a K-means clustering analysis on a dataset of 200 customers from a mall, focusing on their age, annual income, and spending score. It includes data loading, exploratory data analysis with visualizations, feature selection, scaling, and the application of the K-means algorithm to segment customers into clusters. The results include cluster visualization and a summary of average income and spending scores for each cluster.

Uploaded by

akanaguhari

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

25 views13 pages

KMEANS

Uploaded by

akanaguhari

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 13

K - MEAN CLLUSTERING

24CSEG034

2025-04-03
# Load libraries

library(ggplot2)
library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':

##
## filter, lag

## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at

https://goo.gl/ve3WBa

# Load dataset
mall_data <- read.csv(file.choose(), header = TRUE)
View(mall_data)
summary(mall_data)

## CustomerID Genre Age

Annual.Income..k..
## Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00

## 1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50

## Median :100.50 Mode :character Median :36.00 Median : 61.50

## Mean :100.50 Mean :38.85 Mean : 60.56

## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00

## Max. :200.00 Max. :70.00 Max. :137.00

## Spending.Score..1.100.
## Min. : 1.00
## 1st Qu.:34.75
## Median :50.00
## Mean :50.20
## 3rd Qu.:73.00
## Max. :99.00

str(mall_data)

## 'data.frame': 200 obs. of 5 variables:

## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...

dim(mall_data)

## [1] 200 5

colnames(mall_data)

## [1] "CustomerID" "Genre" "Age"

## [4] "Annual.Income..k.." "Spending.Score..1.100."

head(mall_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76

sum(is.na(mall_data))

## [1] 0

sum(duplicated(mall_data))

## [1] 0

# Histogram: Age
ggplot(mall_data, aes(x = Age)) +
geom_histogram(binwidth = 2, fill = "darkgreen", color = "white") +
labs(title = "Distribution of Customer Ages", x = "Age", y =
"Count") +
theme_light()
# Histogram: Annual Income
ggplot(mall_data, aes(x = Annual.Income..k..)) +
geom_histogram(binwidth = 5, fill = "darkorange", color = "white") +

labs(title = "Distribution of Customer Annual Income", x = "Annual

Income (k$)", y = "Count") +
theme_minimal()
# Histogram: Spending Score
ggplot(mall_data, aes(x = Spending.Score..1.100.)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
labs(title = "Distribution of Customer Spending Score", x =
"Spending Score (1-100)", y = "Count") +
theme_classic()
# Boxplot: Age
ggplot(mall_data, aes(y = Age)) +
geom_boxplot(fill = "tomato", color = "black") +
labs(title = "Boxplot of Age", y = "Age") +
theme_bw()
# Boxplot: Annual Income
ggplot(mall_data, aes(y = Annual.Income..k..)) +
geom_boxplot(fill = "skyblue", color = "black") +
labs(title = "Boxplot of Annual Income", y = "Annual Income (k$)") +

theme_light()
# Boxplot: Spending Score
ggplot(mall_data, aes(y = Spending.Score..1.100.)) +
geom_boxplot(fill = "plum", color = "black") +
labs(title = "Boxplot of Spending Score", y = "Spending Score (1-
100)") +
theme_minimal()
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100.)) +
geom_point(color = "darkred", size = 3, alpha = 0.7) +
labs(title = "Income vs Spending Score", x = "Annual Income (k$)", y
= "Spending Score") +
theme_classic()
# Filter customers: above-median income & spending score
filtered_data <- mall_data %>%
filter(Annual.Income..k.. > median(Annual.Income..k..),
Spending.Score..1.100. > median(Spending.Score..1.100.))
head(filtered_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

## 1 103 Male 67 62 59
## 2 104 Male 26 62 55
## 3 105 Male 49 62 56
## 4 111 Male 65 63 52
## 5 112 Female 19 63 54
## 6 118 Female 49 65 59

# Central tendency
mean_age <- mean(mall_data$Age)
median_income <- median(mall_data$Annual.Income..k..)
mode_gender <- names(sort(table(mall_data$Gender), decreasing = TRUE))
[1]

cat("Mean Age:", mean_age, "\n")

## Mean Age: 38.85

cat("Median Annual Income:", median_income, "\n")

## Median Annual Income: 61.5

cat("Most Common Gender:", mode_gender, "\n")

## Most Common Gender:

# Top 10 spending customers

top_spending_customers <- mall_data %>%
arrange(desc(Spending.Score..1.100.)) %>%
head(10)
print(top_spending_customers)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

## 1 12 Female 35 19 99
## 2 20 Female 35 23 98
## 3 146 Male 28 77 97
## 4 186 Male 30 99 97
## 5 128 Male 40 71 95
## 6 168 Female 33 86 95
## 7 8 Female 23 18 94
## 8 142 Male 32 75 93
## 9 164 Female 31 81 93
## 10 34 Male 18 33 92

# Feature selection
cluster_data <- mall_data %>%
select(Annual.Income..k.., Spending.Score..1.100.)

# Scale features
scaled_data <- scale(cluster_data)

# Elbow method
wcss <- sapply(1:10, function(k) {
kmeans(scaled_data, centers = k, nstart = 25)$tot.withinss
})

ggplot(data.frame(K = 1:10, WCSS = wcss), aes(x = K, y = WCSS)) +

geom_line(color = "darkblue", size = 1) +
geom_point(color = "red", size = 3) +
ggtitle("Elbow Method for Optimal K") +
theme_light()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2

3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was
## generated.
# K-means clustering
set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 5, nstart = 25)

mall_data$Cluster <- as.factor(kmeans_model$cluster)

head(mall_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

Cluster
## 1 1 Male 19 15 39
2
## 2 2 Male 21 15 81
3
## 3 3 Female 20 16 6
2
## 4 4 Female 23 16 77
3
## 5 5 Female 31 17 40
2
## 6 6 Female 22 17 76
3

# Cluster visualization
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100., color = Cluster)) +
geom_point(size = 3, alpha = 0.8) +
ggtitle("Customer Segmentation Using K-means") +
theme_minimal()

# Silhouette score
sil_score <- silhouette(kmeans_model$cluster, dist(scaled_data))
summary(sil_score)

## Silhouette of 200 units in 5 clusters from silhouette.default(x =

kmeans_model$cluster, dist = dist(scaled_data)) :
## Cluster sizes and average silhouette widths:
## 81 23 22 35 39
## 0.5978670 0.5105061 0.5982119 0.5052154 0.5107529
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02857 0.49313 0.59848 0.55466 0.66018 0.75907

fviz_silhouette(sil_score)

## cluster size ave.sil.width

## 1 1 81 0.60
## 2 2 23 0.51
## 3 3 22 0.60
## 4 4 35 0.51
## 5 5 39 0.51
# Cluster Summary
cluster_summary <- mall_data %>%
group_by(Cluster) %>%
summarise(Average_Income = mean(Annual.Income..k..),
Average_Spending_Score = mean(Spending.Score..1.100.))
print(cluster_summary)

## # A tibble: 5 × 3
## Cluster Average_Income Average_Spending_Score
## <fct> <dbl> <dbl>
## 1 1 55.3 49.5
## 2 2 26.3 20.9
## 3 3 25.7 79.4
## 4 4 88.2 17.1
## 5 5 86.5 82.1

ML Assignment No 5
No ratings yet
ML Assignment No 5
11 pages
Customer Clustering Analysis
No ratings yet
Customer Clustering Analysis
22 pages
Reading Data: #Importing Required Libraries
No ratings yet
Reading Data: #Importing Required Libraries
16 pages
K Means Clustering For Customer Data
No ratings yet
K Means Clustering For Customer Data
6 pages
Exploratory Data Analysis66
No ratings yet
Exploratory Data Analysis66
17 pages
Customer Segmentation Analysis
No ratings yet
Customer Segmentation Analysis
3 pages
Assignment ....
No ratings yet
Assignment ....
8 pages
Mall Customer Data Analysis PDF
No ratings yet
Mall Customer Data Analysis PDF
10 pages
Hierarchical Clustering in R
No ratings yet
Hierarchical Clustering in R
1 page
NN Model and Gap Statistic Analysis
80% (10)
NN Model and Gap Statistic Analysis
14 pages
K Means Clustering
100% (1)
K Means Clustering
10 pages
Customer Segmentation Analysis for Banking
100% (3)
Customer Segmentation Analysis for Banking
39 pages
Market Segmentation 25171
No ratings yet
Market Segmentation 25171
11 pages
Walmart - Ipynb - Colaboratory
No ratings yet
Walmart - Ipynb - Colaboratory
6 pages
Statistics
No ratings yet
Statistics
9 pages
BDA LabReport-9
No ratings yet
BDA LabReport-9
17 pages
Segmentation:Clustering: Krissie 2024-11-21
No ratings yet
Segmentation:Clustering: Krissie 2024-11-21
26 pages
Customer Spending Data Analysis
No ratings yet
Customer Spending Data Analysis
4 pages
Final Ca
No ratings yet
Final Ca
10 pages
Analysis
No ratings yet
Analysis
37 pages
Assignmnet 5
No ratings yet
Assignmnet 5
11 pages
Project 1 Alvaro Garcia
No ratings yet
Project 1 Alvaro Garcia
8 pages
Walmart - A Case Study
No ratings yet
Walmart - A Case Study
51 pages
Btech1010622 Lab4
No ratings yet
Btech1010622 Lab4
4 pages
Customer Segmentation via Clustering
100% (1)
Customer Segmentation via Clustering
15 pages
Consumer Spending Behavior Based On Different Categories - 5380
No ratings yet
Consumer Spending Behavior Based On Different Categories - 5380
3 pages
Teen Market Segmentation with K-Means
No ratings yet
Teen Market Segmentation with K-Means
6 pages
Data Mining
No ratings yet
Data Mining
27 pages
決策樹 R程式練習
No ratings yet
決策樹 R程式練習
11 pages
Ex No - 9
No ratings yet
Ex No - 9
10 pages
BigMart PDF
100% (1)
BigMart PDF
42 pages
Data Mining Project Report on Customer Segmentation
100% (2)
Data Mining Project Report on Customer Segmentation
26 pages
K-means Clustering Explained
No ratings yet
K-means Clustering Explained
27 pages
Exp 12 and 15
No ratings yet
Exp 12 and 15
4 pages
Stastistics and Probability With R Programming Language: Lab Report
67% (3)
Stastistics and Probability With R Programming Language: Lab Report
44 pages
ROC Curve Analysis for Customer Segmentation
100% (1)
ROC Curve Analysis for Customer Segmentation
48 pages
LP I Assignment A4 Clustering
No ratings yet
LP I Assignment A4 Clustering
13 pages
Data Mining Assignment: Sudhanva Saralaya
100% (1)
Data Mining Assignment: Sudhanva Saralaya
16 pages
Clustering Analysis: Prepared by Muralidharan N
100% (1)
Clustering Analysis: Prepared by Muralidharan N
16 pages
Unit 4
No ratings yet
Unit 4
42 pages
Klasteringggggggg
No ratings yet
Klasteringggggggg
7 pages
PT Mineblox
No ratings yet
PT Mineblox
28 pages
Decision Tree Analysis of Carseats Data
No ratings yet
Decision Tree Analysis of Carseats Data
7 pages
R Programs 2024-2025
No ratings yet
R Programs 2024-2025
13 pages
R Analysis of Buying Patterns
No ratings yet
R Analysis of Buying Patterns
3 pages
K-Means Clustering in R Analysis
No ratings yet
K-Means Clustering in R Analysis
1 page
Home Credit Default Risk Analysis
No ratings yet
Home Credit Default Risk Analysis
6 pages
K Means Clustering Customer Clustering
No ratings yet
K Means Clustering Customer Clustering
7 pages
Another Project-Creating Customer Segments
No ratings yet
Another Project-Creating Customer Segments
31 pages
Xii STD Practical 1 (1) 1
No ratings yet
Xii STD Practical 1 (1) 1
22 pages
23dscp206 Ex11
No ratings yet
23dscp206 Ex11
3 pages
K Means
No ratings yet
K Means
5 pages
Data Mining Assignment Guide
100% (1)
Data Mining Assignment Guide
21 pages
Python Machine Learning
No ratings yet
Python Machine Learning
19 pages
K-Means for Customer Segmentation
No ratings yet
K-Means for Customer Segmentation
13 pages
RAMESH
No ratings yet
RAMESH
10 pages
Data Science Project VI - Ipynb - Colaboratory
No ratings yet
Data Science Project VI - Ipynb - Colaboratory
15 pages
Intro Qugates
No ratings yet
Intro Qugates
4 pages
R Data Analysis Techniques
No ratings yet
R Data Analysis Techniques
9 pages
Discover Public Domain Books
No ratings yet
Discover Public Domain Books
639 pages
Modeling Diagrams
No ratings yet
Modeling Diagrams
96 pages
Marketing Research Methodological Foundations 10th Edition Test Bank
No ratings yet
Marketing Research Methodological Foundations 10th Edition Test Bank
6 pages
Database Reverse Engineering Based On Association Rule Mining
No ratings yet
Database Reverse Engineering Based On Association Rule Mining
6 pages
SQ L Performance Tuning
No ratings yet
SQ L Performance Tuning
15 pages
Homework #6
No ratings yet
Homework #6
2 pages
Big Data Basics for Beginners
No ratings yet
Big Data Basics for Beginners
51 pages
EDI-GENTRAN Mapping Course Launch
No ratings yet
EDI-GENTRAN Mapping Course Launch
1 page
Database Systems for Tech Experts
No ratings yet
Database Systems for Tech Experts
1 page
Understanding Entity Relationship Models
No ratings yet
Understanding Entity Relationship Models
17 pages
10 Iti
No ratings yet
10 Iti
3 pages
Module 3 - Analytics Techniques & Tools
No ratings yet
Module 3 - Analytics Techniques & Tools
74 pages
CT042-3-1-IDB-Week 4
No ratings yet
CT042-3-1-IDB-Week 4
26 pages
Cyber Gyan Virtual Internship
No ratings yet
Cyber Gyan Virtual Internship
9 pages
DocWorkflow Workbook
No ratings yet
DocWorkflow Workbook
85 pages
Literature Review Museum
100% (3)
Literature Review Museum
4 pages
Question Bank DBMS I
No ratings yet
Question Bank DBMS I
11 pages
Industry 4.0
No ratings yet
Industry 4.0
48 pages
MySQL - Beginner To Advance
No ratings yet
MySQL - Beginner To Advance
102 pages
MODEL QUESTION PAPER Dbms
100% (2)
MODEL QUESTION PAPER Dbms
3 pages
The Crowdsourced Guide To The KPMG Virtual Internship PDF
No ratings yet
The Crowdsourced Guide To The KPMG Virtual Internship PDF
15 pages
4 - DXAP2000PRE - r33 - ProdigySeriesInstallationProcedure
No ratings yet
4 - DXAP2000PRE - r33 - ProdigySeriesInstallationProcedure
40 pages
Key Advantages of GIS Technology
No ratings yet
Key Advantages of GIS Technology
17 pages
Sentiment Analysis of Movie Reviews
No ratings yet
Sentiment Analysis of Movie Reviews
39 pages
ORA-01555 Error: Causes and Solutions
No ratings yet
ORA-01555 Error: Causes and Solutions
9 pages
NLP Concepts and Techniques Guide
No ratings yet
NLP Concepts and Techniques Guide
15 pages
Himanshu Gupta Resume
No ratings yet
Himanshu Gupta Resume
2 pages
SAS® Visual Analytics 7.4 - User's Guide PDF
No ratings yet
SAS® Visual Analytics 7.4 - User's Guide PDF
680 pages
Library Services and Activity Calendar
No ratings yet
Library Services and Activity Calendar
6 pages
PregaBot PPT - R2
No ratings yet
PregaBot PPT - R2
18 pages

KMEANS

Uploaded by

KMEANS

Uploaded by

K - MEAN CLLUSTERING

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

## Welcome! Want to learn more? See two factoextra-related books at

## CustomerID Genre Age

## Median :100.50 Mode :character Median :36.00 Median : 61.50

## Mean :100.50 Mean :38.85 Mean : 60.56

## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00

## Max. :200.00 Max. :70.00 Max. :137.00

## 'data.frame': 200 obs. of 5 variables:

## [1] "CustomerID" "Genre" "Age"

## [4] "Annual.Income..k.." "Spending.Score..1.100."

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

labs(title = "Distribution of Customer Annual Income", x = "Annual

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

cat("Mean Age:", mean_age, "\n")

## Mean Age: 38.85

cat("Median Annual Income:", median_income, "\n")

## Median Annual Income: 61.5

## Most Common Gender:

# Top 10 spending customers

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

ggplot(data.frame(K = 1:10, WCSS = wcss), aes(x = K, y = WCSS)) +

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2

mall_data$Cluster <- as.factor(kmeans_model$cluster)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.

## Silhouette of 200 units in 5 clusters from silhouette.default(x =

## cluster size ave.sil.width

You might also like