K - MEAN CLLUSTERING
24CSEG034
2025-04-03
# Load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at
https://goo.gl/ve3WBa
# Load dataset
mall_data <- read.csv(file.choose(), header = TRUE)
View(mall_data)
summary(mall_data)
## CustomerID Genre Age
Annual.Income..k..
## Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00
## 1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50
## Median :100.50 Mode :character Median :36.00 Median : 61.50
## Mean :100.50 Mean :38.85 Mean : 60.56
## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00
## Max. :200.00 Max. :70.00 Max. :137.00
## Spending.Score..1.100.
## Min. : 1.00
## 1st Qu.:34.75
## Median :50.00
## Mean :50.20
## 3rd Qu.:73.00
## Max. :99.00
str(mall_data)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
dim(mall_data)
## [1] 200 5
colnames(mall_data)
## [1] "CustomerID" "Genre" "Age"
## [4] "Annual.Income..k.." "Spending.Score..1.100."
head(mall_data)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
sum(is.na(mall_data))
## [1] 0
sum(duplicated(mall_data))
## [1] 0
# Histogram: Age
ggplot(mall_data, aes(x = Age)) +
geom_histogram(binwidth = 2, fill = "darkgreen", color = "white") +
labs(title = "Distribution of Customer Ages", x = "Age", y =
"Count") +
theme_light()
# Histogram: Annual Income
ggplot(mall_data, aes(x = Annual.Income..k..)) +
geom_histogram(binwidth = 5, fill = "darkorange", color = "white") +
labs(title = "Distribution of Customer Annual Income", x = "Annual
Income (k$)", y = "Count") +
theme_minimal()
# Histogram: Spending Score
ggplot(mall_data, aes(x = Spending.Score..1.100.)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
labs(title = "Distribution of Customer Spending Score", x =
"Spending Score (1-100)", y = "Count") +
theme_classic()
# Boxplot: Age
ggplot(mall_data, aes(y = Age)) +
geom_boxplot(fill = "tomato", color = "black") +
labs(title = "Boxplot of Age", y = "Age") +
theme_bw()
# Boxplot: Annual Income
ggplot(mall_data, aes(y = Annual.Income..k..)) +
geom_boxplot(fill = "skyblue", color = "black") +
labs(title = "Boxplot of Annual Income", y = "Annual Income (k$)") +
theme_light()
# Boxplot: Spending Score
ggplot(mall_data, aes(y = Spending.Score..1.100.)) +
geom_boxplot(fill = "plum", color = "black") +
labs(title = "Boxplot of Spending Score", y = "Spending Score (1-
100)") +
theme_minimal()
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100.)) +
geom_point(color = "darkred", size = 3, alpha = 0.7) +
labs(title = "Income vs Spending Score", x = "Annual Income (k$)", y
= "Spending Score") +
theme_classic()
# Filter customers: above-median income & spending score
filtered_data <- mall_data %>%
filter(Annual.Income..k.. > median(Annual.Income..k..),
Spending.Score..1.100. > median(Spending.Score..1.100.))
head(filtered_data)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 103 Male 67 62 59
## 2 104 Male 26 62 55
## 3 105 Male 49 62 56
## 4 111 Male 65 63 52
## 5 112 Female 19 63 54
## 6 118 Female 49 65 59
# Central tendency
mean_age <- mean(mall_data$Age)
median_income <- median(mall_data$Annual.Income..k..)
mode_gender <- names(sort(table(mall_data$Gender), decreasing = TRUE))
[1]
cat("Mean Age:", mean_age, "\n")
## Mean Age: 38.85
cat("Median Annual Income:", median_income, "\n")
## Median Annual Income: 61.5
cat("Most Common Gender:", mode_gender, "\n")
## Most Common Gender:
# Top 10 spending customers
top_spending_customers <- mall_data %>%
arrange(desc(Spending.Score..1.100.)) %>%
head(10)
print(top_spending_customers)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 12 Female 35 19 99
## 2 20 Female 35 23 98
## 3 146 Male 28 77 97
## 4 186 Male 30 99 97
## 5 128 Male 40 71 95
## 6 168 Female 33 86 95
## 7 8 Female 23 18 94
## 8 142 Male 32 75 93
## 9 164 Female 31 81 93
## 10 34 Male 18 33 92
# Feature selection
cluster_data <- mall_data %>%
select(Annual.Income..k.., Spending.Score..1.100.)
# Scale features
scaled_data <- scale(cluster_data)
# Elbow method
wcss <- sapply(1:10, function(k) {
kmeans(scaled_data, centers = k, nstart = 25)$tot.withinss
})
ggplot(data.frame(K = 1:10, WCSS = wcss), aes(x = K, y = WCSS)) +
geom_line(color = "darkblue", size = 1) +
geom_point(color = "red", size = 3) +
ggtitle("Elbow Method for Optimal K") +
theme_light()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2
3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was
## generated.
# K-means clustering
set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 5, nstart = 25)
mall_data$Cluster <- as.factor(kmeans_model$cluster)
head(mall_data)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
Cluster
## 1 1 Male 19 15 39
2
## 2 2 Male 21 15 81
3
## 3 3 Female 20 16 6
2
## 4 4 Female 23 16 77
3
## 5 5 Female 31 17 40
2
## 6 6 Female 22 17 76
3
# Cluster visualization
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100., color = Cluster)) +
geom_point(size = 3, alpha = 0.8) +
ggtitle("Customer Segmentation Using K-means") +
theme_minimal()
# Silhouette score
sil_score <- silhouette(kmeans_model$cluster, dist(scaled_data))
summary(sil_score)
## Silhouette of 200 units in 5 clusters from silhouette.default(x =
kmeans_model$cluster, dist = dist(scaled_data)) :
## Cluster sizes and average silhouette widths:
## 81 23 22 35 39
## 0.5978670 0.5105061 0.5982119 0.5052154 0.5107529
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02857 0.49313 0.59848 0.55466 0.66018 0.75907
fviz_silhouette(sil_score)
## cluster size ave.sil.width
## 1 1 81 0.60
## 2 2 23 0.51
## 3 3 22 0.60
## 4 4 35 0.51
## 5 5 39 0.51
# Cluster Summary
cluster_summary <- mall_data %>%
group_by(Cluster) %>%
summarise(Average_Income = mean(Annual.Income..k..),
Average_Spending_Score = mean(Spending.Score..1.100.))
print(cluster_summary)
## # A tibble: 5 × 3
## Cluster Average_Income Average_Spending_Score
## <fct> <dbl> <dbl>
## 1 1 55.3 49.5
## 2 2 26.3 20.9
## 3 3 25.7 79.4
## 4 4 88.2 17.1
## 5 5 86.5 82.1