0% found this document useful (0 votes)
25 views13 pages

KMEANS

The document outlines a K-means clustering analysis on a dataset of 200 customers from a mall, focusing on their age, annual income, and spending score. It includes data loading, exploratory data analysis with visualizations, feature selection, scaling, and the application of the K-means algorithm to segment customers into clusters. The results include cluster visualization and a summary of average income and spending scores for each cluster.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views13 pages

KMEANS

The document outlines a K-means clustering analysis on a dataset of 200 customers from a mall, focusing on their age, annual income, and spending score. It includes data loading, exploratory data analysis with visualizations, feature selection, scaling, and the application of the K-means algorithm to segment customers into clusters. The results include cluster visualization and a summary of average income and spending scores for each cluster.

Uploaded by

akanaguhari
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 13

K - MEAN CLLUSTERING

24CSEG034

2025-04-03
# Load libraries

library(ggplot2)
library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at


https://goo.gl/ve3WBa

# Load dataset
mall_data <- read.csv(file.choose(), header = TRUE)
View(mall_data)
summary(mall_data)

## CustomerID Genre Age


Annual.Income..k..
## Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00

## 1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50

## Median :100.50 Mode :character Median :36.00 Median : 61.50

## Mean :100.50 Mean :38.85 Mean : 60.56

## 3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00

## Max. :200.00 Max. :70.00 Max. :137.00

## Spending.Score..1.100.
## Min. : 1.00
## 1st Qu.:34.75
## Median :50.00
## Mean :50.20
## 3rd Qu.:73.00
## Max. :99.00

str(mall_data)

## 'data.frame': 200 obs. of 5 variables:


## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...

dim(mall_data)

## [1] 200 5

colnames(mall_data)

## [1] "CustomerID" "Genre" "Age"

## [4] "Annual.Income..k.." "Spending.Score..1.100."

head(mall_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.


## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76

sum(is.na(mall_data))

## [1] 0

sum(duplicated(mall_data))

## [1] 0

# Histogram: Age
ggplot(mall_data, aes(x = Age)) +
geom_histogram(binwidth = 2, fill = "darkgreen", color = "white") +
labs(title = "Distribution of Customer Ages", x = "Age", y =
"Count") +
theme_light()
# Histogram: Annual Income
ggplot(mall_data, aes(x = Annual.Income..k..)) +
geom_histogram(binwidth = 5, fill = "darkorange", color = "white") +

labs(title = "Distribution of Customer Annual Income", x = "Annual


Income (k$)", y = "Count") +
theme_minimal()
# Histogram: Spending Score
ggplot(mall_data, aes(x = Spending.Score..1.100.)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
labs(title = "Distribution of Customer Spending Score", x =
"Spending Score (1-100)", y = "Count") +
theme_classic()
# Boxplot: Age
ggplot(mall_data, aes(y = Age)) +
geom_boxplot(fill = "tomato", color = "black") +
labs(title = "Boxplot of Age", y = "Age") +
theme_bw()
# Boxplot: Annual Income
ggplot(mall_data, aes(y = Annual.Income..k..)) +
geom_boxplot(fill = "skyblue", color = "black") +
labs(title = "Boxplot of Annual Income", y = "Annual Income (k$)") +

theme_light()
# Boxplot: Spending Score
ggplot(mall_data, aes(y = Spending.Score..1.100.)) +
geom_boxplot(fill = "plum", color = "black") +
labs(title = "Boxplot of Spending Score", y = "Spending Score (1-
100)") +
theme_minimal()
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100.)) +
geom_point(color = "darkred", size = 3, alpha = 0.7) +
labs(title = "Income vs Spending Score", x = "Annual Income (k$)", y
= "Spending Score") +
theme_classic()
# Filter customers: above-median income & spending score
filtered_data <- mall_data %>%
filter(Annual.Income..k.. > median(Annual.Income..k..),
Spending.Score..1.100. > median(Spending.Score..1.100.))
head(filtered_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.


## 1 103 Male 67 62 59
## 2 104 Male 26 62 55
## 3 105 Male 49 62 56
## 4 111 Male 65 63 52
## 5 112 Female 19 63 54
## 6 118 Female 49 65 59

# Central tendency
mean_age <- mean(mall_data$Age)
median_income <- median(mall_data$Annual.Income..k..)
mode_gender <- names(sort(table(mall_data$Gender), decreasing = TRUE))
[1]

cat("Mean Age:", mean_age, "\n")

## Mean Age: 38.85

cat("Median Annual Income:", median_income, "\n")

## Median Annual Income: 61.5


cat("Most Common Gender:", mode_gender, "\n")

## Most Common Gender:

# Top 10 spending customers


top_spending_customers <- mall_data %>%
arrange(desc(Spending.Score..1.100.)) %>%
head(10)
print(top_spending_customers)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.


## 1 12 Female 35 19 99
## 2 20 Female 35 23 98
## 3 146 Male 28 77 97
## 4 186 Male 30 99 97
## 5 128 Male 40 71 95
## 6 168 Female 33 86 95
## 7 8 Female 23 18 94
## 8 142 Male 32 75 93
## 9 164 Female 31 81 93
## 10 34 Male 18 33 92

# Feature selection
cluster_data <- mall_data %>%
select(Annual.Income..k.., Spending.Score..1.100.)

# Scale features
scaled_data <- scale(cluster_data)

# Elbow method
wcss <- sapply(1:10, function(k) {
kmeans(scaled_data, centers = k, nstart = 25)$tot.withinss
})

ggplot(data.frame(K = 1:10, WCSS = wcss), aes(x = K, y = WCSS)) +


geom_line(color = "darkblue", size = 1) +
geom_point(color = "red", size = 3) +
ggtitle("Elbow Method for Optimal K") +
theme_light()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2


3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was
## generated.
# K-means clustering
set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 5, nstart = 25)

mall_data$Cluster <- as.factor(kmeans_model$cluster)


head(mall_data)

## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.


Cluster
## 1 1 Male 19 15 39
2
## 2 2 Male 21 15 81
3
## 3 3 Female 20 16 6
2
## 4 4 Female 23 16 77
3
## 5 5 Female 31 17 40
2
## 6 6 Female 22 17 76
3

# Cluster visualization
ggplot(mall_data, aes(x = Annual.Income..k.., y =
Spending.Score..1.100., color = Cluster)) +
geom_point(size = 3, alpha = 0.8) +
ggtitle("Customer Segmentation Using K-means") +
theme_minimal()

# Silhouette score
sil_score <- silhouette(kmeans_model$cluster, dist(scaled_data))
summary(sil_score)

## Silhouette of 200 units in 5 clusters from silhouette.default(x =


kmeans_model$cluster, dist = dist(scaled_data)) :
## Cluster sizes and average silhouette widths:
## 81 23 22 35 39
## 0.5978670 0.5105061 0.5982119 0.5052154 0.5107529
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02857 0.49313 0.59848 0.55466 0.66018 0.75907

fviz_silhouette(sil_score)

## cluster size ave.sil.width


## 1 1 81 0.60
## 2 2 23 0.51
## 3 3 22 0.60
## 4 4 35 0.51
## 5 5 39 0.51
# Cluster Summary
cluster_summary <- mall_data %>%
group_by(Cluster) %>%
summarise(Average_Income = mean(Annual.Income..k..),
Average_Spending_Score = mean(Spending.Score..1.100.))
print(cluster_summary)

## # A tibble: 5 × 3
## Cluster Average_Income Average_Spending_Score
## <fct> <dbl> <dbl>
## 1 1 55.3 49.5
## 2 2 26.3 20.9
## 3 3 25.7 79.4
## 4 4 88.2 17.1
## 5 5 86.5 82.1

You might also like