##to create dataframe and import libraries
from [Link] import KMeans
import pandas as pd
from matplotlib import pyplot as plt
##to read csv file
df = pd.read_csv(‘/kaggle/input/income-dataset-for-k-means/[Link]’)
df
##to check first 5 rows
df = [Link]()
## to check the basic statistics of the data
[Link]()
[Link]
[Link]
## to plot scatter plot between age and income
[Link]([Link], df['Income($)'])
[Link]('Age')
[Link]('Income($)')
## to use elbow method to find number of clusters (sse = sum of squared error)
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
[Link](df[['Age','Income($)']])
[Link](km.inertia_)
##print sse
sse
## plot elbow graph
[Link]('K')
[Link]('Sum of squared error')
[Link](k_rng,sse)
##to identify the number of clusters
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Income($)']])
y_predicted
## print the predicted cluster number for each datapoint
df['cluster']=y_predicted
[Link]()
##to check the cluster centers
km.cluster_centers_
##to plot the different datapoints as per their assigned clusters
df1 = df[[Link]==0]
df2 = df[[Link]==1]
df3 = df[[Link]==2]
[Link]([Link],df1['Income($)'],color='green')
[Link]([Link],df2['Income($)'],color='red')
[Link]([Link],df3['Income($)'],color='black')
[Link](km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',labe
l='centroid')
[Link]('Age')
[Link]('Income ($)')
[Link]()