ASSIGNMENT NUMBER 3 SOLUTION
#Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter)
will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings('ignore')
#Importing the dataset
data = '/content/Live.csv'
df = pd.read_csv(data)
#Drop redundant columns
df.drop(['Column1', 'Column2', 'Column3', 'Column4'], axis=1,
inplace=True)
df.info()
df.describe()
# view the labels in the variable
df['status_type'].unique()
df.drop(['status_id', 'status_published'], axis=1, inplace=True)
df.head()
#Declaration of feature vector and target variable
X = df
y = df['status_type']
#Conversion of categorical variable into integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['status_type'] = le.fit_transform(X['status_type'])
y = le.transform(y)
#K-Means model with two clusters
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
labels = kmeans.labels_
# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." %
(correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
#K-Means model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X)
# check how many of the samples were correctly labeled
labels = kmeans.labels_
correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." %
(correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
#Use elbow method to find optimal number of clusters
from sklearn.cluster import KMeans
cs = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
n_init = 10, random_state = 0)
kmeans.fit(X)
cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
#By the above plot, we can see that there is a kink at k=2.Hence k=2
can be considered a good number of the cluster to cluster this data.
DATASET LINK: https://www.semanticscholar.org/paper/Dataset-on-usage-and-
engagement-patterns-for-Live-Dehouche/c0ec91003f8bdce99a56fa60dc2d20268cc808b8