14/09/2018 Tutorial 2 - Clustering
In [13]:
import numpy as np
import pandas as pd
from [Link] import KMeans
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
import [Link] as plt
In [9]:
data = pd.read_csv("./driver_dataset.csv", sep='\t')
In [10]:
[Link]()
<class '[Link]'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
Driver_ID 4000 non-null int64
Distance_Feature 4000 non-null float64
Speeding_Feature 4000 non-null float64
dtypes: float64(2), int64(1)
memory usage: 93.8 KB
In [11]:
[Link]()
Out[11]:
Driver_ID Distance_Feature Speeding_Feature
count 4000.000 4000.000 4000.000
mean 3423312447.500 76.042 10.721
std 1154.845 53.470 13.709
min 3423310448.000 15.520 0.000
25% 3423311447.750 45.248 4.000
50% 3423312447.500 53.330 6.000
75% 3423313447.250 65.632 9.000
max 3423314447.000 244.790 100.000
[Link] 1/7
14/09/2018 Tutorial 2 - Clustering
In [26]:
[Link]([Link][:,1:2], [Link][:,2:3])
[Link]([Link][1])
[Link]([Link][2])
[Link]()
In [28]:
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i,init = 'k-means++',random_state = 0)
[Link](data)
[Link](kmeans.inertia_)
[Link](range(1,11),wcss)
[Link]('The Elbow Method')
[Link]('Number of cluster')
[Link]('WCSS')
[Link]()
In [52]:
kmeans = KMeans(n_clusters = 4,init = 'k-means++',random_state =0)
y_kmeans = kmeans.fit_predict(data)
[Link] 2/7
14/09/2018 Tutorial 2 - Clustering
In [53]:
%matplotlib inline
[Link]=(40, 40)
[Link]([Link][:,1],[Link][:,2], c=y_kmeans)
Out[53]:
<[Link] at 0x7f381ee64ba8>
In [47]:
from sklearn import preprocessing
#Performing Min_Max Normalization
min_max_scaler = [Link]()
np_scaled = min_max_scaler.fit_transform([Link][:,1:])
dataN = [Link](np_scaled)
[Link]()
Out[47]:
0 1
0 0.243 0.280
1 0.161 0.250
2 0.214 0.270
3 0.175 0.220
4 0.170 0.250
In [50]:
kmeans = KMeans(n_clusters = 4,init = 'k-means++',random_state =0)
y2_kmeans = kmeans.fit_predict(dataN)
[Link] 3/7
14/09/2018 Tutorial 2 - Clustering
In [59]:
%matplotlib inline
[Link]([Link][:,1],[Link][:,2], c=y2_kmeans)
Out[59]:
<[Link] at 0x7f381c32eda0>
In [ ]:
#DBSCAN STARTS
In [78]:
from [Link] import DBSCAN
dbscan = DBSCAN(eps=0.1, metric='euclidean', min_samples=5)
In [79]:
dbsc = [Link](data)
dbsc.labels_
Out[79]:
array([-1, -1, -1, ..., -1, -1, -1])
[Link] 4/7
14/09/2018 Tutorial 2 - Clustering
In [80]:
[Link]([Link][:,1],[Link][:,2], c=dbsc.labels_)
Out[80]:
<[Link] at 0x7f38142e7550>
In [81]:
dbsc = [Link](dataN)
dbsc.labels_
Out[81]:
array([0, 0, 0, ..., 1, 1, 1])
In [82]:
[Link]([Link][:,1],[Link][:,2], c=dbsc.labels_)
Out[82]:
<[Link] at 0x7f381437b198>
[Link] 5/7
14/09/2018 Tutorial 2 - Clustering
In [66]:
model.labels_
Out[66]:
array([-1, -1, -1, ..., -1, -1, -1])
In [ ]:
#AGGLOMERATIVE STARTS
In [67]:
from [Link] import AgglomerativeClustering as AC
aggclus = AC(n_clusters = 4,affinity='euclidean',linkage='ward',compute_full_tree='
y_aggclus= aggclus.fit_predict([Link][:,1:3])
In [68]:
y_aggclus
Out[68]:
array([3, 3, 3, ..., 1, 1, 1])
In [69]:
from [Link] import dendrogram, linkage,cut_tree
from [Link] import fcluster
k=4
linkage_matrix = linkage(dataN, "ward",metric="euclidean")
ddata=dendrogram(linkage_matrix,color_threshold=1.5)
In [83]:
ddata=dendrogram(linkage_matrix,color_threshold=1.5)
[Link](figsize=(5,7))
Out[83]:
<Figure size 360x504 with 0 Axes>
<Figure size 360x504 with 0 Axes>
[Link] 6/7