14/09/2018 Tutorial 2 - Clustering
In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
import matplotlib.pyplot as plt
In [9]:
data = pd.read_csv("./driver_dataset.csv", sep='\t')
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
Driver_ID 4000 non-null int64
Distance_Feature 4000 non-null float64
Speeding_Feature 4000 non-null float64
dtypes: float64(2), int64(1)
memory usage: 93.8 KB
In [11]:
data.describe()
Out[11]:
Driver_ID Distance_Feature Speeding_Feature
count 4000.000 4000.000 4000.000
mean 3423312447.500 76.042 10.721
std 1154.845 53.470 13.709
min 3423310448.000 15.520 0.000
25% 3423311447.750 45.248 4.000
50% 3423312447.500 53.330 6.000
75% 3423313447.250 65.632 9.000
max 3423314447.000 244.790 100.000
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 1/7
14/09/2018 Tutorial 2 - Clustering
In [26]:
plt.scatter(data.iloc[:,1:2], data.iloc[:,2:3])
plt.xlabel(data.columns.values[1])
plt.ylabel(data.columns.values[2])
plt.show()
In [28]:
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i,init = 'k-means++',random_state = 0)
kmeans.fit(data)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of cluster')
plt.ylabel('WCSS')
plt.show()
In [52]:
kmeans = KMeans(n_clusters = 4,init = 'k-means++',random_state =0)
y_kmeans = kmeans.fit_predict(data)
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 2/7
14/09/2018 Tutorial 2 - Clustering
In [53]:
%matplotlib inline
plt.figsize=(40, 40)
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=y_kmeans)
Out[53]:
<matplotlib.collections.PathCollection at 0x7f381ee64ba8>
In [47]:
from sklearn import preprocessing
#Performing Min_Max Normalization
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(data.iloc[:,1:])
dataN = pd.DataFrame(np_scaled)
dataN.head()
Out[47]:
0 1
0 0.243 0.280
1 0.161 0.250
2 0.214 0.270
3 0.175 0.220
4 0.170 0.250
In [50]:
kmeans = KMeans(n_clusters = 4,init = 'k-means++',random_state =0)
y2_kmeans = kmeans.fit_predict(dataN)
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 3/7
14/09/2018 Tutorial 2 - Clustering
In [59]:
%matplotlib inline
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=y2_kmeans)
Out[59]:
<matplotlib.collections.PathCollection at 0x7f381c32eda0>
In [ ]:
#DBSCAN STARTS
In [78]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.1, metric='euclidean', min_samples=5)
In [79]:
dbsc = dbscan.fit(data)
dbsc.labels_
Out[79]:
array([-1, -1, -1, ..., -1, -1, -1])
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 4/7
14/09/2018 Tutorial 2 - Clustering
In [80]:
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=dbsc.labels_)
Out[80]:
<matplotlib.collections.PathCollection at 0x7f38142e7550>
In [81]:
dbsc = dbscan.fit(dataN)
dbsc.labels_
Out[81]:
array([0, 0, 0, ..., 1, 1, 1])
In [82]:
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=dbsc.labels_)
Out[82]:
<matplotlib.collections.PathCollection at 0x7f381437b198>
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 5/7
14/09/2018 Tutorial 2 - Clustering
In [66]:
model.labels_
Out[66]:
array([-1, -1, -1, ..., -1, -1, -1])
In [ ]:
#AGGLOMERATIVE STARTS
In [67]:
from sklearn.cluster import AgglomerativeClustering as AC
aggclus = AC(n_clusters = 4,affinity='euclidean',linkage='ward',compute_full_tree='
y_aggclus= aggclus.fit_predict(data.iloc[:,1:3])
In [68]:
y_aggclus
Out[68]:
array([3, 3, 3, ..., 1, 1, 1])
In [69]:
from scipy.cluster.hierarchy import dendrogram, linkage,cut_tree
from scipy.cluster.hierarchy import fcluster
k=4
linkage_matrix = linkage(dataN, "ward",metric="euclidean")
ddata=dendrogram(linkage_matrix,color_threshold=1.5)
In [83]:
ddata=dendrogram(linkage_matrix,color_threshold=1.5)
plt.figure(figsize=(5,7))
Out[83]:
<Figure size 360x504 with 0 Axes>
<Figure size 360x504 with 0 Axes>
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 6/7