CHAPTER 4 – CLUSTER ANALYSIS
1. PARTITIONING METHODS :[pg.no:87]
Partitioning methods are a widely used family of clustering algorithms in data mining that aim to
partition a dataset into K clusters.
Program:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data = {'x': [25, 34, 22, 27, 33, 33, 31, 22, 35, 34, 67, 54, 57, 43, 50, 57, 59, 52, 65, 47, 49, 48, 35,
33, 44, 45,
38,43,51,46],
'y': [79, 51, 53, 78, 59, 74, 73, 57, 69, 75, 51, 32, 40, 47, 53, 36, 35, 58, 59, 50, 25, 20, 14, 12,
20, 5, 29,
27, 8, 7]}
df = pd.DataFrame(data, columns=['x', 'y'])
# K-means Clustering
kmeans = KMeans(n_clusters=3).fit(df)
centroids = kmeans.cluster_centers_
print(centroids)
labels = kmeans.labels_
plt.scatter(df['x'], df['y'], c=labels.astype(float), s=50)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()
OUTPUT
2. AGGLOMERATIVE CLUSTERING :[pg.no:96]
Agglomerative clustering: Commonly referred to as AGNES (AGglomerative NESting) works in a
bottom-up manner. That is, each observation is initially considered as a single-element cluster
(leaf). At each step of the algorithm, the two clusters that are the most similar are combined into
a new bigger cluster (nodes).
PROGRAM
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
X = np.array([[2, 8], [8, 15], [3, 6], [6, 9], [8, 7], [10, 10]])
# Agglomerative Clustering
cluster = AgglomerativeClustering(n_clusters=3,
affinity='euclidean', linkage='ward')
labels = cluster.fit_predict(X)
print(labels)
# Drawing Dendrograms
plt.figure(figsize=(10, 7))
plt.title("Employee Skill Dendrograms")
dend = shc.dendrogram(shc.linkage(X, method='ward'))
plt.show()
OUTPUT
[1 2 1 0 0 0]
3. Balanced Iterative Reducing Clustering Using Hierarchies (BIRCH):[pg.no:98]
It appropriate for very large datasets or for streaming data, because of its ability to find a good
clustering solution with only a single scan of the data.
PROGRAM
from sklearn.cluster import Birch
X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
brc = Birch(branching_factor=50, n_clusters=None,
threshold=0.5, compute_labels=True)
brc.fit(X)
print(brc.predict(X))
OUTPUT
[0 0 0 1 1 1]
4. Density-based clustering (DBSCAN) :[pg.no:101-103]
It locates regions of high density that are separated from one another by regions of low density.
PROGRAM
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Generate sample data
centers = [[1, 1] , [-1,-1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters)
print('Estimated number of noise points: %d' % n_noise)
print("Homogeneity: %0.3f" %
metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" %
metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" %
metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" %
metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f" %
metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f" %
metrics.silhouette_score(X, labels))
# Plot result
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1,
len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o',
markerfacecolor=tuple(col), markeredgecolor='k',
markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o',
markerfacecolor=tuple(col), markeredgecolor='k',
markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters)
plt.show()
OUTPUT
Estimated number of clusters: 3
Estimated number of noise points: 18
Homogeneity: 0.953
Completeness: 0.883
V-measure: 0.917
Adjusted Rand Index: 0.952
Adjusted Mutual Information: 0.916
Silhouette Coefficient: 0.626
5. OPTICS CLUSTERING VS DBSCAN CLUSTERING :[pg.no:105-106]
DBSCAN algorithm assumes the density of the clusters as constant, whereas the OPTICS
algorithm allows a varying density of the clusters.
PROGRAM
from sklearn.cluster import OPTICS, cluster_optics_dbscan
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
# Generate sample data
np.random.seed(0)
n_points_per_cluster = 250
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
clust = OPTICS(min_samples=50, xi=0.05,
min_cluster_size=0.05)
# Run the fit
clust.fit(X)
labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
core_distances=clust.core_distances_,
ordering=clust.ordering_,
eps=0.5)
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
core_distances=clust.core_distances_,
ordering=clust.ordering_,
eps=2)
space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]
plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
# Reachability plot
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
Xk = space[labels == klass]
Rk = reachability[labels == klass]
ax1.plot(Xk, Rk, color, alpha=0.3)
ax1.plot(space[labels == -1], reachability[labels == -1], 'k.',
alpha=0.3)
ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-',
alpha=0.5)
ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.',
alpha=0.5)
ax1.set_ylabel('Reachability (epsilon distance)')
ax1.set_title('Reachability Plot')
# OPTICS
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
Xk = X[clust.labels_ == klass]
ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k.', alpha=0.1)
ax2.set_title('Automatic Clustering\nOPTICS')
plt.tight_layout()
plt.show()
OUTPUT