From cef8131885bc04a490341b8f643e7924550aa69b Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Tue, 16 Nov 2021 01:01:17 +0800 Subject: [PATCH 1/4] Reduce num of samples in plot-digit-linkage example --- examples/cluster/plot_digits_linkage.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index 925f5c122d73f..5f5aba6537554 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -22,6 +22,10 @@ # Authors: Gael Varoquaux # License: BSD 3 clause (C) INRIA 2014 +# We utilize the digits dataset which has 1800 entries. +# We will only use 800 entries for the sake of speeding +# up the calculations but feel free to use the whole dataset. + from time import time import numpy as np @@ -31,6 +35,7 @@ from sklearn import manifold, datasets X, y = datasets.load_digits(return_X_y=True) +X, y = X[:800], y[:800] n_samples, n_features = X.shape np.random.seed(0) @@ -65,7 +70,7 @@ def plot_clustering(X_red, labels, title=None): X_red[i, 1], str(y[i]), color=plt.cm.nipy_spectral(labels[i] / 10.0), - fontdict={"weight": "bold", "size": 9}, + fontdict={"weight": "bold", "size": 10}, ) plt.xticks([]) @@ -79,7 +84,7 @@ def plot_clustering(X_red, labels, title=None): # ---------------------------------------------------------------------- # 2D embedding of the digits dataset print("Computing embedding") -X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) +X_red = manifold.SpectralEmbedding(n_components=2, random_state=0).fit_transform(X) print("Done.") from sklearn.cluster import AgglomerativeClustering From 62feab0213c370138a49b17fe103f32a9a3a8bfa Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Tue, 16 Nov 2021 22:48:34 +0800 Subject: [PATCH 2/4] Remove unnecessary random_state --- examples/cluster/plot_digits_linkage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index 5f5aba6537554..a05774af1ac11 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -84,7 +84,7 @@ def plot_clustering(X_red, labels, title=None): # ---------------------------------------------------------------------- # 2D embedding of the digits dataset print("Computing embedding") -X_red = manifold.SpectralEmbedding(n_components=2, random_state=0).fit_transform(X) +X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) print("Done.") from sklearn.cluster import AgglomerativeClustering From 79a776a0d70084042fc6a47b726cf31a14f2e8f9 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Thu, 18 Nov 2021 15:21:11 +0800 Subject: [PATCH 3/4] Remove nudge_images --- examples/cluster/plot_digits_linkage.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index a05774af1ac11..bca67313d561e 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -22,41 +22,19 @@ # Authors: Gael Varoquaux # License: BSD 3 clause (C) INRIA 2014 -# We utilize the digits dataset which has 1800 entries. -# We will only use 800 entries for the sake of speeding -# up the calculations but feel free to use the whole dataset. - from time import time import numpy as np -from scipy import ndimage from matplotlib import pyplot as plt from sklearn import manifold, datasets X, y = datasets.load_digits(return_X_y=True) -X, y = X[:800], y[:800] n_samples, n_features = X.shape np.random.seed(0) -def nudge_images(X, y): - # Having a larger dataset shows more clearly the behavior of the - # methods, but we multiply the size of the dataset only by 2, as the - # cost of the hierarchical clustering methods are strongly - # super-linear in n_samples - shift = lambda x: ndimage.shift( - x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode="constant" - ).ravel() - X = np.concatenate([X, np.apply_along_axis(shift, 1, X)]) - Y = np.concatenate([y, y], axis=0) - return X, Y - - -X, y = nudge_images(X, y) - - # ---------------------------------------------------------------------- # Visualize the clustering def plot_clustering(X_red, labels, title=None): @@ -70,7 +48,7 @@ def plot_clustering(X_red, labels, title=None): X_red[i, 1], str(y[i]), color=plt.cm.nipy_spectral(labels[i] / 10.0), - fontdict={"weight": "bold", "size": 10}, + fontdict={"weight": "bold", "size": 9}, ) plt.xticks([]) From 35b77b7212f609796ba71b1f2fe4a57dc7bb4b1c Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Fri, 19 Nov 2021 11:04:08 +0800 Subject: [PATCH 4/4] Address PR comment, elaborate analysis --- examples/cluster/plot_digits_linkage.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index bca67313d561e..e13e83047fee3 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -12,10 +12,18 @@ What this example shows us is the behavior "rich getting richer" of agglomerative clustering that tends to create uneven cluster sizes. + This behavior is pronounced for the average linkage strategy, -that ends up with a couple of singleton clusters, while in the case -of single linkage we get a single central cluster with all other clusters -being drawn from noise points around the fringes. +that ends up with a couple of clusters with few datapoints. + +The case of single linkage is even more pathologic with a very +large cluster covering most digits, an intermediate size (clean) +cluster with most zero digits and all other clusters being drawn +from noise points around the fringes. + +The other linkage strategies lead to more evenly distributed +clusters that are therefore likely to be less sensible to a +random resampling of the dataset. """