From cef8131885bc04a490341b8f643e7924550aa69b Mon Sep 17 00:00:00 2001
From: yarkhinephyo <yarkhinephyo@gmail.com>
Date: Tue, 16 Nov 2021 01:01:17 +0800
Subject: [PATCH 1/4] Reduce num of samples in plot-digit-linkage example

---
 examples/cluster/plot_digits_linkage.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index 925f5c122d73f..5f5aba6537554 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -22,6 +22,10 @@
 # Authors: Gael Varoquaux
 # License: BSD 3 clause (C) INRIA 2014
 
+# We utilize the digits dataset which has 1800 entries.
+# We will only use 800 entries for the sake of speeding
+# up the calculations but feel free to use the whole dataset.
+
 from time import time
 
 import numpy as np
@@ -31,6 +35,7 @@
 from sklearn import manifold, datasets
 
 X, y = datasets.load_digits(return_X_y=True)
+X, y = X[:800], y[:800]
 n_samples, n_features = X.shape
 
 np.random.seed(0)
@@ -65,7 +70,7 @@ def plot_clustering(X_red, labels, title=None):
             X_red[i, 1],
             str(y[i]),
             color=plt.cm.nipy_spectral(labels[i] / 10.0),
-            fontdict={"weight": "bold", "size": 9},
+            fontdict={"weight": "bold", "size": 10},
         )
 
     plt.xticks([])
@@ -79,7 +84,7 @@ def plot_clustering(X_red, labels, title=None):
 # ----------------------------------------------------------------------
 # 2D embedding of the digits dataset
 print("Computing embedding")
-X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
+X_red = manifold.SpectralEmbedding(n_components=2, random_state=0).fit_transform(X)
 print("Done.")
 
 from sklearn.cluster import AgglomerativeClustering

From 62feab0213c370138a49b17fe103f32a9a3a8bfa Mon Sep 17 00:00:00 2001
From: yarkhinephyo <yarkhinephyo@gmail.com>
Date: Tue, 16 Nov 2021 22:48:34 +0800
Subject: [PATCH 2/4] Remove unnecessary random_state

---
 examples/cluster/plot_digits_linkage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index 5f5aba6537554..a05774af1ac11 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -84,7 +84,7 @@ def plot_clustering(X_red, labels, title=None):
 # ----------------------------------------------------------------------
 # 2D embedding of the digits dataset
 print("Computing embedding")
-X_red = manifold.SpectralEmbedding(n_components=2, random_state=0).fit_transform(X)
+X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
 print("Done.")
 
 from sklearn.cluster import AgglomerativeClustering

From 79a776a0d70084042fc6a47b726cf31a14f2e8f9 Mon Sep 17 00:00:00 2001
From: yarkhinephyo <yarkhinephyo@gmail.com>
Date: Thu, 18 Nov 2021 15:21:11 +0800
Subject: [PATCH 3/4] Remove nudge_images

---
 examples/cluster/plot_digits_linkage.py | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index a05774af1ac11..bca67313d561e 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -22,41 +22,19 @@
 # Authors: Gael Varoquaux
 # License: BSD 3 clause (C) INRIA 2014
 
-# We utilize the digits dataset which has 1800 entries.
-# We will only use 800 entries for the sake of speeding
-# up the calculations but feel free to use the whole dataset.
-
 from time import time
 
 import numpy as np
-from scipy import ndimage
 from matplotlib import pyplot as plt
 
 from sklearn import manifold, datasets
 
 X, y = datasets.load_digits(return_X_y=True)
-X, y = X[:800], y[:800]
 n_samples, n_features = X.shape
 
 np.random.seed(0)
 
 
-def nudge_images(X, y):
-    # Having a larger dataset shows more clearly the behavior of the
-    # methods, but we multiply the size of the dataset only by 2, as the
-    # cost of the hierarchical clustering methods are strongly
-    # super-linear in n_samples
-    shift = lambda x: ndimage.shift(
-        x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode="constant"
-    ).ravel()
-    X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])
-    Y = np.concatenate([y, y], axis=0)
-    return X, Y
-
-
-X, y = nudge_images(X, y)
-
-
 # ----------------------------------------------------------------------
 # Visualize the clustering
 def plot_clustering(X_red, labels, title=None):
@@ -70,7 +48,7 @@ def plot_clustering(X_red, labels, title=None):
             X_red[i, 1],
             str(y[i]),
             color=plt.cm.nipy_spectral(labels[i] / 10.0),
-            fontdict={"weight": "bold", "size": 10},
+            fontdict={"weight": "bold", "size": 9},
         )
 
     plt.xticks([])

From 35b77b7212f609796ba71b1f2fe4a57dc7bb4b1c Mon Sep 17 00:00:00 2001
From: yarkhinephyo <yarkhinephyo@gmail.com>
Date: Fri, 19 Nov 2021 11:04:08 +0800
Subject: [PATCH 4/4] Address PR comment, elaborate analysis

---
 examples/cluster/plot_digits_linkage.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index bca67313d561e..e13e83047fee3 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -12,10 +12,18 @@
 
 What this example shows us is the behavior "rich getting richer" of
 agglomerative clustering that tends to create uneven cluster sizes.
+
 This behavior is pronounced for the average linkage strategy,
-that ends up with a couple of singleton clusters, while in the case
-of single linkage we get a single central cluster with all other clusters
-being drawn from noise points around the fringes.
+that ends up with a couple of clusters with few datapoints.
+
+The case of single linkage is even more pathologic with a very
+large cluster covering most digits, an intermediate size (clean)
+cluster with most zero digits and all other clusters being drawn
+from noise points around the fringes.
+
+The other linkage strategies lead to more evenly distributed
+clusters that are therefore likely to be less sensible to a
+random resampling of the dataset.
 
 """