From 46ba348ebd06752308fb02a43293cd9ca07cef33 Mon Sep 17 00:00:00 2001 From: siavrez Date: Mon, 22 Nov 2021 00:50:22 +0330 Subject: [PATCH 1/3] Accelerate_examples Changing matplotlib.text with matplotlib.scatter for plot_digits_linkage.py --- examples/cluster/plot_digits_linkage.py | 27 +++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index e13e83047fee3..fd5ce9062f9de 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -2,35 +2,28 @@ ============================================================================= Various Agglomerative Clustering on a 2D embedding of digits ============================================================================= - An illustration of various linkage option for agglomerative clustering on a 2D embedding of the digits dataset. - The goal of this example is to show intuitively how the metrics behave, and not to find good clusters for the digits. This is why the example works on a 2D embedding. - What this example shows us is the behavior "rich getting richer" of agglomerative clustering that tends to create uneven cluster sizes. - This behavior is pronounced for the average linkage strategy, that ends up with a couple of clusters with few datapoints. - The case of single linkage is even more pathologic with a very large cluster covering most digits, an intermediate size (clean) cluster with most zero digits and all other clusters being drawn from noise points around the fringes. - The other linkage strategies lead to more evenly distributed clusters that are therefore likely to be less sensible to a random resampling of the dataset. - """ # Authors: Gael Varoquaux # License: BSD 3 clause (C) INRIA 2014 -from time import time +from time import perf_counter import numpy as np from matplotlib import pyplot as plt @@ -50,13 +43,13 @@ def plot_clustering(X_red, labels, title=None): X_red = (X_red - x_min) / (x_max - x_min) plt.figure(figsize=(6, 4)) - for i in range(X_red.shape[0]): - plt.text( - X_red[i, 0], - X_red[i, 1], - str(y[i]), - color=plt.cm.nipy_spectral(labels[i] / 10.0), - fontdict={"weight": "bold", "size": 9}, + for t in np.sort(np.unique(y)): + plt.scatter( + *X_red[y == t].T, + marker=f"${t}$", + s=50, + c=plt.cm.nipy_spectral(labels[y == t] / 10), + alpha=0.5, ) plt.xticks([]) @@ -77,9 +70,9 @@ def plot_clustering(X_red, labels, title=None): for linkage in ("ward", "average", "complete", "single"): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) - t0 = time() + t0 = perf_counter() clustering.fit(X_red) - print("%s :\t%.2fs" % (linkage, time() - t0)) + print("%s :\t%.2fs" % (linkage, perf_counter() - t0)) plot_clustering(X_red, clustering.labels_, "%s linkage" % linkage) From 6748650497152cfb8e64560bccdf670e30c483cf Mon Sep 17 00:00:00 2001 From: siavrez Date: Mon, 22 Nov 2021 21:42:21 +0330 Subject: [PATCH 2/3] Accelerate plot_digits_linkage.py changing proposed names and changing back perf_counter to time --- examples/cluster/plot_digits_linkage.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index fd5ce9062f9de..fe043890d4d06 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -23,14 +23,15 @@ # Authors: Gael Varoquaux # License: BSD 3 clause (C) INRIA 2014 -from time import perf_counter +from time import time import numpy as np from matplotlib import pyplot as plt from sklearn import manifold, datasets -X, y = datasets.load_digits(return_X_y=True) +digits = datasets.load_digits() +X, y = digits.data, digits.target n_samples, n_features = X.shape np.random.seed(0) @@ -43,12 +44,12 @@ def plot_clustering(X_red, labels, title=None): X_red = (X_red - x_min) / (x_max - x_min) plt.figure(figsize=(6, 4)) - for t in np.sort(np.unique(y)): + for digit in digits.target_names: plt.scatter( - *X_red[y == t].T, - marker=f"${t}$", + *X_red[y == digit].T, + marker=f"${digit}$", s=50, - c=plt.cm.nipy_spectral(labels[y == t] / 10), + c=plt.cm.nipy_spectral(labels[y == digit] / 10), alpha=0.5, ) @@ -70,9 +71,9 @@ def plot_clustering(X_red, labels, title=None): for linkage in ("ward", "average", "complete", "single"): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) - t0 = perf_counter() + t0 = time() clustering.fit(X_red) - print("%s :\t%.2fs" % (linkage, perf_counter() - t0)) + print("%s :\t%.2fs" % (linkage, time() - t0)) plot_clustering(X_red, clustering.labels_, "%s linkage" % linkage) From 0a99d084e83570e25f6fab9ae48c1c395e73fab4 Mon Sep 17 00:00:00 2001 From: siavrez Date: Thu, 25 Nov 2021 22:25:51 +0330 Subject: [PATCH 3/3] change back docstring to original state --- examples/cluster/plot_digits_linkage.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index fe043890d4d06..730f85c543356 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -2,22 +2,29 @@ ============================================================================= Various Agglomerative Clustering on a 2D embedding of digits ============================================================================= + An illustration of various linkage option for agglomerative clustering on a 2D embedding of the digits dataset. + The goal of this example is to show intuitively how the metrics behave, and not to find good clusters for the digits. This is why the example works on a 2D embedding. + What this example shows us is the behavior "rich getting richer" of agglomerative clustering that tends to create uneven cluster sizes. + This behavior is pronounced for the average linkage strategy, that ends up with a couple of clusters with few datapoints. + The case of single linkage is even more pathologic with a very large cluster covering most digits, an intermediate size (clean) cluster with most zero digits and all other clusters being drawn from noise points around the fringes. + The other linkage strategies lead to more evenly distributed clusters that are therefore likely to be less sensible to a random resampling of the dataset. + """ # Authors: Gael Varoquaux