From 16b3a8a81fa512a795c948c331d620c2996e295b Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Mon, 15 Nov 2021 09:29:16 +0100 Subject: [PATCH 1/7] ENH Impoving execution speed of plot_pca_vs_fa_model_selection.py by dividing by 2 parameters n_samples, n_features, rank --- examples/decomposition/plot_pca_vs_fa_model_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py index e7802476ef341..2adbeff5cc607 100644 --- a/examples/decomposition/plot_pca_vs_fa_model_selection.py +++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py @@ -40,7 +40,7 @@ # ############################################################################# # Create the data -n_samples, n_features, rank = 1000, 50, 10 +n_samples, n_features, rank = 500, 25, 5 sigma = 1.0 rng = np.random.RandomState(42) U, _, _ = linalg.svd(rng.randn(n_features, n_features)) From 3b5a9ef3a7c8a1baa640ca7321a1ff34ac9aeeb1 Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 15:24:30 +0100 Subject: [PATCH 2/7] ENH Reduced n_samples and batch_size optimisation to improve execution speed --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index c84d06458ac3c..5c3c455521dec 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -5,9 +5,9 @@ This example compares the timing of BIRCH (with and without the global clustering step) and MiniBatchKMeans on a synthetic dataset having -100,000 samples and 2 features generated using make_blobs. +50,000 samples and 2 features generated using make_blobs. -If ``n_clusters`` is set to None, the data is reduced from 100,000 +If ``n_clusters`` is set to None, the data is reduced from 25,000 samples to a set of 158 clusters. This can be viewed as a preprocessing step before the final (global) clustering step that further reduces these 158 clusters to 100 clusters. @@ -18,6 +18,7 @@ # Alexandre Gramfort # License: BSD 3 clause +from os import cpu_count from itertools import cycle from time import time import numpy as np @@ -35,7 +36,7 @@ n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) # Generate blobs to do a comparison between MiniBatchKMeans and BIRCH. -X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0) +X, y = make_blobs(n_samples=25000, centers=n_centres, random_state=0) # Use all colors that matplotlib provides by default. colors_ = cycle(colors.cnames.keys()) @@ -78,7 +79,7 @@ mbk = MiniBatchKMeans( init="k-means++", n_clusters=100, - batch_size=100, + batch_size=256 * cpu_count(), n_init=10, max_no_improvement=10, verbose=0, From 120566605d27f98c39046ec249f7d3ea12856ba6 Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 15:40:44 +0100 Subject: [PATCH 3/7] ENH change in docstring to match with changes in code --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index 5c3c455521dec..8d31fcdfc7626 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -5,7 +5,7 @@ This example compares the timing of BIRCH (with and without the global clustering step) and MiniBatchKMeans on a synthetic dataset having -50,000 samples and 2 features generated using make_blobs. +25,000 samples and 2 features generated using make_blobs. If ``n_clusters`` is set to None, the data is reduced from 25,000 samples to a set of 158 clusters. This can be viewed as a preprocessing From 0ef18cac4f6edef07e6e6b5cbcc6791c5ea442bf Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 16:38:49 +0100 Subject: [PATCH 4/7] Apply @ogrisel 's suggestion Co-authored-by: Olivier Grisel --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index 8d31fcdfc7626..406ea53315bb2 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -7,6 +7,13 @@ clustering step) and MiniBatchKMeans on a synthetic dataset having 25,000 samples and 2 features generated using make_blobs. +Both MiniBatchKMeans and BIRCH are very scalable algorithms and could +run efficiently on hundreds of thousands or even millions of datapoints. We +chose to limit the dataset size of this example in the interest of keeping +our Continuous Integration resource usage reasonable but the interested +reader might enjoy editing this script to rerun it with a larger value for +`n_samples`. + If ``n_clusters`` is set to None, the data is reduced from 25,000 samples to a set of 158 clusters. This can be viewed as a preprocessing step before the final (global) clustering step that further reduces these From d8810d12ed79cef47a4ac3ae7258055fcaddf2c2 Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 16:39:44 +0100 Subject: [PATCH 5/7] Replacing os by joblib (@ogrisel 's suggestion) Co-authored-by: Olivier Grisel --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index 406ea53315bb2..f68749650ae88 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -25,7 +25,7 @@ # Alexandre Gramfort # License: BSD 3 clause -from os import cpu_count +from joblib import cpu_count from itertools import cycle from time import time import numpy as np From cc614078a7c0e2a079a8b43a24c062fc510a1b9a Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 16:46:33 +0100 Subject: [PATCH 6/7] ENH change name variable n_centre to n_centers --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 4 ++-- examples/decomposition/plot_pca_vs_fa_model_selection.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index f68749650ae88..13e58c2cc025e 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -40,10 +40,10 @@ xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) xx, yy = np.meshgrid(xx, yy) -n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) +n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) # Generate blobs to do a comparison between MiniBatchKMeans and BIRCH. -X, y = make_blobs(n_samples=25000, centers=n_centres, random_state=0) +X, y = make_blobs(n_samples=25000, centers=n_centers, random_state=0) # Use all colors that matplotlib provides by default. colors_ = cycle(colors.cnames.keys()) diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py index 2adbeff5cc607..e7802476ef341 100644 --- a/examples/decomposition/plot_pca_vs_fa_model_selection.py +++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py @@ -40,7 +40,7 @@ # ############################################################################# # Create the data -n_samples, n_features, rank = 500, 25, 5 +n_samples, n_features, rank = 1000, 50, 10 sigma = 1.0 rng = np.random.RandomState(42) U, _, _ = linalg.svd(rng.randn(n_features, n_features)) From 0a9a7e969698f43c6319a8689846ab4b426c8619 Mon Sep 17 00:00:00 2001 From: Gauthier I Date: Thu, 18 Nov 2021 17:41:39 +0100 Subject: [PATCH 7/7] Apply @adrinjalali 's suggestion Co-authored-by: Adrin Jalali --- examples/cluster/plot_birch_vs_minibatchkmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index 13e58c2cc025e..f210cd941977a 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -7,7 +7,7 @@ clustering step) and MiniBatchKMeans on a synthetic dataset having 25,000 samples and 2 features generated using make_blobs. -Both MiniBatchKMeans and BIRCH are very scalable algorithms and could +Both ``MiniBatchKMeans`` and ``BIRCH`` are very scalable algorithms and could run efficiently on hundreds of thousands or even millions of datapoints. We chose to limit the dataset size of this example in the interest of keeping our Continuous Integration resource usage reasonable but the interested