|
5 | 5 |
|
6 | 6 | This example compares the timing of BIRCH (with and without the global
|
7 | 7 | clustering step) and MiniBatchKMeans on a synthetic dataset having
|
8 |
| -100,000 samples and 2 features generated using make_blobs. |
| 8 | +25,000 samples and 2 features generated using make_blobs. |
9 | 9 |
|
10 |
| -If ``n_clusters`` is set to None, the data is reduced from 100,000 |
| 10 | +Both ``MiniBatchKMeans`` and ``BIRCH`` are very scalable algorithms and could |
| 11 | +run efficiently on hundreds of thousands or even millions of datapoints. We |
| 12 | +chose to limit the dataset size of this example in the interest of keeping |
| 13 | +our Continuous Integration resource usage reasonable but the interested |
| 14 | +reader might enjoy editing this script to rerun it with a larger value for |
| 15 | +`n_samples`. |
| 16 | +
|
| 17 | +If ``n_clusters`` is set to None, the data is reduced from 25,000 |
11 | 18 | samples to a set of 158 clusters. This can be viewed as a preprocessing
|
12 | 19 | step before the final (global) clustering step that further reduces these
|
13 | 20 | 158 clusters to 100 clusters.
|
|
18 | 25 | # Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
19 | 26 | # License: BSD 3 clause
|
20 | 27 |
|
| 28 | +from joblib import cpu_count |
21 | 29 | from itertools import cycle
|
22 | 30 | from time import time
|
23 | 31 | import numpy as np
|
|
32 | 40 | xx = np.linspace(-22, 22, 10)
|
33 | 41 | yy = np.linspace(-22, 22, 10)
|
34 | 42 | xx, yy = np.meshgrid(xx, yy)
|
35 |
| -n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) |
| 43 | +n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) |
36 | 44 |
|
37 | 45 | # Generate blobs to do a comparison between MiniBatchKMeans and BIRCH.
|
38 |
| -X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0) |
| 46 | +X, y = make_blobs(n_samples=25000, centers=n_centers, random_state=0) |
39 | 47 |
|
40 | 48 | # Use all colors that matplotlib provides by default.
|
41 | 49 | colors_ = cycle(colors.cnames.keys())
|
|
78 | 86 | mbk = MiniBatchKMeans(
|
79 | 87 | init="k-means++",
|
80 | 88 | n_clusters=100,
|
81 |
| - batch_size=100, |
| 89 | + batch_size=256 * cpu_count(), |
82 | 90 | n_init=10,
|
83 | 91 | max_no_improvement=10,
|
84 | 92 | verbose=0,
|
|
0 commit comments