|
12 | 12 |
|
13 | 13 | What this example shows us is the behavior "rich getting richer" of
|
14 | 14 | agglomerative clustering that tends to create uneven cluster sizes.
|
| 15 | +
|
15 | 16 | This behavior is pronounced for the average linkage strategy,
|
16 |
| -that ends up with a couple of singleton clusters, while in the case |
17 |
| -of single linkage we get a single central cluster with all other clusters |
18 |
| -being drawn from noise points around the fringes. |
| 17 | +that ends up with a couple of clusters with few datapoints. |
| 18 | +
|
| 19 | +The case of single linkage is even more pathologic with a very |
| 20 | +large cluster covering most digits, an intermediate size (clean) |
| 21 | +cluster with most zero digits and all other clusters being drawn |
| 22 | +from noise points around the fringes. |
| 23 | +
|
| 24 | +The other linkage strategies lead to more evenly distributed |
| 25 | +clusters that are therefore likely to be less sensible to a |
| 26 | +random resampling of the dataset. |
19 | 27 |
|
20 | 28 | """
|
21 | 29 |
|
|
25 | 33 | from time import time
|
26 | 34 |
|
27 | 35 | import numpy as np
|
28 |
| -from scipy import ndimage |
29 | 36 | from matplotlib import pyplot as plt
|
30 | 37 |
|
31 | 38 | from sklearn import manifold, datasets
|
|
36 | 43 | np.random.seed(0)
|
37 | 44 |
|
38 | 45 |
|
39 |
| -def nudge_images(X, y): |
40 |
| - # Having a larger dataset shows more clearly the behavior of the |
41 |
| - # methods, but we multiply the size of the dataset only by 2, as the |
42 |
| - # cost of the hierarchical clustering methods are strongly |
43 |
| - # super-linear in n_samples |
44 |
| - shift = lambda x: ndimage.shift( |
45 |
| - x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode="constant" |
46 |
| - ).ravel() |
47 |
| - X = np.concatenate([X, np.apply_along_axis(shift, 1, X)]) |
48 |
| - Y = np.concatenate([y, y], axis=0) |
49 |
| - return X, Y |
50 |
| - |
51 |
| - |
52 |
| -X, y = nudge_images(X, y) |
53 |
| - |
54 |
| - |
55 | 46 | # ----------------------------------------------------------------------
|
56 | 47 | # Visualize the clustering
|
57 | 48 | def plot_clustering(X_red, labels, title=None):
|
|
0 commit comments