ogrisel · vene · Oct 9, 2011 · Oct 9, 2011 · Oct 10, 2011 · Oct 10, 2011
diff --git a/.gitignore b/.gitignore
@@ -5,8 +5,8 @@
 *.swp
 .DS_Store
 build
-scikits/learn/datasets/__config__.py
-scikits/learn/**/*.html
+sklearn/datasets/__config__.py
+sklearn/**/*.html
 
 dist/
 doc/_build/

diff --git a/examples/cluster/kmeans_stability_low_dim_dense.py b/examples/cluster/kmeans_stability_low_dim_dense.py
@@ -0,0 +1,121 @@
+"""
+============================================================
+Empirical evaluation of the impact of k-means initialization
+============================================================
+
+Evaluate the ability of k-means initializations strategies to make
+the algorithm convergence robust as measured by the relative standard
+deviation of the inertia of the clustering (i.e. the sum of distances
+to the nearest cluster center).
+
+The dataset used for evaluation is a 2D grid of isotropic gaussian
+clusters widely spaced.
+
+"""
+print __doc__
+
+# Author: Olivier Grisel <olivier.grisel@ensta.org>
+# License: Simplified BSD
+
+import numpy as np
+import pylab as pl
+import matplotlib.cm as cm
+
+from sklearn.utils import shuffle
+from sklearn.utils import check_random_state
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import KMeans
+
+random_state = np.random.RandomState(0)
+
+# Number of run (with randomly generated dataset) for each strategy so as
+# to be able to compute an estimate of the standard deviation
+n_runs = 5
+
+# k-means models can do several random inits so as to be able to trade
+# CPU time for convergence robustness
+n_init_range = np.array([1, 5, 10, 15, 20])
+
+# Datasets generation parameters
+n_samples_per_center = 100
+grid_size = 3
+scale = 0.1
+n_clusters = grid_size ** 2
+
+
+def make_data(random_state, n_samples_per_center, grid_size, scale):
+    random_state = check_random_state(random_state)
+    centers = np.array([[i, j]
+                        for i in range(grid_size)
+                        for j in range(grid_size)])
+    n_clusters_true, n_featues = centers.shape
+
+    noise = random_state.normal(
+        scale=scale, size=(n_samples_per_center, centers.shape[1]))
+
+    X = np.concatenate([c + noise for c in centers])
+    y = np.concatenate([[i] * n_samples_per_center
+                        for i in range(n_clusters_true)])
+    return shuffle(X, y, random_state=random_state)
+
+
+fig = pl.figure()
+plots = []
+legends = []
+
+cases = [
+    (KMeans, 'k-means++', {}),
+    (KMeans, 'random', {}),
+    (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),
+    (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),
+]
+
+for factory, init, params in cases:
+    print "Evaluation of %s with %s init" % (factory.__name__, init)
+    inertia = np.empty((len(n_init_range), n_runs))
+
+    for run_id in range(n_runs):
+        X, y = make_data(run_id, n_samples_per_center, grid_size, scale)
+        for i, n_init in enumerate(n_init_range):
+            km = factory(k=n_clusters,
+                         init=init,
+                         random_state=run_id,
+                         n_init=n_init,
+                         **params).fit(X)
+            inertia[i, run_id] = km.inertia_
+            print "Inertia for n_init=%02d, run_id=%d: %0.3f" % (
+                n_init, run_id, km.inertia_)
+
+    plots.append(
+        pl.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1)))
+    n_reinit = params.get('n_reinit')
+    if n_reinit is not None:
+        legends.append("%s with %s init and %d reinit" % (
+            factory.__name__, init, n_reinit))
+    else:
+        legends.append("%s with %s init" % (factory.__name__, init))
+
+plots = [plot[0] for plot in plots]  # take only the first line in each plot
+pl.xlabel('n_init')
+pl.ylabel('inertia')
+pl.legend(plots, legends)
+pl.title("Mean inertia for various k-means init across %d runs" % n_runs)
+
+# Part 2: qualitative visual inspection of the convergence
+
+X, y = make_data(random_state, n_samples_per_center, grid_size, scale)
+km = MiniBatchKMeans(k=n_clusters, init='random', n_init=1,
+                     random_state=random_state).fit(X)
+
+fig = pl.figure()
+for k in range(n_clusters):
+    my_members = km.labels_ == k
+    color = cm.spectral(float(k) / n_clusters, 1)
+    pl.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)
+    cluster_center = km.cluster_centers_[k]
+    pl.plot(cluster_center[0], cluster_center[1], 'o',
+            markerfacecolor=color, markeredgecolor='k', markersize=6)
+    pl.title("Example cluster allocation with a single random init\n"
+             "with MiniBatchKMeans")
+
+pl.show()
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
@@ -35,7 +35,7 @@
 ##############################################################################
 # Compute clustering with Means
 
-k_means = KMeans(init='k-means++', k=3)
+k_means = KMeans(init='k-means++', k=3, n_init=10)
 t0 = time.time()
 k_means.fit(X)
 t_batch = time.time() - t0
@@ -46,7 +46,8 @@
 ##############################################################################
 # Compute clustering with MiniBatchKMeans
 
-mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
+mbk = MiniBatchKMeans(init='k-means++', k=3, batch_size=batch_size,
+                      n_init=10, max_no_improvement=10, verbose=0)
 t0 = time.time()
 mbk.fit(X)
 t_mini_batch = time.time() - t0

diff --git a/examples/document_clustering.py b/examples/document_clustering.py
@@ -49,7 +49,6 @@
 print "%d categories" % len(dataset.target_names)
 print
 
-# split a training set and a test set
 labels = dataset.target
 true_k = np.unique(labels).shape[0]
 
@@ -63,10 +62,11 @@
 print
 
 ###############################################################################
-# Now sparse MiniBatchKmeans
+# Sparse MiniBatchKmeans
 
-mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
-                       chunk_size=1000, verbose=0)
+mbkm = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
+                       init_size=1000,
+                       batch_size=1000, verbose=1)
 print "Clustering sparse data with %s" % mbkm
 t0 = time()
 mbkm.fit(X)

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -172,7 +172,7 @@ def _get_params(self, deep=True):
         """
         out = dict()
         for key in self._get_param_names():
-            value = getattr(self, key)
+            value = getattr(self, key, None)
             if deep and hasattr(value, '_get_params'):
                 deep_items = value._get_params().items()
                 out.update((key + '__' + k, val) for k, val in deep_items)