scikit-learn
diff --git a/‎sklearn/metrics/cluster/gap_statistic.py
Lines changed: 184 additions & 0 deletions b/‎sklearn/metrics/cluster/gap_statistic.py
Lines changed: 184 additions & 0 deletions
diff --git a/‎sklearn/metrics/cluster/tests/test_gap.py
Lines changed: 33 additions & 0 deletions b/‎sklearn/metrics/cluster/tests/test_gap.py
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,184 @@
+from __future__ import division
+
+from math import sqrt, log
+
+import numpy as np
+
+from .distortion import distortion
+from sklearn import preprocessing
+from ...utils import check_random_state
+
+
+def normal_distortion(X, cluster_estimator, nb_draw=100,
+                      distortion_meth='sqeuclidean', p=2, random_state=None):
+    """
+    Draw centered and reduced data of size data_shape = (nb_data, nb_feature),
+    Clusterize data using cluster_estimator and compute distortion
+
+    Parameter
+    ---------
+    X numpy array of size (nb_data, nb_feature)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    distortion_meth: can be a function X, labels -> float,
+        can be a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    dist: list of distortions (float) obtained on random dataset
+    """
+    rng = check_random_state(random_state)
+
+    data_shape = X.shape
+    dist = []
+    for i in range(nb_draw):
+        X_rand = rng.standard_normal(data_shape)
+        dist.append(distortion(
+            X_rand, cluster_estimator.fit_predict(X_rand),
+            distortion_meth, p) / data_shape[0])
+
+    return dist
+
+
+def uniform_distortion(X, cluster_estimator, nb_draw=100, val_min=None,
+                       val_max=None, distortion_meth='sqeuclidean', p=2,
+                       random_state=None):
+    """
+    Uniformly draw data of size data_shape = (nb_data, nb_feature)
+    in the smallest hyperrectangle containing real data X.
+    Clusterize data using cluster_estimator and compute distortion
+
+    Parameter
+    ---------
+    X: numpy array of shape (nb_data, nb_feature)
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+        need method fit_predict: X -> labels
+    val_min: minimum values of each dimension of input data
+        array of length nb_feature
+    val_max: maximum values of each dimension of input data
+        array of length nb_feature
+    distortion_meth: can be a function X, labels -> float,
+        can be a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    dist: list of distortions (float) obtained on random dataset
+    """
+    rng = check_random_state(random_state)
+    if val_min is None:
+        val_min = np.min(X, axis=0)
+    if val_max is None:
+        val_max = np.max(X, axis=0)
+
+    dist = []
+    for i in range(nb_draw):
+        X_rand = rng.uniform(size=X.shape) * (val_max - val_min) + val_min
+        dist.append(distortion(X_rand, cluster_estimator.fit_predict(X_rand),
+                               distortion_meth, p) / X.shape[0])
+
+    return dist
+
+
+def gap_statistic(X, cluster_estimator, k_max=None, nb_draw=10,
+                  random_state=None, draw_model='uniform',
+                  distortion_meth='sqeuclidean', p=2):
+    """
+    Estimating optimal number of cluster for data X with cluster_estimator by
+    comparing distortion of clustered real data with distortion of clustered
+    random data. Let D_rand(k) be the distortion of random data in k clusters,
+    D_real(k) distortion of real data in k clusters, statistic gap is defined
+    as
+
+    Gap(k) = E(log(D_rand(k))) - log(D_real(k))
+
+    We draw nb_draw random data "shapened-like X" (shape depend on draw_model)
+    We select the smallest k such as the gap between distortion of k clusters
+    of random data and k clusters of real data is superior to the gap with
+    k + 1 clusters minus a "standard-error" safety. Precisely:
+
+    k_star = min_k k
+         s.t. Gap(k) >= Gap(k + 1) - s(k + 1)
+              s(k) = stdev(log(D_rand)) * sqrt(1 + 1 / nb_draw)
+
+    From R.Tibshirani, G. Walther and T.Hastie, Estimating the number of
+    clusters in a dataset via the Gap statistic, Journal of the Royal
+    Statistical Socciety: Seris (B) (Statistical Methodology), 63(2), 411-423
+
+    Parameter
+    ---------
+    X: data. array nb_data * nb_feature
+    cluster_estimator: ClusterMixing estimator object.
+        need parameter n_clusters
+    nb_draw: int: number of random data of shape (nb_data, nb_feature) drawn
+        to estimate E(log(D_rand(k)))
+    draw_model: under which i.i.d data are draw. default: uniform data
+        (following Tibshirani et al.)
+        can be 'uniform', 'normal' (Gaussian distribution)
+    distortion_meth: can be a function X, labels -> float,
+        can be a string naming a scipy.spatial distance. can be in
+        ['euclidian', 'minkowski', 'seuclidiean', 'sqeuclidean', 'chebyshev'
+         'cityblock', 'cosine', 'correlation', 'hamming', 'jaccard',
+         'Bray-Curtis', 'mahalanobis', 'yule', 'matching', 'dice', 'kulsinski',
+         'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
+         'canberra', 'wminkowski'])
+    p : double
+        The p-norm to apply (for Minkowski, weighted and unweighted)
+
+    Return
+    ------
+    k: int: number of cluster that maximizes the gap statistic
+    """
+    rng = check_random_state(random_state)
+
+    # if no maximum number of clusters set, take datasize divided by 2
+    if not k_max:
+        k_max = X.shape[0] // 2
+    if draw_model == 'uniform':
+        val_min = np.min(X, axis=0)
+        val_max = np.max(X, axis=0)
+    elif draw_model == 'normal':
+        X = preprocessing.scale(X)
+
+    k_star = 1
+    old_gap = 0
+    gap = .0
+    for k in range(1, k_max + 2):
+        cluster_estimator.set_params(n_clusters=k)
+        real_dist = distortion(X, cluster_estimator.fit_predict(X),
+                               distortion_meth, p)
+        # expected distortion
+        if draw_model == 'uniform':
+            rand_dist = uniform_distortion(X, cluster_estimator, nb_draw,
+                                           val_min, val_max, distortion_meth,
+                                           p)
+        elif draw_model == 'normal':
+            rand_dist = normal_distortion(X, cluster_estimator, nb_draw,
+                                          distortion_meth, p)
+        else:
+            raise ValueError(
+                "For gap statistic, model for random data is unknown")
+        rand_dist = np.log(rand_dist)
+        exp_dist = np.mean(rand_dist)
+        std_dist = np.std(rand_dist)
+        gap = exp_dist - log(real_dist)
+        safety = std_dist * sqrt(1 + 1 / nb_draw)
+        if k_star < 2 and old_gap >= gap - safety:
+            k_star = k - 1
+        old_gap = gap
+    return k_star
@@ -0,0 +1,33 @@
+import numpy as np
+
+from sklearn.utils.testing import (assert_true, assert_equal)
+
+from sklearn.cluster.k_means_ import KMeans
+from sklearn.metrics.cluster.gap_statistic import (normal_distortion,
+                                                   gap_statistic)
+from sklearn.datasets import make_blobs
+
+
+def test_normal_distortion():
+    class BogusCluster(object):
+        def fit_predict(self, points):
+            n = len(points)
+            mid = n / 2
+            return [int(i < mid) for i in range(n)]
+    mean_dist = np.mean(normal_distortion(
+        np.zeros((100, 2)), BogusCluster(), nb_draw=10, random_state=0))
+    # Expected mean dist is 1.
+    # After 100 tries, it should be between .90 and 1.1
+    assert_true(mean_dist > .9)
+    assert_true(mean_dist < 1.1)
+
+
+def test_gap_statistic():
+    # for j in [20 * i: 20 * (i+1)[, x[j] = [rand rand] + [4 * i, 4 * i]
+    X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]),
+                      random_state=0)
+    cluster_estimator = KMeans()
+    assert_equal(gap_statistic(X, cluster_estimator, k_max=6, nb_draw=10,
+                               random_state=0, draw_model='normal'), 3)
+    assert_equal(gap_statistic(X, cluster_estimator, k_max=6, nb_draw=10,
+                               random_state=0, distortion_meth='cityblock'), 3)