From bf4c3e23978d827c454fd94bf0e17d793c3af5f0 Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Fri, 25 Nov 2016 17:27:35 +0100
Subject: [PATCH 01/12] Adding davies_bouldin_index calculation

1. sklearn/metrics/cluster/unsupervised.py - calculation itself
2. sklearn/metrics/cluster/tests/test_unsupervised.py - tests
3. sklearn/metrics/cluster/__init__.py - exposing the function
---
 sklearn/metrics/cluster/__init__.py           |  4 +-
 .../cluster/tests/test_unsupervised.py        | 30 ++++++++++
 sklearn/metrics/cluster/unsupervised.py       | 60 +++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 4cda1108ece32..3f8edd65a5758 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -20,6 +20,7 @@
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
 from .unsupervised import calinski_harabaz_score
+from .unsupervised import davies_bouldin_index
 from .bicluster import consensus_score
 
 __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score",
@@ -27,4 +28,5 @@
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
            "fowlkes_mallows_score", "entropy", "silhouette_samples",
-           "silhouette_score", "calinski_harabaz_score", "consensus_score"]
+           "silhouette_score", "calinski_harabaz_score",
+           "davies_bouldin_index", "consensus_score"]
diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
index 55715d0c35cdd..2d3b10bc7763e 100644
--- a/sklearn/metrics/cluster/tests/test_unsupervised.py
+++ b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -14,6 +14,7 @@
 from sklearn.metrics.cluster import silhouette_samples
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics.cluster import calinski_harabaz_score
+from sklearn.metrics.cluster import davies_bouldin_index
 
 
 def test_silhouette():
@@ -146,3 +147,32 @@ def test_calinski_harabaz_score():
     labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
     assert_almost_equal(calinski_harabaz_score(X, labels),
                         45 * (40 - 4) / (5 * (4 - 1)))
+
+
+def test_davies_bouldin_index():
+    rng = np.random.RandomState(seed=0)
+
+    # Assert message when there is only one label
+    assert_raise_message(ValueError, "Number of labels is",
+                         davies_bouldin_index,
+                         rng.rand(10, 2), np.zeros(10))
+
+    # Assert message when all point are in different clusters
+    assert_raise_message(ValueError, "Number of labels is",
+                         davies_bouldin_index,
+                         rng.rand(10, 2), np.arange(10))
+
+    # Assert the value is 0. when all samples are equals
+    assert_equal(0., davies_bouldin_index(np.ones((10, 2)),
+                                          [0] * 5 + [1] * 5))
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert_equal(0., davies_bouldin_index([[-1, -1], [1, 1]] * 10,
+                                          [0] * 10 + [1] * 10))
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    assert_almost_equal(davies_bouldin_index(X, labels),
+                        2*np.sqrt(0.5)/3)
diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 1aed8e72a654b..4350788b41f68 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -255,3 +255,63 @@ def calinski_harabaz_score(X, labels):
     return (1. if intra_disp == 0. else
             extra_disp * (n_samples - n_labels) /
             (intra_disp * (n_labels - 1.)))
+
+
+def davies_bouldin_index(X, labels):
+    """Compute the Davies Bouldin index.
+
+    The index is defiend as the ratio of within-cluster
+    and between-cluster distances.
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score : float
+        The resulting Davies-Bouldin index.
+
+    References
+    ----------
+    .. [1] `Davies, David L.; Bouldin, Donald W. (1979).
+       "A Cluster Separation Measure". IEEE Transactions on
+       Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_
+    """
+
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+
+    check_number_of_labels(n_labels, n_samples)
+    clusters_data = {}
+    for k in range(n_labels):
+        cluster_k = X[labels == k]
+        mean_k = np.mean(cluster_k, axis=0)
+        d_k = np.average(pairwise_distances(cluster_k, mean_k))
+        clusters_data[k] = (mean_k, d_k)
+
+    score = 0
+    for i in range(n_labels):
+        max_score = 0
+        mean_i, d_i = clusters_data[i]
+        for j in range(n_labels):
+            if i == j:
+                continue
+            mean_j, d_j = clusters_data[j]
+            mean_distance = np.linalg.norm(mean_i - mean_j)
+
+            if mean_distance == 0:
+                curr_score = 0
+            else:
+                curr_score = (d_i + d_j)/mean_distance
+            max_score = max(curr_score, max_score)
+        score += max_score
+    return score/n_labels

From b9230d30412fd4aa5478a3a2275d85835a5ad57e Mon Sep 17 00:00:00 2001
From: Tom Ron <rontom@gmail.com>
Date: Sat, 26 Nov 2016 19:12:21 +0100
Subject: [PATCH 02/12] Expose function at metrics level

---
 sklearn/metrics/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 413831939fbbc..8625dae427bfc 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -43,6 +43,7 @@
 from .cluster import silhouette_samples
 from .cluster import silhouette_score
 from .cluster import calinski_harabaz_score
+from .cluster import davies_bouldin_index
 from .cluster import v_measure_score
 
 from .pairwise import euclidean_distances

From 06c1df06ede2230115cfbfd3a205d049a24f8a51 Mon Sep 17 00:00:00 2001
From: Tom Ron <rontom@gmail.com>
Date: Sat, 26 Nov 2016 19:13:15 +0100
Subject: [PATCH 03/12] Adding documentation and usage example

---
 doc/modules/clustering.rst | 74 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 34bb3b678a12f..10a2759705c28 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1569,3 +1569,77 @@ Drawbacks
  *  Caliński, T., & Harabasz, J. (1974). "A dendrite method for cluster
     analysis". Communications in Statistics-theory and Methods 3: 1-27.
     `doi:10.1080/03610926.2011.560741 <http://dx.doi.org/10.1080/03610926.2011.560741>`_.
+
+.. _davies–bouldin_index:
+
+Davies–Bouldin Index
+----------------------
+
+If the ground truth labels are not known, the Davies–Bouldin index
+(:func:`sklearn.metrics.davies_bouldin_index`) can be used to evaluate the
+model, where a lower Davies–Bouldin Index relates to a model with better
+separation between clusters.
+
+For :math:`k` clusters, the Davies–Bouldin index :math:`DB` is given as the
+ratio of within cluster-mean distance to the between means distance.
+
+.. math::
+  DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij}
+
+Where :math:`D_ij` is the ratio between the within distances in clusters
+:math:`i` and :math:`j` and the distance between the means of cluster
+:math:`i` and :math:`j`.
+
+.. math::
+  D_ij = \frac{\bar{d_i}+\bar{d_j}}{d_ij}
+
+:math:`\bar{d_i}` is the average distance between each point cluster
+:math:`i` and the centroid of cluster :math:`i`.
+:math:`\bar{d_i}` is the diameter of cluster :math:`i`.
+
+:math:`\bar{d_j}` is the average distance between each point cluster
+:math:`j` and the centroid of cluster :math:`j`.
+:math:`\bar{d_j}` is the diameter of cluster :math:`j`.
+
+:math:`d_ij` is the Euclidean distance between the centroid of cluster
+:math:`i` and the centroid of cluster :math:`j`.
+
+
+  >>> from sklearn import metrics
+  >>> from sklearn.metrics import pairwise_distances
+  >>> from sklearn import datasets
+  >>> dataset = datasets.load_iris()
+  >>> X = dataset.data
+  >>> y = dataset.target
+
+In normal usage, the Davies-Bouldin index is applied to the results of a
+cluster analysis.
+
+  >>> import numpy as np
+  >>> from sklearn.cluster import KMeans
+  >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
+  >>> labels = kmeans_model.labels_
+  >>> metrics.davies_bouldin_index(X, labels)  # doctest: +ELLIPSIS
+  0.6623...
+
+
+Advantages
+~~~~~~~~~~
+
+- The computation of the Davies-Bouldin index is simpler than the computation
+  of the Silhouette index.
+
+Drawbacks
+~~~~~~~~~
+
+- The Davies-Bouldin index is generally higher for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained
+  through DBSCAN.
+
+.. topic:: References
+
+ *  Davies, David L.; Bouldin, Donald W. (1979).
+    "A Cluster Separation Measure"
+    IEEE Transactions on Pattern Analysis and Machine Intelligence.
+    PAMI-1 (2): 224–227. 
+    `doi:10.1109/TPAMI.1979.4766909 <http://dx.doi.org/10.1109/TPAMI.1979.4766909>`_.
\ No newline at end of file

From a9ccce28866a8beee0e5c0b21deb35640804270b Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Mon, 28 Nov 2016 09:39:44 +0100
Subject: [PATCH 04/12] Fix - adding davies_bouldin_index to all list

---
 sklearn/metrics/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 8625dae427bfc..7975c2571b508 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -74,6 +74,7 @@
     'confusion_matrix',
     'consensus_score',
     'coverage_error',
+    'davies_bouldin_index',
     'euclidean_distances',
     'explained_variance_score',
     'f1_score',

From 6870cbc960c2508923373a6f9ebae1e8cea83c8b Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Mon, 28 Nov 2016 11:38:25 +0100
Subject: [PATCH 05/12] Typo fix: defiend -> defined

---
 sklearn/metrics/cluster/unsupervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 4350788b41f68..3ca133f91c876 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -260,7 +260,7 @@ def calinski_harabaz_score(X, labels):
 def davies_bouldin_index(X, labels):
     """Compute the Davies Bouldin index.
 
-    The index is defiend as the ratio of within-cluster
+    The index is defined as the ratio of within-cluster
     and between-cluster distances.
 
     Parameters

From a55fafe13566b6668cbc2b2644535cbbb7443c3e Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Tue, 29 Nov 2016 10:58:28 +0100
Subject: [PATCH 06/12] Test cluster with one sample

Adding a test to validate that the code run correctly when
there is one sample in a cluster (not in all clusters, this is
already validated).
---
 sklearn/metrics/cluster/tests/test_unsupervised.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
index 2d3b10bc7763e..881ef34457581 100644
--- a/sklearn/metrics/cluster/tests/test_unsupervised.py
+++ b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -176,3 +176,9 @@ def test_davies_bouldin_index():
     labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
     assert_almost_equal(davies_bouldin_index(X, labels),
                         2*np.sqrt(0.5)/3)
+
+    # General case - cluster have one sample
+    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
+    labels = [0, 0, 1, 2]
+    assert_almost_equal(davies_bouldin_index(X, labels),
+                        (5./4)/3)

From cdb0cb8e98d144baf7a7ed096804c820054a271a Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Tue, 29 Nov 2016 11:01:35 +0100
Subject: [PATCH 07/12] Fix mathematic notation

---
 doc/modules/clustering.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 10a2759705c28..4d73965c46281 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1586,22 +1586,22 @@ ratio of within cluster-mean distance to the between means distance.
 .. math::
   DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij}
 
-Where :math:`D_ij` is the ratio between the within distances in clusters
+Where :math:`D_{ij}` is the ratio between the within distances in clusters
 :math:`i` and :math:`j` and the distance between the means of cluster
 :math:`i` and :math:`j`.
 
 .. math::
-  D_ij = \frac{\bar{d_i}+\bar{d_j}}{d_ij}
+  D_{ij} = \frac{\bar{d_i}+\bar{d_j}}{d_ij}
 
-:math:`\bar{d_i}` is the average distance between each point cluster
+:math:`\bar{d_i}` is the average distance between each point in cluster
 :math:`i` and the centroid of cluster :math:`i`.
 :math:`\bar{d_i}` is the diameter of cluster :math:`i`.
 
-:math:`\bar{d_j}` is the average distance between each point cluster
+:math:`\bar{d_j}` is the average distance between each point in cluster
 :math:`j` and the centroid of cluster :math:`j`.
 :math:`\bar{d_j}` is the diameter of cluster :math:`j`.
 
-:math:`d_ij` is the Euclidean distance between the centroid of cluster
+:math:`d_{ij}` is the Euclidean distance between the centroid of cluster
 :math:`i` and the centroid of cluster :math:`j`.
 
 

From 89845784822f7fe409fe3a319ee19bb7d3a3084b Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Tue, 29 Nov 2016 11:02:02 +0100
Subject: [PATCH 08/12] Add drawback - euclidean distance only

---
 doc/modules/clustering.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 4d73965c46281..dbc3d6ff0350f 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1635,6 +1635,7 @@ Drawbacks
 - The Davies-Bouldin index is generally higher for convex clusters than other
   concepts of clusters, such as density based clusters like those obtained
   through DBSCAN.
+- The usage of centroid distance limit the distance metric only to Euclidean space.
 
 .. topic:: References
 

From 6586a2b43307e11063d5805d9254d50e03b005a1 Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Tue, 29 Nov 2016 11:03:33 +0100
Subject: [PATCH 09/12] mean_k -> [mean_k]

---
 sklearn/metrics/cluster/unsupervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 3ca133f91c876..8bb14a9bce1c7 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -295,7 +295,7 @@ def davies_bouldin_index(X, labels):
     for k in range(n_labels):
         cluster_k = X[labels == k]
         mean_k = np.mean(cluster_k, axis=0)
-        d_k = np.average(pairwise_distances(cluster_k, mean_k))
+        d_k = np.average(pairwise_distances(cluster_k, [mean_k]))
         clusters_data[k] = (mean_k, d_k)
 
     score = 0

From 8a4bd265ce60e211dd20d63136ba3d29762b07a1 Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Wed, 30 Nov 2016 15:16:36 +0100
Subject: [PATCH 10/12] Efficiency and style fixes

1. average intracluster distances and centroids in separate numpy arrays
2. Adjust code for the case where a cluster have single sample
---
 sklearn/metrics/cluster/unsupervised.py | 34 ++++++++++---------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 8bb14a9bce1c7..6420b8feeb4ad 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -292,26 +292,20 @@ def davies_bouldin_index(X, labels):
 
     check_number_of_labels(n_labels, n_samples)
     clusters_data = {}
+    intra_dists = np.zeros(n_labels)
+    centroids = np.zeros((n_labels, len(X[0])), np.float32)
     for k in range(n_labels):
         cluster_k = X[labels == k]
         mean_k = np.mean(cluster_k, axis=0)
-        d_k = np.average(pairwise_distances(cluster_k, [mean_k]))
-        clusters_data[k] = (mean_k, d_k)
-
-    score = 0
-    for i in range(n_labels):
-        max_score = 0
-        mean_i, d_i = clusters_data[i]
-        for j in range(n_labels):
-            if i == j:
-                continue
-            mean_j, d_j = clusters_data[j]
-            mean_distance = np.linalg.norm(mean_i - mean_j)
-
-            if mean_distance == 0:
-                curr_score = 0
-            else:
-                curr_score = (d_i + d_j)/mean_distance
-            max_score = max(curr_score, max_score)
-        score += max_score
-    return score/n_labels
+        centroids[k] = mean_k
+        intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k]))
+    centroid_distances = pairwise_distances(centroids)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        if np.all((intra_dists[:, None] + intra_dists)==0.0) or \
+           np.all(centroid_distances == 0.0):
+           return 0.0
+        scores = (intra_dists[:, None] + intra_dists)/centroid_distances
+        # remove inf values
+        scores[scores == np.inf] = np.nan
+        return np.mean(np.nanmax(scores, axis=1))
+

From 95614be9f8db5c8dcbeaa98d972bc09bfd3531f5 Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Wed, 30 Nov 2016 15:23:07 +0100
Subject: [PATCH 11/12] Remove unused variable

---
 doc/modules/clustering.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index dbc3d6ff0350f..66002c5770ac6 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1610,7 +1610,6 @@ Where :math:`D_{ij}` is the ratio between the within distances in clusters
   >>> from sklearn import datasets
   >>> dataset = datasets.load_iris()
   >>> X = dataset.data
-  >>> y = dataset.target
 
 In normal usage, the Davies-Bouldin index is applied to the results of a
 cluster analysis.

From 8b6c212aa83527c2a556c066401af6fb9a8ad2a2 Mon Sep 17 00:00:00 2001
From: Tom Ron <tom.ron@magicinternet.de>
Date: Wed, 30 Nov 2016 15:25:35 +0100
Subject: [PATCH 12/12] Flakes and pep8 fixes

---
 sklearn/metrics/cluster/unsupervised.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
index 6420b8feeb4ad..0eeb1acb7b54a 100644
--- a/sklearn/metrics/cluster/unsupervised.py
+++ b/sklearn/metrics/cluster/unsupervised.py
@@ -291,7 +291,6 @@ def davies_bouldin_index(X, labels):
     n_labels = len(le.classes_)
 
     check_number_of_labels(n_labels, n_samples)
-    clusters_data = {}
     intra_dists = np.zeros(n_labels)
     centroids = np.zeros((n_labels, len(X[0])), np.float32)
     for k in range(n_labels):
@@ -301,11 +300,10 @@ def davies_bouldin_index(X, labels):
         intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k]))
     centroid_distances = pairwise_distances(centroids)
     with np.errstate(divide='ignore', invalid='ignore'):
-        if np.all((intra_dists[:, None] + intra_dists)==0.0) or \
+        if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \
            np.all(centroid_distances == 0.0):
-           return 0.0
+            return 0.0
         scores = (intra_dists[:, None] + intra_dists)/centroid_distances
         # remove inf values
         scores[scores == np.inf] = np.nan
         return np.mean(np.nanmax(scores, axis=1))
-