From bf4c3e23978d827c454fd94bf0e17d793c3af5f0 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Fri, 25 Nov 2016 17:27:35 +0100 Subject: [PATCH 01/12] Adding davies_bouldin_index calculation 1. sklearn/metrics/cluster/unsupervised.py - calculation itself 2. sklearn/metrics/cluster/tests/test_unsupervised.py - tests 3. sklearn/metrics/cluster/__init__.py - exposing the function --- sklearn/metrics/cluster/__init__.py | 4 +- .../cluster/tests/test_unsupervised.py | 30 ++++++++++ sklearn/metrics/cluster/unsupervised.py | 60 +++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 4cda1108ece32..3f8edd65a5758 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -20,6 +20,7 @@ from .unsupervised import silhouette_samples from .unsupervised import silhouette_score from .unsupervised import calinski_harabaz_score +from .unsupervised import davies_bouldin_index from .bicluster import consensus_score __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score", @@ -27,4 +28,5 @@ "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", "fowlkes_mallows_score", "entropy", "silhouette_samples", - "silhouette_score", "calinski_harabaz_score", "consensus_score"] + "silhouette_score", "calinski_harabaz_score", + "davies_bouldin_index", "consensus_score"] diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 55715d0c35cdd..2d3b10bc7763e 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -14,6 +14,7 @@ from sklearn.metrics.cluster import silhouette_samples from sklearn.metrics import pairwise_distances from sklearn.metrics.cluster import calinski_harabaz_score +from sklearn.metrics.cluster import davies_bouldin_index def test_silhouette(): @@ -146,3 +147,32 @@ def test_calinski_harabaz_score(): labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1))) + + +def test_davies_bouldin_index(): + rng = np.random.RandomState(seed=0) + + # Assert message when there is only one label + assert_raise_message(ValueError, "Number of labels is", + davies_bouldin_index, + rng.rand(10, 2), np.zeros(10)) + + # Assert message when all point are in different clusters + assert_raise_message(ValueError, "Number of labels is", + davies_bouldin_index, + rng.rand(10, 2), np.arange(10)) + + # Assert the value is 0. when all samples are equals + assert_equal(0., davies_bouldin_index(np.ones((10, 2)), + [0] * 5 + [1] * 5)) + + # Assert the value is 0. when all the mean cluster are equal + assert_equal(0., davies_bouldin_index([[-1, -1], [1, 1]] * 10, + [0] * 10 + [1] * 10)) + + # General case (with non numpy arrays) + X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) + labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 + assert_almost_equal(davies_bouldin_index(X, labels), + 2*np.sqrt(0.5)/3) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 1aed8e72a654b..4350788b41f68 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -255,3 +255,63 @@ def calinski_harabaz_score(X, labels): return (1. if intra_disp == 0. else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.))) + + +def davies_bouldin_index(X, labels): + """Compute the Davies Bouldin index. + + The index is defiend as the ratio of within-cluster + and between-cluster distances. + + Parameters + ---------- + X : array-like, shape (``n_samples``, ``n_features``) + List of ``n_features``-dimensional data points. Each row corresponds + to a single data point. + + labels : array-like, shape (``n_samples``,) + Predicted labels for each sample. + + Returns + ------- + score : float + The resulting Davies-Bouldin index. + + References + ---------- + .. [1] `Davies, David L.; Bouldin, Donald W. (1979). + "A Cluster Separation Measure". IEEE Transactions on + Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_ + """ + + X, labels = check_X_y(X, labels) + le = LabelEncoder() + labels = le.fit_transform(labels) + n_samples, _ = X.shape + n_labels = len(le.classes_) + + check_number_of_labels(n_labels, n_samples) + clusters_data = {} + for k in range(n_labels): + cluster_k = X[labels == k] + mean_k = np.mean(cluster_k, axis=0) + d_k = np.average(pairwise_distances(cluster_k, mean_k)) + clusters_data[k] = (mean_k, d_k) + + score = 0 + for i in range(n_labels): + max_score = 0 + mean_i, d_i = clusters_data[i] + for j in range(n_labels): + if i == j: + continue + mean_j, d_j = clusters_data[j] + mean_distance = np.linalg.norm(mean_i - mean_j) + + if mean_distance == 0: + curr_score = 0 + else: + curr_score = (d_i + d_j)/mean_distance + max_score = max(curr_score, max_score) + score += max_score + return score/n_labels From b9230d30412fd4aa5478a3a2275d85835a5ad57e Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Sat, 26 Nov 2016 19:12:21 +0100 Subject: [PATCH 02/12] Expose function at metrics level --- sklearn/metrics/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 413831939fbbc..8625dae427bfc 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -43,6 +43,7 @@ from .cluster import silhouette_samples from .cluster import silhouette_score from .cluster import calinski_harabaz_score +from .cluster import davies_bouldin_index from .cluster import v_measure_score from .pairwise import euclidean_distances From 06c1df06ede2230115cfbfd3a205d049a24f8a51 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Sat, 26 Nov 2016 19:13:15 +0100 Subject: [PATCH 03/12] Adding documentation and usage example --- doc/modules/clustering.rst | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 34bb3b678a12f..10a2759705c28 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1569,3 +1569,77 @@ Drawbacks * Caliński, T., & Harabasz, J. (1974). "A dendrite method for cluster analysis". Communications in Statistics-theory and Methods 3: 1-27. `doi:10.1080/03610926.2011.560741 `_. + +.. _davies–bouldin_index: + +Davies–Bouldin Index +---------------------- + +If the ground truth labels are not known, the Davies–Bouldin index +(:func:`sklearn.metrics.davies_bouldin_index`) can be used to evaluate the +model, where a lower Davies–Bouldin Index relates to a model with better +separation between clusters. + +For :math:`k` clusters, the Davies–Bouldin index :math:`DB` is given as the +ratio of within cluster-mean distance to the between means distance. + +.. math:: + DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij} + +Where :math:`D_ij` is the ratio between the within distances in clusters +:math:`i` and :math:`j` and the distance between the means of cluster +:math:`i` and :math:`j`. + +.. math:: + D_ij = \frac{\bar{d_i}+\bar{d_j}}{d_ij} + +:math:`\bar{d_i}` is the average distance between each point cluster +:math:`i` and the centroid of cluster :math:`i`. +:math:`\bar{d_i}` is the diameter of cluster :math:`i`. + +:math:`\bar{d_j}` is the average distance between each point cluster +:math:`j` and the centroid of cluster :math:`j`. +:math:`\bar{d_j}` is the diameter of cluster :math:`j`. + +:math:`d_ij` is the Euclidean distance between the centroid of cluster +:math:`i` and the centroid of cluster :math:`j`. + + + >>> from sklearn import metrics + >>> from sklearn.metrics import pairwise_distances + >>> from sklearn import datasets + >>> dataset = datasets.load_iris() + >>> X = dataset.data + >>> y = dataset.target + +In normal usage, the Davies-Bouldin index is applied to the results of a +cluster analysis. + + >>> import numpy as np + >>> from sklearn.cluster import KMeans + >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X) + >>> labels = kmeans_model.labels_ + >>> metrics.davies_bouldin_index(X, labels) # doctest: +ELLIPSIS + 0.6623... + + +Advantages +~~~~~~~~~~ + +- The computation of the Davies-Bouldin index is simpler than the computation + of the Silhouette index. + +Drawbacks +~~~~~~~~~ + +- The Davies-Bouldin index is generally higher for convex clusters than other + concepts of clusters, such as density based clusters like those obtained + through DBSCAN. + +.. topic:: References + + * Davies, David L.; Bouldin, Donald W. (1979). + "A Cluster Separation Measure" + IEEE Transactions on Pattern Analysis and Machine Intelligence. + PAMI-1 (2): 224–227. + `doi:10.1109/TPAMI.1979.4766909 `_. \ No newline at end of file From a9ccce28866a8beee0e5c0b21deb35640804270b Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Mon, 28 Nov 2016 09:39:44 +0100 Subject: [PATCH 04/12] Fix - adding davies_bouldin_index to all list --- sklearn/metrics/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 8625dae427bfc..7975c2571b508 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -74,6 +74,7 @@ 'confusion_matrix', 'consensus_score', 'coverage_error', + 'davies_bouldin_index', 'euclidean_distances', 'explained_variance_score', 'f1_score', From 6870cbc960c2508923373a6f9ebae1e8cea83c8b Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Mon, 28 Nov 2016 11:38:25 +0100 Subject: [PATCH 05/12] Typo fix: defiend -> defined --- sklearn/metrics/cluster/unsupervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 4350788b41f68..3ca133f91c876 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -260,7 +260,7 @@ def calinski_harabaz_score(X, labels): def davies_bouldin_index(X, labels): """Compute the Davies Bouldin index. - The index is defiend as the ratio of within-cluster + The index is defined as the ratio of within-cluster and between-cluster distances. Parameters From a55fafe13566b6668cbc2b2644535cbbb7443c3e Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Tue, 29 Nov 2016 10:58:28 +0100 Subject: [PATCH 06/12] Test cluster with one sample Adding a test to validate that the code run correctly when there is one sample in a cluster (not in all clusters, this is already validated). --- sklearn/metrics/cluster/tests/test_unsupervised.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 2d3b10bc7763e..881ef34457581 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -176,3 +176,9 @@ def test_davies_bouldin_index(): labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(davies_bouldin_index(X, labels), 2*np.sqrt(0.5)/3) + + # General case - cluster have one sample + X = ([[0, 0], [2, 2], [3, 3], [5, 5]]) + labels = [0, 0, 1, 2] + assert_almost_equal(davies_bouldin_index(X, labels), + (5./4)/3) From cdb0cb8e98d144baf7a7ed096804c820054a271a Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Tue, 29 Nov 2016 11:01:35 +0100 Subject: [PATCH 07/12] Fix mathematic notation --- doc/modules/clustering.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 10a2759705c28..4d73965c46281 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1586,22 +1586,22 @@ ratio of within cluster-mean distance to the between means distance. .. math:: DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij} -Where :math:`D_ij` is the ratio between the within distances in clusters +Where :math:`D_{ij}` is the ratio between the within distances in clusters :math:`i` and :math:`j` and the distance between the means of cluster :math:`i` and :math:`j`. .. math:: - D_ij = \frac{\bar{d_i}+\bar{d_j}}{d_ij} + D_{ij} = \frac{\bar{d_i}+\bar{d_j}}{d_ij} -:math:`\bar{d_i}` is the average distance between each point cluster +:math:`\bar{d_i}` is the average distance between each point in cluster :math:`i` and the centroid of cluster :math:`i`. :math:`\bar{d_i}` is the diameter of cluster :math:`i`. -:math:`\bar{d_j}` is the average distance between each point cluster +:math:`\bar{d_j}` is the average distance between each point in cluster :math:`j` and the centroid of cluster :math:`j`. :math:`\bar{d_j}` is the diameter of cluster :math:`j`. -:math:`d_ij` is the Euclidean distance between the centroid of cluster +:math:`d_{ij}` is the Euclidean distance between the centroid of cluster :math:`i` and the centroid of cluster :math:`j`. From 89845784822f7fe409fe3a319ee19bb7d3a3084b Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Tue, 29 Nov 2016 11:02:02 +0100 Subject: [PATCH 08/12] Add drawback - euclidean distance only --- doc/modules/clustering.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 4d73965c46281..dbc3d6ff0350f 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1635,6 +1635,7 @@ Drawbacks - The Davies-Bouldin index is generally higher for convex clusters than other concepts of clusters, such as density based clusters like those obtained through DBSCAN. +- The usage of centroid distance limit the distance metric only to Euclidean space. .. topic:: References From 6586a2b43307e11063d5805d9254d50e03b005a1 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Tue, 29 Nov 2016 11:03:33 +0100 Subject: [PATCH 09/12] mean_k -> [mean_k] --- sklearn/metrics/cluster/unsupervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 3ca133f91c876..8bb14a9bce1c7 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -295,7 +295,7 @@ def davies_bouldin_index(X, labels): for k in range(n_labels): cluster_k = X[labels == k] mean_k = np.mean(cluster_k, axis=0) - d_k = np.average(pairwise_distances(cluster_k, mean_k)) + d_k = np.average(pairwise_distances(cluster_k, [mean_k])) clusters_data[k] = (mean_k, d_k) score = 0 From 8a4bd265ce60e211dd20d63136ba3d29762b07a1 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Wed, 30 Nov 2016 15:16:36 +0100 Subject: [PATCH 10/12] Efficiency and style fixes 1. average intracluster distances and centroids in separate numpy arrays 2. Adjust code for the case where a cluster have single sample --- sklearn/metrics/cluster/unsupervised.py | 34 ++++++++++--------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 8bb14a9bce1c7..6420b8feeb4ad 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -292,26 +292,20 @@ def davies_bouldin_index(X, labels): check_number_of_labels(n_labels, n_samples) clusters_data = {} + intra_dists = np.zeros(n_labels) + centroids = np.zeros((n_labels, len(X[0])), np.float32) for k in range(n_labels): cluster_k = X[labels == k] mean_k = np.mean(cluster_k, axis=0) - d_k = np.average(pairwise_distances(cluster_k, [mean_k])) - clusters_data[k] = (mean_k, d_k) - - score = 0 - for i in range(n_labels): - max_score = 0 - mean_i, d_i = clusters_data[i] - for j in range(n_labels): - if i == j: - continue - mean_j, d_j = clusters_data[j] - mean_distance = np.linalg.norm(mean_i - mean_j) - - if mean_distance == 0: - curr_score = 0 - else: - curr_score = (d_i + d_j)/mean_distance - max_score = max(curr_score, max_score) - score += max_score - return score/n_labels + centroids[k] = mean_k + intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k])) + centroid_distances = pairwise_distances(centroids) + with np.errstate(divide='ignore', invalid='ignore'): + if np.all((intra_dists[:, None] + intra_dists)==0.0) or \ + np.all(centroid_distances == 0.0): + return 0.0 + scores = (intra_dists[:, None] + intra_dists)/centroid_distances + # remove inf values + scores[scores == np.inf] = np.nan + return np.mean(np.nanmax(scores, axis=1)) + From 95614be9f8db5c8dcbeaa98d972bc09bfd3531f5 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Wed, 30 Nov 2016 15:23:07 +0100 Subject: [PATCH 11/12] Remove unused variable --- doc/modules/clustering.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index dbc3d6ff0350f..66002c5770ac6 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1610,7 +1610,6 @@ Where :math:`D_{ij}` is the ratio between the within distances in clusters >>> from sklearn import datasets >>> dataset = datasets.load_iris() >>> X = dataset.data - >>> y = dataset.target In normal usage, the Davies-Bouldin index is applied to the results of a cluster analysis. From 8b6c212aa83527c2a556c066401af6fb9a8ad2a2 Mon Sep 17 00:00:00 2001 From: Tom Ron Date: Wed, 30 Nov 2016 15:25:35 +0100 Subject: [PATCH 12/12] Flakes and pep8 fixes --- sklearn/metrics/cluster/unsupervised.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 6420b8feeb4ad..0eeb1acb7b54a 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -291,7 +291,6 @@ def davies_bouldin_index(X, labels): n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) - clusters_data = {} intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(X[0])), np.float32) for k in range(n_labels): @@ -301,11 +300,10 @@ def davies_bouldin_index(X, labels): intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k])) centroid_distances = pairwise_distances(centroids) with np.errstate(divide='ignore', invalid='ignore'): - if np.all((intra_dists[:, None] + intra_dists)==0.0) or \ + if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \ np.all(centroid_distances == 0.0): - return 0.0 + return 0.0 scores = (intra_dists[:, None] + intra_dists)/centroid_distances # remove inf values scores[scores == np.inf] = np.nan return np.mean(np.nanmax(scores, axis=1)) -