diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 34bb3b678a12f..66002c5770ac6 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1569,3 +1569,77 @@ Drawbacks * Caliński, T., & Harabasz, J. (1974). "A dendrite method for cluster analysis". Communications in Statistics-theory and Methods 3: 1-27. `doi:10.1080/03610926.2011.560741 `_. + +.. _davies–bouldin_index: + +Davies–Bouldin Index +---------------------- + +If the ground truth labels are not known, the Davies–Bouldin index +(:func:`sklearn.metrics.davies_bouldin_index`) can be used to evaluate the +model, where a lower Davies–Bouldin Index relates to a model with better +separation between clusters. + +For :math:`k` clusters, the Davies–Bouldin index :math:`DB` is given as the +ratio of within cluster-mean distance to the between means distance. + +.. math:: + DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij} + +Where :math:`D_{ij}` is the ratio between the within distances in clusters +:math:`i` and :math:`j` and the distance between the means of cluster +:math:`i` and :math:`j`. + +.. math:: + D_{ij} = \frac{\bar{d_i}+\bar{d_j}}{d_ij} + +:math:`\bar{d_i}` is the average distance between each point in cluster +:math:`i` and the centroid of cluster :math:`i`. +:math:`\bar{d_i}` is the diameter of cluster :math:`i`. + +:math:`\bar{d_j}` is the average distance between each point in cluster +:math:`j` and the centroid of cluster :math:`j`. +:math:`\bar{d_j}` is the diameter of cluster :math:`j`. + +:math:`d_{ij}` is the Euclidean distance between the centroid of cluster +:math:`i` and the centroid of cluster :math:`j`. + + + >>> from sklearn import metrics + >>> from sklearn.metrics import pairwise_distances + >>> from sklearn import datasets + >>> dataset = datasets.load_iris() + >>> X = dataset.data + +In normal usage, the Davies-Bouldin index is applied to the results of a +cluster analysis. + + >>> import numpy as np + >>> from sklearn.cluster import KMeans + >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X) + >>> labels = kmeans_model.labels_ + >>> metrics.davies_bouldin_index(X, labels) # doctest: +ELLIPSIS + 0.6623... + + +Advantages +~~~~~~~~~~ + +- The computation of the Davies-Bouldin index is simpler than the computation + of the Silhouette index. + +Drawbacks +~~~~~~~~~ + +- The Davies-Bouldin index is generally higher for convex clusters than other + concepts of clusters, such as density based clusters like those obtained + through DBSCAN. +- The usage of centroid distance limit the distance metric only to Euclidean space. + +.. topic:: References + + * Davies, David L.; Bouldin, Donald W. (1979). + "A Cluster Separation Measure" + IEEE Transactions on Pattern Analysis and Machine Intelligence. + PAMI-1 (2): 224–227. + `doi:10.1109/TPAMI.1979.4766909 `_. \ No newline at end of file diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 413831939fbbc..7975c2571b508 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -43,6 +43,7 @@ from .cluster import silhouette_samples from .cluster import silhouette_score from .cluster import calinski_harabaz_score +from .cluster import davies_bouldin_index from .cluster import v_measure_score from .pairwise import euclidean_distances @@ -73,6 +74,7 @@ 'confusion_matrix', 'consensus_score', 'coverage_error', + 'davies_bouldin_index', 'euclidean_distances', 'explained_variance_score', 'f1_score', diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 4cda1108ece32..3f8edd65a5758 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -20,6 +20,7 @@ from .unsupervised import silhouette_samples from .unsupervised import silhouette_score from .unsupervised import calinski_harabaz_score +from .unsupervised import davies_bouldin_index from .bicluster import consensus_score __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score", @@ -27,4 +28,5 @@ "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", "fowlkes_mallows_score", "entropy", "silhouette_samples", - "silhouette_score", "calinski_harabaz_score", "consensus_score"] + "silhouette_score", "calinski_harabaz_score", + "davies_bouldin_index", "consensus_score"] diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 55715d0c35cdd..881ef34457581 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -14,6 +14,7 @@ from sklearn.metrics.cluster import silhouette_samples from sklearn.metrics import pairwise_distances from sklearn.metrics.cluster import calinski_harabaz_score +from sklearn.metrics.cluster import davies_bouldin_index def test_silhouette(): @@ -146,3 +147,38 @@ def test_calinski_harabaz_score(): labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(calinski_harabaz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1))) + + +def test_davies_bouldin_index(): + rng = np.random.RandomState(seed=0) + + # Assert message when there is only one label + assert_raise_message(ValueError, "Number of labels is", + davies_bouldin_index, + rng.rand(10, 2), np.zeros(10)) + + # Assert message when all point are in different clusters + assert_raise_message(ValueError, "Number of labels is", + davies_bouldin_index, + rng.rand(10, 2), np.arange(10)) + + # Assert the value is 0. when all samples are equals + assert_equal(0., davies_bouldin_index(np.ones((10, 2)), + [0] * 5 + [1] * 5)) + + # Assert the value is 0. when all the mean cluster are equal + assert_equal(0., davies_bouldin_index([[-1, -1], [1, 1]] * 10, + [0] * 10 + [1] * 10)) + + # General case (with non numpy arrays) + X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) + labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 + assert_almost_equal(davies_bouldin_index(X, labels), + 2*np.sqrt(0.5)/3) + + # General case - cluster have one sample + X = ([[0, 0], [2, 2], [3, 3], [5, 5]]) + labels = [0, 0, 1, 2] + assert_almost_equal(davies_bouldin_index(X, labels), + (5./4)/3) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 1aed8e72a654b..0eeb1acb7b54a 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -255,3 +255,55 @@ def calinski_harabaz_score(X, labels): return (1. if intra_disp == 0. else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.))) + + +def davies_bouldin_index(X, labels): + """Compute the Davies Bouldin index. + + The index is defined as the ratio of within-cluster + and between-cluster distances. + + Parameters + ---------- + X : array-like, shape (``n_samples``, ``n_features``) + List of ``n_features``-dimensional data points. Each row corresponds + to a single data point. + + labels : array-like, shape (``n_samples``,) + Predicted labels for each sample. + + Returns + ------- + score : float + The resulting Davies-Bouldin index. + + References + ---------- + .. [1] `Davies, David L.; Bouldin, Donald W. (1979). + "A Cluster Separation Measure". IEEE Transactions on + Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_ + """ + + X, labels = check_X_y(X, labels) + le = LabelEncoder() + labels = le.fit_transform(labels) + n_samples, _ = X.shape + n_labels = len(le.classes_) + + check_number_of_labels(n_labels, n_samples) + intra_dists = np.zeros(n_labels) + centroids = np.zeros((n_labels, len(X[0])), np.float32) + for k in range(n_labels): + cluster_k = X[labels == k] + mean_k = np.mean(cluster_k, axis=0) + centroids[k] = mean_k + intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k])) + centroid_distances = pairwise_distances(centroids) + with np.errstate(divide='ignore', invalid='ignore'): + if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \ + np.all(centroid_distances == 0.0): + return 0.0 + scores = (intra_dists[:, None] + intra_dists)/centroid_distances + # remove inf values + scores[scores == np.inf] = np.nan + return np.mean(np.nanmax(scores, axis=1))