From a44d480f60f3c2873bcf58725066cbeed6ee4a06 Mon Sep 17 00:00:00 2001 From: Greg Stupp Date: Wed, 17 Aug 2016 18:24:05 -0700 Subject: [PATCH 1/4] use sparse contingency matrix for supervised cluster metrics Remove max_n_classes option --- sklearn/metrics/cluster/supervised.py | 230 +++++++++--------- .../metrics/cluster/tests/test_supervised.py | 66 ++--- 2 files changed, 152 insertions(+), 144 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 131c14b5078ca..6960707ac4a03 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -9,12 +9,14 @@ # Diego Molla # Arnaud Fouchet # Thierry Guillemot +# Gregory Stupp # License: BSD 3 clause from math import log from scipy.misc import comb -from scipy.sparse import coo_matrix +from scipy.sparse import coo_matrix, find +from scipy.sparse.data import _data_matrix import numpy as np from .expected_mutual_info_fast import expected_mutual_information @@ -46,7 +48,7 @@ def check_clusterings(labels_true, labels_pred): return labels_true, labels_pred -def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): +def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): """Build a contingency matrix describing the relationship between labels. Parameters @@ -62,38 +64,36 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): matrix. This helps to stop NaN propagation. If ``None``, nothing is adjusted. - max_n_classes : int, optional (default=5000) - Maximal number of classeses handled for contingency_matrix. - This help to avoid Memory error with regression target - for mutual_information. + sparse: boolean, optional. + If True, return a sparse continency matrix. If ``eps is not None``, + and ``sparse is True``, will throw ValueError. Returns ------- - contingency: array, shape=[n_classes_true, n_classes_pred] + contingency: {array-like, sparse matrix}, shape=[n_classes_true, n_classes_pred] Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in true class :math:`i` and in predicted class :math:`j`. If ``eps is None``, the dtype of this array will be integer. If ``eps`` is given, the dtype will be float. """ + + if eps is not None and sparse: + raise ValueError("Cannot set 'eps' and return a sparse matrix") + classes, class_idx = np.unique(labels_true, return_inverse=True) clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) n_classes = classes.shape[0] n_clusters = clusters.shape[0] - if n_classes > max_n_classes: - raise ValueError("Too many classes for a clustering metric. If you " - "want to increase the limit, pass parameter " - "max_n_classes to the scoring function") - if n_clusters > max_n_classes: - raise ValueError("Too many clusters for a clustering metric. If you " - "want to increase the limit, pass parameter " - "max_n_classes to the scoring function") + # Using coo_matrix to accelerate simple histogram calculation, # i.e. bins are consecutive integers # Currently, coo_matrix is faster than histogram2d for simple cases contingency = coo_matrix((np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), shape=(n_classes, n_clusters), - dtype=np.int).toarray() + dtype=np.int) + if not sparse: + contingency = contingency.toarray() if eps is not None: # don't use += as contingency is integer contingency = contingency + eps @@ -102,7 +102,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): # clustering measures -def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): +def adjusted_rand_score(labels_true, labels_pred, contingency=None): """Rand index adjusted for chance. The Rand Index computes a similarity measure between two clusterings @@ -134,10 +134,10 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] Cluster labels to evaluate - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze + contingency: {None, sparse matrix}, shape = [n_classes_true, n_classes_pred] + A contingency matrix given by the :func:`contingency_matrix` function. + If value is ``None``, it will be computed, otherwise the given value is + used, with ``labels_true`` and ``labels_pred`` ignored. Returns ------- @@ -188,33 +188,49 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): adjusted_mutual_info_score: Adjusted Mutual Information """ - labels_true, labels_pred = check_clusterings(labels_true, labels_pred) - n_samples = labels_true.shape[0] - classes = np.unique(labels_true) - clusters = np.unique(labels_pred) + if contingency is None: + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + n_samples = labels_true.shape[0] + n_classes = np.unique(labels_true).shape[0] + n_clusters = np.unique(labels_pred).shape[0] + elif isinstance(contingency, _data_matrix): # scipy.sparse.data._data_matrix + n_samples = contingency.nnz + n_classes, n_clusters = contingency.shape + else: + raise ValueError("'contingency' must be a sparse matrix or None") + # Special limit cases: no clustering since the data is not split; # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. - if (classes.shape[0] == clusters.shape[0] == 1 or - classes.shape[0] == clusters.shape[0] == 0 or - classes.shape[0] == clusters.shape[0] == len(labels_true)): + if (n_classes == n_clusters == 1 or + n_classes == n_clusters == 0 or + n_classes == n_clusters == n_samples): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred, - max_n_classes=max_n_classes) + # Compute contingency matrix if we weren't given it + if contingency is None: + contingency = contingency_matrix(labels_true, labels_pred) # Compute the ARI using the contingency data - sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) - sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0)) + if isinstance(contingency, np.ndarray): + # For an array + sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) + sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0)) + sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) + elif isinstance(contingency, _data_matrix): + # For a sparse matrix + sum_comb_c = sum(comb2(n_c) for n_c in np.array(contingency.sum(axis=1))) + sum_comb_k = sum(comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T) + sum_comb = sum(comb2(n_ij) for n_ij in find(contingency)[2]) + else: + raise ValueError("Unsupported type for 'contingency': " + str(type(contingency))) - sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2)) mean_comb = (sum_comb_k + sum_comb_c) / 2. - return ((sum_comb - prod_comb) / (mean_comb - prod_comb)) + return float((sum_comb - prod_comb) / (mean_comb - prod_comb)) -def homogeneity_completeness_v_measure(labels_true, labels_pred, - max_n_classes=5000): +def homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=False): """Compute the homogeneity and completeness and V-Measure scores at once. Those metrics are based on normalized conditional entropy measures of @@ -248,10 +264,9 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze + sparse: boolean, optional. + If True, intermediate calculation of the contingency matrix + will calculate a sparse continency matrix. Returns ------- @@ -278,8 +293,11 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, entropy_C = entropy(labels_true) entropy_K = entropy(labels_pred) - MI = mutual_info_score(labels_true, labels_pred, - max_n_classes=max_n_classes) + if sparse: + contingency = contingency_matrix(labels_true, labels_pred, sparse=True) + MI = mutual_info_score(None, None, contingency=contingency) + else: + MI = mutual_info_score(labels_true, labels_pred) homogeneity = MI / (entropy_C) if entropy_C else 1.0 completeness = MI / (entropy_K) if entropy_K else 1.0 @@ -293,7 +311,7 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, return homogeneity, completeness, v_measure_score -def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): +def homogeneity_score(labels_true, labels_pred, sparse=False): """Homogeneity metric of a cluster labeling given a ground truth. A clustering result satisfies homogeneity if all of its clusters @@ -317,10 +335,9 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze + sparse: boolean, optional. + If True, intermediate calculation of the contingency matrix + will calculate a sparse continency matrix. Returns ------- @@ -369,11 +386,10 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, - max_n_classes)[0] + return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[0] -def completeness_score(labels_true, labels_pred, max_n_classes=5000): +def completeness_score(labels_true, labels_pred, sparse=False): """Completeness metric of a cluster labeling given a ground truth. A clustering result satisfies completeness if all the data points @@ -397,10 +413,9 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze + sparse: boolean, optional. + If True, intermediate calculation of the contingency matrix + will calculate a sparse continency matrix. Returns ------- @@ -445,11 +460,10 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000): 0.0 """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, - max_n_classes)[1] + return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[1] -def v_measure_score(labels_true, labels_pred, max_n_classes=5000): +def v_measure_score(labels_true, labels_pred, sparse=False): """V-measure cluster labeling given a ground truth. This score is identical to :func:`normalized_mutual_info_score`. @@ -477,10 +491,9 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze + sparse: boolean, optional. + If True, intermediate calculation of the contingency matrix + will calculate a sparse continency matrix. Returns ------- @@ -546,12 +559,10 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, - max_n_classes)[2] + return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[2] -def mutual_info_score(labels_true, labels_pred, contingency=None, - max_n_classes=5000): +def mutual_info_score(labels_true, labels_pred, contingency=None): """Mutual Information between two clusterings. The Mutual Information is a measure of the similarity between two labels of @@ -586,16 +597,11 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - contingency: None or array, shape = [n_classes_true, n_classes_pred] + contingency: {None, array, sparse matrix}, shape = [n_classes_true, n_classes_pred] A contingency matrix given by the :func:`contingency_matrix` function. If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the mutual_info_score - metric. Setting it too high can lead to MemoryError or OS - freeze - Returns ------- mi: float @@ -608,27 +614,43 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, """ if contingency is None: labels_true, labels_pred = check_clusterings(labels_true, labels_pred) - contingency = contingency_matrix(labels_true, labels_pred, - max_n_classes=max_n_classes) - contingency = np.array(contingency, dtype='float') - contingency_sum = np.sum(contingency) - pi = np.sum(contingency, axis=1) - pj = np.sum(contingency, axis=0) - outer = np.outer(pi, pj) - nnz = contingency != 0.0 - # normalized contingency - contingency_nm = contingency[nnz] - log_contingency_nm = np.log(contingency_nm) - contingency_nm /= contingency_sum - # log(a / b) should be calculated as log(a) - log(b) for - # possible loss of precision - log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum()) - mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) + - contingency_nm * log_outer) - return mi.sum() + contingency = contingency_matrix(labels_true, labels_pred) + if isinstance(contingency, np.ndarray): + # For an array + contingency = np.array(contingency, dtype='float') + contingency_sum = np.sum(contingency) + pi = np.sum(contingency, axis=1) + pj = np.sum(contingency, axis=0) + outer = np.outer(pi, pj) + nnz = contingency != 0.0 + # normalized contingency + contingency_nm = contingency[nnz] + log_contingency_nm = np.log(contingency_nm) + contingency_nm /= contingency_sum + # log(a / b) should be calculated as log(a) - log(b) for + # possible loss of precision + log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum()) + mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) + + contingency_nm * log_outer) + return mi.sum() + elif isinstance(contingency, _data_matrix): + # For a sparse matrix + contingency_sum = contingency.sum() + pi = np.array(contingency.sum(axis=1)) + pj = np.array(contingency.sum(axis=0)).T + nnzx, nnzy, nnz_val = find(contingency) + log_contingency_nm = np.log(nnz_val) + contingency_nm = nnz_val * 1.0 / contingency_sum # python2 integer division... + # Don't need to calculate the full outer product. Just for the non-zero values + outer = np.array([pi[x] * pj[y] for x, y in zip(nnzx, nnzy)]).T + log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) + mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + contingency_nm * log_outer + return mi.sum() + else: + raise ValueError("Unsupported type for 'contingency': " + str(type(contingency))) -def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): +def adjusted_mutual_info_score(labels_true, labels_pred): """Adjusted Mutual Information between two clusterings. Adjusted Mutual Information (AMI) is an adjustment of the Mutual @@ -661,11 +683,6 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze - Returns ------- ami: float(upperlimited by 1.0) @@ -716,8 +733,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred, - max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -730,7 +746,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): return ami -def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): +def normalized_mutual_info_score(labels_true, labels_pred): """Normalized Mutual Information between two clusterings. Normalized Mutual Information (NMI) is an normalization of the Mutual @@ -760,11 +776,6 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - max_n_classes: int, optional (default=5000) - Maximal number of classes handled by the adjusted_rand_score - metric. Setting it too high can lead to MemoryError or OS - freeze - Returns ------- nmi: float @@ -803,8 +814,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred, - max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -816,7 +826,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): return nmi -def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): +def fowlkes_mallows_score(labels_true, labels_pred): """Measure the similarity of two clusterings of a set of points. The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of @@ -845,11 +855,6 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = (``n_samples``, ) A clustering of the data into disjoint subsets. - max_n_classes : int, optional (default=5000) - Maximal number of classes handled by the Fowlkes-Mallows - metric. Setting it too high can lead to MemoryError or OS - freeze - Returns ------- score : float @@ -886,8 +891,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): labels_true, labels_pred = check_clusterings(labels_true, labels_pred,) n_samples, = labels_true.shape - c = contingency_matrix(labels_true, labels_pred, - max_n_classes=max_n_classes) + c = contingency_matrix(labels_true, labels_pred) tk = np.dot(c.ravel(), c.ravel()) - n_samples pk = np.sum(np.sum(c, axis=0) ** 2) - n_samples qk = np.sum(np.sum(c, axis=1) ** 2) - n_samples diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 828c2c544574c..f345ee5615b0b 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -55,6 +55,22 @@ def test_perfect_matches(): assert_equal(score_func([0., 1., 2.], [42., 7., 2.]), 1.0) assert_equal(score_func([0, 1, 2], [42, 7, 2]), 1.0) +def test_homogeneity_completeness_v_measure_sparse(): + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + h, c, v = homogeneity_completeness_v_measure(labels_a, labels_b) + h_s, c_s, v_s = homogeneity_completeness_v_measure(labels_a, labels_b, sparse = True) + assert_array_almost_equal([h, c, v],[h_s, c_s, v_s]) + +""" Takes too long... +def test_homogeneity_completeness_v_measure_large(): + # This will fail without sparse matrices with any reasonable amount of RAM (<~1TB) + from random import randrange + labels_a = [randrange(100000) for x in range(1000000)] + labels_b = [randrange(100000) for x in range(1000000)] + h_s, c_s, v_s = homogeneity_completeness_v_measure(labels_a, labels_b, sparse = True) + assert_raises(MemoryError, homogeneity_completeness_v_measure, labels_a, labels_b) +""" def test_homogeneous_but_not_complete_labeling(): # homogeneous but not complete clustering @@ -183,19 +199,30 @@ def test_contingency_matrix(): assert_array_almost_equal(C, C2 + .1) +def test_contingency_matrix_sparse(): + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + C = contingency_matrix(labels_a, labels_b) + C_sparse = contingency_matrix(labels_a, labels_b, sparse = True).toarray() + assert_array_almost_equal(C, C_sparse) + + +def test_adjusted_rand_score_sparse(): + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + C_sparse = contingency_matrix(labels_a, labels_b, sparse = True) + assert_almost_equal(adjusted_rand_score(labels_a,labels_b), adjusted_rand_score(None, None, C_sparse)) + + def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int),\ np.arange(i, dtype=np.int) - assert_equal(normalized_mutual_info_score(labels_a, labels_b, - max_n_classes=1e4), 0.0) - assert_equal(v_measure_score(labels_a, labels_b, - max_n_classes=1e4), 0.0) - assert_equal(adjusted_mutual_info_score(labels_a, labels_b, - max_n_classes=1e4), 0.0) - assert_equal(normalized_mutual_info_score(labels_a, labels_b, - max_n_classes=1e4), 0.0) + assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) + assert_equal(v_measure_score(labels_a, labels_b), 0.0) + assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) + assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) def test_v_measure_and_mutual_information(seed=36): @@ -209,29 +236,6 @@ def test_v_measure_and_mutual_information(seed=36): (entropy(labels_a) + entropy(labels_b)), 0) -def test_max_n_classes(): - rng = np.random.RandomState(seed=0) - labels_true = rng.rand(53) - labels_pred = rng.rand(53) - labels_zero = np.zeros(53) - labels_true[:2] = 0 - labels_zero[:3] = 1 - labels_pred[:2] = 0 - for score_func in score_funcs: - expected = ("Too many classes for a clustering metric. If you " - "want to increase the limit, pass parameter " - "max_n_classes to the scoring function") - assert_raise_message(ValueError, expected, score_func, - labels_true, labels_pred, - max_n_classes=50) - expected = ("Too many clusters for a clustering metric. If you " - "want to increase the limit, pass parameter " - "max_n_classes to the scoring function") - assert_raise_message(ValueError, expected, score_func, - labels_zero, labels_pred, - max_n_classes=50) - - def test_fowlkes_mallows_score(): # General case score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], From cc50125b095ae8b374508fda0e3e4054cfe44531 Mon Sep 17 00:00:00 2001 From: Greg Stupp Date: Wed, 17 Aug 2016 19:11:14 -0700 Subject: [PATCH 2/4] merge sparse and max_n_classes functionality --- sklearn/metrics/cluster/supervised.py | 117 +++++++++++++----- .../metrics/cluster/tests/test_supervised.py | 78 ++++++++---- 2 files changed, 139 insertions(+), 56 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 6960707ac4a03..d8a3214ec3158 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -14,10 +14,10 @@ from math import log +import numpy as np from scipy.misc import comb from scipy.sparse import coo_matrix, find from scipy.sparse.data import _data_matrix -import numpy as np from .expected_mutual_info_fast import expected_mutual_information from ...utils.fixes import bincount @@ -48,7 +48,7 @@ def check_clusterings(labels_true, labels_pred): return labels_true, labels_pred -def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): +def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, sparse=False): """Build a contingency matrix describing the relationship between labels. Parameters @@ -59,11 +59,16 @@ def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): labels_pred : array, shape = [n_samples] Cluster labels to evaluate - eps: None or float + eps: None or float, optional. If a float, that value is added to all values in the contingency matrix. This helps to stop NaN propagation. If ``None``, nothing is adjusted. + max_n_classes : int, optional (default=5000) + Maximal number of classeses handled for contingency_matrix. + This help to avoid Memory error with regression target + for mutual_information. + sparse: boolean, optional. If True, return a sparse continency matrix. If ``eps is not None``, and ``sparse is True``, will throw ValueError. @@ -84,12 +89,18 @@ def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) n_classes = classes.shape[0] n_clusters = clusters.shape[0] - + if not sparse and (n_classes > max_n_classes): + raise ValueError("Too many classes for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + if not sparse and (n_clusters > max_n_classes): + raise ValueError("Too many clusters for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") # Using coo_matrix to accelerate simple histogram calculation, # i.e. bins are consecutive integers # Currently, coo_matrix is faster than histogram2d for simple cases - contingency = coo_matrix((np.ones(class_idx.shape[0]), - (class_idx, cluster_idx)), + contingency = coo_matrix((np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), shape=(n_classes, n_clusters), dtype=np.int) if not sparse: @@ -102,7 +113,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): # clustering measures -def adjusted_rand_score(labels_true, labels_pred, contingency=None): +def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, contingency=None): """Rand index adjusted for chance. The Rand Index computes a similarity measure between two clusterings @@ -134,6 +145,11 @@ def adjusted_rand_score(labels_true, labels_pred, contingency=None): labels_pred : array, shape = [n_samples] Cluster labels to evaluate + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + contingency: {None, sparse matrix}, shape = [n_classes_true, n_classes_pred] A contingency matrix given by the :func:`contingency_matrix` function. If value is ``None``, it will be computed, otherwise the given value is @@ -203,13 +219,13 @@ def adjusted_rand_score(labels_true, labels_pred, contingency=None): # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. if (n_classes == n_clusters == 1 or - n_classes == n_clusters == 0 or - n_classes == n_clusters == n_samples): + n_classes == n_clusters == 0 or + n_classes == n_clusters == n_samples): return 1.0 # Compute contingency matrix if we weren't given it if contingency is None: - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) # Compute the ARI using the contingency data if isinstance(contingency, np.ndarray): @@ -230,7 +246,7 @@ def adjusted_rand_score(labels_true, labels_pred, contingency=None): return float((sum_comb - prod_comb) / (mean_comb - prod_comb)) -def homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=False): +def homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=5000, sparse=False): """Compute the homogeneity and completeness and V-Measure scores at once. Those metrics are based on normalized conditional entropy measures of @@ -264,6 +280,11 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=False): labels_pred : array, shape = [n_samples] cluster labels to evaluate + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + sparse: boolean, optional. If True, intermediate calculation of the contingency matrix will calculate a sparse continency matrix. @@ -297,7 +318,7 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=False): contingency = contingency_matrix(labels_true, labels_pred, sparse=True) MI = mutual_info_score(None, None, contingency=contingency) else: - MI = mutual_info_score(labels_true, labels_pred) + MI = mutual_info_score(labels_true, labels_pred, max_n_classes=max_n_classes) homogeneity = MI / (entropy_C) if entropy_C else 1.0 completeness = MI / (entropy_K) if entropy_K else 1.0 @@ -311,7 +332,7 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=False): return homogeneity, completeness, v_measure_score -def homogeneity_score(labels_true, labels_pred, sparse=False): +def homogeneity_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): """Homogeneity metric of a cluster labeling given a ground truth. A clustering result satisfies homogeneity if all of its clusters @@ -339,6 +360,11 @@ def homogeneity_score(labels_true, labels_pred, sparse=False): If True, intermediate calculation of the contingency matrix will calculate a sparse continency matrix. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- homogeneity: float @@ -386,10 +412,11 @@ def homogeneity_score(labels_true, labels_pred, sparse=False): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[0] + return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, + max_n_classes=max_n_classes)[0] -def completeness_score(labels_true, labels_pred, sparse=False): +def completeness_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): """Completeness metric of a cluster labeling given a ground truth. A clustering result satisfies completeness if all the data points @@ -417,6 +444,11 @@ def completeness_score(labels_true, labels_pred, sparse=False): If True, intermediate calculation of the contingency matrix will calculate a sparse continency matrix. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- completeness: float @@ -460,10 +492,11 @@ def completeness_score(labels_true, labels_pred, sparse=False): 0.0 """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[1] + return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, + max_n_classes=max_n_classes)[1] -def v_measure_score(labels_true, labels_pred, sparse=False): +def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): """V-measure cluster labeling given a ground truth. This score is identical to :func:`normalized_mutual_info_score`. @@ -495,6 +528,11 @@ def v_measure_score(labels_true, labels_pred, sparse=False): If True, intermediate calculation of the contingency matrix will calculate a sparse continency matrix. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- v_measure: float @@ -559,10 +597,11 @@ def v_measure_score(labels_true, labels_pred, sparse=False): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse)[2] + return homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=max_n_classes, + sparse=sparse)[2] -def mutual_info_score(labels_true, labels_pred, contingency=None): +def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes=5000): """Mutual Information between two clusterings. The Mutual Information is a measure of the similarity between two labels of @@ -602,6 +641,11 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the mutual_info_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- mi: float @@ -614,7 +658,7 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): """ if contingency is None: labels_true, labels_pred = check_clusterings(labels_true, labels_pred) - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) if isinstance(contingency, np.ndarray): # For an array contingency = np.array(contingency, dtype='float') @@ -650,7 +694,7 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): raise ValueError("Unsupported type for 'contingency': " + str(type(contingency))) -def adjusted_mutual_info_score(labels_true, labels_pred): +def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): """Adjusted Mutual Information between two clusterings. Adjusted Mutual Information (AMI) is an adjustment of the Mutual @@ -683,6 +727,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- ami: float(upperlimited by 1.0) @@ -731,9 +780,9 @@ def adjusted_mutual_info_score(labels_true, labels_pred): # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1 or - classes.shape[0] == clusters.shape[0] == 0): + classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -746,7 +795,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred): return ami -def normalized_mutual_info_score(labels_true, labels_pred): +def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): """Normalized Mutual Information between two clusterings. Normalized Mutual Information (NMI) is an normalization of the Mutual @@ -776,6 +825,11 @@ def normalized_mutual_info_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- nmi: float @@ -812,9 +866,9 @@ def normalized_mutual_info_score(labels_true, labels_pred): # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1 or - classes.shape[0] == clusters.shape[0] == 0): + classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -826,7 +880,7 @@ def normalized_mutual_info_score(labels_true, labels_pred): return nmi -def fowlkes_mallows_score(labels_true, labels_pred): +def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): """Measure the similarity of two clusterings of a set of points. The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of @@ -855,6 +909,11 @@ def fowlkes_mallows_score(labels_true, labels_pred): labels_pred : array, shape = (``n_samples``, ) A clustering of the data into disjoint subsets. + max_n_classes : int, optional (default=5000) + Maximal number of classes handled by the Fowlkes-Mallows + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- score : float @@ -888,10 +947,10 @@ def fowlkes_mallows_score(labels_true, labels_pred): .. [2] `Wikipedia entry for the Fowlkes-Mallows Index `_ """ - labels_true, labels_pred = check_clusterings(labels_true, labels_pred,) + labels_true, labels_pred = check_clusterings(labels_true, labels_pred, ) n_samples, = labels_true.shape - c = contingency_matrix(labels_true, labels_pred) + c = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) tk = np.dot(c.ravel(), c.ravel()) - n_samples pk = np.sum(np.sum(c, axis=0) ** 2) - n_samples qk = np.sum(np.sum(c, axis=1) ** 2) - n_samples diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index f345ee5615b0b..367b9ce020de6 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -1,23 +1,21 @@ import numpy as np +from nose.tools import assert_almost_equal +from nose.tools import assert_equal +from numpy.testing import assert_array_almost_equal +from sklearn.metrics.cluster import adjusted_mutual_info_score from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.metrics.cluster import homogeneity_score from sklearn.metrics.cluster import completeness_score -from sklearn.metrics.cluster import v_measure_score -from sklearn.metrics.cluster import homogeneity_completeness_v_measure -from sklearn.metrics.cluster import adjusted_mutual_info_score -from sklearn.metrics.cluster import normalized_mutual_info_score -from sklearn.metrics.cluster import mutual_info_score -from sklearn.metrics.cluster import expected_mutual_information from sklearn.metrics.cluster import contingency_matrix -from sklearn.metrics.cluster import fowlkes_mallows_score from sklearn.metrics.cluster import entropy - +from sklearn.metrics.cluster import expected_mutual_information +from sklearn.metrics.cluster import fowlkes_mallows_score +from sklearn.metrics.cluster import homogeneity_completeness_v_measure +from sklearn.metrics.cluster import homogeneity_score +from sklearn.metrics.cluster import mutual_info_score +from sklearn.metrics.cluster import normalized_mutual_info_score +from sklearn.metrics.cluster import v_measure_score from sklearn.utils.testing import assert_raise_message -from nose.tools import assert_almost_equal -from nose.tools import assert_equal -from numpy.testing import assert_array_almost_equal - score_funcs = [ adjusted_rand_score, @@ -55,12 +53,14 @@ def test_perfect_matches(): assert_equal(score_func([0., 1., 2.], [42., 7., 2.]), 1.0) assert_equal(score_func([0, 1, 2], [42, 7, 2]), 1.0) + def test_homogeneity_completeness_v_measure_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) h, c, v = homogeneity_completeness_v_measure(labels_a, labels_b) - h_s, c_s, v_s = homogeneity_completeness_v_measure(labels_a, labels_b, sparse = True) - assert_array_almost_equal([h, c, v],[h_s, c_s, v_s]) + h_s, c_s, v_s = homogeneity_completeness_v_measure(labels_a, labels_b, sparse=True) + assert_array_almost_equal([h, c, v], [h_s, c_s, v_s]) + """ Takes too long... def test_homogeneity_completeness_v_measure_large(): @@ -72,6 +72,7 @@ def test_homogeneity_completeness_v_measure_large(): assert_raises(MemoryError, homogeneity_completeness_v_measure, labels_a, labels_b) """ + def test_homogeneous_but_not_complete_labeling(): # homogeneous but not complete clustering h, c, v = homogeneity_completeness_v_measure( @@ -203,39 +204,62 @@ def test_contingency_matrix_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C = contingency_matrix(labels_a, labels_b) - C_sparse = contingency_matrix(labels_a, labels_b, sparse = True).toarray() + C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray() assert_array_almost_equal(C, C_sparse) - + def test_adjusted_rand_score_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) - C_sparse = contingency_matrix(labels_a, labels_b, sparse = True) - assert_almost_equal(adjusted_rand_score(labels_a,labels_b), adjusted_rand_score(None, None, C_sparse)) + C_sparse = contingency_matrix(labels_a, labels_b, sparse=True) + assert_almost_equal(adjusted_rand_score(labels_a, labels_b), adjusted_rand_score(None, None, contingency=C_sparse)) def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): - labels_a, labels_b = np.ones(i, dtype=np.int),\ - np.arange(i, dtype=np.int) - assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) - assert_equal(v_measure_score(labels_a, labels_b), 0.0) - assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) - assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) + labels_a, labels_b = np.ones(i, dtype=np.int), \ + np.arange(i, dtype=np.int) + assert_equal(normalized_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0) + assert_equal(v_measure_score(labels_a, labels_b, max_n_classes=1e4), 0.0) + assert_equal(adjusted_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0) + assert_equal(normalized_mutual_info_score(labels_a, labels_b, max_n_classes=1e4), 0.0) def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) - labels_a, labels_b = random_state.randint(0, 10, i),\ - random_state.randint(0, 10, i) + labels_a, labels_b = random_state.randint(0, 10, i), \ + random_state.randint(0, 10, i) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) +def test_max_n_classes(): + rng = np.random.RandomState(seed=0) + labels_true = rng.rand(53) + labels_pred = rng.rand(53) + labels_zero = np.zeros(53) + labels_true[:2] = 0 + labels_zero[:3] = 1 + labels_pred[:2] = 0 + for score_func in score_funcs: + expected = ("Too many classes for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + assert_raise_message(ValueError, expected, score_func, + labels_true, labels_pred, + max_n_classes=50) + expected = ("Too many clusters for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + assert_raise_message(ValueError, expected, score_func, + labels_zero, labels_pred, + max_n_classes=50) + + def test_fowlkes_mallows_score(): # General case score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], From 8ec4d112f4ed03ced7c59a9498d19b7a52ee53b4 Mon Sep 17 00:00:00 2001 From: Greg Stupp Date: Wed, 17 Aug 2016 19:13:56 -0700 Subject: [PATCH 3/4] clarify docs --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index d8a3214ec3158..34c729647870b 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -67,7 +67,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, s max_n_classes : int, optional (default=5000) Maximal number of classeses handled for contingency_matrix. This help to avoid Memory error with regression target - for mutual_information. + for mutual_information. If `sparse`, `max_n_classes` is ignored. sparse: boolean, optional. If True, return a sparse continency matrix. If ``eps is not None``, From 630bfa10e1227037a99939b5730bc3cb96bc9f1f Mon Sep 17 00:00:00 2001 From: Greg Stupp Date: Thu, 18 Aug 2016 13:39:01 -0700 Subject: [PATCH 4/4] pep8 --- sklearn/metrics/cluster/supervised.py | 83 +++++++++++++++++---------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 34c729647870b..0f5d14207262c 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -48,7 +48,8 @@ def check_clusterings(labels_true, labels_pred): return labels_true, labels_pred -def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, sparse=False): +def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, + sparse=False): """Build a contingency matrix describing the relationship between labels. Parameters @@ -67,7 +68,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, s max_n_classes : int, optional (default=5000) Maximal number of classeses handled for contingency_matrix. This help to avoid Memory error with regression target - for mutual_information. If `sparse`, `max_n_classes` is ignored. + for mutual_information. If ``sparse is True``, + `max_n_classes` is ignored. sparse: boolean, optional. If True, return a sparse continency matrix. If ``eps is not None``, @@ -75,11 +77,11 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, s Returns ------- - contingency: {array-like, sparse matrix}, shape=[n_classes_true, n_classes_pred] + contingency: {array-like, sparse}, shape=[n_classes_true, n_classes_pred] Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in true class :math:`i` and in predicted class :math:`j`. If ``eps is None``, the dtype of this array will be integer. If ``eps`` is - given, the dtype will be float. + given, the dtype will be float. Will be sparse if ``sparse is True`` """ if eps is not None and sparse: @@ -100,7 +102,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, s # Using coo_matrix to accelerate simple histogram calculation, # i.e. bins are consecutive integers # Currently, coo_matrix is faster than histogram2d for simple cases - contingency = coo_matrix((np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), + contingency = coo_matrix((np.ones(class_idx.shape[0]), + (class_idx, cluster_idx)), shape=(n_classes, n_clusters), dtype=np.int) if not sparse: @@ -113,7 +116,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000, s # clustering measures -def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, contingency=None): +def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, + contingency=None): """Rand index adjusted for chance. The Rand Index computes a similarity measure between two clusterings @@ -209,7 +213,7 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, contingenc n_samples = labels_true.shape[0] n_classes = np.unique(labels_true).shape[0] n_clusters = np.unique(labels_pred).shape[0] - elif isinstance(contingency, _data_matrix): # scipy.sparse.data._data_matrix + elif isinstance(contingency, _data_matrix): n_samples = contingency.nnz n_classes, n_clusters = contingency.shape else: @@ -225,7 +229,8 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, contingenc # Compute contingency matrix if we weren't given it if contingency is None: - contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) # Compute the ARI using the contingency data if isinstance(contingency, np.ndarray): @@ -235,18 +240,22 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000, contingenc sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) elif isinstance(contingency, _data_matrix): # For a sparse matrix - sum_comb_c = sum(comb2(n_c) for n_c in np.array(contingency.sum(axis=1))) - sum_comb_k = sum(comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T) + sum_comb_c = sum( + comb2(n_c) for n_c in np.array(contingency.sum(axis=1))) + sum_comb_k = sum( + comb2(n_k) for n_k in np.array(contingency.sum(axis=0)).T) sum_comb = sum(comb2(n_ij) for n_ij in find(contingency)[2]) else: - raise ValueError("Unsupported type for 'contingency': " + str(type(contingency))) + raise ValueError( + "Unsupported type for 'contingency': " + str(type(contingency))) prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2)) mean_comb = (sum_comb_k + sum_comb_c) / 2. return float((sum_comb - prod_comb) / (mean_comb - prod_comb)) -def homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=5000, sparse=False): +def homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes=5000, sparse=False): """Compute the homogeneity and completeness and V-Measure scores at once. Those metrics are based on normalized conditional entropy measures of @@ -318,7 +327,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=5 contingency = contingency_matrix(labels_true, labels_pred, sparse=True) MI = mutual_info_score(None, None, contingency=contingency) else: - MI = mutual_info_score(labels_true, labels_pred, max_n_classes=max_n_classes) + MI = mutual_info_score(labels_true, labels_pred, + max_n_classes=max_n_classes) homogeneity = MI / (entropy_C) if entropy_C else 1.0 completeness = MI / (entropy_K) if entropy_K else 1.0 @@ -332,7 +342,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=5 return homogeneity, completeness, v_measure_score -def homogeneity_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): +def homogeneity_score(labels_true, labels_pred, max_n_classes=5000, + sparse=False): """Homogeneity metric of a cluster labeling given a ground truth. A clustering result satisfies homogeneity if all of its clusters @@ -412,11 +423,13 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000, sparse=False 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, - max_n_classes=max_n_classes)[0] + return \ + homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, + max_n_classes=max_n_classes)[0] -def completeness_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): +def completeness_score(labels_true, labels_pred, max_n_classes=5000, + sparse=False): """Completeness metric of a cluster labeling given a ground truth. A clustering result satisfies completeness if all the data points @@ -492,8 +505,9 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000, sparse=Fals 0.0 """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, - max_n_classes=max_n_classes)[1] + return \ + homogeneity_completeness_v_measure(labels_true, labels_pred, sparse=sparse, + max_n_classes=max_n_classes)[1] def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): @@ -597,11 +611,13 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000, sparse=False): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred, max_n_classes=max_n_classes, + return homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes=max_n_classes, sparse=sparse)[2] -def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes=5000): +def mutual_info_score(labels_true, labels_pred, contingency=None, + max_n_classes=5000): """Mutual Information between two clusterings. The Mutual Information is a measure of the similarity between two labels of @@ -636,7 +652,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes= labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - contingency: {None, array, sparse matrix}, shape = [n_classes_true, n_classes_pred] + contingency: {None, array, sparse matrix}, + shape = [n_classes_true, n_classes_pred] A contingency matrix given by the :func:`contingency_matrix` function. If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. @@ -658,7 +675,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes= """ if contingency is None: labels_true, labels_pred = check_clusterings(labels_true, labels_pred) - contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) if isinstance(contingency, np.ndarray): # For an array contingency = np.array(contingency, dtype='float') @@ -684,14 +702,16 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, max_n_classes= pj = np.array(contingency.sum(axis=0)).T nnzx, nnzy, nnz_val = find(contingency) log_contingency_nm = np.log(nnz_val) - contingency_nm = nnz_val * 1.0 / contingency_sum # python2 integer division... + contingency_nm = nnz_val * 1.0 / contingency_sum # Don't need to calculate the full outer product. Just for the non-zero values outer = np.array([pi[x] * pj[y] for x, y in zip(nnzx, nnzy)]).T log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) - mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + contingency_nm * log_outer + mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + \ + contingency_nm * log_outer return mi.sum() else: - raise ValueError("Unsupported type for 'contingency': " + str(type(contingency))) + raise ValueError( + "Unsupported type for 'contingency': " + str(type(contingency))) def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): @@ -780,9 +800,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1 or - classes.shape[0] == clusters.shape[0] == 0): + classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -868,7 +889,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -950,7 +972,8 @@ def fowlkes_mallows_score(labels_true, labels_pred, max_n_classes=5000): labels_true, labels_pred = check_clusterings(labels_true, labels_pred, ) n_samples, = labels_true.shape - c = contingency_matrix(labels_true, labels_pred, max_n_classes=max_n_classes) + c = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) tk = np.dot(c.ravel(), c.ravel()) - n_samples pk = np.sum(np.sum(c, axis=0) ** 2) - n_samples qk = np.sum(np.sum(c, axis=1) ** 2) - n_samples