From fbd4b7904b9331bc53673b9cbdf8a5d76754dc29 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 19 Oct 2015 11:51:50 +0200 Subject: [PATCH 1/3] - Add max_n_classes param to cluster.supervised metric - Add testing for the Value Error - Check that n_clusters,n_classes are not too high in contingency matrix --- sklearn/metrics/cluster/supervised.py | 97 +++++++++++++++---- .../metrics/cluster/tests/test_supervised.py | 34 ++++++- 2 files changed, 110 insertions(+), 21 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 2d7d27c5a4c13..65b762f67d3e7 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -44,7 +44,7 @@ def check_clusterings(labels_true, labels_pred): return labels_true, labels_pred -def contingency_matrix(labels_true, labels_pred, eps=None): +def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): """Build a contengency matrix describing the relationship between labels. Parameters @@ -60,6 +60,11 @@ def contingency_matrix(labels_true, labels_pred, eps=None): matrix. This helps to stop NaN propagation. If ``None``, nothing is adjusted. + max_n_classes: int + Maximal number of classes handled for contingency_matrix. + This help to avoid Memory error with regression target + for mutual_information. + Returns ------- contingency: array, shape=[n_classes_true, n_classes_pred] @@ -72,6 +77,14 @@ def contingency_matrix(labels_true, labels_pred, eps=None): clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) n_classes = classes.shape[0] n_clusters = clusters.shape[0] + if n_classes > max_n_classes: + raise ValueError("Too many classes for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + if n_clusters > max_n_classes: + raise ValueError("Too many clusters for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") # Using coo_matrix to accelerate simple histogram calculation, # i.e. bins are consecutive integers # Currently, coo_matrix is faster than histogram2d for simple cases @@ -87,7 +100,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None): # clustering measures -def adjusted_rand_score(labels_true, labels_pred): +def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): """Rand index adjusted for chance The Rand Index computes a similarity measure between two clusterings @@ -119,6 +132,11 @@ def adjusted_rand_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] Cluster labels to evaluate + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- ari : float @@ -180,7 +198,8 @@ def adjusted_rand_score(labels_true, labels_pred): or classes.shape[0] == clusters.shape[0] == len(labels_true)): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) # Compute the ARI using the contingency data sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) @@ -192,7 +211,8 @@ def adjusted_rand_score(labels_true, labels_pred): return ((sum_comb - prod_comb) / (mean_comb - prod_comb)) -def homogeneity_completeness_v_measure(labels_true, labels_pred): +def homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes=5000): """Compute the homogeneity and completeness and V-Measure scores at once Those metrics are based on normalized conditional entropy measures of @@ -226,6 +246,11 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred): labels_pred : array, shape = [n_samples] cluster labels to evaluate + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- homogeneity: float @@ -251,7 +276,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred): entropy_C = entropy(labels_true) entropy_K = entropy(labels_pred) - MI = mutual_info_score(labels_true, labels_pred) + MI = mutual_info_score(labels_true, labels_pred, + max_n_classes=max_n_classes) homogeneity = MI / (entropy_C) if entropy_C else 1.0 completeness = MI / (entropy_K) if entropy_K else 1.0 @@ -265,7 +291,7 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred): return homogeneity, completeness, v_measure_score -def homogeneity_score(labels_true, labels_pred): +def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): """Homogeneity metric of a cluster labeling given a ground truth A clustering result satisfies homogeneity if all of its clusters @@ -289,6 +315,11 @@ def homogeneity_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] cluster labels to evaluate + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- homogeneity: float @@ -336,10 +367,11 @@ def homogeneity_score(labels_true, labels_pred): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred)[0] + return homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes)[0] -def completeness_score(labels_true, labels_pred): +def completeness_score(labels_true, labels_pred, max_n_classes=5000): """Completeness metric of a cluster labeling given a ground truth A clustering result satisfies completeness if all the data points @@ -363,6 +395,11 @@ def completeness_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] cluster labels to evaluate + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- completeness: float @@ -406,10 +443,11 @@ def completeness_score(labels_true, labels_pred): 0.0 """ - return homogeneity_completeness_v_measure(labels_true, labels_pred)[1] + return homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes)[1] -def v_measure_score(labels_true, labels_pred): +def v_measure_score(labels_true, labels_pred, max_n_classes=5000): """V-measure cluster labeling given a ground truth. This score is identical to :func:`normalized_mutual_info_score`. @@ -437,6 +475,11 @@ def v_measure_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] cluster labels to evaluate + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- v_measure: float @@ -501,10 +544,12 @@ def v_measure_score(labels_true, labels_pred): 0.0... """ - return homogeneity_completeness_v_measure(labels_true, labels_pred)[2] + return homogeneity_completeness_v_measure(labels_true, labels_pred, + max_n_classes)[2] -def mutual_info_score(labels_true, labels_pred, contingency=None): +def mutual_info_score(labels_true, labels_pred, contingency=None, + max_n_classes=5000): """Mutual Information between two clusterings The Mutual Information is a measure of the similarity between two labels of @@ -544,6 +589,11 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. + max_n_classes : int + Maximal number of class handled by the mutual_info_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- mi: float @@ -556,7 +606,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): """ if contingency is None: labels_true, labels_pred = check_clusterings(labels_true, labels_pred) - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') contingency_sum = np.sum(contingency) pi = np.sum(contingency, axis=1) @@ -575,7 +626,7 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): return mi.sum() -def adjusted_mutual_info_score(labels_true, labels_pred): +def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): """Adjusted Mutual Information between two clusterings Adjusted Mutual Information (AMI) is an adjustment of the Mutual @@ -608,6 +659,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- ami: float(upperlimited by 1.0) @@ -658,7 +714,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred): if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, @@ -671,7 +728,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred): return ami -def normalized_mutual_info_score(labels_true, labels_pred): +def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): """Normalized Mutual Information between two clusterings Normalized Mutual Information (NMI) is an normalization of the Mutual @@ -701,6 +758,11 @@ def normalized_mutual_info_score(labels_true, labels_pred): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. + max_n_classes : int + Maximal number of class handled by the adjusted_rand_score + metric. Setting it too high can lead to MemoryError or OS + freeze + Returns ------- nmi: float @@ -739,7 +801,8 @@ def normalized_mutual_info_score(labels_true, labels_pred): if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 - contingency = contingency_matrix(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, + max_n_classes=max_n_classes) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 121f1f7d72518..ff2792981c2f1 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -181,10 +181,14 @@ def test_exactly_zero_info_score(): for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int),\ np.arange(i, dtype=np.int) - assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) - assert_equal(v_measure_score(labels_a, labels_b), 0.0) - assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) - assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) + assert_equal(normalized_mutual_info_score(labels_a, labels_b, + max_n_classes=1e4), 0.0) + assert_equal(v_measure_score(labels_a, labels_b, + max_n_classes=1e4), 0.0) + assert_equal(adjusted_mutual_info_score(labels_a, labels_b, + max_n_classes=1e4), 0.0) + assert_equal(normalized_mutual_info_score(labels_a, labels_b, + max_n_classes=1e4), 0.0) def test_v_measure_and_mutual_information(seed=36): @@ -196,3 +200,25 @@ def test_v_measure_and_mutual_information(seed=36): assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) + + +def test_max_n_classes(): + labels_true = np.random.random(5003) + labels_pred = np.random.random(5003) + labels_zero = np.zeros(5003) + labels_true[:2] = 0 + labels_zero[:3] = 1 + labels_pred[:2] = 0 + for score_func in score_funcs: + expected = ("Too many classes for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + assert_raise_message(ValueError, expected, score_func, + labels_true, labels_pred, + max_n_classes=500) + expected = ("Too many clusters for a clustering metric. If you " + "want to increase the limit, pass parameter " + "max_n_classes to the scoring function") + assert_raise_message(ValueError, expected, score_func, + labels_zero, labels_pred, + max_n_classes=500) From e916008b8b6350ea38ac5895238bc294bf388ce0 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 19 Oct 2015 13:32:27 +0200 Subject: [PATCH 2/3] Reduce test time. -Now the overall test time is below .2s for cluster metrics --- sklearn/metrics/cluster/tests/test_supervised.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index ff2792981c2f1..59d283f96b853 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -203,9 +203,9 @@ def test_v_measure_and_mutual_information(seed=36): def test_max_n_classes(): - labels_true = np.random.random(5003) - labels_pred = np.random.random(5003) - labels_zero = np.zeros(5003) + labels_true = np.random.random(53) + labels_pred = np.random.random(53) + labels_zero = np.zeros(53) labels_true[:2] = 0 labels_zero[:3] = 1 labels_pred[:2] = 0 @@ -215,10 +215,10 @@ def test_max_n_classes(): "max_n_classes to the scoring function") assert_raise_message(ValueError, expected, score_func, labels_true, labels_pred, - max_n_classes=500) + max_n_classes=50) expected = ("Too many clusters for a clustering metric. If you " "want to increase the limit, pass parameter " "max_n_classes to the scoring function") assert_raise_message(ValueError, expected, score_func, labels_zero, labels_pred, - max_n_classes=500) + max_n_classes=50) From 1dd55187a311a7e222a910904e70d296d7099941 Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Mon, 19 Oct 2015 15:53:48 +0200 Subject: [PATCH 3/3] Correct Typo with Gael comment --- sklearn/metrics/cluster/supervised.py | 36 +++++++++---------- .../metrics/cluster/tests/test_supervised.py | 5 +-- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 65b762f67d3e7..b61a528fb3819 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -60,8 +60,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None, max_n_classes=5000): matrix. This helps to stop NaN propagation. If ``None``, nothing is adjusted. - max_n_classes: int - Maximal number of classes handled for contingency_matrix. + max_n_classes : int, optional (default=5000) + Maximal number of classeses handled for contingency_matrix. This help to avoid Memory error with regression target for mutual_information. @@ -132,8 +132,8 @@ def adjusted_rand_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] Cluster labels to evaluate - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -246,8 +246,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -315,8 +315,8 @@ def homogeneity_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -395,8 +395,8 @@ def completeness_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -475,8 +475,8 @@ def v_measure_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] cluster labels to evaluate - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -589,8 +589,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None, If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. - max_n_classes : int - Maximal number of class handled by the mutual_info_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the mutual_info_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -659,8 +659,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze @@ -758,8 +758,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, max_n_classes=5000): labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. - max_n_classes : int - Maximal number of class handled by the adjusted_rand_score + max_n_classes: int, optional (default=5000) + Maximal number of classes handled by the adjusted_rand_score metric. Setting it too high can lead to MemoryError or OS freeze diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 59d283f96b853..dd3beb27e111c 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -203,8 +203,9 @@ def test_v_measure_and_mutual_information(seed=36): def test_max_n_classes(): - labels_true = np.random.random(53) - labels_pred = np.random.random(53) + rng = np.random.RandomState(seed=0) + labels_true = rng.rand(53) + labels_pred = rng.rand(53) labels_zero = np.zeros(53) labels_true[:2] = 0 labels_zero[:3] = 1