From 5d1f17fdde4122125eed6277f449ca74ad335f00 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Sat, 16 Sep 2017 23:06:38 +0800 Subject: [PATCH 01/26] move roc_auc_score --- sklearn/metrics/base.py | 3 ++- sklearn/metrics/tests/test_common.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py index b8bbab30930b4..90b742ccd3988 100644 --- a/sklearn/metrics/base.py +++ b/sklearn/metrics/base.py @@ -67,6 +67,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average, raise ValueError('average has to be one of {0}' ''.format(average_options)) + check_consistent_length(y_true, y_score, sample_weight) + y_type = type_of_target(y_true) if y_type not in ("binary", "multilabel-indicator"): raise ValueError("{0} format is not supported".format(y_type)) @@ -74,7 +76,6 @@ def _average_binary_score(binary_metric, y_true, y_score, average, if y_type == "binary": return binary_metric(y_true, y_score, sample_weight=sample_weight) - check_consistent_length(y_true, y_score, sample_weight) y_true = check_array(y_true) y_score = check_array(y_score) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 5f775aaf9ac8f..f96fa2cf9db19 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -198,12 +198,6 @@ "samples_recall_score", "coverage_error", - "roc_auc_score", - "micro_roc_auc", - "weighted_roc_auc", - "macro_roc_auc", - "samples_roc_auc", - "average_precision_score", "weighted_average_precision_score", "micro_average_precision_score", @@ -218,6 +212,11 @@ METRIC_UNDEFINED_MULTICLASS = [ "brier_score_loss", + "roc_auc_score", + "micro_roc_auc", + "weighted_roc_auc", + "macro_roc_auc", + "samples_roc_auc", # with default average='binary', multiclass is prohibited "precision_score", "recall_score", @@ -996,9 +995,10 @@ def check_sample_weight_invariance(name, metric, y1, y2): (weighted_score_zeroed, weighted_score_subset, name))) if not name.startswith('unnormalized'): - # check that the score is invariant under scaling of the weights by a - # common factor - for scaling in [2, 0.3]: + # Check that the score is invariant under scaling of the weights by a + # common factor. The scaling value is carefully chosen to reduce minor + # errors introduced by python when doing floating operations. + for scaling in [5, 0.5]: assert_almost_equal( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), From 5ffad5eaebdaaccf735e8a3a3094520f3e376ac2 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Sun, 17 Sep 2017 08:33:27 +0800 Subject: [PATCH 02/26] minor improve --- sklearn/metrics/base.py | 4 ++-- sklearn/metrics/tests/test_common.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py index 90b742ccd3988..e02fd9f566a94 100644 --- a/sklearn/metrics/base.py +++ b/sklearn/metrics/base.py @@ -67,12 +67,12 @@ def _average_binary_score(binary_metric, y_true, y_score, average, raise ValueError('average has to be one of {0}' ''.format(average_options)) - check_consistent_length(y_true, y_score, sample_weight) - y_type = type_of_target(y_true) if y_type not in ("binary", "multilabel-indicator"): raise ValueError("{0} format is not supported".format(y_type)) + check_consistent_length(y_true, y_score, sample_weight) + if y_type == "binary": return binary_metric(y_true, y_score, sample_weight=sample_weight) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index f96fa2cf9db19..aa8f5fc96a21e 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -217,6 +217,7 @@ "weighted_roc_auc", "macro_roc_auc", "samples_roc_auc", + # with default average='binary', multiclass is prohibited "precision_score", "recall_score", From a8ebe416e1498c27e2772651f30091bfcdb277c0 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 18 Sep 2017 11:55:20 +0800 Subject: [PATCH 03/26] set decimal=2 --- sklearn/metrics/tests/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index aa8f5fc96a21e..c434709984490 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -997,12 +997,12 @@ def check_sample_weight_invariance(name, metric, y1, y2): if not name.startswith('unnormalized'): # Check that the score is invariant under scaling of the weights by a - # common factor. The scaling value is carefully chosen to reduce minor - # errors introduced by python when doing floating operations. - for scaling in [5, 0.5]: + # common factor + for scaling in [2, 0.3]: assert_almost_equal( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), + decimal=2, err_msg="%s sample_weight is not invariant " "under scaling" % name) From 5ef8af44ded44fecd59ca33c57c58ac53ffee8a9 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 18 Sep 2017 11:56:21 +0800 Subject: [PATCH 04/26] minor fix --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index c434709984490..58b6035ba9134 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -996,7 +996,7 @@ def check_sample_weight_invariance(name, metric, y1, y2): (weighted_score_zeroed, weighted_score_subset, name))) if not name.startswith('unnormalized'): - # Check that the score is invariant under scaling of the weights by a + # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: assert_almost_equal( From 578c6f0950b1d0a1806420b03ac9f3e99e8ed7a4 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 18 Sep 2017 12:59:52 +0800 Subject: [PATCH 05/26] test decimal=1 --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 58b6035ba9134..a9f070127335e 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1002,7 +1002,7 @@ def check_sample_weight_invariance(name, metric, y1, y2): assert_almost_equal( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), - decimal=2, + decimal=1, err_msg="%s sample_weight is not invariant " "under scaling" % name) From ab3ed4f66f4213e66ecd5e0e66127feed5b41450 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Mon, 18 Sep 2017 17:01:16 +0800 Subject: [PATCH 06/26] use assert_allclose --- sklearn/metrics/tests/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index a9f070127335e..c7c93fb0ad718 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -999,10 +999,10 @@ def check_sample_weight_invariance(name, metric, y1, y2): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: - assert_almost_equal( + np.testing.assert_allclose( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), - decimal=1, + atol=1e-2, err_msg="%s sample_weight is not invariant " "under scaling" % name) From 6b2cf79a197a0c290577e6351b0bfc42cdd5d582 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 19 Sep 2017 15:06:57 +0800 Subject: [PATCH 07/26] try another way --- sklearn/metrics/tests/test_common.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index c7c93fb0ad718..19bb4306db859 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -943,7 +943,7 @@ def test_averaging_multilabel_all_ones(): @ignore_warnings def check_sample_weight_invariance(name, metric, y1, y2): - rng = np.random.RandomState(0) + rng = np.random.RandomState(10) sample_weight = rng.randint(1, 10, size=len(y1)) # check that unit weights gives the same score as no weight @@ -999,10 +999,9 @@ def check_sample_weight_invariance(name, metric, y1, y2): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: - np.testing.assert_allclose( + assert_almost_equal( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), - atol=1e-2, err_msg="%s sample_weight is not invariant " "under scaling" % name) @@ -1027,7 +1026,7 @@ def test_sample_weight_invariance(n_samples=50): metric, y_true, y_pred # binary - random_state = check_random_state(0) + random_state = check_random_state(10) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) From 4c6b4ba61f705fa5a486163c1f95d43c9d182035 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 19 Sep 2017 15:21:26 +0800 Subject: [PATCH 08/26] Revert "try another way" This reverts commit 6b2cf79a197a0c290577e6351b0bfc42cdd5d582. --- sklearn/metrics/tests/test_common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 19bb4306db859..c7c93fb0ad718 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -943,7 +943,7 @@ def test_averaging_multilabel_all_ones(): @ignore_warnings def check_sample_weight_invariance(name, metric, y1, y2): - rng = np.random.RandomState(10) + rng = np.random.RandomState(0) sample_weight = rng.randint(1, 10, size=len(y1)) # check that unit weights gives the same score as no weight @@ -999,9 +999,10 @@ def check_sample_weight_invariance(name, metric, y1, y2): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: - assert_almost_equal( + np.testing.assert_allclose( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), + atol=1e-2, err_msg="%s sample_weight is not invariant " "under scaling" % name) @@ -1026,7 +1027,7 @@ def test_sample_weight_invariance(n_samples=50): metric, y_true, y_pred # binary - random_state = check_random_state(10) + random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) From bfa46d6b4a40977c17540636d98d7aa8a9d05a78 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 19 Sep 2017 16:13:06 +0800 Subject: [PATCH 09/26] lesteve's idea --- sklearn/metrics/tests/test_common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index c7c93fb0ad718..622981f1fc123 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -998,13 +998,17 @@ def check_sample_weight_invariance(name, metric, y1, y2): if not name.startswith('unnormalized'): # check that the score is invariant under scaling of the weights by a # common factor + + # FIXME: roc_auc scores are more unstable than other scores + kwargs = {'atol': 1e-2} if 'roc_auc' in name else {} + for scaling in [2, 0.3]: np.testing.assert_allclose( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), - atol=1e-2, err_msg="%s sample_weight is not invariant " - "under scaling" % name) + "under scaling" % name, + **kwargs) # Check that if sample_weight.shape[0] != y_true.shape[0], it raised an # error From b58e61f14fa735e995d4dfcb85735e9a01cc0530 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 19 Sep 2017 17:28:22 +0800 Subject: [PATCH 10/26] have a try --- sklearn/metrics/tests/test_common.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 622981f1fc123..e5f6f3520b9f8 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -1000,15 +1000,16 @@ def check_sample_weight_invariance(name, metric, y1, y2): # common factor # FIXME: roc_auc scores are more unstable than other scores - kwargs = {'atol': 1e-2} if 'roc_auc' in name else {} + if 'roc_auc' in name: + y2 = np.round(y2, 1) + weighted_score = metric(y1, y2, sample_weight=sample_weight) for scaling in [2, 0.3]: - np.testing.assert_allclose( + assert_almost_equal( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), err_msg="%s sample_weight is not invariant " - "under scaling" % name, - **kwargs) + "under scaling" % name) # Check that if sample_weight.shape[0] != y_true.shape[0], it raised an # error @@ -1031,7 +1032,7 @@ def test_sample_weight_invariance(n_samples=50): metric, y_true, y_pred # binary - random_state = check_random_state(0) + random_state = check_random_state(10) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) From b1ac80a4c53a6c6d19499b8eaa6cff98812ef72f Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 11:49:02 +0800 Subject: [PATCH 11/26] refer to previous commit for previous solution --- sklearn/metrics/ranking.py | 6 +++--- sklearn/metrics/tests/test_common.py | 8 +------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index aa2e5425976e9..419684d4589c6 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -88,8 +88,8 @@ def auc(x, y, reorder=False): x, y = x[order], y[order] else: dx = np.diff(x) - if np.any(dx < 0): - if np.all(dx <= 0): + if np.any(dx < -1e-10): + if np.all(dx <= 1e-10): direction = -1 else: raise ValueError("Reordering is not turned on, and " @@ -258,7 +258,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) - return auc(fpr, tpr, reorder=True) + return auc(fpr, tpr, reorder=False) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index e5f6f3520b9f8..b935ccbe29910 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -998,12 +998,6 @@ def check_sample_weight_invariance(name, metric, y1, y2): if not name.startswith('unnormalized'): # check that the score is invariant under scaling of the weights by a # common factor - - # FIXME: roc_auc scores are more unstable than other scores - if 'roc_auc' in name: - y2 = np.round(y2, 1) - weighted_score = metric(y1, y2, sample_weight=sample_weight) - for scaling in [2, 0.3]: assert_almost_equal( weighted_score, @@ -1032,7 +1026,7 @@ def test_sample_weight_invariance(n_samples=50): metric, y_true, y_pred # binary - random_state = check_random_state(10) + random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) From aa38bb3619a7fe04d5f02ad21a264cd0a597900f Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 13:56:43 +0800 Subject: [PATCH 12/26] update what's new --- doc/whats_new/v0.20.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 06bcc9a4e6cf8..f75f41e36a1cb 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -17,6 +17,7 @@ random sampling procedures. - :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) - :class:`isotonic.IsotonicRegression` (bug fix) +- :class:`metrics.roc_auc_score` (enhancement) Details are listed in the changelog below. @@ -54,12 +55,15 @@ Classifiers and regressors :class:`sklearn.ensemble.voting_classifier` to access fitted estimators. :issue:`9157` by :user:`Herilalaina Rakotoarison `. - Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. +- Improve the efficiency and stability of :func:`metrics.roc_auc_score` + through removing unnecessary sorting process. + :issue:`9786` by :user:`Hanmin Qin `. + Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the From 60ae8bbdf33005b8bcdf2457bd7df4cf6dbbccab Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 14:14:16 +0800 Subject: [PATCH 13/26] empty commit --- doc/whats_new/v0.20.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 3eb5f14e446c8..8f4b596ae2227 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -68,6 +68,7 @@ Model evaluation and meta-estimators through removing unnecessary sorting process. :issue:`9786` by :user:`Hanmin Qin `. + Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the From 1ef462fb93c5e24eb3994734c3bad2e6a386fa13 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 14:15:25 +0800 Subject: [PATCH 14/26] empty commit (unstable travis ...) --- doc/whats_new/v0.20.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 8f4b596ae2227..3eb5f14e446c8 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -68,7 +68,6 @@ Model evaluation and meta-estimators through removing unnecessary sorting process. :issue:`9786` by :user:`Hanmin Qin `. - Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the From 4361ce7c7db3007aec3f0b98c646d4fe3e3168d0 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 17:19:33 +0800 Subject: [PATCH 15/26] conservative change according to lesteve --- sklearn/metrics/base.py | 3 +-- sklearn/metrics/ranking.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py index e02fd9f566a94..b8bbab30930b4 100644 --- a/sklearn/metrics/base.py +++ b/sklearn/metrics/base.py @@ -71,11 +71,10 @@ def _average_binary_score(binary_metric, y_true, y_score, average, if y_type not in ("binary", "multilabel-indicator"): raise ValueError("{0} format is not supported".format(y_type)) - check_consistent_length(y_true, y_score, sample_weight) - if y_type == "binary": return binary_metric(y_true, y_score, sample_weight=sample_weight) + check_consistent_length(y_true, y_score, sample_weight) y_true = check_array(y_true) y_score = check_array(y_score) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 419684d4589c6..ce2012a1ab268 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -299,7 +299,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): thresholds : array, shape = [n_thresholds] Decreasing score values. """ - check_consistent_length(y_true, y_score) + check_consistent_length(y_true, y_score, sample_weight) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) From c947d7b0f569adbb2540bc6f6065ea347e2166f8 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 18:53:36 +0800 Subject: [PATCH 16/26] not use auc? --- sklearn/metrics/ranking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index ce2012a1ab268..22aa1e9cb0e88 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -88,8 +88,8 @@ def auc(x, y, reorder=False): x, y = x[order], y[order] else: dx = np.diff(x) - if np.any(dx < -1e-10): - if np.all(dx <= 1e-10): + if np.any(dx < 0): + if np.all(dx <= 0): direction = -1 else: raise ValueError("Reordering is not turned on, and " @@ -258,7 +258,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) - return auc(fpr, tpr, reorder=False) + return np.trapz(tpr, fpr) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, From c09c26a192514280e2cbe83c74d98712821ac354 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 21:35:04 +0800 Subject: [PATCH 17/26] lesteve's great idea --- sklearn/metrics/ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 22aa1e9cb0e88..2e9f49b9e731b 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -341,7 +341,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] if sample_weight is not None: - fps = stable_cumsum(weight)[threshold_idxs] - tps + fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] else: fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs] From 2b346e8abd07d1f4afb02988d88d599a6a41952d Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 21:38:33 +0800 Subject: [PATCH 18/26] empty commit --- sklearn/metrics/ranking.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 2e9f49b9e731b..b2ad9d9be42b1 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -258,6 +258,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) + return np.trapz(tpr, fpr) return _average_binary_score( From 9be6181c06c8293abdb7feb94bebfa9f3dcd4f89 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 21:39:17 +0800 Subject: [PATCH 19/26] empty commit (CI so unstable) --- sklearn/metrics/ranking.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index b2ad9d9be42b1..2e9f49b9e731b 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -258,7 +258,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) - return np.trapz(tpr, fpr) return _average_binary_score( From cf6a08fbb3ae8dcabec8b20a687f66889cd13df6 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 20 Sep 2017 23:07:42 +0800 Subject: [PATCH 20/26] lesteve's idea --- sklearn/metrics/ranking.py | 2 +- sklearn/metrics/tests/test_ranking.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 2e9f49b9e731b..6a5b7368beaa4 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -258,7 +258,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) - return np.trapz(tpr, fpr) + return auc(fpr, tpr, reorder=False) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index db80691663606..ec5822ec99b25 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -371,6 +371,20 @@ def test_roc_curve_drop_intermediate(): [1.0, 0.9, 0.7, 0.6, 0.]) +def test_roc_curve_fpr_tpr_increasing(): + # Ensure that fpr and tpr returned by roc_curve are increasing + # Regression test for issue #9786 + n_samples = 50 + rng = np.random.RandomState(0) + y_true = rng.randint(0, 2, size=(n_samples, )) + y_score = rng.random_sample(size=(n_samples,)) + sample_weight = rng.randint(1, 10, size=(n_samples, )) + fpr, tpr, _ = roc_curve(y_true, y_score, + sample_weight=sample_weight * 0.2) + assert_equal((np.diff(fpr) < 0).sum(), 0) + assert_equal((np.diff(tpr) < 0).sum(), 0) + + def test_auc(): # Test Area Under Curve (AUC) computation x = [0, 1] From b0f2efcad6a855def46b64ec8fd9e6db68c1677c Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Thu, 21 Sep 2017 09:07:10 +0800 Subject: [PATCH 21/26] update what's new --- doc/whats_new/v0.20.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 3eb5f14e446c8..0f002e89b3519 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -18,6 +18,7 @@ random sampling procedures. - :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) - :class:`isotonic.IsotonicRegression` (bug fix) - :class:`metrics.roc_auc_score` (enhancement) +- :class:`metrics.roc_curve` (enhancement) Details are listed in the changelog below. @@ -64,8 +65,8 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. -- Improve the efficiency and stability of :func:`metrics.roc_auc_score` - through removing unnecessary sorting process. +- Improve the stability of :func:`metrics.roc_auc_score` + and :func:`metrics.roc_curve` in float calculations. :issue:`9786` by :user:`Hanmin Qin `. Linear, kernelized and related models From a15bc3c516ccacb5e0cd22357822831bc3dc34b2 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Tue, 26 Sep 2017 19:26:59 +0800 Subject: [PATCH 22/26] improve --- doc/whats_new/v0.20.rst | 12 ++++++------ sklearn/metrics/tests/test_ranking.py | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 0f002e89b3519..48ae8edb91671 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -17,8 +17,7 @@ random sampling procedures. - :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) - :class:`isotonic.IsotonicRegression` (bug fix) -- :class:`metrics.roc_auc_score` (enhancement) -- :class:`metrics.roc_curve` (enhancement) +- :class:`metrics.roc_auc_score` (bug fix) Details are listed in the changelog below. @@ -65,10 +64,6 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. -- Improve the stability of :func:`metrics.roc_auc_score` - and :func:`metrics.roc_curve` in float calculations. - :issue:`9786` by :user:`Hanmin Qin `. - Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the @@ -112,6 +107,11 @@ Decomposition, manifold learning and clustering - Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly shuffled. :issue:`9731` by `Nicolas Goix`_. +Model evaluation and meta-estimators + +- Fixed a bug in :func:`metrics.roc_auc_score`, where float calculations sometimes + introduce significant error. :issue:`9786` by :user:`Hanmin Qin `. + API changes summary ------------------- diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index ec5822ec99b25..e1c0f8791c99c 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -373,12 +373,13 @@ def test_roc_curve_drop_intermediate(): def test_roc_curve_fpr_tpr_increasing(): # Ensure that fpr and tpr returned by roc_curve are increasing - # Regression test for issue #9786 n_samples = 50 rng = np.random.RandomState(0) y_true = rng.randint(0, 2, size=(n_samples, )) y_score = rng.random_sample(size=(n_samples,)) sample_weight = rng.randint(1, 10, size=(n_samples, )) + # Construct an edge case with float y_score and sample_weight + # when some adjacent values of fpr and tpr are the same. fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight * 0.2) assert_equal((np.diff(fpr) < 0).sum(), 0) From 5b054c2ff3e27c8103f3c91fb511b4dace2298a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 27 Sep 2017 15:47:29 +0200 Subject: [PATCH 23/26] Minor tweaks --- sklearn/metrics/ranking.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 6a5b7368beaa4..435b3b6502f42 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -258,7 +258,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) - return auc(fpr, tpr, reorder=False) + return auc(fpr, tpr) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, @@ -341,6 +341,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] if sample_weight is not None: + # express fps as a cumsum to ensure fps is increasing even in + # the presense of floating point errors fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] else: fps = 1 + threshold_idxs - tps From 145d34e5210699f4713e5ab26a0b8796521fb769 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 27 Sep 2017 21:53:11 +0800 Subject: [PATCH 24/26] minor comment --- sklearn/metrics/tests/test_ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index e1c0f8791c99c..53126a090ac17 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -379,7 +379,7 @@ def test_roc_curve_fpr_tpr_increasing(): y_score = rng.random_sample(size=(n_samples,)) sample_weight = rng.randint(1, 10, size=(n_samples, )) # Construct an edge case with float y_score and sample_weight - # when some adjacent values of fpr and tpr are the same. + # when some adjacent values of fpr and tpr are actually the same. fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight * 0.2) assert_equal((np.diff(fpr) < 0).sum(), 0) From 0fdbe660b32e469a39eb28cb36ef09c4c68fd3fa Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 27 Sep 2017 22:19:07 +0800 Subject: [PATCH 25/26] try new test on the CI --- sklearn/metrics/tests/test_ranking.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 53126a090ac17..6b5dd0815accb 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -372,16 +372,13 @@ def test_roc_curve_drop_intermediate(): def test_roc_curve_fpr_tpr_increasing(): - # Ensure that fpr and tpr returned by roc_curve are increasing - n_samples = 50 - rng = np.random.RandomState(0) - y_true = rng.randint(0, 2, size=(n_samples, )) - y_score = rng.random_sample(size=(n_samples,)) - sample_weight = rng.randint(1, 10, size=(n_samples, )) + # Ensure that fpr and tpr returned by roc_curve are increasing. # Construct an edge case with float y_score and sample_weight - # when some adjacent values of fpr and tpr are actually the same. - fpr, tpr, _ = roc_curve(y_true, y_score, - sample_weight=sample_weight * 0.2) + # when some adjacent values of fpr and tpr are the same. + y_true = [0, 0, 1, 1, 1] + y_score = [0.1, 0.7, 0.3, 0.4, 0.5] + sample_weight = np.repeat(0.2, 5) + fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) assert_equal((np.diff(fpr) < 0).sum(), 0) assert_equal((np.diff(tpr) < 0).sum(), 0) From e7eb7f09f8013a0a7439673cd2601229880f693b Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 27 Sep 2017 22:41:39 +0800 Subject: [PATCH 26/26] improve --- doc/whats_new/v0.20.rst | 6 +++--- sklearn/metrics/tests/test_ranking.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 48ae8edb91671..6ccdc58b7b3b0 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -107,10 +107,10 @@ Decomposition, manifold learning and clustering - Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly shuffled. :issue:`9731` by `Nicolas Goix`_. -Model evaluation and meta-estimators +Metrics -- Fixed a bug in :func:`metrics.roc_auc_score`, where float calculations sometimes - introduce significant error. :issue:`9786` by :user:`Hanmin Qin `. +- Fixed a bug due to floating point error in :func:`metrics.roc_auc_score` with + non-integer sample weights. :issue:`9786` by :user:`Hanmin Qin `. API changes summary ------------------- diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 6b5dd0815accb..ab8a4684c0c65 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -374,7 +374,7 @@ def test_roc_curve_drop_intermediate(): def test_roc_curve_fpr_tpr_increasing(): # Ensure that fpr and tpr returned by roc_curve are increasing. # Construct an edge case with float y_score and sample_weight - # when some adjacent values of fpr and tpr are the same. + # when some adjacent values of fpr and tpr are actually the same. y_true = [0, 0, 1, 1, 1] y_score = [0.1, 0.7, 0.3, 0.4, 0.5] sample_weight = np.repeat(0.2, 5)