diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst index ee7028f469b5f..a6c0f430c2638 100644 --- a/doc/modules/classification_threshold.rst +++ b/doc/modules/classification_threshold.rst @@ -1,6 +1,6 @@ .. currentmodule:: sklearn.model_selection -.. _TunedThresholdClassifierCV: +.. _threshold_tunning: ================================================== Tuning the decision threshold for class prediction @@ -63,7 +63,7 @@ Post-tuning the decision threshold One solution to address the problem stated in the introduction is to tune the decision threshold of the classifier once the model has been trained. The -:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using +:class:`TunedThresholdClassifierCV` tunes this threshold using an internal cross-validation. The optimum threshold is chosen to maximize a given metric. @@ -80,6 +80,15 @@ a utility metric defined by the business (in this case an insurance company). :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html :align: center +Plotting metric across thresholds +--------------------------------- + +The final plot above shows the value of a utility metric of interest across a range +of threshold values. This can be a useful visualization when tuning decision +threshold, especially if there is more than one metric of interest. The +:func:`decision_threshold_curve` allows you to easily generate such plots as it +computes the values required for each axis, scores per threshold and threshold values. + Options to tune the decision threshold -------------------------------------- @@ -120,7 +129,7 @@ a meaningful metric for their use case. Important notes regarding the internal cross-validation ------------------------------------------------------- -By default :class:`~sklearn.model_selection.TunedThresholdClassifierCV` uses a 5-fold +By default :class:`TunedThresholdClassifierCV` uses a 5-fold stratified cross-validation to tune the decision threshold. The parameter `cv` allows to control the cross-validation strategy. It is possible to bypass cross-validation by setting `cv="prefit"` and providing a fitted classifier. In this case, the decision @@ -143,7 +152,7 @@ Manually setting the decision threshold The previous sections discussed strategies to find an optimal decision threshold. It is also possible to manually set the decision threshold using the class -:class:`~sklearn.model_selection.FixedThresholdClassifier`. In case that you don't want +:class:`FixedThresholdClassifier`. In case that you don't want to refit the model when calling `fit`, wrap your sub-estimator with a :class:`~sklearn.frozen.FrozenEstimator` and do ``FixedThresholdClassifier(FrozenEstimator(estimator), ...)``. diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index cf168295a6024..cc15b4ecce185 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -63,7 +63,7 @@ The most common decisions are done on binary classification tasks, where the res probability of rain a decision is made on how to act (whether to take mitigating measures like an umbrella or not). For classifiers, this is what :term:`predict` returns. -See also :ref:`TunedThresholdClassifierCV`. +See also :ref:`threshold_tunning`. There are many scoring functions which measure different aspects of such a decision, most of them are covered with or derived from the :func:`metrics.confusion_matrix`. diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst new file mode 100644 index 0000000000000..52ebf021f0a7f --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.metrics/31338.major-feature.rst @@ -0,0 +1,4 @@ +- :func:`inspection.metric_threshold_curve` has been added to + assess performance over thresholds by computing a threshold-dependent + metric of interest per threshold. By + :user:`Carlo Lemos ` and :user:`Lucy Liu `. diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index ce86525acc368..acf135c427c8c 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -26,6 +26,7 @@ recall_score, zero_one_loss, ) +from ._decision_threshold import decision_threshold_curve from ._dist_metrics import DistanceMetric from ._plot.confusion_matrix import ConfusionMatrixDisplay from ._plot.det_curve import DetCurveDisplay @@ -124,6 +125,7 @@ "d2_tweedie_score", "davies_bouldin_score", "dcg_score", + "decision_threshold_curve", "det_curve", "euclidean_distances", "explained_variance_score", diff --git a/sklearn/metrics/_decision_threshold.py b/sklearn/metrics/_decision_threshold.py new file mode 100644 index 0000000000000..9dbadb22edc38 --- /dev/null +++ b/sklearn/metrics/_decision_threshold.py @@ -0,0 +1,117 @@ +"""Metric per threshold curve to assess binary classification performance. + +Compute metric per threshold, over a range of threshold values to aid visualization +of threshold-dependent metric behavior. + +Utilizes `_CurveScorer` methods to do all the computation. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral + +from ..utils._param_validation import Interval, validate_params + + +@validate_params( + { + "score_func": [callable], + "y_true": ["array-like"], + "y_score": ["array-like"], + "thresholds": [ + Interval(Integral, 2, None, closed="left"), + "array-like", + ], + "greater_is_better": ["boolean"], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def decision_threshold_curve( + score_func, + y_true, + y_score, + # Should below 2 have a default value? + thresholds=20, + greater_is_better=True, + labels=None, + **kwargs, +): + """Compute threshold-dependent metric of interest per threshold. + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.8 + + Parameters + ---------- + score_func : callable + The score function to use. It will be called as + `score_func(y_true, y_pred, **kwargs)`. + + y_true : array-like of shape (n_samples,) + Ground truth (correct) target labels. + + y_score : array-like of shape (n_samples,) + Continuous response scores. + + thresholds : int or array-like, default=20 + Specifies number of decision thresholds to compute score for. If an integer, + it will be used to generate `thresholds` thresholds uniformly distributed + between the minimum and maximum of `y_score`. If an array-like, it will be + used as the thresholds. + + greater_is_better : bool, default=True + Whether `score_func` is a score function (default), meaning high is + good, or a loss function, meaning low is good. In the latter case, the + the output of `score_func` will be sign-flipped. + + labels : array-like, default=None + Class labels. If `None`, inferred from `y_true`. + TODO: used `labels` instead of `classes` to be consistent with other metrics. + + **kwargs : dict + Parameters to pass to `score_func`. + + Returns + ------- + score_thresholds : ndarray of shape (n_thresholds,) + The scores associated with each threshold. + + thresholds : ndarray of shape (n_thresholds,) + The thresholds used to compute the scores. + + See Also + -------- + precision_recall_curve : Compute precision-recall pairs for different + probability thresholds. + det_curve : Compute error rates for different probability thresholds. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import accuracy_score, decision_threshold_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) + >>> score_thresholds, thresholds = decision_threshold_curve( + ... accuracy_score, y_true, y_score, thresholds=4) + >>> thresholds + array([0.1, 0.33333333, 0.56666667, 0.8 ]) + >>> score_thresholds + array([0.5, 0.75, 0.75, 0.75]) + """ + # To prevent circular import + from ._scorer import _CurveScorer + + sign = 1 if greater_is_better else -1 + curve_scorer = _CurveScorer(score_func, sign, {}, thresholds) + return curve_scorer._scores_from_predictions( + y_true, + y_score, + labels, + **kwargs, + ) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 08e5a20187de7..14391314356fa 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -30,6 +30,7 @@ from ..utils import Bunch from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params from ..utils._response import _get_response_values +from ..utils._unique import cached_unique from ..utils.metadata_routing import ( MetadataRequest, MetadataRouter, @@ -1071,23 +1072,24 @@ class _CurveScorer(_BaseScorer): `score_func(y_true, y_pred, **kwargs)`. sign : int - Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`. - Thus, `sign` defined if higher scores are better or worse. + Either 1 or -1. Score is returned as `sign * score_func(estimator, X, y)`. + Thus, `sign` defines whether higher scores are better or worse. kwargs : dict Additional parameters to pass to the score function. thresholds : int or array-like - Related to the number of decision thresholds for which we want to compute the - score. If an integer, it will be used to generate `thresholds` thresholds - uniformly distributed between the minimum and maximum predicted scores. If an - array-like, it will be used as the thresholds. + Specifies number of decision thresholds to compute score for. If an integer, + it will be used to generate `thresholds` thresholds uniformly distributed + between the minimum and maximum of `y_score`. If an array-like, it will be + used as the thresholds. - response_method : str + response_method : str, default=None The method to call on the estimator to get the response values. + If set to `None`, the `_scores_from_estimator` method cannot be used. """ - def __init__(self, score_func, sign, kwargs, thresholds, response_method): + def __init__(self, score_func, sign, kwargs, thresholds, response_method=None): super().__init__( score_func=score_func, sign=sign, @@ -1110,8 +1112,75 @@ def from_scorer(cls, scorer, response_method, thresholds): instance._metadata_request = scorer._get_metadata_request() return instance + def _scores_from_predictions( + self, + y_true, + y_score, + classes=None, + **kwargs, + ): + """Computes scores per threshold, given continuous response and true labels. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target labels. + + y_score : array-like of shape (n_samples,) + Continuous response scores. + + classes: array-like, default=None + Class labels. If `None`, inferred from `y_true`. + + **kwargs : dict + Parameters to pass to `self.score_func`. + + Returns + ------- + score_thresholds : ndarray of shape (thresholds,) + The scores associated with each threshold. + + thresholds : ndarray of shape (thresholds,) + The thresholds used to compute the scores. + """ + # This could also be done in `decision_threshold_curve`, not sure which + # is better + y_true_unique = cached_unique(y_true) + if classes is None: + classes = y_true_unique + # not sure if this separate error msg needed. + # there is the possibility that set(classes) != set(y_true_unique) fails + # because `y_true` only contains one class. + if len(y_true_unique) == 1: + raise ValueError("`y_true` only contains one class label.") + if set(classes) != set(y_true_unique): + raise ValueError( + f"`classes` ({classes}) is not equal to the unique values found in " + f"`y_true` ({y_true_unique})." + ) + + if isinstance(self._thresholds, Integral): + potential_thresholds = np.linspace( + np.min(y_score), np.max(y_score), self._thresholds + ) + else: + potential_thresholds = np.asarray(self._thresholds) + + score_thresholds = [ + self._sign + * self._score_func( + y_true, + _threshold_scores_to_class_labels( + y_score, th, classes, self._get_pos_label() + ), + **{**self._kwargs, **kwargs}, + ) + for th in potential_thresholds + ] + return np.array(score_thresholds), potential_thresholds + def _score(self, method_caller, estimator, X, y_true, **kwargs): - """Evaluate predicted target values for X relative to y_true. + """Computes scores per threshold, given estimator, X and true labels. Parameters ---------- @@ -1140,27 +1209,21 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs): potential_thresholds : ndarray of shape (thresholds,) The potential thresholds used to compute the scores. """ - pos_label = self._get_pos_label() + if self._response_method is None: + raise ValueError( + "This method cannot be used when `_CurveScorer` initialized with " + "`response_method=None`" + ) + y_score = method_caller( - estimator, self._response_method, X, pos_label=pos_label + estimator, self._response_method, X, pos_label=self._get_pos_label() ) - scoring_kwargs = {**self._kwargs, **kwargs} - if isinstance(self._thresholds, Integral): - potential_thresholds = np.linspace( - np.min(y_score), np.max(y_score), self._thresholds - ) - else: - potential_thresholds = np.asarray(self._thresholds) - score_thresholds = [ - self._sign - * self._score_func( - y_true, - _threshold_scores_to_class_labels( - y_score, th, estimator.classes_, pos_label - ), - **scoring_kwargs, - ) - for th in potential_thresholds - ] - return np.array(score_thresholds), potential_thresholds + # why 'potential' ? + score_thresholds, potential_thresholds = self._scores_from_predictions( + y_true, + y_score, + estimator.classes_, + **kwargs, + ) + return score_thresholds, potential_thresholds diff --git a/sklearn/metrics/tests/test_decision_threshold.py b/sklearn/metrics/tests/test_decision_threshold.py new file mode 100644 index 0000000000000..950ea9e28c916 --- /dev/null +++ b/sklearn/metrics/tests/test_decision_threshold.py @@ -0,0 +1,133 @@ +from functools import partial + +import pytest + +from sklearn.metrics import ( + accuracy_score, + f1_score, + fbeta_score, + precision_score, + recall_score, +) + + +# TODO(Carlo): Update tests. +def test_grid_int_bigger_than_set_then_all(): + # """When `thresholds` parameter is bigger than the number of unique + # `y_score` then `len(thresholds)` should be equal to `len(set(y_score))`. + # """ + + # X, y = make_classification() + # clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X, y) + # y_score = clf.predict_proba(X)[:, 1] + + # _, thresholds_big_int = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=len(set(y_score)) + 1000 + # ) + + # assert len(thresholds_big_int) == len(set(y_score)) + assert True + + +def test_binary_clf_curve_multiclass_error(): + # rng = check_random_state(404) + # y_true = rng.randint(0, 3, size=10) + # y_pred = rng.rand(10) + # msg = "In a multiclass scenario, you must pass " + # with pytest.raises(ValueError, match=msg): + # decision_threshold_curve(y_true, y_pred, accuracy_score) + assert True + + +@pytest.mark.parametrize( + "metric", + [ + # make_scorer(fbeta_score, beta=3), + # make_scorer(fbeta_score, beta=0.5), + f1_score, + precision_score, + recall_score, + accuracy_score, + ], +) +def test_decision_threshold_curve_end_points(metric): + # rng = check_random_state(0) + # y_true = np.array([0] * 50 + [1] * 50) + # y_score = rng.normal(3, size=100) + # min_pred, max_score = min(y_score), max(y_score) + + # metric_values, _ = decision_threshold_curve(y_true, y_score, metric) + + # assert metric_values[0] == metric(y_true, (y_score > min_pred) * 1) + # assert metric_values[-1] == metric(y_true, (y_score > max_score) * 1) + assert True + + +@pytest.mark.parametrize( + "metric", + [partial(fbeta_score, beta=3), precision_score, recall_score], +) +def test_zero_sample_weight_equals_excluding(metric): + # rng = check_random_state(0) + # y_true = np.array([0] * 50 + [1] * 50) + # y_score = rng.normal(3, size=100) + + # sample_weight = np.array([0] * 20 + [1] * 80) + # scoring_kwargs = {"sample_weight": sample_weight} + # metric_values_sw, _ = decision_threshold_curve( + # y_true, y_score, metric, scoring_kwargs=scoring_kwargs + # ) + + # y_true_exclude = y_true[sample_weight != 0] + # y_score_exclude = y_score[sample_weight != 0] + # metric_values_exclude, _ = decision_threshold_curve( + # y_true_exclude, y_score_exclude, metric + # ) + + # assert_allclose(metric_values_sw, metric_values_exclude) + assert True + + +def test_len_of_threshold_when_passing_int(): + # y = [0] * 500 + [1] * 500 + # y_score = list(range(1000)) + # _, thresholds = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=13 + # ) + + # assert len(thresholds) == 13 + assert True + + +@pytest.mark.parametrize( + "metric, scoring_kwargs", + [ + (f1_score, None), + (f1_score, {}), + (fbeta_score, {"beta": 4}), + ], +) +def test_scoring_kwargs(metric, scoring_kwargs): + # y_true = np.array([0] * 50 + [1] * 50) + # decision_threshold_curve(y_true, y_true, metric, scoring_kwargs=scoring_kwargs) + assert True + + +def test_passing_the_grid(): + # y = [0] * 500 + [1] * 500 + # y_score = list(range(1000)) + + # grid_sorted = np.array(list(range(200, 300))) + # _, thresholds_sorted = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=grid_sorted + # ) + + # assert_allclose(grid_sorted, thresholds_sorted) + + # grid_not_sorted = grid_sorted[::-1] + # _, thresholds_not_sorted = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=grid_not_sorted + # ) + + # assert_allclose(grid_sorted, thresholds_not_sorted) + assert True diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 672ed8ae7eecc..8d4a8574f4a77 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -1653,6 +1653,21 @@ def test_curve_scorer_pos_label(global_random_seed): assert scores_pos_label_1.max() == pytest.approx(1.0) +def test_curve_scorer_scores_from_predictions(): + """Check behavior of `_CurveScorer._scores_from_predictions`.""" + X, y = make_classification(random_state=0) + lr = LogisticRegression().fit(X, y) + y_score = lr.predict_proba(X) + + curve_scorer = _CurveScorer( + balanced_accuracy_score, sign=1, kwargs={}, thresholds=10 + ) + score_thresholds, potential_thresholds = curve_scorer._scores_from_predictions( + y_true=y, + y_score=y_score[:, 1], + ) + + # TODO(1.8): remove def test_make_scorer_reponse_method_default_warning(): with pytest.warns(FutureWarning, match="response_method=None is deprecated"): diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index c68ed38b8819d..707f28c3bc64e 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -510,7 +510,7 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier): into a class label. The tuning is done by optimizing a binary metric, potentially constrained by a another metric. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. .. versionadded:: 1.5 diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index 707aa37737c1b..1ddb6421e4834 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -239,6 +239,7 @@ def _check_function_param_validation( "sklearn.metrics.d2_tweedie_score", "sklearn.metrics.davies_bouldin_score", "sklearn.metrics.dcg_score", + "sklearn.metrics.decision_threshold_curve", "sklearn.metrics.det_curve", "sklearn.metrics.explained_variance_score", "sklearn.metrics.f1_score",