scikit-learn · ogrisel · May 22, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -38,6 +38,11 @@ Changelog
   grids that have heterogeneous parameter values.
   :pr:`29078` by :user:`Loïc Estève <lesteve>`.
 
+- |Fix| Fix :class:`model_selection.TunedThresholdClassifierCV` to raise
+  `ValueError` when passed a `scoring` argument intended for unthresholded
+  predictions. It now also raises warnings for different choice of `scoring`
+  that lead to degenerate choice of thresholds.
+  :pr:`29082` by :user:`Olivier Grisel <ogrisel>`.
 
 .. _changes_1_5:
 

diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py
@@ -1,5 +1,6 @@
 from collections.abc import MutableMapping
 from numbers import Integral, Real
+from warnings import warn
 
 import numpy as np
 
@@ -656,7 +657,17 @@ class TunedThresholdClassifierCV(BaseThresholdClassifier):
 
         * a string associated to a scoring function for binary classification
           (see :ref:`scoring_parameter`);
-        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`;
+        * a scorer callable object created with :func:`~sklearn.metrics.make_scorer`.
+
+        Note that scoring objective should introduce a trade-off between false
+        negatives and false positives, otherwise the tuned threshold would be
+        trivial and the resulting classifier would be equivalent to constantly
+        classifiying one of the two possible classes. This would be the case
+        when passing scoring="precision" or scoring="recall" for instance.
+        Furthermore, the scoring objective should evaluate thresholded
+        classifier predictions: as a result, metrics such as ROC AUC, Average
+        Precision, log loss or the Brier score are not valid scoring metrics in
+        this context.
 
     response_method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `estimator` corresponding to the
@@ -947,6 +958,35 @@ def _fit(self, X, y, **params):
         best_idx = objective_scores.argmax()
         self.best_score_ = objective_scores[best_idx]
         self.best_threshold_ = decision_thresholds[best_idx]
+
+        if self.best_threshold_ == min_threshold:
+            trivial_kind = "positive"
+        elif self.best_threshold_ == max_threshold:
+            trivial_kind = "negative"
+        else:
+            trivial_kind = None
+
+        if (
+            objective_scores.max() - objective_scores.min()
+            <= np.finfo(objective_scores.dtype).eps
+        ):
+            warn(
+                f"The objective metric {self.scoring!r} is constant at "
+                f"{self.best_score_} across all thresholds. Falling back "
+                "to the default 0.5 threshold. Please instead pass a scoring "
+                "metric that varies with the decision threshold.",
+                UserWarning,
+            )
+            self.best_threshold_ = 0.5
+        elif trivial_kind is not None:
+            warn(
+                f"Tuning the decision threshold on {self.scoring} "
+                "leads to a trivial classifier that classifies all samples as "
+                f"the {trivial_kind} class. Consider revising the scoring parameter "
+                "to include a trade-off between false positives and false negatives.",
+                UserWarning,
+            )
+
         if self.store_cv_results:
             self.cv_results_ = {
                 "thresholds": decision_thresholds,
@@ -1012,8 +1052,24 @@ def get_metadata_routing(self):
 
     def _get_curve_scorer(self):
         """Get the curve scorer based on the objective metric used."""
-        scoring = check_scoring(self.estimator, scoring=self.scoring)
+        scorer = check_scoring(self.estimator, scoring=self.scoring)
+        # XXX: at the time of writing, there is no very explicit way to check
+        # if a scorer expects thresholded binary classification predictions.
+        # TODO: update this condition when a better way is available.
+        scorer_response_methods = getattr(scorer, "_response_method", "predict")
+        if isinstance(scorer_response_methods, str):
+            scorer_response_methods = {scorer_response_methods}
+        else:
+            scorer_response_methods = set(scorer_response_methods)
+
+        if scorer_response_methods.issubset({"predict_proba", "decision_function"}):
+            raise ValueError(
+                f"{self.__class__.__name__} expects a scoring metric that evaluates "
+                f"the thresholded predictions of a binary classifier, got: "
+                f"{self.scoring!r} which expects unthresholded predictions computed by "
+                f"the {scorer._response_method!r} method(s) of the classifier."
+            )
         curve_scorer = _CurveScorer.from_scorer(
-            scoring, self._get_response_method(), self.thresholds
+            scorer, self._get_response_method(), self.thresholds
         )
         return curve_scorer
diff --git a/sklearn/model_selection/tests/test_classification_threshold.py b/sklearn/model_selection/tests/test_classification_threshold.py
@@ -1,3 +1,5 @@
+import re
+
 import numpy as np
 import pytest
 
@@ -14,6 +16,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import (
     balanced_accuracy_score,
+    confusion_matrix,
     f1_score,
     fbeta_score,
     make_scorer,
@@ -684,6 +687,96 @@ def test_fixed_threshold_classifier_metadata_routing():
     assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_)
 
 
+@pytest.mark.parametrize(
+    "scoring_name, expected_method_names",
+    [
+        ("roc_auc", "('decision_function', 'predict_proba')"),
+        ("average_precision", "('decision_function', 'predict_proba')"),
+        ("neg_log_loss", "'predict_proba'"),
+    ],
+)
+def test_error_on_unthresholded_classification_metrics(
+    scoring_name, expected_method_names
+):
+    """Check error raised with metrics meant for unthresholded predictions."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+    err_msg = re.escape(
+        "TunedThresholdClassifierCV expects a scoring metric that evaluates the "
+        f"thresholded predictions of a binary classifier, got: '{scoring_name}' "
+        "which expects unthresholded predictions computed by the "
+        f"{expected_method_names} method(s) of the classifier."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        TunedThresholdClassifierCV(estimator, scoring=scoring_name).fit(X, y)
+
+
+def test_warn_on_constant_scores():
+    """Check that a warning is raised when the score is constant."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+
+    def constant_score_func(y_true, y_pred):
+        return 1.0
+
+    scorer = make_scorer(constant_score_func, response_method="predict")
+
+    warn_msg = re.escape(
+        "The objective metric make_scorer(constant_score_func, "
+        "response_method='predict') is constant at 1.0 across all thresholds. Falling "
+        "back to the default 0.5 threshold. Please instead pass a scoring metric that "
+        "varies with the decision threshold."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        tuned_clf = TunedThresholdClassifierCV(
+            estimator, scoring=scorer, store_cv_results=True
+        ).fit(X, y)
+    assert_allclose(tuned_clf.cv_results_["scores"], np.ones(shape=100))
+    assert tuned_clf.best_threshold_ == pytest.approx(0.5)
+
+
+def always_prefer_positive_class(y_observed, y_pred):
+    tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel()
+    return tp - 2 * fn
+
+
+def always_prefer_negative_class(y_observed, y_pred):
+    tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel()
+    return tn - 2 * fp
+
+
+@pytest.mark.parametrize(
+    "scoring, kind",
+    [
+        (make_scorer(always_prefer_positive_class), "positive"),
+        (make_scorer(always_prefer_negative_class), "negative"),
+        ("precision", "negative"),
+        ("recall", "positive"),
+    ],
+)
+def test_warn_on_trivial_thresholds(scoring, kind):
+    """Check that a warning is raised when the score is constant."""
+    X, y = make_classification(random_state=0)
+    estimator = LogisticRegression()
+
+    warn_msg = re.escape(
+        f"Tuning the decision threshold on {scoring} leads to a trivial classifier "
+        f"that classifies all samples as the {kind} class. Consider revising the "
+        "scoring parameter to include a trade-off between false positives and false "
+        "negatives."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        tuned_clf = TunedThresholdClassifierCV(
+            estimator, scoring=scoring, store_cv_results=True
+        ).fit(X, y)
+
+    threshods = tuned_clf.cv_results_["thresholds"]
+    if kind == "positive":
+        assert tuned_clf.best_threshold_ == threshods[0] == threshods.min()
+    else:
+        assert tuned_clf.best_threshold_ == threshods[-1] == threshods.max()
+
+
 class ClassifierLoggingFit(ClassifierMixin, BaseEstimator):
     """Classifier that logs the number of `fit` calls."""