scikit-learn · OmarManzoor · Aug 25, 2023 · Aug 25, 2023 · Sep 18, 2023 · Sep 18, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -336,6 +336,14 @@ Changelog
   for CSR × CSR,  Dense × CSR, and CSR × Dense datasets is now 1.5x faster.
   :pr:`26765` by :user:`Meekail Zain <micky774>`
 
+- |Fix| :func:`f1_score` now provides correct values when handling various
+  cases in which zero division occurs by using a formulation that does not
+  depend on the precision and recall values.
+  :pr:`27165` by :user:`Omar Salman <OmarManzoor>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
+:mod:`sklearn.preprocessing`
+............................
 - |Efficiency| Computing distances via :class:`metrics.DistanceMetric`
   for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory,
   and outputs distances in the same dtype as the provided data.

diff --git a/fbeta.py b/fbeta.py
@@ -0,0 +1,7 @@
+import numpy as np
+
+from sklearn.metrics import fbeta_score
+
+y_true = [0, 1, 2, 0, 1, 2]
+y_pred_empty = [0, 0, 0, 0, 0, 0]
+fbeta_score(y_true, y_pred_empty, average="macro", zero_division=np.nan, beta=0.5)
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -1399,7 +1399,7 @@ def fbeta_score(
     >>> y_pred_empty = [0, 0, 0, 0, 0, 0]
     >>> fbeta_score(y_true, y_pred_empty,
     ...             average="macro", zero_division=np.nan, beta=0.5)
-    0.38...
+    0.12...
     """
 
     _, _, f, _ = precision_recall_fscore_support(
@@ -1451,12 +1451,13 @@ def _prf_divide(
     # labels with no predicted samples. Use ``zero_division`` parameter to
     # control this behavior."
 
-    if metric in warn_for and "f-score" in warn_for:
+    if metric in warn_for and "f-score" in warn_for and metric != "f-score":
         msg_start = "{0} and F-score are".format(metric.title())
-    elif metric in warn_for:
-        msg_start = "{0} is".format(metric.title())
     elif "f-score" in warn_for:
         msg_start = "F-score is"
+    elif metric in warn_for:
+        msg_start = "{0} is".format(metric.title())
+
     else:
         return result
 
@@ -1711,7 +1712,7 @@ def precision_recall_fscore_support(
      array([0., 0., 1.]), array([0. , 0. , 0.8]),
      array([2, 2, 2]))
     """
-    zero_division_value = _check_zero_division(zero_division)
+    _check_zero_division(zero_division)
     labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
 
     # Calculate tp_sum, pred_sum, true_sum ###
@@ -1744,26 +1745,25 @@ def precision_recall_fscore_support(
         tp_sum, true_sum, "recall", "true", average, warn_for, zero_division
     )
 
-    # warn for f-score only if zero_division is warn, it is in warn_for
-    # and BOTH prec and rec are ill-defined
-    if zero_division == "warn" and ("f-score",) == warn_for:
-        if (pred_sum[true_sum == 0] == 0).any():
-            _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
-
     if np.isposinf(beta):
         f_score = recall
     elif beta == 0:
         f_score = precision
     else:
         # The score is defined as:
         # score = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
-        # We set to `zero_division_value` if the denominator is 0 **or** if **both**
-        # precision and recall are ill-defined.
-        denom = beta2 * precision + recall
-        mask = np.isclose(denom, 0) | np.isclose(pred_sum + true_sum, 0)
-        denom[mask] = 1  # avoid division by 0
-        f_score = (1 + beta2) * precision * recall / denom
-        f_score[mask] = zero_division_value
+        # Therefore, we can express the score in terms of confusion matrix entries as:
+        # score = (1 + beta**2) * tp / ((1 + beta**2) * tp + beta**2 * fn + fp)
+        denom = beta2 * true_sum + pred_sum
+        f_score = _prf_divide(
+            (1 + beta2) * tp_sum,
+            denom,
+            "f-score",
+            "true nor predicted",
+            average,
+            warn_for,
+            zero_division,
+        )
 
     # Average the results
     if average == "weighted":

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -1797,7 +1797,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction(
 
     assert_array_almost_equal(p, [zero_division_expected, 1.0, 1.0, 0.0], 2)
     assert_array_almost_equal(r, [0.0, 0.5, 1.0, zero_division_expected], 2)
-    expected_f = 0 if not np.isnan(zero_division_expected) else np.nan
+    expected_f = 0
     assert_array_almost_equal(f, [expected_f, 1 / 1.5, 1, expected_f], 2)
     assert_array_almost_equal(s, [1, 2, 1, 0], 2)
 
@@ -1814,7 +1814,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction(
 
     assert_almost_equal(p, (2 + value_to_sum) / values_to_average)
     assert_almost_equal(r, (1.5 + value_to_sum) / values_to_average)
-    expected_f = (2 / 3 + 1) / (4 if not np.isnan(zero_division_expected) else 2)
+    expected_f = (2 / 3 + 1) / 4
     assert_almost_equal(f, expected_f)
     assert s is None
     assert_almost_equal(
@@ -1847,7 +1847,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction(
     )
     assert_almost_equal(p, 3 / 4 if zero_division_expected == 0 else 1.0)
     assert_almost_equal(r, 0.5)
-    values_to_average = 4 if not np.isnan(zero_division_expected) else 3
+    values_to_average = 4
     assert_almost_equal(f, (2 * 2 / 3 + 1) / values_to_average)
     assert s is None
     assert_almost_equal(
@@ -1865,12 +1865,12 @@ def test_precision_recall_f1_score_with_an_empty_prediction(
     assert_almost_equal(r, 1 / 3)
     assert_almost_equal(f, 1 / 3)
     assert s is None
-    expected_result = {1: 0.666, np.nan: 1.0}
+    expected_result = 0.333
     assert_almost_equal(
         fbeta_score(
             y_true, y_pred, beta=2, average="samples", zero_division=zero_division
         ),
-        expected_result.get(zero_division, 0.333),
+        expected_result,
         2,
     )
 
@@ -2077,6 +2077,12 @@ def test_prf_warnings():
     with warnings.catch_warnings(record=True) as record:
         warnings.simplefilter("always")
         precision_recall_fscore_support([0, 0], [0, 0], average="binary")
+        msg = (
+            "F-score is ill-defined and being set to 0.0 due to no true nor "
+            "predicted samples. Use `zero_division` parameter to control this"
+            " behavior."
+        )
+        assert str(record.pop().message) == msg
         msg = (
             "Recall and F-score are ill-defined and "
             "being set to 0.0 due to no true samples."
@@ -2804,3 +2810,22 @@ def test_classification_metric_pos_label_types(metric, classes):
         y_pred = y_true.copy()
     result = metric(y_true, y_pred, pos_label=pos_label)
     assert not np.any(np.isnan(result))
+
+
+def test_f1_for_small_binary_inputs_with_zero_division():
+    """Non-regression test for gh-26965"""
+    y_true = np.array([0, 1])
+    y_pred = np.array([1, 0])
+    assert f1_score(y_true, y_pred, zero_division=1) == 0.0
+
+    y_true = np.array([0, 1])
+    y_pred = np.array([0, 1])
+    assert f1_score(y_true, y_pred, zero_division=1) == 1.0
+
+    y_true = np.array([0, 1])
+    y_pred = np.array([0, 0])
+    assert f1_score(y_true, y_pred, zero_division=1) == 0.0
+
+    y_true = np.array([0, 0])
+    y_pred = np.array([0, 0])
+    assert f1_score(y_true, y_pred, zero_division=1) == 1.0