FIX divide by zero in line search of GradientBoostingClassifier (#28095)

lorentzenchr · web-flow · commit f3b13e5dae57 · 2024-01-15T22:29:03.000+01:00
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -470,7 +470,7 @@ Changelog
 - |Efficiency| :class:`ensemble.GradientBoostingClassifier` is faster,
   for binary and in particular for multiclass problems thanks to the private loss
   function module.
-  :pr:`26278` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :pr:`26278` and :pr:`28095` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Efficiency| Improves runtime and memory usage for
   :class:`ensemble.GradientBoostingClassifier` and
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
@@ -65,16 +65,23 @@
 
 def _safe_divide(numerator, denominator):
     """Prevents overflow and division by zero."""
-    try:
+    # This is used for classifiers where the denominator might become zero exatly.
+    # For instance for log loss, HalfBinomialLoss, if proba=0 or proba=1 exactly, then
+    # denominator = hessian = 0, and we should set the node value in the line search to
+    # zero as there is no improvement of the loss possible.
+    # For numerical safety, we do this already for extremely tiny values.
+    if abs(denominator) < 1e-150:
+        return 0.0
+    else:
+        # Cast to Python float to trigger Python errors, e.g. ZeroDivisionError,
+        # without relying on `np.errstate` that is not supported by Pyodide.
+        result = float(numerator) / float(denominator)
         # Cast to Python float to trigger a ZeroDivisionError without relying
         # on `np.errstate` that is not supported by Pyodide.
         result = float(numerator) / float(denominator)
         if math.isinf(result):
             warnings.warn("overflow encountered in _safe_divide", RuntimeWarning)
         return result
-    except ZeroDivisionError:
-        warnings.warn("divide by zero encountered in _safe_divide", RuntimeWarning)
-        return 0.0
 
 
 def _init_raw_predictions(X, estimator, loss, use_predict_proba):
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -1452,9 +1452,9 @@ def test_huber_vs_mean_and_median():
 
 def test_safe_divide():
     """Test that _safe_divide handles division by zero."""
-    with pytest.warns(RuntimeWarning, match="divide"):
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
         assert _safe_divide(np.float64(1e300), 0) == 0
-    with pytest.warns(RuntimeWarning, match="divide"):
         assert _safe_divide(np.float64(0.0), np.float64(0.0)) == 0
     with pytest.warns(RuntimeWarning, match="overflow"):
         # np.finfo(float).max = 1.7976931348623157e+308
@@ -1680,3 +1680,31 @@ def test_multinomial_error_exact_backward_compat():
         ]
     )
     assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)
+
+
+def test_gb_denominator_zero(global_random_seed):
+    """Test _update_terminal_regions denominator is not zero.
+
+    For instance for log loss based binary classification, the line search step might
+    become nan/inf as denominator = hessian = prob * (1 - prob) and prob = 0 or 1 can
+    happen.
+    Here, we create a situation were this happens (at least with roughly 80%) based
+    on the random seed.
+    """
+    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=20)
+
+    params = {
+        "learning_rate": 1.0,
+        "subsample": 0.5,
+        "n_estimators": 100,
+        "max_leaf_nodes": 4,
+        "max_depth": None,
+        "random_state": global_random_seed,
+        "min_samples_leaf": 2,
+    }
+
+    clf = GradientBoostingClassifier(**params)
+    # _safe_devide would raise a RuntimeWarning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        clf.fit(X, y)