scikit-learn · lorentzenchr · Oct 20, 2020 · Oct 23, 2020 · NicolasHug · Oct 21, 2020
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -10,10 +10,13 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport exp, log
+#from libc.math cimport expf
+cdef extern from "math.h":
+    float expf(float x) nogil
 
 from .common cimport Y_DTYPE_C
 from .common cimport G_H_DTYPE_C
+from .common import G_H_DTYPE
 
 np.import_array()
 
@@ -68,7 +71,7 @@ def _update_gradients_hessians_least_absolute_deviation(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
-        # gradient = sign(raw_predicition - y_pred) * sample_weight
+        # gradient = sign(raw_prediction - y_pred) * sample_weight
         gradients[i] = sample_weight[i] * (2 *
                         (y_true[i] - raw_predictions[i] < 0) - 1)
         hessians[i] = sample_weight[i]
@@ -85,7 +88,7 @@ def _update_gradients_least_absolute_deviation(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
-        # gradient = sign(raw_predicition - y_pred)
+        # gradient = sign(raw_prediction - y_pred)
         gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
 
 
@@ -99,21 +102,21 @@ def _update_gradients_hessians_poisson(
     cdef:
         int n_samples
         int i
-        Y_DTYPE_C y_pred
+        G_H_DTYPE_C y_pred
 
     n_samples = raw_predictions.shape[0]
     if sample_weight is None:
         for i in prange(n_samples, schedule='static', nogil=True):
             # Note: We use only half of the deviance loss. Therefore, there is
             # no factor of 2.
-            y_pred = exp(raw_predictions[i])
+            y_pred = expf(raw_predictions[i])
             gradients[i] = (y_pred - y_true[i])
             hessians[i] = y_pred
     else:
         for i in prange(n_samples, schedule='static', nogil=True):
             # Note: We use only half of the deviance loss. Therefore, there is
             # no factor of 2.
-            y_pred = exp(raw_predictions[i])
+            y_pred = expf(raw_predictions[i])
             gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
             hessians[i] = y_pred * sample_weight[i]
 
@@ -126,7 +129,7 @@ def _update_gradients_hessians_binary_crossentropy(
         const Y_DTYPE_C [::1] sample_weight):  # IN
     cdef:
         int n_samples
-        Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
+        G_H_DTYPE_C p_i  # proba that ith sample belongs to positive class
         int i
 
     n_samples = raw_predictions.shape[0]
@@ -153,11 +156,12 @@ def _update_gradients_hessians_categorical_crossentropy(
         int n_samples = raw_predictions.shape[1]
         int k  # class index
         int i  # sample index
-        Y_DTYPE_C sw
+        G_H_DTYPE_C sw
         # p[i, k] is the probability that class(ith sample) == k.
         # It's the softmax of the raw predictions
-        Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
-        Y_DTYPE_C p_i_k
+        G_H_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim),
+                                          dtype=G_H_DTYPE)
+        G_H_DTYPE_C p_i_k
 
     if sample_weight is None:
         for i in prange(n_samples, schedule='static', nogil=True):
@@ -184,14 +188,14 @@ def _update_gradients_hessians_categorical_crossentropy(
                 hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw
 
 
-cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
+cdef inline void _compute_softmax(G_H_DTYPE_C [:, ::1] p, const int i) nogil:
     """Compute softmaxes of values in p[i, :]."""
     # i needs to be passed (and stays constant) because otherwise Cython does
     # not generate optimal code
 
     cdef:
-        Y_DTYPE_C max_value = p[i, 0]
-        Y_DTYPE_C sum_exps = 0.
+        G_H_DTYPE_C max_value = p[i, 0]
+        G_H_DTYPE_C sum_exps = 0.
         unsigned int k
         unsigned prediction_dim = p.shape[1]
 
@@ -201,13 +205,13 @@ cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
             max_value = p[i, k]
 
     for k in range(prediction_dim):
-        p[i, k] = exp(p[i, k] - max_value)
+        p[i, k] = expf(p[i, k] - max_value)
         sum_exps += p[i, k]
 
     for k in range(prediction_dim):
         p[i, k] /= sum_exps
 
 
-cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil:
+cdef inline G_H_DTYPE_C _cexpit(const G_H_DTYPE_C x) nogil:
     """Custom expit (logistic sigmoid function)"""
-    return 1. / (1. + exp(-x))
+    return 1. / (1. + expf(-x))
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -91,14 +91,14 @@ def fprime2(x: np.ndarray) -> np.ndarray:
         return get_hessians(y_true, x)
 
     optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
-                     maxiter=70, tol=2e-8)
+                     maxiter=70, tol=5e-7)
 
     # Need to ravel arrays because assert_allclose requires matching dimensions
     y_true = y_true.ravel()
     optimum = optimum.ravel()
-    assert_allclose(loss.inverse_link_function(optimum), y_true)
-    assert_allclose(func(optimum), 0, atol=1e-14)
-    assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
+    assert_allclose(loss.inverse_link_function(optimum), y_true, atol=5e-7)
+    assert_allclose(func(optimum), 0, atol=1e-13)
+    assert_allclose(get_gradients(y_true, optimum), 0, atol=5e-7)
 
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [