From 011961ef1f5423c65f5dc8b3d005d7ce2fb714fc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 20 Oct 2020 22:39:12 +0200 Subject: [PATCH 1/2] ENH compute float32 grad and hess in HGBT --- .../_hist_gradient_boosting/_loss.pyx | 36 ++++++++++--------- .../tests/test_loss.py | 8 ++--- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 4114cd24aa8df..86507bb21d6e2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -10,10 +10,13 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from libc.math cimport exp, log +#from libc.math cimport expf +cdef extern from "math.h": + float expf(float x) nogil from .common cimport Y_DTYPE_C from .common cimport G_H_DTYPE_C +from .common import G_H_DTYPE np.import_array() @@ -68,7 +71,7 @@ def _update_gradients_hessians_least_absolute_deviation( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - # gradient = sign(raw_predicition - y_pred) * sample_weight + # gradient = sign(raw_prediction - y_pred) * sample_weight gradients[i] = sample_weight[i] * (2 * (y_true[i] - raw_predictions[i] < 0) - 1) hessians[i] = sample_weight[i] @@ -85,7 +88,7 @@ def _update_gradients_least_absolute_deviation( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - # gradient = sign(raw_predicition - y_pred) + # gradient = sign(raw_prediction - y_pred) gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 @@ -99,21 +102,21 @@ def _update_gradients_hessians_poisson( cdef: int n_samples int i - Y_DTYPE_C y_pred + G_H_DTYPE_C y_pred n_samples = raw_predictions.shape[0] if sample_weight is None: for i in prange(n_samples, schedule='static', nogil=True): # Note: We use only half of the deviance loss. Therefore, there is # no factor of 2. - y_pred = exp(raw_predictions[i]) + y_pred = expf(raw_predictions[i]) gradients[i] = (y_pred - y_true[i]) hessians[i] = y_pred else: for i in prange(n_samples, schedule='static', nogil=True): # Note: We use only half of the deviance loss. Therefore, there is # no factor of 2. - y_pred = exp(raw_predictions[i]) + y_pred = expf(raw_predictions[i]) gradients[i] = (y_pred - y_true[i]) * sample_weight[i] hessians[i] = y_pred * sample_weight[i] @@ -126,7 +129,7 @@ def _update_gradients_hessians_binary_crossentropy( const Y_DTYPE_C [::1] sample_weight): # IN cdef: int n_samples - Y_DTYPE_C p_i # proba that ith sample belongs to positive class + G_H_DTYPE_C p_i # proba that ith sample belongs to positive class int i n_samples = raw_predictions.shape[0] @@ -153,11 +156,12 @@ def _update_gradients_hessians_categorical_crossentropy( int n_samples = raw_predictions.shape[1] int k # class index int i # sample index - Y_DTYPE_C sw + G_H_DTYPE_C sw # p[i, k] is the probability that class(ith sample) == k. # It's the softmax of the raw predictions - Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) - Y_DTYPE_C p_i_k + G_H_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim), + dtype=G_H_DTYPE) + G_H_DTYPE_C p_i_k if sample_weight is None: for i in prange(n_samples, schedule='static', nogil=True): @@ -184,14 +188,14 @@ def _update_gradients_hessians_categorical_crossentropy( hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw -cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: +cdef inline void _compute_softmax(G_H_DTYPE_C [:, ::1] p, const int i) nogil: """Compute softmaxes of values in p[i, :].""" # i needs to be passed (and stays constant) because otherwise Cython does # not generate optimal code cdef: - Y_DTYPE_C max_value = p[i, 0] - Y_DTYPE_C sum_exps = 0. + G_H_DTYPE_C max_value = p[i, 0] + G_H_DTYPE_C sum_exps = 0. unsigned int k unsigned prediction_dim = p.shape[1] @@ -201,13 +205,13 @@ cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: max_value = p[i, k] for k in range(prediction_dim): - p[i, k] = exp(p[i, k] - max_value) + p[i, k] = expf(p[i, k] - max_value) sum_exps += p[i, k] for k in range(prediction_dim): p[i, k] /= sum_exps -cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil: +cdef inline G_H_DTYPE_C _cexpit(const G_H_DTYPE_C x) nogil: """Custom expit (logistic sigmoid function)""" - return 1. / (1. + exp(-x)) + return 1. / (1. + expf(-x)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 221b94183a7ff..0e68c6a951455 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -91,14 +91,14 @@ def fprime2(x: np.ndarray) -> np.ndarray: return get_hessians(y_true, x) optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2, - maxiter=70, tol=2e-8) + maxiter=70, tol=5e-6) # Need to ravel arrays because assert_allclose requires matching dimensions y_true = y_true.ravel() optimum = optimum.ravel() - assert_allclose(loss.inverse_link_function(optimum), y_true) - assert_allclose(func(optimum), 0, atol=1e-14) - assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7) + assert_allclose(loss.inverse_link_function(optimum), y_true, atol=2e-5) + assert_allclose(func(optimum), 0, atol=2e-11) + assert_allclose(get_gradients(y_true, optimum), 0, atol=2e-5) @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ From 1c73fcc6e9329734c2db2291b746ef0901b51415 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 23 Oct 2020 13:10:59 +0200 Subject: [PATCH 2/2] TST higher test accuracy --- .../ensemble/_hist_gradient_boosting/tests/test_loss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 0e68c6a951455..e27d2a7d8a394 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -91,14 +91,14 @@ def fprime2(x: np.ndarray) -> np.ndarray: return get_hessians(y_true, x) optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2, - maxiter=70, tol=5e-6) + maxiter=70, tol=5e-7) # Need to ravel arrays because assert_allclose requires matching dimensions y_true = y_true.ravel() optimum = optimum.ravel() - assert_allclose(loss.inverse_link_function(optimum), y_true, atol=2e-5) - assert_allclose(func(optimum), 0, atol=2e-11) - assert_allclose(get_gradients(y_true, optimum), 0, atol=2e-5) + assert_allclose(loss.inverse_link_function(optimum), y_true, atol=5e-7) + assert_allclose(func(optimum), 0, atol=1e-13) + assert_allclose(get_gradients(y_true, optimum), 0, atol=5e-7) @pytest.mark.parametrize('loss, n_classes, prediction_dim', [