8000 [WIP] PERF compute float32 grad and hess in HGBT by lorentzenchr · Pull Request #18659 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] PERF compute float32 grad and hess in HGBT #18659

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ from cython.parallel import prange
import numpy as np
cimport numpy as np

from libc.math cimport exp, log
#from libc.math cimport expf
cdef extern from "math.h":
float expf(float x) nogil

from .common cimport Y_DTYPE_C
from .common cimport G_H_DTYPE_C
from .common import G_H_DTYPE

np.import_array()

Expand Down Expand Up @@ -68,7 +71,7 @@ def _update_gradients_hessians_least_absolute_deviation(

n_samples = raw_predictions.shape[0]
for i in prange(n_samples, schedule='static', nogil=True):
# gradient = sign(raw_predicition - y_pred) * sample_weight
# gradient = sign(raw_prediction - y_pred) * sample_weight
gradients[i] = sample_weight[i] * (2 *
(y_true[i] - raw_predictions[i] < 0) - 1)
hessians[i] = sample_weight[i]
Expand All @@ -85,7 +88,7 @@ def _update_gradients_least_absolute_deviation(

n_samples = raw_predictions.shape[0]
for i in prange(n_samples, schedule='static', nogil=True):
# gradient = sign(raw_predicition - y_pred)
# gradient = sign(raw_prediction - y_pred)
gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1


Expand All @@ -99,21 +102,21 @@ def _update_gradients_hessians_poisson(
cdef:
int n_samples
int i
Y_DTYPE_C y_pred
G_H_DTYPE_C y_pred

n_samples = raw_predictions.shape[0]
if sample_weight is None:
for i in prange(n_samples, schedule='static', nogil=True):
# Note: We use only half of the deviance loss. Therefore, there is
# no factor of 2.
y_pred = exp(raw_predictions[i])
y_pred = expf(raw_predictions[i])
gradients[i] = (y_pred - y_true[i])
hessians[i] = y_pred
else:
for i in prange(n_samples, schedule='static', nogil=True):
# Note: We use only half of the deviance loss. Therefore, there is
# no factor of 2.
y_pred = exp(raw_predictions[i])
y_pred = expf(raw_predictions[i])
gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
hessians[i] = y_pred * sample_weight[i]

Expand All @@ -126,7 +129,7 @@ def _update_gradients_hessians_binary_crossentropy(
const Y_DTYPE_C [::1] sample_weight): # IN
cdef:
int n_samples
Y_DTYPE_C p_i # proba that ith sample belongs to positive class
G_H_DTYPE_C p_i # proba that ith sample belongs to positive class
int i

n_samples = raw_predictions.shape[0]
Expand All @@ -153,11 +156,12 @@ def _update_gradients_hessians_categorical_crossentropy(
int n_samples = raw_predictions.shape[1]
int k # class index
int i # sample index
Y_DTYPE_C sw
G_H_DTYPE_C sw
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why change sw?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Otherwise, in line 187-188, gradients[k, i] = XXX * sw, the multiplication on the right-hand side would be a double, which is then down-casted to float32 for assignment to the left-hand side.
Why doing the multiplication in double precision, when it's down-casted to float anyway?

< 10000 details class="details-overlay details-reset position-relative d-inline-block"> Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doing the multiplication in double precision, when it's down-casted to float anyway?

To avoid too many casts since casting isn't free. Though I don't know enough of the details to understand how costly they are, and how/when exactly they happen.

Note that according to this logic, y_true in _update_gradients_hessians_categorical_crossentropy should be converted to float too since we do gradients[i] = p_i - y_true[i] (I don't think we should, just noting a discrepancy.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

y_true is a memory view, i.e. double[:]. I guess there is no implicit type conversion (possible). Accoding to this answer, the underlying type is a struct containing pointers. Providing a float32 array as argument throws an error instead of casting.

# p[i, k] is the probability that class(ith sample) == k.
# It's the softmax of the raw predictions
Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
Y_DTYPE_C p_i_k
G_H_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim),
dtype=G_H_DTYPE)
G_H_DTYPE_C p_i_k

if sample_weight is None:
for i in prange(n_samples, schedule='static', nogil=True):
Expand All @@ -184,14 +188,14 @@ def _update_gradients_hessians_categorical_crossentropy(
hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw


cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
cdef inline void _compute_softmax(G_H_DTYPE_C [:, ::1] p, const int i) nogil:
"""Compute softmaxes of values in p[i, :]."""
# i needs to be passed (and stays constant) because otherwise Cython does
# not generate optimal code

cdef:
Y_DTYPE_C max_value = p[i, 0]
Y_DTYPE_C sum_exps = 0.
G_H_DTYPE_C max_value = p[i, 0]
G_H_DTYPE_C sum_exps = 0.
unsigned int k
unsigned prediction_dim = p.shape[1]

Expand All @@ -201,13 +205,13 @@ cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
max_value = p[i, k]

for k in range(prediction_dim):
p[i, k] = exp(p[i, k] - max_value)
p[i, k] = expf(p[i, k] - max_value)
sum_exps += p[i, k]

for k in range(prediction_dim):
p[i, k] /= sum_exps


cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil:
cdef inline G_H_DTYPE_C _cexpit(const G_H_DTYPE_C x) nogil:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is now expecting a float but what is passed to _cexpit is still a double

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand C correctly, in particular type casting, the argument of _cexpit will be implicitly converted to the type as specified in function definition, which is now float32.

"""Custom expit (logistic sigmoid function)"""
return 1. / (1. + exp(-x))
return 1. / (1. + expf(-x))
8 changes: 4 additions & 4 deletions sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,14 @@ def fprime2(x: np.ndarray) -> np.ndarray:
return get_hessians(y_true, x)

optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
maxiter=70, tol=2e-8)
maxiter=70, tol=5e-7)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. I am not sure what we should conclude from the fact that more newton iteration converge to the true optimum even with a lower precision gradient computation...


# Need to ravel arrays because assert_allclose requires matching dimensions
y_true = y_true.ravel()
optimum = optimum.ravel()
assert_allclose(loss.inverse_link_function(optimum), y_true)
assert_allclose(func(optimum), 0, atol=1e-14)
assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
assert_allclose(loss.inverse_link_function(optimum), y_true, atol=5e-7)
assert_allclose(func(optimum), 0, atol=1e-13)
assert_allclose(get_gradients(y_true, optimum), 0, atol=5e-7)


@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
Expand Down
0