-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[WIP] PERF compute float32 grad and hess in HGBT #18659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,10 +10,13 @@ from cython.parallel import prange | |
import numpy as np | ||
cimport numpy as np | ||
|
||
from libc.math cimport exp, log | ||
#from libc.math cimport expf | ||
cdef extern from "math.h": | ||
float expf(float x) nogil | ||
|
||
from .common cimport Y_DTYPE_C | ||
from .common cimport G_H_DTYPE_C | ||
from .common import G_H_DTYPE | ||
|
||
np.import_array() | ||
|
||
|
@@ -68,7 +71,7 @@ def _update_gradients_hessians_least_absolute_deviation( | |
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# gradient = sign(raw_predicition - y_pred) * sample_weight | ||
# gradient = sign(raw_prediction - y_pred) * sample_weight | ||
gradients[i] = sample_weight[i] * (2 * | ||
(y_true[i] - raw_predictions[i] < 0) - 1) | ||
hessians[i] = sample_weight[i] | ||
|
@@ -85,7 +88,7 @@ def _update_gradients_least_absolute_deviation( | |
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# gradient = sign(raw_predicition - y_pred) | ||
# gradient = sign(raw_prediction - y_pred) | ||
gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 | ||
|
||
|
||
|
@@ -99,21 +102,21 @@ def _update_gradients_hessians_poisson( | |
cdef: | ||
int n_samples | ||
int i | ||
Y_DTYPE_C y_pred | ||
G_H_DTYPE_C y_pred | ||
|
||
n_samples = raw_predictions.shape[0] | ||
if sample_weight is None: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# Note: We use only half of the deviance loss. Therefore, there is | ||
# no factor of 2. | ||
y_pred = exp(raw_predictions[i]) | ||
y_pred = expf(raw_predictions[i]) | ||
gradients[i] = (y_pred - y_true[i]) | ||
hessians[i] = y_pred | ||
else: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# Note: We use only half of the deviance loss. Therefore, there is | ||
# no factor of 2. | ||
y_pred = exp(raw_predictions[i]) | ||
y_pred = expf(raw_predictions[i]) | ||
gradients[i] = (y_pred - y_true[i]) * sample_weight[i] | ||
hessians[i] = y_pred * sample_weight[i] | ||
|
||
|
@@ -126,7 +129,7 @@ def _update_gradients_hessians_binary_crossentropy( | |
const Y_DTYPE_C [::1] sample_weight): # IN | ||
cdef: | ||
int n_samples | ||
Y_DTYPE_C p_i # proba that ith sample belongs to positive class | ||
G_H_DTYPE_C p_i # proba that ith sample belongs to positive class | ||
int i | ||
|
||
n_samples = raw_predictions.shape[0] | ||
|
@@ -153,11 +156,12 @@ def _update_gradients_hessians_categorical_crossentropy( | |
int n_samples = raw_predictions.shape[1] | ||
int k # class index | ||
int i # sample index | ||
Y_DTYPE_C sw | ||
G_H_DTYPE_C sw | ||
# p[i, k] is the probability that class(ith sample) == k. | ||
# It's the softmax of the raw predictions | ||
Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) | ||
Y_DTYPE_C p_i_k | ||
G_H_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim), | ||
dtype=G_H_DTYPE) | ||
G_H_DTYPE_C p_i_k | ||
|
||
if sample_weight is None: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
|
@@ -184,14 +188,14 @@ def _update_gradients_hessians_categorical_crossentropy( | |
hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw | ||
|
||
|
||
cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: | ||
cdef inline void _compute_softmax(G_H_DTYPE_C [:, ::1] p, const int i) nogil: | ||
"""Compute softmaxes of values in p[i, :].""" | ||
# i needs to be passed (and stays constant) because otherwise Cython does | ||
# not generate optimal code | ||
|
||
cdef: | ||
Y_DTYPE_C max_value = p[i, 0] | ||
Y_DTYPE_C sum_exps = 0. | ||
G_H_DTYPE_C max_value = p[i, 0] | ||
G_H_DTYPE_C sum_exps = 0. | ||
unsigned int k | ||
unsigned prediction_dim = p.shape[1] | ||
|
||
|
@@ -201,13 +205,13 @@ cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: | |
max_value = p[i, k] | ||
|
||
for k in range(prediction_dim): | ||
p[i, k] = exp(p[i, k] - max_value) | ||
p[i, k] = expf(p[i, k] - max_value) | ||
sum_exps += p[i, k] | ||
|
||
for k in range(prediction_dim): | ||
p[i, k] /= sum_exps | ||
|
||
|
||
cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil: | ||
cdef inline G_H_DTYPE_C _cexpit(const G_H_DTYPE_C x) nogil: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is now expecting a float but what is passed to _cexpit is still a double There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand C correctly, in particular type casting, the argument of |
||
"""Custom expit (logistic sigmoid function)""" | ||
return 1. / (1. + exp(-x)) | ||
return 1. / (1. + expf(-x)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,14 +91,14 @@ def fprime2(x: np.ndarray) -> np.ndarray: | |
return get_hessians(y_true, x) | ||
|
||
optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2, | ||
maxiter=70, tol=2e-8) | ||
maxiter=70, tol=5e-7) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. I am not sure what we should conclude from the fact that more newton iteration converge to the true optimum even with a lower precision gradient computation... |
||
|
||
# Need to ravel arrays because assert_allclose requires matching dimensions | ||
y_true = y_true.ravel() | ||
optimum = optimum.ravel() | ||
assert_allclose(loss.inverse_link_function(optimum), y_true) | ||
assert_allclose(func(optimum), 0, atol=1e-14) | ||
assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7) | ||
assert_allclose(loss.inverse_link_function(optimum), y_true, atol=5e-7) | ||
assert_allclose(func(optimum), 0, atol=1e-13) | ||
assert_allclose(get_gradients(y_true, optimum), 0, atol=5e-7) | ||
|
||
|
||
@pytest.mark.parametrize('loss, n_classes, prediction_dim', [ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why change sw?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Otherwise, in line 187-188,
gradients[k, i] = XXX * sw
, the multiplication on the right-hand side would be a double, which is then down-casted to float32 for assignment to the left-hand side.Why doing the multiplication in double precision, when it's down-casted to float anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To avoid too many casts since casting isn't free. Though I don't know enough of the details to understand how costly they are, and how/when exactly they happen.
Note that according to this logic, y_true in
_update_gradients_hessians_categorical_crossentropy
should be converted to float too since we dogradients[i] = p_i - y_true[i]
(I don't think we should, just noting a discrepancy.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
y_true
is a memory view, i.e.double[:]
. I guess there is no implicit type conversion (possible). Accoding to this answer, the underlying type is a struct containing pointers. Providing a float32 array as argument throws an error instead of casting.