8000 FIX more precise log loss gradient and hessian (#28048) · scikit-learn/scikit-learn@5ad8e45 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5ad8e45

Browse files
authored
FIX more precise log loss gradient and hessian (#28048)
1 parent 71deeb8 commit 5ad8e45

File tree

3 files changed

+172
-41
lines changed

3 files changed

+172
-41
lines changed

doc/whats_new/v1.4.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,17 @@ See :ref:`array_api` for more details.
220220
- :class:`preprocessing.MinMaxScaler` in :pr:`26243` by `Tim Head`_;
221221
- :class:`preprocessing.Normalizer` in :pr:`27558` by :user:`Edoardo Abati <EdAbati>`.
222222

223+
Private Loss Function Module
224+
----------------------------
225+
226+
- |FIX| The gradient computation of the binomial log loss is now numerically
227+
more stable for very large, in absolute value, input (raw predictions). Before, it
228+
could result in `np.nan`. Among the models that profit from this change are
229+
:class:`ensemble.GradientBoostingClassifier`,
230+
:class:`ensemble.HistGradientBoostingClassifier` and
231+
:class:`linear_model.LogisticRegression`.
232+
:pr:`28048` by :user:`Christian Lorentzen <lorentzenchr>`.
233+
223234
Changelog
224235
---------
225236

sklearn/_loss/_loss.pyx.tp

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -695,9 +695,8 @@ cdef inline double cgradient_half_binomial(
695695
double y_true,
696696
double raw_prediction
697697
) noexcept nogil:
698-
# y_pred - y_true = expit(raw_prediction) - y_true
699-
# Numerically more stable, see
700-
# http://fa.bianp.net/blog/2019/evaluate_logistic/
698+
# gradient = y_pred - y_true = expit(raw_prediction) - y_true
699+
# Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
701700
# if raw_prediction < 0:
702701
# exp_tmp = exp(raw_prediction)
703702
# return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
@@ -708,34 +707,47 @@ cdef inline double cgradient_half_binomial(
708707
# return expit(raw_prediction) - y_true
709708
# i.e. no "if else" and an own inline implementation of expit instead of
710709
# from scipy.special.cython_special cimport expit
711-
# The case distinction raw_prediction < 0 in the stable implementation
712-
# does not provide significant better precision. Therefore we go without
713-
# it.
710+
# The case distinction raw_prediction < 0 in the stable implementation does not
711+
# provide significant better precision apart from protecting overflow of exp(..).
712+
# The branch (if else), however, can incur runtime costs of up to 30%.
713+
# Instead, we help branch prediction by almost always ending in the first if clause
714+
# and making the second branch (else) a bit simpler. This has the exact same
715+
# precision but is faster than the stable implementation.
716+
# As branching criteria, we use the same cutoff as in log1pexp. Note that the
717+
# maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
718+
# (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
714719
cdef double exp_tmp
715-
exp_tmp = exp(-raw_prediction)
716-
return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
720+
if raw_prediction > -37:
721+
exp_tmp = exp(-raw_prediction)
722+
return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
723+
else:
724+
# expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
725+
return exp(raw_prediction) - y_true
717726

718727

719728
cdef inline double_pair closs_grad_half_binomial(
720729
double y_true,
721730
double raw_prediction
722731
) noexcept nogil:
723732
cdef double_pair lg
724-
if raw_prediction <= 0:
733+
# Same if else conditions as in log1pexp.
734+
if raw_prediction <= -37:
725735
lg.val2 = exp(raw_prediction) # used as temporary
726-
if raw_prediction <= -37:
727-
lg.val1 = lg.val2 - y_true * raw_prediction # loss
728-
else:
729-
lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
736+
lg.val1 = lg.val2 - y_true * raw_prediction # loss
737+
lg.val2 -= y_true # gradient
738+
elif raw_prediction <= -2:
739+
lg.val2 = exp(raw_prediction) # used as temporary
740+
lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
730741
lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient
742+
elif raw_prediction <= 18:
743+
lg.val2 = exp(-raw_prediction) # used as temporary
744+
# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
745+
lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
746+
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
731747
else:
732748
lg.val2 = exp(-raw_prediction) # used as temporary
733-
if raw_prediction <= 18:
734-
# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
735-
lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
736-
else:
737-
lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
738-
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
749+
lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
750+
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
739751
return lg
740752

741753

@@ -747,9 +759,15 @@ cdef inline double_pair cgrad_hess_half_binomial(
747759
# hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
748760
# = exp(-raw) / (1 + exp(-raw))**2
749761
cdef double_pair gh
750-
gh.val2 = exp(-raw_prediction) # used as temporary
751-
gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
752-
gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
762+
# See comment in cgradient_half_binomial.
763+
if raw_prediction > -37:
764+
gh.val2 = exp(-raw_prediction) # used as temporary
765+
gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
766+
gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
767+
else:
768+
gh.val2 = exp(raw_prediction)
769+
gh.val1 = gh.val2 - y_true
770+
gh.val2 *= (1 - gh.val2)
753771
return gh
754772

755773

sklearn/_loss/tests/test_loss.py

Lines changed: 121 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -224,48 +224,150 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
224224

225225

226226
@pytest.mark.parametrize(
227-
"loss, y_true, raw_prediction, loss_true",
227+
"loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true",
228228
[
229-
(HalfSquaredError(), 1.0, 5.0, 8),
230-
(AbsoluteError(), 1.0, 5.0, 4),
231-
(PinballLoss(quantile=0.5), 1.0, 5.0, 2),
232-
(PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25)),
233-
(PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25),
234-
(HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2)),
235-
(HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2),
236-
(HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
237-
(HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
238-
(HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2),
239-
(HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2)),
240-
(HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2),
241-
(HalfTweedieLossIdentity(power=3), 2.0, 4.0, -1 / 4 + 1 / 4**2 + 1 / 2 / 2),
242-
(HalfBinomialLoss(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
229+
(HalfSquaredError(), 1.0, 5.0, 8, 4, 1),
230+
(AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None),
231+
(PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None),
232+
(PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None),
233+
(PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None),
234+
(HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None),
235+
(HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None),
236+
(HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4),
237+
(HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4),
238+
(HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None),
239+
(HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None),
240+
(HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None),
241+
(
242+
HalfTweedieLossIdentity(power=3),
243+
2.0,
244+
4.0,
245+
-1 / 4 + 1 / 4**2 + 1 / 2 / 2,
246+
None,
247+
None,
248+
),
249+
(
250+
HalfBinomialLoss(),
251+
0.25,
252+
np.log(4),
253+
np.log1p(4) - 0.25 * np.log(4),
254+
None,
255+
None,
256+
),
257+
# Extreme log loss cases, checked with mpmath:
258+
# import mpmath as mp
259+
#
260+
# # Stolen from scipy
261+
# def mpf2float(x):
262+
# return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0))
263+
#
264+
# def mp_logloss(y_true, raw):
265+
# with mp.workdps(100):
266+
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
267+
# out = mp.log1p(mp.exp(raw)) - y_true * raw
268+
# return mpf2float(out)
269+
#
270+
# def mp_gradient(y_true, raw):
271+
# with mp.workdps(100):
272+
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
273+
# out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true
274+
# return mpf2float(out)
275+
#
276+
# def mp_hessian(y_true, raw):
277+
# with mp.workdps(100):
278+
# y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw))
279+
# p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw))
280+
# out = p * (mp.mpf(1) - p)
281+
# return mpf2float(out)
282+
#
283+
# y, raw = 0.0, 37.
284+
# mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw)
285+
(HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0),
286+
(HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0),
287+
(HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0),
288+
(HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0),
289+
(HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0),
290+
(HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17),
291+
(HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3),
292+
(HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17),
293+
(HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3),
294+
(HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17),
295+
(HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3),
296+
(HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17),
297+
(HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101),
298+
(HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0),
299+
(HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0),
300+
(
301+
HalfBinomialLoss(),
302+
1.0,
303+
232.8,
304+
0,
305+
-1.4287342391028437e-101,
306+
1.4287342391028437e-101,
307+
),
308+
(HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0),
309+
(HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0),
310+
(HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0),
243311
(
244312
HalfMultinomialLoss(n_classes=3),
245313
0.0,
246314
[0.2, 0.5, 0.3],
247315
logsumexp([0.2, 0.5, 0.3]) - 0.2,
316+
None,
317+
None,
248318
),
249319
(
250320
HalfMultinomialLoss(n_classes=3),
251321
1.0,
252322
[0.2, 0.5, 0.3],
253323
logsumexp([0.2, 0.5, 0.3]) - 0.5,
324+
None,
325+
None,
254326
),
255327
(
256328
HalfMultinomialLoss(n_classes=3),
257329
2.0,
258330
[0.2, 0.5, 0.3],
259331
logsumexp([0.2, 0.5, 0.3]) - 0.3,
332+
None,
333+
None,
334+
),
335+
(
336+
HalfMultinomialLoss(n_classes=3),
337+
2.0,
338+
[1e4, 0, 7e-7],
339+
logsumexp([1e4, 0, 7e-7]) - (7e-7),
340+
None,
341+
None,
260342
),
261343
],
262344
ids=loss_instance_name,
263345
)
264-
def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
265-
"""Test losses at specific values."""
266-
assert loss(
346+
def test_loss_on_specific_values(
347+
loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true
348+
):
349+
"""Test losses, gradients and hessians at specific values."""
350+
loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]))
351+
grad1 = loss.gradient(
352+
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
353+
)
354+
loss2, grad2 = loss.loss_gradient(
355+
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
356+
)
357+
grad3, hess = loss.gradient_hessian(
267358
y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
268-
) == approx(loss_true, rel=1e-11, abs=1e-12)
359+
)
360+
361+
assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15)
362+
assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15)
363+
364+
if gradient_true is not None:
365+
assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15)
366+
assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15)
367+
assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15)
368+
369+
if hessian_true is not None:
370+
assert hess == approx(hessian_true, rel=1e-15, abs=1e-15)
269371

270372

271373
@pytest.mark.parametrize("loss", ALL_LOSSES)

0 commit comments

Comments
 (0)
0