8000 ENH Poisson loss for HistGradientBoostingRegressor (#16692) · scikit-learn/scikit-learn@a93b15f · GitHub
[go: up one dir, main page]

Skip to content

Commit a93b15f

Browse files
author
Christian Lorentzen
authored
ENH Poisson loss for HistGradientBoostingRegressor (#16692)
1 parent 946fdde commit a93b15f

File tree

7 files changed

+184
-17
lines changed

7 files changed

+184
-17
lines changed

doc/modules/ensemble.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -952,8 +952,9 @@ controls the number of iterations of the boosting process::
952952
>>> clf.score(X_test, y_test)
953953
0.8965
954954

955-
Available losses for regression are 'least_squares' and
956-
'least_absolute_deviation', which is less sensitive to outliers. For
955+
Available losses for regression are 'least_squares',
956+
'least_absolute_deviation', which is less sensitive to outliers, and
957+
'poisson', which is well suited to model counts and frequencies. For
957958
classification, 'binary_crossentropy' is used for binary classification and
958959
'categorical_crossentropy' is used for multiclass classification. By default
959960
the loss is 'auto' and will select the appropriate loss depending on

doc/whats_new/v0.23.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ Changelog
215215
to obtain the input to the meta estimator.
216216
:pr:`16539` by :user:`Bill DeRose <wderose>`.
217217

218+
- |Feature| Added additional option `loss="poisson"` to
219+
:class:`ensemble.HistGradientBoostingRegressor`, which adds Poisson deviance
220+
with log-link useful for modeling count data.
221+
:pr:`16692` by :user:`Christian Lorentzen <lorentzenchr>`
222+
218223
:mod:`sklearn.feature_extraction`
219224
.................................
220225

sklearn/ensemble/_hist_gradient_boosting/_loss.pyx

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ from cython.parallel import prange
1010
import numpy as np
1111
cimport numpy as np
1212

13-
from libc.math cimport exp
13+
from libc.math cimport exp, log
1414

1515
from .common cimport Y_DTYPE_C
1616
from .common cimport G_H_DTYPE_C
@@ -27,7 +27,7 @@ def _update_gradients_least_squares(
2727

2828
n_samples = raw_predictions.shape[0]
2929
for i in prange(n_samples, schedule='static', nogil=True):
30-
# Note: a more correct exp is 2 * (raw_predictions - y_true)
30+
# Note: a more correct expression is 2 * (raw_predictions - y_true)
3131
# but since we use 1 for the constant hessian value (and not 2) this
3232
# is strictly equivalent for the leaves values.
3333
gradients[i] = raw_predictions[i] - y_true[i]
@@ -87,6 +87,35 @@ def _update_gradients_least_absolute_deviation(
8787
gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
8888

8989

90+
def _update_gradients_hessians_poisson(
91+
G_H_DTYPE_C [::1] gradients, # OUT
92+
G_H_DTYPE_C [::1] hessians, # OUT
93+
const Y_DTYPE_C [::1] y_true, # IN
94+
const Y_DTYPE_C [::1] raw_predictions, # IN
95+
const Y_DTYPE_C [::1] sample_weight): # IN
96+
97+
cdef:
98+
int n_samples
99+
int i
100+
Y_DTYPE_C y_pred
101+
102+
n_samples = raw_predictions.shape[0]
103+
if sample_weight is None:
104+
for i in prange(n_samples, schedule='static', nogil=True):
105+
# Note: We use only half of the deviance loss. Therefore, there is
106+
# no factor of 2.
107+
y_pred = exp(raw_predictions[i])
108+
gradients[i] = (y_pred - y_true[i])
109+
hessians[i] = y_pred
110+
else:
111+
for i in prange(n_samples, schedule='static', nogil=True):
112+
# Note: We use only half of the deviance loss. Therefore, there is
113+
# no factor of 2.
114+
y_pred = exp(raw_predictions[i])
115+
gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
116+
hessians[i] = y_pred * sample_weight[i]
117+
118+
90119
def _update_gradients_hessians_binary_crossentropy(
91120
G_H_DTYPE_C [::1] gradients, # OUT
92121
G_H_DTYPE_C [::1] hessians, # OUT

sklearn/ensemble/_hist_g 10000 radient_boosting/gradient_boosting.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -758,11 +758,13 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
758758
759759
Parameters
760760
----------
761-
loss : {'least_squares', 'least_absolute_deviation'}, \
761+
loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \
762762
optional (default='least_squares')
763763
The loss function to use in the boosting process. Note that the
764-
"least squares" loss actually implements an "half least squares loss"
765-
to simplify the computation of the gradient.
764+
"least squares" and "poisson" losses actually implement
765+
"half least squares loss" and "half poisson deviance" to simplify the
766+
computation of the gradient. Furthermore, "poisson" loss internally
767+
uses a log-link and requires ``y >= 0``
766768
learning_rate : float, optional (default=0.1)
767769
The learning rate, also known as *shrinkage*. This is used as a
768770
multiplicative factor for the leaves values. Use ``1`` for no
@@ -868,7 +870,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
868870
0.92...
869871
"""
870872

871-
_VALID_LOSSES = ('least_squares', 'least_absolute_deviation')
873+
_VALID_LOSSES = ('least_squares', 'least_absolute_deviation',
874+
'poisson')
872875

873876
@_deprecate_positional_args
874877
def __init__(self, loss='least_squares', *, learning_rate=0.1,
@@ -902,14 +905,20 @@ def predict(self, X):
902905
y : ndarray, shape (n_samples,)
903906
The predicted values.
904907
"""
905-
# Return raw predictions after converting shape
906-
# (n_samples, 1) to (n_samples,)
907-
return self._raw_predict(X).ravel()
908+
check_is_fitted(self)
909+
# Return inverse link of raw predictions after converting
910+
# shape (n_samples, 1) to (n_samples,)
911+
return self.loss_.inverse_link_function(self._raw_predict(X).ravel())
908912

909913
def _encode_y(self, y):
910914
# Just convert y to the expected dtype
911915
self.n_trees_per_iteration_ = 1
912916
y = y.astype(Y_DTYPE, copy=False)
917+
if self.loss == 'poisson':
918+
# Ensure y >= 0 and sum(y) > 0
919+
if not (np.all(y >= 0) and np.sum(y) > 0):
920+
raise ValueError("loss='poisson' requires non-negative y and "
921+
"sum(y) > 0.")
913922
return y
914923

915924
def _get_loss(self, sample_weight):

sklearn/ensemble/_hist_gradient_boosting/loss.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from abc import ABC, abstractmethod
1010

1111
import numpy as np
12-
from scipy.special import expit, logsumexp
12+
from scipy.special import expit, logsumexp, xlogy
1313

1414
from .common import Y_DTYPE
1515
from .common import G_H_DTYPE
@@ -19,11 +19,13 @@
1919
from ._loss import _update_gradients_hessians_least_absolute_deviation
2020
from ._loss import _update_gradients_hessians_binary_crossentropy
2121
from ._loss import _update_gradients_hessians_categorical_crossentropy
22+
from ._loss import _update_gradients_hessians_poisson
2223
from ...utils.stats import _weighted_percentile
2324

2425

2526
class BaseLoss(ABC):
2627
"""Base class for a loss."""
28+
2729
def __init__(self, hessians_are_constant):
2830
self.hessians_are_constant = hessians_are_constant
2931

@@ -153,6 +155,7 @@ class LeastSquares(BaseLoss):
153155
the computation of the gradients and get a unit hessian (and be consistent
154156
with what is done in LightGBM).
155157
"""
158+
156159
def __init__(self, sample_weight):
157160
# If sample weights are provided, the hessians and gradients
158161
# are multiplied by sample_weight, which means the hessians are
@@ -195,6 +198,7 @@ class LeastAbsoluteDeviation(BaseLoss):
195198
196199
loss(x_i) = |y_true_i - raw_pred_i|
197200
"""
201+
198202
def __init__(self, sample_weight):
199203
# If sample weights are provided, the hessians and gradients
200204
# are multiplied by sample_weight, which means the hessians are
@@ -265,6 +269,51 @@ def update_leaves_values(self, grower, y_true, raw_predictions,
265269
# Note that the regularization is ignored here
266270

267271

272+
class Poisson(BaseLoss):
273+
"""Poisson deviance loss with log-link, for regression.
274+
275+
For a given sample x_i, Poisson deviance loss is defined as::
276+
277+
loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
278+
- y_true_i + exp(raw_pred_i))
279+
280+
This actually computes half the Poisson deviance to simplify
281+
the computation of the gradients.
282+
"""
283+
284+
def __init__(self, sample_weight):
285+
super().__init__(hessians_are_constant=False)
286+
287+
inverse_link_function = staticmethod(np.exp)
288+
289+
def pointwise_loss(self, y_true, raw_predictions):
290+
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
291+
# return a view.
292+
raw_predictions = raw_predictions.reshape(-1)
293+
# TODO: For speed, we could remove the constant xlogy(y_true, y_true)
294+
# Advantage of this form: minimum of zero at raw_predictions = y_true.
295+
loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
296+
+ np.exp(raw_predictions))
297+
return loss
298+
299+
def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
300+
y_pred = np.average(y_train, weights=sample_weight)
301+
eps = np.finfo(y_train.dtype).eps
302+
y_pred = np.clip(y_pred, eps, None)
303+
return np.log(y_pred)
304+
305+
def update_gradients_and_hessians(self, gradients, hessians, y_true,
306+
raw_predictions, sample_weight):
307+
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
308+
# return a view.
309+
raw_predictions = raw_predictions.reshape(-1)
310+
gradients = gradients.reshape(-1)
311+
hessians = hessians.reshape(-1)
312+
_update_gradients_hessians_poisson(gradients, hessians,
313+
y_true, raw_predictions,
314+
sample_weight)
315+
316+
268317
class BinaryCrossEntropy(BaseLoss):
269318
"""Binary cross-entropy loss, for binary classification.
270319
@@ -372,5 +421,6 @@ def predict_proba(self, raw_predictions):
372421
'least_squares': LeastSquares,
373422
'least_absolute_deviation': LeastAbsoluteDeviation,
374423
'binary_crossentropy': BinaryCrossEntropy,
375-
'categorical_crossentropy': CategoricalCrossEntropy
424+
'categorical_crossentropy': CategoricalCrossEntropy,
425+
'poisson': Poisson,
376426
}

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
import pytest
33
from numpy.testing import assert_allclose, assert_array_equal
44
from sklearn.datasets import make_classification, make_regression
5+
from sklearn.datasets import make_low_rank_matrix
56
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
67
from sklearn.model_selection import train_test_split
78
from sklearn.base import clone, BaseEstimator, TransformerMixin
89
from sklearn.pipeline import make_pipeline
10+
from sklearn.metrics import mean_poisson_deviance
11+
from sklearn.dummy import DummyRegressor
912

1013
# To use this experimental feature, we need to explicitly ask for it:
1114
from sklearn.experimental import enable_hist_gradient_boosting # noqa
@@ -194,6 +197,45 @@ def test_least_absolute_deviation():
194197
assert gbdt.score(X, y) > .9
195198

196199

200+
@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
201+
def test_poisson_y_positive(y):
202+
# Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
203+
err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
204+
gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
205+
with pytest.raises(ValueError, match=err_msg):
206+
gbdt.fit(np.zeros(shape=(len(y), 1)), y)
207+
208+
209+
def test_poisson():
210+
# For Poisson distributed target, Poisson loss should give better results
211+
# than least squares measured in Poisson deviance as metric.
212+
rng = np.random.RandomState(42)
213+
n_train, n_test, n_features = 500, 100, 100
214+
X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
215+
random_state=rng)
216+
# We create a log-linear Poisson model and downscale coef as it will get
217+
# exponentiated.
218+
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
219+
y = rng.poisson(lam=np.exp(X @ coef))
220+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
221+
random_state=rng)
222+
gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
223+
gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
224+
random_state=rng)
225+
gbdt_pois.fit(X_train, y_train)
226+
gbdt_ls.fit(X_train, y_train)
227+
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
228+
229+
for X, y in [(X_train, y_train), (X_test, y_test)]:
230+
metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
231+
# least_squares might produce non-positive predictions => clip
232+
metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
233+
None))
234+
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
235+
assert metric_pois < metric_ls
236+
assert metric_pois < metric_dummy
237+
238+
197239
def test_binning_train_validation_are_separated():
198240
# Make sure training and validation data are binned separately.
199241
# See issue 13926

sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def get_hessians(y_true, raw_predictions):
5252
# ('binary_crossentropy', 0.3, 0),
5353
('binary_crossentropy', -12, 1),
5454
('binary_crossentropy', 30, 1),
55+
('poisson', 12., 1.),
56+
('poisson', 0., 2.),
57+
('poisson', -22., 10.),
5558
])
5659
@pytest.mark.skipif(sp_version == (1, 2, 0),
5760
reason='bug in scipy 1.2.0, see scipy issue #9608')
@@ -76,17 +79,19 @@ def fprime(x):
7679
def fprime2(x):
7780
return get_hessians(y_true, x)
7881

79-
optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2)
82+
optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
83+
maxiter=70, tol=2e-8)
8084
assert np.allclose(loss.inverse_link_function(optimum), y_true)
8185
assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
82-
assert np.allclose(get_gradients(y_true, optimum), 0)
86+
assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
8387

8488

8589
@pytest.mark.parametrize('loss, n_classes, prediction_dim', [
8690
('least_squares', 0, 1),
8791
('least_absolute_deviation', 0, 1),
8892
('binary_crossentropy', 2, 1),
8993
('categorical_crossentropy', 3, 3),
94+
('poisson', 0, 1),
9095
])
9196
@pytest.mark.skipif(Y_DTYPE != np.float64,
9297
reason='Need 64 bits float precision for numerical checks')
@@ -100,6 +105,8 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
100105
n_samples = 100
101106
if loss in ('least_squares', 'least_absolute_deviation'):
102107
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
108+
elif loss in ('poisson'):
109+
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
103110
else:
104111
y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
105112
raw_predictions = rng.normal(
@@ -114,7 +121,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
114121

115122
# Approximate gradients
116123
# For multiclass loss, we should only change the predictions of one tree
117-
# (here the first), hence the use of offset[:, 0] += eps
124+
# (here the first), hence the use of offset[0, :] += eps
118125
# As a softmax is computed, offsetting the whole array by a constant would
119126
# have no effect on the probabilities, and thus on the loss
120127
eps = 1e-9
@@ -164,6 +171,27 @@ def test_baseline_least_absolute_deviation():
164171
assert baseline_prediction == pytest.approx(np.median(y_train))
165172

166173

174+
def test_baseline_poisson():
175+
rng = np.random.RandomState(0)
176+
177+
loss = _LOSSES['poisson'](sample_weight=None)
178+
y_train = rng.poisson(size=100).astype(np.float64)
179+
# Sanity check, make sure at least one sample is non-zero so we don't take
180+
# log(0)
181+
assert y_train.sum() > 0
182+
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
183+
assert np.isscalar(baseline_prediction)
184+
assert baseline_prediction.dtype == y_train.dtype
185+
assert_all_finite(baseline_prediction)
186+
# Make sure baseline prediction produces the log of the mean of all targets
187+
assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
188+
189+
# Test baseline for y_true = 0
190+
y_train.fill(0.)
191+
baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
192+
assert_all_finite(baseline_prediction)
193+
194+
167195
def test_baseline_binary_crossentropy():
168196
rng = np.random.RandomState(0)
169197

@@ -215,7 +243,8 @@ def test_baseline_categorical_crossentropy():
215243
('least_squares', 'regression'),
216244
('least_absolute_deviation', 'regression'),
217245
('binary_crossentropy', 'classification'),
218-
('categorical_crossentropy', 'classification')
246+
('categorical_crossentropy', 'classification'),
247+
('poisson', 'poisson_regression'),
219248
])
220249
@pytest.mark.parametrize('sample_weight', ['ones', 'random'])
221250
def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
@@ -232,6 +261,8 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
232261

233262
if problem == 'regression':
234263
y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
264+
elif problem == 'poisson_regression':
265+
y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
235266
else:
236267
y_true = rng.randint(0, n_classes, size=n_ 3C78 samples).astype(Y_DTYPE)
237268

0 commit comments

Comments
 (0)
0