10000 FEA add quantile HGBT (#21800) · scikit-learn/scikit-learn@5ad3421 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5ad3421

Browse files
authored
FEA add quantile HGBT (#21800)
1 parent c36f926 commit 5ad3421

File tree

3 files changed

+64
-2
lines changed

3 files changed

+64
-2
lines changed

doc/whats_new/v1.1.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,11 @@ Changelog
305305
:mod:`sklearn.ensemble`
306306
.......................
307307

308+
- |MajorFeature| Added additional option `loss="quantile"` to
309+
:class:`ensemble.HistGradientBoostingRegressor` for modelling quantiles.
310+
The quantile level can be specified with the new parameter `quantile`.
311+
:pr:`21800` and :pr:`20567` by :user:`Christian Lorentzen <lorentzenchr>`.
312+
308313
- |Efficiency| :meth:`fit` of :class:`ensemble.BaseGradientBoosting` now
309314
calls :func:`check_array` with parameter `force_all_finite=False` for non
310315
initial warm-start runs as it has already been checked before.

sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
HalfMultinomialLoss,
1616
HalfPoissonLoss,
1717
HalfSquaredError,
18+
PinballLoss,
1819
)
1920
from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
2021
from ...utils import check_random_state, resample
@@ -42,6 +43,7 @@
4243
"least_squares": HalfSquaredError,
4344
"least_absolute_deviation": AbsoluteError,
4445
"poisson": HalfPoissonLoss,
46+
"quantile": PinballLoss,
4547
"binary_crossentropy": HalfBinomialLoss,
4648
"categorical_crossentropy": HalfMultinomialLoss,
4749
}
@@ -1115,17 +1117,21 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
11151117
11161118
Parameters
11171119
----------
1118-
loss : {'squared_error', 'absolute_error', 'poisson'}, \
1120+
loss : {'squared_error', 'absolute_error', 'poisson', 'quantile'}, \
11191121
default='squared_error'
11201122
The loss function to use in the boosting process. Note that the
11211123
"squared error" and "poisson" losses actually implement
11221124
"half least squares loss" and "half poisson deviance" to simplify the
11231125
computation of the gradient. Furthermore, "poisson" loss internally
11241126
uses a log-link and requires ``y >= 0``.
1127+
"quantile" uses the pinball loss.
11251128
11261129
.. versionchanged:: 0.23
11271130
Added option 'poisson'.
11281131
1132+
.. versionchanged:: 1.1
1133+
Added option 'quantile'.
1134+
11291135
.. deprecated:: 1.0
11301136
The loss 'least_squares' was deprecated in v1.0 and will be removed
11311137
in version 1.2. Use `loss='squared_error'` which is equivalent.
@@ -1135,6 +1141,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
11351141
be removed in version 1.2. Use `loss='absolute_error'` which is
11361142
equivalent.
11371143
1144+
quantile : float, default=None
1145+
If loss is "quantile", this parameter specifies which quantile to be estimated
1146+
and must be between 0 and 1.
11381147
learning_rate : float, default=0.1
11391148
The learning rate, also known as *shrinkage*. This is used as a
11401149
multiplicative factor for the leaves values. Use ``1`` for no
@@ -1294,12 +1303,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
12941303
"absolute_error",
12951304
"least_absolute_deviation",
12961305
"poisson",
1306+
"quantile",
12971307
)
12981308

12991309
def __init__(
13001310
self,
13011311
loss="squared_error",
13021312
*,
1313+
quantile=None,
13031314
learning_rate=0.1,
13041315
max_iter=100,
13051316
max_leaf_nodes=31,
@@ -1338,6 +1349,7 @@ def __init__(
13381349
verbose=verbose,
13391350
random_state=random_state,
13401351
)
1352+
self.quantile = quantile
13411353

13421354
def predict(self, X):
13431355
"""Predict values for X.
@@ -1409,7 +1421,12 @@ def _get_loss(self, sample_weight):
14091421
)
14101422
return _LOSSES["absolute_error"](sample_weight=sample_weight)
14111423

1412-
return _LOSSES[self.loss](sample_weight=sample_weight)
1424+
if self.loss == "quantile":
1425+
return _LOSSES[self.loss](
1426+
sample_weight=sample_weight, quantile=self.quantile
1427+
)
1428+
else:
1429+
return _LOSSES[self.loss](sample_weight=sample_weight)
14131430

14141431

14151432
class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):

sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
HalfMultinomialLoss,
88
HalfPoissonLoss,
99
HalfSquaredError,
10+
PinballLoss,
1011
)
1112
from sklearn.datasets import make_classification, make_regression
1213
from sklearn.datasets import make_low_rank_matrix
@@ -35,6 +36,7 @@
3536
"squared_error": HalfSquaredError,
3637
"absolute_error": AbsoluteError,
3738
"poisson": HalfPoissonLoss,
39+
"quantile": PinballLoss,
3840
"binary_crossentropy": HalfBinomialLoss,
3941
"categorical_crossentropy": HalfMultinomialLoss,
4042
}
@@ -249,6 +251,44 @@ def test_absolute_error_sample_weight():
249251
gbdt.fit(X, y, sample_weight=sample_weight)
250252

251253

254+
@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
255+
def test_asymmetric_error(quantile):
256+
"""Test quantile regression for asymmetric distributed targets."""
257+
n_samples = 10_000
258+
rng = np.random.RandomState(42)
259+
# take care that X @ coef + intercept > 0
260+
X = np.concatenate(
261+
(
262+
np.abs(rng.randn(n_samples)[:, None]),
263+
-rng.randint(2, size=(n_samples, 1)),
264+
),
265+
axis=1,
266+
)
267+
intercept = 1.23
268+
coef = np.array([0.5, -2])
269+
# For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
270+
# the quantile at level q is:
271+
# quantile(q) = - log(1 - q) / lambda
272+
# scale = 1/lambda = -quantile(q) / log(1-q)
273+
y = rng.exponential(
274+
scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
275+
)
276+
model = HistGradientBoostingRegressor(
277+
loss="quantile",
278+
quantile=quantile,
279+
max_iter=25,
280+
random_state=0,
281+
max_leaf_nodes=10,
282+
).fit(X, y)
283+
assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=1e-2)
284+
285+
pinball_loss = PinballLoss(quantile=quantile)
286+
loss_true_quantile = pinball_loss(y, X @ coef + intercept)
287+
loss_pred_quantile = pinball_loss(y, model.predict(X))
288+
# we are overfitting
289+
assert loss_pred_quantile <= loss_true_quantile
290+
291+
252292
@pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 0.0, 0.0])])
253293
def test_poisson_y_positive(y):
254294
# Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.

0 commit comments

Comments
 (0)
0