8000 ENH Adds Poisson criterion in RandomForestRegressor (#19836) · scikit-learn/scikit-learn@36915ae · GitHub
[go: up one dir, main page]

Skip to content

Commit 36915ae

Browse files
bsun94lorentzenchrazihnaAlihan Zihnacmarmo
authored
ENH Adds Poisson criterion in RandomForestRegressor (#19836)
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com> Co-authored-by: Alihan Zihna <alihanz@gmail.com> Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com> Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com> Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com> Co-authored-by: naozin555 <37050583+naozin555@users.noreply.github.com> Co-authored-by: Venkatachalam N <venky.yuvy@gmail.com> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
1 parent a1a6b3a commit 36915ae

File tree

3 files changed

+108
-3
lines changed

3 files changed

+108
-3
lines changed

doc/whats_new/v1.0.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ Changelog
270270
:class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
271271
:pr:`19564` by `Thomas Fan`_.
272272

273+
- |Enhancement| Documented and tested support of the Poisson criterion for
274+
:class:`ensemble.RandomForestRegressor`. :pr:`19836` by
275+
:user:`Brian Sun <bsun94>`.
276+
273277
- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
274278
in :class:`ensemble.RandomForestClassifier`,
275279
:class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is

sklearn/ensemble/_forest.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,14 @@ def fit(self, X, y, sample_weight=None):
323323
# [:, np.newaxis] that does not.
324324
y = np.reshape(y, (-1, 1))
325325

326+
if self.criterion == "poisson":
327+
if np.any(y < 0):
328+
raise ValueError("Some value(s) of y are negative which is "
329+
"not allowed for Poisson regression.")
330+
if np.sum(y) <= 0:
331+
raise ValueError("Sum of y is not strictly positive which "
332+
"is necessary for Poisson regression.")
333+
326334
self.n_outputs_ = y.shape[1]
327335

328336
y, expanded_class_weight = self._validate_y_class_weight(y)
@@ -1324,16 +1332,20 @@ class RandomForestRegressor(ForestRegressor):
13241332
The default value of ``n_estimators`` changed from 10 to 100
13251333
in 0.22.
13261334
1327-
criterion : {"squared_error", "mse", "absolute_error", "mae"}, \
1335+
criterion : {"squared_error", "mse", "absolute_error", "poisson"}, \
13281336
default="squared_error"
13291337
The function to measure the quality of a split. Supported criteria
13301338
are "squared_error" for the mean squared error, which is equal to
1331-
variance reduction as feature selection criterion, and "absolute_error"
1332-
for the mean absolute error.
1339+
variance reduction as feature selection criterion, "absolute_error"
1340+
for the mean absolute error, and "poisson" which uses reduction in
1341+
Poisson deviance to find splits.
13331342
13341343
.. versionadded:: 0.18
13351344
Mean Absolute Error (MAE) criterion.
13361345
1346+
.. versionadded:: 1.0
1347+
Poisson criterion.
1348+
13371349
.. deprecated:: 1.0
13381350
Criterion "mse" was deprecated in v1.0 and will be removed in
13391351
version 1.2. Use `criterion="squared_error"` which is equivalent.

sklearn/ensemble/tests/test_forest.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
import joblib
2828
from numpy.testing import assert_allclose
2929

30+
from sklearn.dummy import DummyRegressor
31+
from sklearn.metrics import mean_poisson_deviance
3032
from sklearn.utils._testing import assert_almost_equal
3133
from sklearn.utils._testing import assert_array_almost_equal
3234
from sklearn.utils._testing import assert_array_equal
@@ -185,6 +187,76 @@ def test_regression(name, criterion):
185187
check_regression_criterion(name, criterion)
186188

187189

190+
def test_poisson_vs_mse():
191+
"""Test that random forest with poisson criterion performs better than
192+
mse for a poisson target."""
193+
rng = np.random.RandomState(42)
194+
n_train, n_test, n_features = 500, 500, 10
195+
X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
196+
n_features=n_features, random_state=rng)
197+
X = np.abs(X)
198+
X /= np.max(np.abs(X), axis=0)
199+
# We create a log-linear Poisson model
200+
coef = rng.uniform(low=-4, high=1, size=n_features)
201+
y = rng.poisson(lam=np.exp(X @ coef))
202+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
203+
random_state=rng)
204+
205+
forest_poi = RandomForestRegressor(
206+
criterion="poisson",
207+
min_samples_leaf=10,
208+
max_features="sqrt",
209+
random_state=rng)
210+
forest_mse = RandomForestRegressor(
211+
criterion="squared_error",
212+
min_samples_leaf=10,
213+
max_features="sqrt",
214+
random_state=rng)
215+
216+
forest_poi.fit(X_train, y_train)
217+
forest_mse.fit(X_train, y_train)
218+
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
219+
220+
for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
221+
metric_poi = mean_poisson_deviance(y, forest_poi.predict(X))
222+
# squared_error forest might produce non-positive predictions => clip
223+
# If y = 0 for those, the poisson deviance gets too good.
224+
# If we drew more samples, we would eventually get y > 0 and the
225+
# poisson deviance would explode, i.e. be undefined. Therefore, we do
226+
# not clip to a tiny value like 1e-15, but to 0.1. This acts like a
227+
# mild penalty to the non-positive predictions.
228+
metric_mse = mean_poisson_deviance(
229+
y,
230+
np.clip(forest_mse.predict(X), 1e-6, None))
231+
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
232+
# As squared_error might correctly predict 0 in train set, its train
233+
# score can be better than Poisson. This is no longer the case for the
234+
# test set. But keep the above comment for clipping in mind.
235+
if val == "test":
236+
assert metric_poi < metric_mse
237+
assert metric_poi < metric_dummy
238+
239+
240+
@pytest.mark.parametrize('criterion', ('poisson', 'squared_error'))
241+
def test_balance_property_random_forest(criterion):
242+
""""Test that sum(y_pred)==sum(y_true) on the training set."""
243+
rng = np.random.RandomState(42)
244+
n_train, n_test, n_fe F438 atures = 500, 500, 10
245+
X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
246+
n_features=n_features, random_state=rng)
247+
248+
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
249+
y = rng.poisson(lam=np.exp(X @ coef))
250+
251+
reg = RandomForestRegressor(criterion=criterion,
252+
n_estimators=10,
253+
bootstrap=False,
254+
random_state=rng)
255+
reg.fit(X, y)
256+
257+
assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
258+
259+
188260
def check_regressor_attributes(name):
189261
# Regression models should not have a classes_ attribute.
190262
r = FOREST_REGRESSORS[name](random_state=0)
@@ -1367,6 +1439,23 @@ def test_min_impurity_decrease():
13671439
assert tree.min_impurity_decrease == 0.1
13681440

13691441

1442+
def test_poisson_y_positive_check():
1443+
est = RandomForestRegressor(criterion="poisson")
1444+
X = np.zeros((3, 3))
1445+
1446+
y = [-1, 1, 3]
1447+
err_msg = (r"Some value\(s\) of y are negative which is "
1448+
r"not allowed for Poisson regression.")
1449+
with pytest.raises(ValueError, match=err_msg):
1450+
est.fit(X, y)
1451+
1452+
y = [0, 0, 0]
1453+
err_msg = (r"Sum of y is not strictly positive which "
1454+
r"is necessary for Poisson regression.")
1455+
with pytest.raises(ValueError, match=err_msg):
1456+
est.fit(X, y)
1457+
1458+
13701459
# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
13711460
class MyBackend(DEFAULT_JOBLIB_BACKEND): # type: ignore
13721461
def __init__(self, *args, **kwargs):

0 commit comments

Comments
 (0)
0