diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 6570a10a064c9..cf4b4858538dd 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -34,6 +34,8 @@ from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport WeightedMedianCalculator +# EPSILON is used in the Poisson criterion +cdef double EPSILON = 10 * np.finfo('double').eps cdef class Criterion: """Interface for impurity criteria. @@ -1384,10 +1386,13 @@ cdef class Poisson(RegressionCriterion): cdef double y_mean_right = 0. for k in range(self.n_outputs): - if (self.sum_left[k] <= 0) or (self.sum_right[k] <= 0): + if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. + # Since sum_right = sum_total - sum_left, it can lead to + # floating point rounding error and will not give zero. Thus, + # we relax the above comparison to sum(y_i) <= EPSILON. return -INFINITY else: y_mean_left = self.sum_left[k] / self.weighted_n_left @@ -1436,11 +1441,16 @@ cdef class Poisson(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs): - y_mean = y_sum[k] / weight_sum - - if y_mean <= 0: + if y_sum[k] <= EPSILON: + # y_sum could be computed from the subtraction + # sum_right = sum_total - sum_left leading to a potential + # floating point rounding error. + # Thus, we relax the comparison y_sum <= 0 to + # y_sum <= EPSILON. return INFINITY + y_mean = y_sum[k] / weight_sum + for p in range(start, end): i = self.samples[p] diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index d9a9d5c356c58..32480ed9bbf82 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1987,18 +1987,19 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -def test_poisson_zero_nodes(): +@pytest.mark.parametrize("seed", range(3)) +def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]] y = [0, 0, 0, 0, 1, 2, 3, 4] # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can # easily learn that: - reg = DecisionTreeRegressor(criterion="mse", random_state=1) + reg = DecisionTreeRegressor(criterion="mse", random_state=seed) reg.fit(X, y) assert np.amin(reg.predict(X)) == 0 # whereas Poisson must predict strictly positive numbers - reg = DecisionTreeRegressor(criterion="poisson", random_state=1) + reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) @@ -2009,12 +2010,13 @@ def test_poisson_zero_nodes(): n_samples=1_000, n_features=n_features, n_informative=n_features * 2 // 3, + random_state=seed, ) # some excess zeros y[(-1 < y) & (y < 0)] = 0 # make sure the target is positive y = np.abs(y) - reg = DecisionTreeRegressor(criterion='poisson', random_state=42) + reg = DecisionTreeRegressor(criterion='poisson', random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0)