8000 FIX numerical rounding issue causing test_poisson_zero_nodes to fail at random by ogrisel · Pull Request #18737 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FIX numerical rounding issue causing test_poisson_zero_nodes to fail at random #18737

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ from ._utils cimport safe_realloc
from ._utils cimport sizet_ptr_to_ndarray
from ._utils cimport WeightedMedianCalculator

# EPSILON is used in the Poisson criterion
cdef double EPSILON = 10 * np.finfo('double').eps

cdef class Criterion:
"""Interface for impurity criteria.
Expand Down Expand Up @@ -1384,10 +1386,13 @@ cdef class Poisson(RegressionCriterion):
cdef double y_mean_right = 0.

for k in range(self.n_outputs):
if (self.sum_left[k] <= 0) or (self.sum_right[k] <= 0):
if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON):
# Poisson loss does not allow non-positive predictions. We
# therefore forbid splits that have child nodes with
# sum(y_i) <= 0.
# Since sum_right = sum_total - sum_left, it can lead to
# floating point rounding error and will not give zero. Thus,
# we relax the above comparison to sum(y_i) <= EPSILON.
return -INFINITY
else:
y_mean_left = self.sum_left[k] / self.weighted_n_left
Expand Down Expand Up @@ -1436,11 +1441,16 @@ cdef class Poisson(RegressionCriterion):
cdef SIZE_t n_outputs = self.n_outputs

for k in range(n_outputs):
y_mean = y_sum[k] / weight_sum

if y_mean <= 0:
if y_sum[k] <= EPSILON:
# y_sum could be computed from the subtraction
# sum_right = sum_total - sum_left leading to a potential
# floating point rounding error.
# Thus, we relax the comparison y_sum <= 0 to
# y_sum <= EPSILON.
return INFINITY

y_mean = y_sum[k] / weight_sum

for p in range(start, end):
i = self.samples[p]

Expand Down
10 changes: 6 additions & 4 deletions sklearn/tree/tests/test_tree.py
7FB9
Original file line number Diff line number Diff line change
Expand Up @@ -1987,18 +1987,19 @@ def test_balance_property(criterion, Tree):
assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))


def test_poisson_zero_nodes():
@pytest.mark.parametrize("seed", range(3))
def test_poisson_zero_nodes(seed):
# Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes.
X = [[0, 0], [0, 1], [0, 2], [0, 3],
[1, 0], [1, 2], [1, 2], [1, 3]]
y = [0, 0, 0, 0, 1, 2, 3, 4]
# Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
# easily learn that:
reg = DecisionTreeRegressor(criterion="mse", random_state=1)
reg = DecisionTreeRegressor(criterion="mse", random_state=seed)
reg.fit(X, y)
assert np.amin(reg.predict(X)) == 0
# whereas Poisson must predict strictly positive numbers
reg = DecisionTreeRegressor(criterion="poisson", random_state=1)
reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
reg.fit(X, y)
assert np.all(reg.predict(X) > 0)

Expand All @@ -2009,12 +2010,13 @@ def test_poisson_zero_nodes():
n_samples=1_000,
n_features=n_features,
n_informative=n_features * 2 // 3,
random_state=seed,
)
# some excess zeros
y[(-1 < y) & (y < 0)] = 0
# make sure the target is positive
y = np.abs(y)
reg = DecisionTreeRegressor(criterion='poisson', random_state=42)
reg = DecisionTreeRegressor(criterion='poisson', random_state=seed)
reg.fit(X, y)
assert np.all(reg.predict(X) > 0)

Expand Down
0