From ccd8b5f396556f621c40db79e00821d8231c1dba Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 2 Nov 2020 21:48:59 +0100 Subject: [PATCH 1/8] Fix the seed of test data in test_poisson_zero_nodes --- sklearn/tree/tests/test_tree.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index d9a9d5c356c58..ad0df12e83ed5 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1988,17 +1988,18 @@ def test_balance_property(criterion, Tree): def test_poisson_zero_nodes(): + seed = 42 # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]] y = [0, 0, 0, 0, 1, 2, 3, 4] # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can # easily learn that: - reg = DecisionTreeRegressor(criterion="mse", random_state=1) + reg = DecisionTreeRegressor(criterion="mse", random_state=seed) reg.fit(X, y) assert np.amin(reg.predict(X)) == 0 # whereas Poisson must predict strictly positive numbers - reg = DecisionTreeRegressor(criterion="poisson", random_state=1) + reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) @@ -2009,12 +2010,13 @@ def test_poisson_zero_nodes(): n_samples=1_000, n_features=n_features, n_informative=n_features * 2 // 3, + random_state=seed, ) # some excess zeros y[(-1 < y) & (y < 0)] = 0 # make sure the target is positive y = np.abs(y) - reg = DecisionTreeRegressor(criterion='poisson', random_state=42) + reg = DecisionTreeRegressor(criterion='poisson', random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) From a9c3d751e7fcd8fdac6387c52ae8a65dfebacb61 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 2 Nov 2020 21:55:39 +0100 Subject: [PATCH 2/8] Check the stability of the test on 100 different dataset seeds --- sklearn/tree/tests/test_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index ad0df12e83ed5..eaf40c9d93526 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1987,8 +1987,8 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -def test_poisson_zero_nodes(): - seed = 42 +@pytest.mark.parametrize("seed", range(100)) +def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]] From b888364cc6764304f24e59e957d77d9d0e449728 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 15:23:37 +0100 Subject: [PATCH 3/8] Use EPSILON instead of 0 --- sklearn/tree/_criterion.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 6570a10a064c9..2dbc0dd0488b6 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -34,6 +34,7 @@ from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport WeightedMedianCalculator +cdef double EPSILON = np.finfo('double').eps cdef class Criterion: """Interface for impurity criteria. @@ -1384,7 +1385,7 @@ cdef class Poisson(RegressionCriterion): cdef double y_mean_right = 0. for k in range(self.n_outputs): - if (self.sum_left[k] <= 0) or (self.sum_right[k] <= 0): + if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. @@ -1438,7 +1439,7 @@ cdef class Poisson(RegressionCriterion): for k in range(n_outputs): y_mean = y_sum[k] / weight_sum - if y_mean <= 0: + if y_mean <= EPSILON: return INFINITY for p in range(start, end): From 045ff4a0a71bf0b09a624d3a2c373e329b211401 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 15:30:24 +0100 Subject: [PATCH 4/8] iter --- sklearn/tree/_criterion.pyx | 8 ++++++++ sklearn/tree/tests/test_tree.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2dbc0dd0488b6..768c09a405214 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1389,6 +1389,9 @@ cdef class Poisson(RegressionCriterion): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. + # Since sum_right = sum_total - sum_left, it can lead to + # floating point rounding error and will not give zero. Thus, + # we relax the above comparison to sum(y_i) <= EPSILON. return -INFINITY else: y_mean_left = self.sum_left[k] / self.weighted_n_left @@ -1440,6 +1443,11 @@ cdef class Poisson(RegressionCriterion): y_mean = y_sum[k] / weight_sum if y_mean <= EPSILON: + # y_mean could be computed from the subtraction + # sum_right = sum_total - sum_left leading to a potential + # floating point rounding error. + # Thus, we relax the comparison y_mean <= 0 to + # y_mean <= EPSILON. return INFINITY for p in range(start, end): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index eaf40c9d93526..de32c663ccf0a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1987,7 +1987,7 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -@pytest.mark.parametrize("seed", range(100)) +@pytest.mark.parametrize("seed", range(10000)) def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], From acde48f81ec6bf3a121b678123b96fbaa60ca45b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 15:47:03 +0100 Subject: [PATCH 5/8] use y_sum --- sklearn/tree/_criterion.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 768c09a405214..b09138f0510aa 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1440,16 +1440,16 @@ cdef class Poisson(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs): - y_mean = y_sum[k] / weight_sum - - if y_mean <= EPSILON: - # y_mean could be computed from the subtraction + if y_sum[k] <= EPSILON: + # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. - # Thus, we relax the comparison y_mean <= 0 to - # y_mean <= EPSILON. + # Thus, we relax the comparison y_sum <= 0 to + # y_sum <= EPSILON. return INFINITY + y_mean = y_sum[k] / weight_sum + for p in range(start, end): i = self.samples[p] From 3b5401abb3aecc521e2482c52cbfb29c7684c3fb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 16:19:00 +0100 Subject: [PATCH 6/8] reduce n_seeds --- sklearn/tree/tests/test_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index de32c663ccf0a..552637c13a704 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1987,7 +1987,7 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -@pytest.mark.parametrize("seed", range(10000)) +@pytest.mark.parametrize("seed", range(1000)) def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], From 31d969e4c26ebec6312205b7d70f88baceb34578 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 16:21:26 +0100 Subject: [PATCH 7/8] increase tolerance --- sklearn/tree/_criterion.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index b09138f0510aa..02bacc788c7bf 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1385,7 +1385,8 @@ cdef class Poisson(RegressionCriterion): cdef double y_mean_right = 0. for k in range(self.n_outputs): - if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON): + if ((self.sum_left[k] <= 10 * EPSILON) or + (self.sum_right[k] <= 10 * EPSILON)): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. @@ -1440,7 +1441,7 @@ cdef class Poisson(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs): - if y_sum[k] <= EPSILON: + if y_sum[k] <= 10 * EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. From 3dd7a4e9d744c03c1ecaef4273ad4095dfbf7497 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Nov 2020 19:47:35 +0100 Subject: [PATCH 8/8] cosmit --- sklearn/tree/_criterion.pyx | 8 ++++---- sklearn/tree/tests/test_tree.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 02bacc788c7bf..cf4b4858538dd 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -34,7 +34,8 @@ from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport WeightedMedianCalculator -cdef double EPSILON = np.finfo('double').eps +# EPSILON is used in the Poisson criterion +cdef double EPSILON = 10 * np.finfo('double').eps cdef class Criterion: """Interface for impurity criteria. @@ -1385,8 +1386,7 @@ cdef class Poisson(RegressionCriterion): cdef double y_mean_right = 0. for k in range(self.n_outputs): - if ((self.sum_left[k] <= 10 * EPSILON) or - (self.sum_right[k] <= 10 * EPSILON)): + if (self.sum_left[k] <= EPSILON) or (self.sum_right[k] <= EPSILON): # Poisson loss does not allow non-positive predictions. We # therefore forbid splits that have child nodes with # sum(y_i) <= 0. @@ -1441,7 +1441,7 @@ cdef class Poisson(RegressionCriterion): cdef SIZE_t n_outputs = self.n_outputs for k in range(n_outputs): - if y_sum[k] <= 10 * EPSILON: + if y_sum[k] <= EPSILON: # y_sum could be computed from the subtraction # sum_right = sum_total - sum_left leading to a potential # floating point rounding error. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 552637c13a704..32480ed9bbf82 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1987,7 +1987,7 @@ def test_balance_property(criterion, Tree): assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) -@pytest.mark.parametrize("seed", range(1000)) +@pytest.mark.parametrize("seed", range(3)) def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3],