From 7860b7e90755c5cfc1f90e9c0f542a5d12145336 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 6 Oct 2018 14:43:59 +0200 Subject: [PATCH 1/9] fix the issue with max_depth and BestFirstTreeBuilder --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 911e63bbf6ed4..12df7e6265af8 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -450,7 +450,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): impurity = splitter.node_impurity() n_node_samples = end - start - is_leaf = (depth > self.max_depth or + is_leaf = (depth >= self.max_depth or n_node_samples < self.min_samples_split or n_node_samples < 2 * self.min_samples_leaf or weighted_n_node_samples < 2 * self.min_weight_leaf or From fe3b63c49db4ff64718b596265fa64b8ab09a094 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 6 Oct 2018 14:45:21 +0200 Subject: [PATCH 2/9] fix the test --- sklearn/tree/tests/test_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 37eb6582c7023..137c88dbcde7c 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1234,7 +1234,7 @@ def test_max_leaf_nodes_max_depth(): for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) tree = est.tree_ - assert_greater(tree.max_depth, 1) + assert_equal(tree.max_depth, 1) def test_arrays_persist(): From 1bfe726d3e1bb71303052a50312e0639f342deac Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 10 Oct 2018 16:06:32 +0200 Subject: [PATCH 3/9] fix max_depth overshoot in BFS expansion --- sklearn/tree/_tree.pyx | 11 +++++++++++ sklearn/tree/tests/test_tree.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ed259c98ac850..d9b40fcd38bef 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -24,6 +24,7 @@ from libc.string cimport memcpy from libc.string cimport memset import numpy as np +import warnings cimport numpy as np np.import_array() @@ -456,6 +457,16 @@ cdef class BestFirstTreeBuilder(TreeBuilder): weighted_n_node_samples < 2 * self.min_weight_leaf or impurity <= min_impurity_split) + if (depth >= self.max_depth and + not (n_node_samples < self.min_samples_split or + n_node_samples < 2 * self.min_samples_leaf or + weighted_n_node_samples < 2 * self.min_weight_leaf or + impurity <= min_impurity_split)): + with gil: + warnings.warn("Due to a bugfix in v0.21 the maximum depth of a" + " tree now does not pass the given max_depth!", + UserWarning) + if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) # If EPSILON=0 in the below comparison, float precision issues stop diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 8463ecdb20043..e3e3703ab27aa 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1210,7 +1210,6 @@ def test_class_weight_errors(name): def test_max_leaf_nodes(): # Test greedy trees with max_depth + 1 leafs. - from sklearn.tree._tree import TREE_LEAF X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 for name, TreeEstimator in ALL_TREES.items(): @@ -1235,6 +1234,16 @@ def test_max_leaf_nodes_max_depth(): assert_equal(est.get_depth(), 1) +def test_bugfix_warning(): + # Test if the warning is raised when there's a behavior change. + # PR #xxxxx + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + with pytest.warns(UserWarning, + match="Due to a bugfix in v0.21 the maximum depth of a"): + for name, TreeEstimator in ALL_TREES.items(): + TreeEstimator(max_depth=1, max_leaf_nodes=1000).fit(X, y) + + def test_arrays_persist(): # Ensure property arrays' memory stays alive when tree disappears # non-regression for #2726 From 8719ff4bd1be68b49f0379a3cae9222ff90dd613 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 10 Oct 2018 18:11:41 +0200 Subject: [PATCH 4/9] fix forest tests --- sklearn/ensemble/tests/test_forest.py | 4 ++-- sklearn/ensemble/tests/test_gradient_boosting.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index d7586c2866571..03a0c8f9495a3 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -708,11 +708,11 @@ def check_max_leaf_nodes_max_depth(name): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0).fit(X, y) - assert_greater(est.estimators_[0].tree_.max_depth, 1) + assert_equal(est.estimators_[0].get_depth(), 1) est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y) - assert_equal(est.estimators_[0].tree_.max_depth, 1) + assert_equal(est.estimators_[0].get_depth(), 1) @pytest.mark.parametrize('name', FOREST_ESTIMATORS) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index e407ca8ef2554..a2312c50d721a 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1104,7 +1104,7 @@ def test_max_leaf_nodes_max_depth(GBEstimator): est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) tree = est.estimators_[0, 0].tree_ - assert_greater(tree.max_depth, 1) + assert_equal(tree.max_depth, 1) est = GBEstimator(max_depth=1).fit(X, y) tree = est.estimators_[0, 0].tree_ From 7566a3a2279f5f75f5dcc005da78a828aa38161c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Oct 2018 09:34:46 +0200 Subject: [PATCH 5/9] remove the warning, add whats_new entry --- doc/whats_new/v0.21.rst | 9 +++++++++ sklearn/tree/_tree.pyx | 11 ----------- sklearn/tree/tests/test_tree.py | 10 ---------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index be200250a9ae5..ec84f0d5a6608 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -69,6 +69,15 @@ Support for Python 3.4 and below has been officially dropped. and :class:`tree.ExtraTreeRegressor`. :issue:`12300` by :user:`Adrin Jalali `. +- |Fix| Fixed an issue with :class:`tree.BaseDecisionTree` + and consequently all estimators based + on it, including :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, + and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given + ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and + ``max_depth`` were both specified by the user. + + Multiple modules ................ diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d9b40fcd38bef..ed259c98ac850 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -24,7 +24,6 @@ from libc.string cimport memcpy from libc.string cimport memset import numpy as np -import warnings cimport numpy as np np.import_array() @@ -457,16 +456,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): weighted_n_node_samples < 2 * self.min_weight_leaf or impurity <= min_impurity_split) - if (depth >= self.max_depth and - not (n_node_samples < self.min_samples_split or - n_node_samples < 2 * self.min_samples_leaf or - weighted_n_node_samples < 2 * self.min_weight_leaf or - impurity <= min_impurity_split)): - with gil: - warnings.warn("Due to a bugfix in v0.21 the maximum depth of a" - " tree now does not pass the given max_depth!", - UserWarning) - if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) # If EPSILON=0 in the below comparison, float precision issues stop diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index e3e3703ab27aa..163c7ec2665b2 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1234,16 +1234,6 @@ def test_max_leaf_nodes_max_depth(): assert_equal(est.get_depth(), 1) -def test_bugfix_warning(): - # Test if the warning is raised when there's a behavior change. - # PR #xxxxx - X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - with pytest.warns(UserWarning, - match="Due to a bugfix in v0.21 the maximum depth of a"): - for name, TreeEstimator in ALL_TREES.items(): - TreeEstimator(max_depth=1, max_leaf_nodes=1000).fit(X, y) - - def test_arrays_persist(): # Ensure property arrays' memory stays alive when tree disappears # non-regression for #2726 From ebabc2f6d6e56ade04a16c8475025dc9a6234f9a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Oct 2018 09:40:09 +0200 Subject: [PATCH 6/9] remove extra line --- doc/whats_new/v0.21.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index e16a6c0649fae..29deef94eef9c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -87,7 +87,6 @@ Support for Python 3.4 and below has been officially dropped. :issue:`12232` by :user:`Krishna Sangeeth `. - Multiple modules ................ From e9c3cabc4069f15e48bed88c6a22cdfa2e930133 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Oct 2018 23:23:18 +0200 Subject: [PATCH 7/9] add affected classes to changed classes --- doc/whats_new/v0.21.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 29deef94eef9c..25e3dfa1f07f3 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -18,6 +18,10 @@ occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. - please add class and reason here (see version 0.20 what's new) +- :class:`tree.DecisionTreeClassifier` (bug fix) +- :class:`tree.DecisionTreeRegressor` (bug fix) +- :class:`tree.ExtraTreeClassifier` (bug fix) +- :class:`tree.ExtraTreeRegressor` (bug fix) Details are listed in the changelog below. From d0aedd7ae88700de72295151f4dcb52f41e9384e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Oct 2018 14:46:01 +0200 Subject: [PATCH 8/9] add other affected estimators to the whats_new changed models --- doc/whats_new/v0.21.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 25e3dfa1f07f3..475f5c1f02870 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -18,6 +18,18 @@ occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. - please add class and reason here (see version 0.20 what's new) +- :class:`ensemble.AdaBoostClassifier` (bug fix) +- :class:`ensemble.AdaBoostRegressor` (bug fix) +- :class:`ensemble.BaggingClassifier` (bug fix) +- :class:`ensemble.BaggingRegressor` (bug fix) +- :class:`ensemble.ExtraTreesClassifier` (bug fix) +- :class:`ensemble.ExtraTreesRegressor` (bug fix) +- :class:`ensemble.GradientBoostingClassifier` (bug fix) +- :class:`ensemble.GradientBoostingRegressor` (bug fix) +- :class:`ensemble.IsolationForest` (bug fix) +- :class:`ensemble.RandomForestClassifier` (bug fix) +- :class:`ensemble.RandomForestRegressor` (bug fix) +- :class:`ensemble.RandomTreesEmbedding` (bug fix) - :class:`tree.DecisionTreeClassifier` (bug fix) - :class:`tree.DecisionTreeRegressor` (bug fix) - :class:`tree.ExtraTreeClassifier` (bug fix) @@ -79,7 +91,8 @@ Support for Python 3.4 and below has been officially dropped. :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and - ``max_depth`` were both specified by the user. + ``max_depth`` were both specified by the user. Please note that this also + affects all ensemble methods using decision trees. :pr:`12344` by :user:`Adrin Jalali `. From 3aa8009ab3dd93b13c972ba7f4c5ef0f306b1cc0 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 16 Oct 2018 11:03:51 +0200 Subject: [PATCH 9/9] shorten whats_new changed models entry --- doc/whats_new/v0.21.rst | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 475f5c1f02870..012e33d4c6ce4 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -18,22 +18,8 @@ occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. - please add class and reason here (see version 0.20 what's new) -- :class:`ensemble.AdaBoostClassifier` (bug fix) -- :class:`ensemble.AdaBoostRegressor` (bug fix) -- :class:`ensemble.BaggingClassifier` (bug fix) -- :class:`ensemble.BaggingRegressor` (bug fix) -- :class:`ensemble.ExtraTreesClassifier` (bug fix) -- :class:`ensemble.ExtraTreesRegressor` (bug fix) -- :class:`ensemble.GradientBoostingClassifier` (bug fix) -- :class:`ensemble.GradientBoostingRegressor` (bug fix) -- :class:`ensemble.IsolationForest` (bug fix) -- :class:`ensemble.RandomForestClassifier` (bug fix) -- :class:`ensemble.RandomForestRegressor` (bug fix) -- :class:`ensemble.RandomTreesEmbedding` (bug fix) -- :class:`tree.DecisionTreeClassifier` (bug fix) -- :class:`tree.DecisionTreeRegressor` (bug fix) -- :class:`tree.ExtraTreeClassifier` (bug fix) -- :class:`tree.ExtraTreeRegressor` (bug fix) +- Decision trees and derived ensembles when both `max_depth` and + `max_leaf_nodes` are set. (bug fix) Details are listed in the changelog below.