From 9585337545c7c275eb6cc7906e37220e6230aefa Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 06:40:45 -0700 Subject: [PATCH 01/16] feature: add beta-threshold early stopping for decision tree growth --- sklearn/tree/_tree.pxd | 2 ++ sklearn/tree/_tree.pyx | 17 +++++++++--- sklearn/tree/tests/test_tree.py | 48 +++++++++++++++++++++++++++++++++ sklearn/tree/tree.py | 24 +++++++++++++++-- 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 007b7a7860342..0db91ed2c87eb 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -4,6 +4,7 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -95,6 +96,7 @@ cdef class TreeBuilder: cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf cdef SIZE_t max_depth # Maximal tree depth + cdef double beta # Impurity threshold for early stopping cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..9f97938521fe1 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -12,6 +12,7 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -131,12 +132,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth): + SIZE_t max_depth, double beta): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth + self.beta = beta cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -166,6 +168,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf cdef SIZE_t min_samples_split = self.min_samples_split + cdef double beta = self.beta # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) @@ -223,7 +226,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): impurity = splitter.node_impurity() first = 0 - is_leaf = is_leaf or (impurity <= MIN_IMPURITY_SPLIT) + is_leaf = (is_leaf or + (impurity <= MIN_IMPURITY_SPLIT) or + (impurity < beta)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) @@ -289,13 +294,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes): + SIZE_t max_depth, SIZE_t max_leaf_nodes, + double beta): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes + self.beta = beta cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -421,6 +428,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 cdef double weighted_n_samples = splitter.weighted_n_samples + cdef double beta = self.beta cdef double weighted_n_node_samples cdef bint is_leaf cdef SIZE_t n_left, n_right @@ -436,7 +444,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): (n_node_samples < self.min_samples_split) or (n_node_samples < 2 * self.min_samples_leaf) or (weighted_n_node_samples < self.min_weight_leaf) or - (impurity <= MIN_IMPURITY_SPLIT)) + (impurity <= MIN_IMPURITY_SPLIT) or + (impurity < beta)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 78a35fe5becc1..3cf5808edad14 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -681,6 +681,54 @@ def test_min_weight_fraction_leaf(): yield check_min_weight_fraction_leaf, name, "multilabel", True +def test_beta(): + # Test if beta creates leaves with impurity [0, beta) when + # min_samples_leaf = 1 and min_samples_split = 2. + X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE)) + y = iris.target + + # test both DepthFirstTreeBuilder and BestFirstTreeBuilder + # by setting max_leaf_nodes + # we set max leaf nodes to a number greater than the total nodes + # possible, thus ensuring that the leaves generated have impurity + # of 0 when there is no beta stopping used. + for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): + TreeEstimator = ALL_TREES[name] + beta = .5 + + # verify leaf nodes without beta have impurity 0 + est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, + random_state=0) + est.fit(X, y) + for node in range(est.tree_.node_count): + if (est.tree_.children_left[node] == TREE_LEAF or + est.tree_.children_right[node] == TREE_LEAF): + assert_equal(est.tree_.impurity[node], 0., + "Failed with {0} " + "beta={1}".format( + est.tree_.impurity[node], + est.beta)) + + # verify leaf nodes have impurity [0,beta) when using beta + est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, + beta=beta, + random_state=0) + est.fit(X, y) + for node in range(est.tree_.node_count): + if (est.tree_.children_left[node] == TREE_LEAF or + est.tree_.children_right[node] == TREE_LEAF): + assert_greater_equal(est.tree_.impurity[node], 0, + "Failed with {0} " + "beta={1}".format( + est.tree_.impurity[node], + est.beta)) + assert_less(est.tree_.impurity[node], beta, + "Failed with {0} " + "beta={1}".format( + est.tree_.impurity[node], + est.beta)) + + def test_pickle(): for name, TreeEstimator in ALL_TREES.items(): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index f004d845279bc..edb387ad25778 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -10,6 +10,7 @@ # Satrajit Gosh # Joly Arnaud # Fares Hedayati +# Nelson Liu # # License: BSD 3 clause @@ -89,6 +90,7 @@ def __init__(self, max_features, max_leaf_nodes, random_state, + beta, class_weight=None, presort=False): self.criterion = criterion @@ -100,6 +102,7 @@ def __init__(self, self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes + self.beta = beta self.class_weight = class_weight self.presort = presort @@ -151,6 +154,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, """ random_state = check_random_state(self.random_state) + beta = self.beta if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) @@ -359,13 +363,13 @@ def fit(self, X, y, sample_weight=None, check_input=True, builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, - max_depth) + max_depth, beta) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, - max_leaf_nodes) + max_leaf_nodes, beta) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) @@ -608,6 +612,10 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): If None, the random number generator is the RandomState instance used by `np.random`. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + presort : bool, optional (default=False) Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large @@ -685,6 +693,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, + beta=0., class_weight=None, presort=False): super(DecisionTreeClassifier, self).__init__( @@ -698,6 +707,7 @@ def __init__(self, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, + beta=beta, presort=presort) def predict_proba(self, X, check_input=True): @@ -848,6 +858,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): If None, the random number generator is the RandomState instance used by `np.random`. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + presort : bool, optional (default=False) Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large @@ -917,6 +931,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, + beta=0., presort=False): super(DecisionTreeRegressor, self).__init__( criterion=criterion, @@ -928,6 +943,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, + beta=beta, presort=presort) @@ -965,6 +981,7 @@ def __init__(self, max_features="auto", random_state=None, max_leaf_nodes=None, + beta=0., class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, @@ -976,6 +993,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, + beta=beta, random_state=random_state) @@ -1012,6 +1030,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, + beta=0., max_leaf_nodes=None): super(ExtraTreeRegressor, self).__init__( criterion=criterion, @@ -1022,4 +1041,5 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, + beta=beta, random_state=random_state) From 40164b84c3be5ab788537717b26c881eaafeb177 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 14:53:37 -0700 Subject: [PATCH 02/16] check if value of beta is greater than or equal to 0 --- sklearn/tree/tree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index edb387ad25778..c30f9d2e0d7f9 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -308,6 +308,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. + if not 0. <= beta: + raise ValueError("beta must be greater than 0") + presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. From d306dc3cb96cd6d6cabd93204bc5f70abe9ceca5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 14:56:53 -0700 Subject: [PATCH 03/16] test if default value of beta is 0 and edit input validation error message --- sklearn/tree/tests/test_tree.py | 3 +++ sklearn/tree/tree.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 3cf5808edad14..80d624c13ddb6 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -699,6 +699,9 @@ def test_beta(): # verify leaf nodes without beta have impurity 0 est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0) + assert_equal(est.beta, 0., + "Failed, beta = {0} != 0".format( + est.beta)) est.fit(X, y) for node in range(est.tree_.node_count): if (est.tree_.children_left[node] == TREE_LEAF or diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index c30f9d2e0d7f9..ef7b51b13ce3e 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -309,7 +309,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, min_weight_leaf = 0. if not 0. <= beta: - raise ValueError("beta must be greater than 0") + raise ValueError("beta must be greater or equal to 0") presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, From e0867b403131d29ee1bbbf4917857cad8ba82738 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 15:55:45 -0700 Subject: [PATCH 04/16] feature: separately validate beta for reg. and clf., and add tests for it --- sklearn/tree/tests/test_tree.py | 11 +++++++++++ sklearn/tree/tree.py | 10 ++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 80d624c13ddb6..07a833b13168e 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -441,6 +441,10 @@ def test_max_features(): clf.fit(iris.data, iris.target) assert_equal(clf.max_features_, 2) + # use values of beta that are invalid for classification + clf = TreeClassifier(beta=2.0) + assert_raises(ValueError, clf.fit, X, y) + for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_features="sqrt") est.fit(iris.data, iris.target) @@ -493,6 +497,13 @@ def test_max_features(): est = TreeEstimator(max_features="foobar") assert_raises(ValueError, est.fit, X, y) + # use values of beta that are invalid + clf = TreeClassifier(beta=-1.0) + assert_raises(ValueError, clf.fit, X, y) + + clf = TreeClassifier(beta="foobar") + assert_raises(ValueError, clf.fit, X, y) + def test_error(): # Test that it gives proper exception on deficient input. diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index ef7b51b13ce3e..e92786a0b2c95 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -308,8 +308,14 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. - if not 0. <= beta: - raise ValueError("beta must be greater or equal to 0") + if is_classification: + if not 0. <= beta <= 1: + raise ValueError("beta must be in range [0,1] " + "in classification") + else: + if not 0. <= beta: + raise ValueError("beta must be greater than or equal " + "to 0 in regression") presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, From 8205f83b2b2bd57614c63ddac090e46cff04638f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 18:26:37 -0700 Subject: [PATCH 05/16] feature: add beta to forest-based ensemble methods --- sklearn/ensemble/forest.py | 41 ++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index f76414066a92c..a096d39999967 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -805,6 +805,10 @@ class RandomForestClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -899,6 +903,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + beta=0., bootstrap=True, oob_score=False, n_jobs=1, @@ -911,7 +916,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "beta", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -928,6 +933,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.beta = beta class RandomForestRegressor(ForestRegressor): @@ -1001,6 +1007,10 @@ class RandomForestRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -1064,6 +1074,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + beta=0., bootstrap=True, oob_score=False, n_jobs=1, @@ -1075,7 +1086,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "beta", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1091,6 +1102,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.beta = beta class ExtraTreesClassifier(ForestClassifier): @@ -1160,6 +1172,10 @@ class ExtraTreesClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1255,6 +1271,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + beta=0., bootstrap=False, oob_score=False, n_jobs=1, @@ -1267,7 +1284,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "beta", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1284,6 +1301,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.beta = beta class ExtraTreesRegressor(ForestRegressor): @@ -1355,6 +1373,10 @@ class ExtraTreesRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1419,6 +1441,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + beta=0., bootstrap=False, oob_score=False, n_jobs=1, @@ -1430,7 +1453,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "beta", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1446,7 +1469,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes - + self.beta = beta class RandomTreesEmbedding(BaseForest): """An ensemble of totally random trees. @@ -1500,6 +1523,10 @@ class RandomTreesEmbedding(BaseForest): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + sparse_output : bool, optional (default=True) Whether or not to return a sparse CSR matrix, as default behavior, or to return a dense array compatible with dense pipeline operators. @@ -1544,6 +1571,7 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, + beta=0., sparse_output=True, n_jobs=1, random_state=None, @@ -1554,7 +1582,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "beta", "random_state"), bootstrap=False, oob_score=False, @@ -1570,6 +1598,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = 1 self.max_leaf_nodes = max_leaf_nodes + self.beta = beta self.sparse_output = sparse_output def _set_oob_score(self, X, y): From 796fa8a06abab5be17b31fc9c215e43cdd833c16 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 18:44:36 -0700 Subject: [PATCH 06/16] feature: add separate condition to determine that beta is float --- sklearn/tree/tests/test_tree.py | 17 ++++++----------- sklearn/tree/tree.py | 8 +++++--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 07a833b13168e..bffc97d7069a8 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -441,10 +441,6 @@ def test_max_features(): clf.fit(iris.data, iris.target) assert_equal(clf.max_features_, 2) - # use values of beta that are invalid for classification - clf = TreeClassifier(beta=2.0) - assert_raises(ValueError, clf.fit, X, y) - for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_features="sqrt") est.fit(iris.data, iris.target) @@ -497,13 +493,6 @@ def test_max_features(): est = TreeEstimator(max_features="foobar") assert_raises(ValueError, est.fit, X, y) - # use values of beta that are invalid - clf = TreeClassifier(beta=-1.0) - assert_raises(ValueError, clf.fit, X, y) - - clf = TreeClassifier(beta="foobar") - assert_raises(ValueError, clf.fit, X, y) - def test_error(): # Test that it gives proper exception on deficient input. @@ -516,6 +505,10 @@ def test_error(): X2 = [[-2, -1, 1]] # wrong feature shape for sample assert_raises(ValueError, est.predict_proba, X2) + # invalid type for beta parameter in classification + est = TreeEstimator(beta=2.0) + assert_raises(ValueError, est.fit, X, y) + for name, TreeEstimator in ALL_TREES.items(): # Invalid values for parameters assert_raises(ValueError, TreeEstimator(min_samples_leaf=-1).fit, X, y) @@ -535,6 +528,8 @@ def test_error(): X, y) assert_raises(ValueError, TreeEstimator(max_depth=-1).fit, X, y) assert_raises(ValueError, TreeEstimator(max_features=42).fit, X, y) + assert_raises(ValueError, TreeEstimator(beta=-1.0).fit, X, y) + assert_raises(ValueError, TreeEstimator(beta="foobar").fit, X, y) # Wrong dimensions est = TreeEstimator() diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index e92786a0b2c95..83d5dba95a9c1 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -308,14 +308,16 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. + if not isinstance(beta, float): + raise ValueError("beta must be a float") if is_classification: - if not 0. <= beta <= 1: - raise ValueError("beta must be in range [0,1] " + if not 0. <= beta <= 1.: + raise ValueError("beta must be in range [0., 1.] " "in classification") else: if not 0. <= beta: raise ValueError("beta must be greater than or equal " - "to 0 in regression") + "to 0. in regression") presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, From cdd8dfdbbcaeac25567d4edb548c9db88a8bd971 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 3 Jul 2016 19:04:07 -0700 Subject: [PATCH 07/16] feature: add beta to gradient boosting estimators --- sklearn/ensemble/gradient_boosting.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 1b0767d419168..19b5145dbf10d 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -722,7 +722,7 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble, @abstractmethod def __init__(self, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_depth, init, subsample, max_features, + max_depth, beta, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -736,6 +736,7 @@ def __init__(self, loss, learning_rate, n_estimators, criterion, self.subsample = subsample self.max_features = max_features self.max_depth = max_depth + self.beta = beta self.init = init self.random_state = random_state self.alpha = alpha @@ -1358,6 +1359,10 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + init : BaseEstimator, None, optional (default=None) An estimator object that is used to compute the initial predictions. ``init`` has to provide ``fit`` and ``predict``. @@ -1437,7 +1442,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, init=None, random_state=None, + max_depth=3, beta=0.,init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1450,7 +1455,9 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, random_state=random_state, verbose=verbose, - max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, + max_leaf_nodes=max_leaf_nodes, + beta=beta, + warm_start=warm_start, presort=presort) def _validate_y(self, y): @@ -1711,6 +1718,10 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. + beta : float, optional (default=0.) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + alpha : float (default=0.9) The alpha-quantile of the huber loss function and the quantile loss function. Only if ``loss='huber'`` or ``loss='quantile'``. @@ -1791,7 +1802,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, init=None, random_state=None, + max_depth=3, beta=0., init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1801,7 +1812,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, - max_features=max_features, + max_features=max_features, beta=beta, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort) From 68f2d6c450f99d4da47a1d36ff0cac417ac1da7b Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 4 Jul 2016 09:00:12 -0700 Subject: [PATCH 08/16] rename parameter to min_impurity_split, edit input validation and associated tests --- sklearn/ensemble/forest.py | 60 +++++++++++++-------------- sklearn/ensemble/gradient_boosting.py | 24 +++++------ sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 16 +++---- sklearn/tree/tests/test_tree.py | 41 ++++++++---------- sklearn/tree/tree.py | 46 +++++++++----------- 6 files changed, 88 insertions(+), 101 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index a096d39999967..3c030669356ed 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -805,9 +805,9 @@ class RandomForestClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -903,7 +903,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - beta=0., + min_impurity_split=0., bootstrap=True, oob_score=False, n_jobs=1, @@ -916,7 +916,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", "beta", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -933,7 +933,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split class RandomForestRegressor(ForestRegressor): @@ -1007,9 +1007,9 @@ class RandomForestRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -1074,7 +1074,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - beta=0., + min_impurity_split=0., bootstrap=True, oob_score=False, n_jobs=1, @@ -1086,7 +1086,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", "beta", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1102,7 +1102,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split class ExtraTreesClassifier(ForestClassifier): @@ -1172,9 +1172,9 @@ class ExtraTreesClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1271,7 +1271,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - beta=0., + min_impurity_split=0., bootstrap=False, oob_score=False, n_jobs=1, @@ -1284,7 +1284,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", "beta", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1301,7 +1301,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split class ExtraTreesRegressor(ForestRegressor): @@ -1373,9 +1373,9 @@ class ExtraTreesRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1441,7 +1441,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - beta=0., + min_impurity_split=0., bootstrap=False, oob_score=False, n_jobs=1, @@ -1453,7 +1453,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", "beta", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1469,7 +1469,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split class RandomTreesEmbedding(BaseForest): """An ensemble of totally random trees. @@ -1523,9 +1523,9 @@ class RandomTreesEmbedding(BaseForest): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. sparse_output : bool, optional (default=True) Whether or not to return a sparse CSR matrix, as default behavior, @@ -1571,7 +1571,7 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, - beta=0., + min_impurity_split=0., sparse_output=True, n_jobs=1, random_state=None, @@ -1582,7 +1582,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", "beta", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=False, oob_score=False, @@ -1598,7 +1598,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = 1 self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output def _set_oob_score(self, X, y): diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 19b5145dbf10d..a5ed725e05e10 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -722,7 +722,7 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble, @abstractmethod def __init__(self, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_depth, beta, init, subsample, max_features, + max_depth, min_impurity_split, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -736,7 +736,7 @@ def __init__(self, loss, learning_rate, n_estimators, criterion, self.subsample = subsample self.max_features = max_features self.max_depth = max_depth - self.beta = beta + self.min_impurity_split = min_impurity_split self.init = init self.random_state = random_state self.alpha = alpha @@ -1359,9 +1359,9 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. init : BaseEstimator, None, optional (default=None) An estimator object that is used to compute the initial @@ -1442,7 +1442,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, beta=0.,init=None, random_state=None, + max_depth=3, min_impurity_split=0.,init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1456,7 +1456,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_features=max_features, random_state=random_state, verbose=verbose, max_leaf_nodes=max_leaf_nodes, - beta=beta, + min_impurity_split=min_impurity_split, warm_start=warm_start, presort=presort) @@ -1718,9 +1718,9 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. alpha : float (default=0.9) The alpha-quantile of the huber loss function and the quantile @@ -1802,7 +1802,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, beta=0., init=None, random_state=None, + max_depth=3, min_impurity_split=0., init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1812,7 +1812,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, - max_features=max_features, beta=beta, + max_features=max_features, min_impurity_split=min_impurity_split, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 0db91ed2c87eb..dbf0545b1e1d5 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -96,7 +96,7 @@ cdef class TreeBuilder: cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf cdef SIZE_t max_depth # Maximal tree depth - cdef double beta # Impurity threshold for early stopping + cdef double min_impurity_split # Impurity threshold for early stopping cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 9f97938521fe1..bf39bb928d2cf 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -132,13 +132,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double beta): + SIZE_t max_depth, double min_impurity_split): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth - self.beta = beta + self.min_impurity_split = min_impurity_split cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -168,7 +168,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf cdef SIZE_t min_samples_split = self.min_samples_split - cdef double beta = self.beta + cdef double min_impurity_split = self.min_impurity_split # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) @@ -228,7 +228,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = (is_leaf or (impurity <= MIN_IMPURITY_SPLIT) or - (impurity < beta)) + (impurity < min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) @@ -295,14 +295,14 @@ cdef class BestFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, min_weight_leaf, SIZE_t max_depth, SIZE_t max_leaf_nodes, - double beta): + double min_impurity_split): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -428,7 +428,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 cdef double weighted_n_samples = splitter.weighted_n_samples - cdef double beta = self.beta + cdef double min_impurity_split = self.min_impurity_split cdef double weighted_n_node_samples cdef bint is_leaf cdef SIZE_t n_left, n_right @@ -445,7 +445,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): (n_node_samples < 2 * self.min_samples_leaf) or (weighted_n_node_samples < self.min_weight_leaf) or (impurity <= MIN_IMPURITY_SPLIT) or - (impurity < beta)) + (impurity < min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index bffc97d7069a8..02132290e54fc 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -505,10 +505,6 @@ def test_error(): X2 = [[-2, -1, 1]] # wrong feature shape for sample assert_raises(ValueError, est.predict_proba, X2) - # invalid type for beta parameter in classification - est = TreeEstimator(beta=2.0) - assert_raises(ValueError, est.fit, X, y) - for name, TreeEstimator in ALL_TREES.items(): # Invalid values for parameters assert_raises(ValueError, TreeEstimator(min_samples_leaf=-1).fit, X, y) @@ -528,8 +524,7 @@ def test_error(): X, y) assert_raises(ValueError, TreeEstimator(max_depth=-1).fit, X, y) assert_raises(ValueError, TreeEstimator(max_features=42).fit, X, y) - assert_raises(ValueError, TreeEstimator(beta=-1.0).fit, X, y) - assert_raises(ValueError, TreeEstimator(beta="foobar").fit, X, y) + assert_raises(ValueError, TreeEstimator(min_impurity_split=-1.0).fit, X, y) # Wrong dimensions est = TreeEstimator() @@ -687,8 +682,8 @@ def test_min_weight_fraction_leaf(): yield check_min_weight_fraction_leaf, name, "multilabel", True -def test_beta(): - # Test if beta creates leaves with impurity [0, beta) when +def test_min_impurity_split(): + # Test if min_impurity_split creates leaves with impurity [0, min_impurity_split) when # min_samples_leaf = 1 and min_samples_split = 2. X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE)) y = iris.target @@ -697,30 +692,30 @@ def test_beta(): # by setting max_leaf_nodes # we set max leaf nodes to a number greater than the total nodes # possible, thus ensuring that the leaves generated have impurity - # of 0 when there is no beta stopping used. + # of 0 when there is no min_impurity_split stopping used. for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): TreeEstimator = ALL_TREES[name] - beta = .5 + min_impurity_split = .5 - # verify leaf nodes without beta have impurity 0 + # verify leaf nodes without min_impurity_split have impurity 0 est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0) - assert_equal(est.beta, 0., - "Failed, beta = {0} != 0".format( - est.beta)) + assert_equal(est.min_impurity_split, 0., + "Failed, min_impurity_split = {0} != 0".format( + est.min_impurity_split)) est.fit(X, y) for node in range(est.tree_.node_count): if (est.tree_.children_left[node] == TREE_LEAF or est.tree_.children_right[node] == TREE_LEAF): assert_equal(est.tree_.impurity[node], 0., "Failed with {0} " - "beta={1}".format( + "min_impurity_split={1}".format( est.tree_.impurity[node], - est.beta)) + est.min_impurity_split)) - # verify leaf nodes have impurity [0,beta) when using beta + # verify leaf nodes have impurity [0,min_impurity_split) when using min_impurity_split est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, - beta=beta, + min_impurity_split=min_impurity_split, random_state=0) est.fit(X, y) for node in range(est.tree_.node_count): @@ -728,14 +723,14 @@ def test_beta(): est.tree_.children_right[node] == TREE_LEAF): assert_greater_equal(est.tree_.impurity[node], 0, "Failed with {0} " - "beta={1}".format( + "min_impurity_split={1}".format( est.tree_.impurity[node], - est.beta)) - assert_less(est.tree_.impurity[node], beta, + est.min_impurity_split)) + assert_less(est.tree_.impurity[node], min_impurity_split, "Failed with {0} " - "beta={1}".format( + "min_impurity_split={1}".format( est.tree_.impurity[node], - est.beta)) + est.min_impurity_split)) def test_pickle(): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 83d5dba95a9c1..00d99421a82c7 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -90,7 +90,7 @@ def __init__(self, max_features, max_leaf_nodes, random_state, - beta, + min_impurity_split, class_weight=None, presort=False): self.criterion = criterion @@ -102,7 +102,7 @@ def __init__(self, self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes - self.beta = beta + self.min_impurity_split = min_impurity_split self.class_weight = class_weight self.presort = presort @@ -154,7 +154,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, """ random_state = check_random_state(self.random_state) - beta = self.beta if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) @@ -308,16 +307,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. - if not isinstance(beta, float): - raise ValueError("beta must be a float") - if is_classification: - if not 0. <= beta <= 1.: - raise ValueError("beta must be in range [0., 1.] " - "in classification") - else: - if not 0. <= beta: - raise ValueError("beta must be greater than or equal " - "to 0. in regression") + if not 0. <= self.min_impurity_split: + raise ValueError("min_impurity_split must be greater than or equal " + "to 0") presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, @@ -374,13 +366,13 @@ def fit(self, X, y, sample_weight=None, check_input=True, builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, - max_depth, beta) + max_depth, self.min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, - max_leaf_nodes, beta) + max_leaf_nodes, self.min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) @@ -623,9 +615,9 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): If None, the random number generator is the RandomState instance used by `np.random`. - beta : float, optional (default=0.) - Threshold for early stopping in tree growth. If the impurity - of a node is below the threshold, the node is a leaf. + min_impurity_split : float, optional (default=0.) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. presort : bool, optional (default=False) Whether to presort the data to speed up the finding of best splits in @@ -704,7 +696,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, - beta=0., + min_impurity_split=0., class_weight=None, presort=False): super(DecisionTreeClassifier, self).__init__( @@ -718,7 +710,7 @@ def __init__(self, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, - beta=beta, + min_impurity_split=min_impurity_split, presort=presort) def predict_proba(self, X, check_input=True): @@ -869,7 +861,7 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): If None, the random number generator is the RandomState instance used by `np.random`. - beta : float, optional (default=0.) + min_impurity_split : float, optional (default=0.) Threshold for early stopping in tree growth. If the impurity of a node is below the threshold, the node is a leaf. @@ -942,7 +934,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, - beta=0., + min_impurity_split=0., presort=False): super(DecisionTreeRegressor, self).__init__( criterion=criterion, @@ -954,7 +946,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, - beta=beta, + min_impurity_split=min_impurity_split, presort=presort) @@ -992,7 +984,7 @@ def __init__(self, max_features="auto", random_state=None, max_leaf_nodes=None, - beta=0., + min_impurity_split=0., class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, @@ -1004,7 +996,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, - beta=beta, + min_impurity_split=min_impurity_split, random_state=random_state) @@ -1041,7 +1033,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, - beta=0., + min_impurity_split=0., max_leaf_nodes=None): super(ExtraTreeRegressor, self).__init__( criterion=criterion, @@ -1052,5 +1044,5 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, - beta=beta, + min_impurity_split=min_impurity_split, random_state=random_state) From 346eac932337bcc4751717fb94e17abf941798b8 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 4 Jul 2016 09:30:03 -0700 Subject: [PATCH 09/16] chore: fix spacing in forest and force recompilation of grad boosting extension --- sklearn/ensemble/_gradient_boosting.pyx | 2 +- sklearn/ensemble/forest.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 9e6e9f6d29c0e..d268856e32049 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -35,7 +35,7 @@ ctypedef np.npy_intp SIZE_t # constant to mark tree leafs cdef int LEAF = -1 - +# trivial comment to force recompilation cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, Node* root_node, double *value, diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 3c030669356ed..4208a4201838a 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -1471,6 +1471,7 @@ def __init__(self, self.max_leaf_nodes = max_leaf_nodes self.min_impurity_split = min_impurity_split + class RandomTreesEmbedding(BaseForest): """An ensemble of totally random trees. From 1a5fae5fd08465f7ba3d0e3b26ee89ad22026dd5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 4 Jul 2016 09:57:02 -0700 Subject: [PATCH 10/16] remove trivial comment in grad boost and add whats new --- doc/whats_new.rst | 3 +++ sklearn/ensemble/_gradient_boosting.pyx | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c47f4274f4dd8..e5ef76d388bc0 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -125,6 +125,9 @@ New features `_) by `Nelson Liu`_. + - Added weighted impurity-based early stopping criterion for decision tree growth. + (`#6954 `_) by `Nelson Liu`_ + Enhancements ............ diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index d268856e32049..9e6e9f6d29c0e 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -35,7 +35,7 @@ ctypedef np.npy_intp SIZE_t # constant to mark tree leafs cdef int LEAF = -1 -# trivial comment to force recompilation + cdef void _predict_regression_tree_inplace_fast(DTYPE_t *X, Node* root_node, double *value, From f85e74a653cd3c1f86d48490581ff1529cc76eda Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 4 Jul 2016 12:42:36 -0700 Subject: [PATCH 11/16] edit wording in test comment / rebuild --- sklearn/tree/tests/test_tree.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 02132290e54fc..dc3aae1b87f74 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -683,16 +683,14 @@ def test_min_weight_fraction_leaf(): def test_min_impurity_split(): - # Test if min_impurity_split creates leaves with impurity [0, min_impurity_split) when - # min_samples_leaf = 1 and min_samples_split = 2. + # test if min_impurity_split creates leaves with impurity + # [0, min_impurity_split) when min_samples_leaf = 1 and + # min_samples_split = 2. X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE)) y = iris.target # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes - # we set max leaf nodes to a number greater than the total nodes - # possible, thus ensuring that the leaves generated have impurity - # of 0 when there is no min_impurity_split stopping used. for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): TreeEstimator = ALL_TREES[name] min_impurity_split = .5 From 15a2951fdc5a94ec434aa286029b360128ab5f57 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 18:52:14 -0500 Subject: [PATCH 12/16] rename constant with the same name as our parameter --- sklearn/tree/_tree.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index bf39bb928d2cf..7beebe88d3a6a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -64,7 +64,7 @@ TREE_UNDEFINED = -2 cdef SIZE_t _TREE_LEAF = TREE_LEAF cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED cdef SIZE_t INITIAL_STACK_SIZE = 10 -cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7 +cdef DTYPE_t LEAF_MIN_IMPURITY = 1e-7 # Repeat struct definition for numpy NODE_DTYPE = np.dtype({ @@ -227,7 +227,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): first = 0 is_leaf = (is_leaf or - (impurity <= MIN_IMPURITY_SPLIT) or + (impurity <= LEAF_MIN_IMPURITY) or (impurity < min_impurity_split)) if not is_leaf: @@ -444,7 +444,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): (n_node_samples < self.min_samples_split) or (n_node_samples < 2 * self.min_samples_leaf) or (weighted_n_node_samples < self.min_weight_leaf) or - (impurity <= MIN_IMPURITY_SPLIT) or + (impurity <= LEAF_MIN_IMPURITY) or (impurity < min_impurity_split)) if not is_leaf: From 7ff2aaac1b7a5bbb6d050cf3e592ce95cf75af7f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 25 Jul 2016 11:32:16 -0700 Subject: [PATCH 13/16] edit line length for what's new --- doc/whats_new.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e5ef76d388bc0..7b9c90f8afa1f 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -125,8 +125,10 @@ New features `_) by `Nelson Liu`_. - - Added weighted impurity-based early stopping criterion for decision tree growth. - (`#6954 `_) by `Nelson Liu`_ + - Added weighted impurity-based early stopping criterion for decision tree + growth. (`#6954 + `_) by `Nelson + Liu`_ Enhancements ............ From 838bad6809f0974e43d7bfec68eb6df2a58e5ed0 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 26 Jul 2016 09:15:27 -0700 Subject: [PATCH 14/16] remove constant and set min_impurity_split to 1e-7 by default --- sklearn/tree/_tree.pyx | 7 ++----- sklearn/tree/tests/test_tree.py | 21 +++++++++++---------- sklearn/tree/tree.py | 12 ++++++------ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 7beebe88d3a6a..f3db4a197580a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -64,7 +64,6 @@ TREE_UNDEFINED = -2 cdef SIZE_t _TREE_LEAF = TREE_LEAF cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED cdef SIZE_t INITIAL_STACK_SIZE = 10 -cdef DTYPE_t LEAF_MIN_IMPURITY = 1e-7 # Repeat struct definition for numpy NODE_DTYPE = np.dtype({ @@ -227,8 +226,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): first = 0 is_leaf = (is_leaf or - (impurity <= LEAF_MIN_IMPURITY) or - (impurity < min_impurity_split)) + (impurity <= min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) @@ -444,8 +442,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): (n_node_samples < self.min_samples_split) or (n_node_samples < 2 * self.min_samples_leaf) or (weighted_n_node_samples < self.min_weight_leaf) or - (impurity <= LEAF_MIN_IMPURITY) or - (impurity < min_impurity_split)) + (impurity <= min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index dc3aae1b87f74..231d12d539e5a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -695,11 +695,12 @@ def test_min_impurity_split(): TreeEstimator = ALL_TREES[name] min_impurity_split = .5 - # verify leaf nodes without min_impurity_split have impurity 0 + # verify leaf nodes without min_impurity_split less than + # impurity 1e-7 est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0) - assert_equal(est.min_impurity_split, 0., - "Failed, min_impurity_split = {0} != 0".format( + assert_less_equal(est.min_impurity_split, 1e-7, + "Failed, min_impurity_split = {0} > 1e-7".format( est.min_impurity_split)) est.fit(X, y) for node in range(est.tree_.node_count): @@ -711,7 +712,7 @@ def test_min_impurity_split(): est.tree_.impurity[node], est.min_impurity_split)) - # verify leaf nodes have impurity [0,min_impurity_split) when using min_impurity_split + # verify leaf nodes have impurity [0,min_impurity_split] when using min_impurity_split est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, random_state=0) @@ -720,15 +721,15 @@ def test_min_impurity_split(): if (est.tree_.children_left[node] == TREE_LEAF or est.tree_.children_right[node] == TREE_LEAF): assert_greater_equal(est.tree_.impurity[node], 0, - "Failed with {0} " + "Failed with {0}, " "min_impurity_split={1}".format( est.tree_.impurity[node], est.min_impurity_split)) - assert_less(est.tree_.impurity[node], min_impurity_split, - "Failed with {0} " - "min_impurity_split={1}".format( - est.tree_.impurity[node], - est.min_impurity_split)) + assert_less_equal(est.tree_.impurity[node], min_impurity_split, + "Failed with {0}, " + "min_impurity_split={1}".format( + est.tree_.impurity[node], + est.min_impurity_split)) def test_pickle(): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 00d99421a82c7..907d46dce3ce3 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -307,7 +307,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. - if not 0. <= self.min_impurity_split: + if self.min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than or equal " "to 0") @@ -696,7 +696,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, class_weight=None, presort=False): super(DecisionTreeClassifier, self).__init__( @@ -861,7 +861,7 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): If None, the random number generator is the RandomState instance used by `np.random`. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. If the impurity of a node is below the threshold, the node is a leaf. @@ -934,7 +934,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, presort=False): super(DecisionTreeRegressor, self).__init__( criterion=criterion, @@ -984,7 +984,7 @@ def __init__(self, max_features="auto", random_state=None, max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, @@ -1033,7 +1033,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, - min_impurity_split=0., + min_impurity_split=1e-7, max_leaf_nodes=None): super(ExtraTreeRegressor, self).__init__( criterion=criterion, From b15f10256ae341ce042ab0d00e259bcc3fa508ce Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 26 Jul 2016 09:23:37 -0700 Subject: [PATCH 15/16] fix docstrings for new default --- sklearn/ensemble/forest.py | 10 +++++----- sklearn/ensemble/gradient_boosting.py | 4 ++-- sklearn/tree/tree.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 4208a4201838a..71464e1ca1864 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -805,7 +805,7 @@ class RandomForestClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1007,7 +1007,7 @@ class RandomForestRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1172,7 +1172,7 @@ class ExtraTreesClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1373,7 +1373,7 @@ class ExtraTreesRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1524,7 +1524,7 @@ class RandomTreesEmbedding(BaseForest): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index a5ed725e05e10..a37f047088687 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1359,7 +1359,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. @@ -1718,7 +1718,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 907d46dce3ce3..c1aa0b3ab2578 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -615,7 +615,7 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): If None, the random number generator is the RandomState instance used by `np.random`. - min_impurity_split : float, optional (default=0.) + min_impurity_split : float, optional (default=1e-7) Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. From 9fce4fc11b6dc91ddad16f159f26d321f9c3e6fb Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 27 Jul 2016 08:33:28 -0700 Subject: [PATCH 16/16] fix defaults in gradientboosting and forest classes --- sklearn/ensemble/forest.py | 10 +++++----- sklearn/ensemble/gradient_boosting.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 71464e1ca1864..1002c5967834e 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -903,7 +903,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=1, @@ -1074,7 +1074,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=1, @@ -1271,7 +1271,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, @@ -1441,7 +1441,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, @@ -1572,7 +1572,7 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, - min_impurity_split=0., + min_impurity_split=1e-7, sparse_output=True, n_jobs=1, random_state=None, diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index a37f047088687..eca5d3697fbe0 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1442,8 +1442,8 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, min_impurity_split=0.,init=None, random_state=None, - max_features=None, verbose=0, + max_depth=3, min_impurity_split=1e-7, init=None, + random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1802,7 +1802,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, min_impurity_split=0., init=None, random_state=None, + max_depth=3, min_impurity_split=1e-7, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'):