diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c47f4274f4dd8..7b9c90f8afa1f 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -125,6 +125,11 @@ New features `_) by `Nelson Liu`_. + - Added weighted impurity-based early stopping criterion for decision tree + growth. (`#6954 + `_) by `Nelson + Liu`_ + Enhancements ............ diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index f76414066a92c..1002c5967834e 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -805,6 +805,10 @@ class RandomForestClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -899,6 +903,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=1, @@ -911,7 +916,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -928,6 +933,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split class RandomForestRegressor(ForestRegressor): @@ -1001,6 +1007,10 @@ class RandomForestRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + bootstrap : boolean, optional (default=True) Whether bootstrap samples are used when building trees. @@ -1064,6 +1074,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=1, @@ -1075,7 +1086,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1091,6 +1102,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split class ExtraTreesClassifier(ForestClassifier): @@ -1160,6 +1172,10 @@ class ExtraTreesClassifier(ForestClassifier): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1255,6 +1271,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, @@ -1267,7 +1284,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1284,6 +1301,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split class ExtraTreesRegressor(ForestRegressor): @@ -1355,6 +1373,10 @@ class ExtraTreesRegressor(ForestRegressor): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + bootstrap : boolean, optional (default=False) Whether bootstrap samples are used when building trees. @@ -1419,6 +1441,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, + min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, @@ -1430,7 +1453,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=bootstrap, oob_score=oob_score, @@ -1446,6 +1469,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split class RandomTreesEmbedding(BaseForest): @@ -1500,6 +1524,10 @@ class RandomTreesEmbedding(BaseForest): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + sparse_output : bool, optional (default=True) Whether or not to return a sparse CSR matrix, as default behavior, or to return a dense array compatible with dense pipeline operators. @@ -1544,6 +1572,7 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, + min_impurity_split=1e-7, sparse_output=True, n_jobs=1, random_state=None, @@ -1554,7 +1583,7 @@ def __init__(self, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", + "max_features", "max_leaf_nodes", "min_impurity_split", "random_state"), bootstrap=False, oob_score=False, @@ -1570,6 +1599,7 @@ def __init__(self, self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = 1 self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output def _set_oob_score(self, X, y): diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 1b0767d419168..eca5d3697fbe0 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -722,7 +722,7 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble, @abstractmethod def __init__(self, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, - max_depth, init, subsample, max_features, + max_depth, min_impurity_split, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -736,6 +736,7 @@ def __init__(self, loss, learning_rate, n_estimators, criterion, self.subsample = subsample self.max_features = max_features self.max_depth = max_depth + self.min_impurity_split = min_impurity_split self.init = init self.random_state = random_state self.alpha = alpha @@ -1358,6 +1359,10 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + init : BaseEstimator, None, optional (default=None) An estimator object that is used to compute the initial predictions. ``init`` has to provide ``fit`` and ``predict``. @@ -1437,8 +1442,8 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, init=None, random_state=None, - max_features=None, verbose=0, + max_depth=3, min_impurity_split=1e-7, init=None, + random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1450,7 +1455,9 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, random_state=random_state, verbose=verbose, - max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, + max_leaf_nodes=max_leaf_nodes, + min_impurity_split=min_impurity_split, + warm_start=warm_start, presort=presort) def _validate_y(self, y): @@ -1711,6 +1718,10 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + alpha : float (default=0.9) The alpha-quantile of the huber loss function and the quantile loss function. Only if ``loss='huber'`` or ``loss='quantile'``. @@ -1791,7 +1802,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., - max_depth=3, init=None, random_state=None, + max_depth=3, min_impurity_split=1e-7, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -1801,7 +1812,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, - max_features=max_features, + max_features=max_features, min_impurity_split=min_impurity_split, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 007b7a7860342..dbf0545b1e1d5 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -4,6 +4,7 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -95,6 +96,7 @@ cdef class TreeBuilder: cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf cdef SIZE_t max_depth # Maximal tree depth + cdef double min_impurity_split # Impurity threshold for early stopping cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..f3db4a197580a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -12,6 +12,7 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -63,7 +64,6 @@ TREE_UNDEFINED = -2 cdef SIZE_t _TREE_LEAF = TREE_LEAF cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED cdef SIZE_t INITIAL_STACK_SIZE = 10 -cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7 # Repeat struct definition for numpy NODE_DTYPE = np.dtype({ @@ -131,12 +131,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth): + SIZE_t max_depth, double min_impurity_split): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth + self.min_impurity_split = min_impurity_split cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -166,6 +167,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf cdef SIZE_t min_samples_split = self.min_samples_split + cdef double min_impurity_split = self.min_impurity_split # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight_ptr, X_idx_sorted) @@ -223,7 +225,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): impurity = splitter.node_impurity() first = 0 - is_leaf = is_leaf or (impurity <= MIN_IMPURITY_SPLIT) + is_leaf = (is_leaf or + (impurity <= min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) @@ -289,13 +292,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes): + SIZE_t max_depth, SIZE_t max_leaf_nodes, + double min_impurity_split): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, @@ -421,6 +426,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 cdef double weighted_n_samples = splitter.weighted_n_samples + cdef double min_impurity_split = self.min_impurity_split cdef double weighted_n_node_samples cdef bint is_leaf cdef SIZE_t n_left, n_right @@ -436,7 +442,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): (n_node_samples < self.min_samples_split) or (n_node_samples < 2 * self.min_samples_leaf) or (weighted_n_node_samples < self.min_weight_leaf) or - (impurity <= MIN_IMPURITY_SPLIT)) + (impurity <= min_impurity_split)) if not is_leaf: splitter.node_split(impurity, &split, &n_constant_features) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 78a35fe5becc1..231d12d539e5a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -524,6 +524,7 @@ def test_error(): X, y) assert_raises(ValueError, TreeEstimator(max_depth=-1).fit, X, y) assert_raises(ValueError, TreeEstimator(max_features=42).fit, X, y) + assert_raises(ValueError, TreeEstimator(min_impurity_split=-1.0).fit, X, y) # Wrong dimensions est = TreeEstimator() @@ -681,6 +682,56 @@ def test_min_weight_fraction_leaf(): yield check_min_weight_fraction_leaf, name, "multilabel", True +def test_min_impurity_split(): + # test if min_impurity_split creates leaves with impurity + # [0, min_impurity_split) when min_samples_leaf = 1 and + # min_samples_split = 2. + X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE)) + y = iris.target + + # test both DepthFirstTreeBuilder and BestFirstTreeBuilder + # by setting max_leaf_nodes + for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): + TreeEstimator = ALL_TREES[name] + min_impurity_split = .5 + + # verify leaf nodes without min_impurity_split less than + # impurity 1e-7 + est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, + random_state=0) + assert_less_equal(est.min_impurity_split, 1e-7, + "Failed, min_impurity_split = {0} > 1e-7".format( + est.min_impurity_split)) + est.fit(X, y) + for node in range(est.tree_.node_count): + if (est.tree_.children_left[node] == TREE_LEAF or + est.tree_.children_right[node] == TREE_LEAF): + assert_equal(est.tree_.impurity[node], 0., + "Failed with {0} " + "min_impurity_split={1}".format( + est.tree_.impurity[node], + est.min_impurity_split)) + + # verify leaf nodes have impurity [0,min_impurity_split] when using min_impurity_split + est = TreeEstimator(max_leaf_nodes=max_leaf_nodes, + min_impurity_split=min_impurity_split, + random_state=0) + est.fit(X, y) + for node in range(est.tree_.node_count): + if (est.tree_.children_left[node] == TREE_LEAF or + est.tree_.children_right[node] == TREE_LEAF): + assert_greater_equal(est.tree_.impurity[node], 0, + "Failed with {0}, " + "min_impurity_split={1}".format( + est.tree_.impurity[node], + est.min_impurity_split)) + assert_less_equal(est.tree_.impurity[node], min_impurity_split, + "Failed with {0}, " + "min_impurity_split={1}".format( + est.tree_.impurity[node], + est.min_impurity_split)) + + def test_pickle(): for name, TreeEstimator in ALL_TREES.items(): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index f004d845279bc..c1aa0b3ab2578 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -10,6 +10,7 @@ # Satrajit Gosh # Joly Arnaud # Fares Hedayati +# Nelson Liu # # License: BSD 3 clause @@ -89,6 +90,7 @@ def __init__(self, max_features, max_leaf_nodes, random_state, + min_impurity_split, class_weight=None, presort=False): self.criterion = criterion @@ -100,6 +102,7 @@ def __init__(self, self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split self.class_weight = class_weight self.presort = presort @@ -304,6 +307,10 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: min_weight_leaf = 0. + if self.min_impurity_split < 0.: + raise ValueError("min_impurity_split must be greater than or equal " + "to 0") + presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. @@ -359,13 +366,13 @@ def fit(self, X, y, sample_weight=None, check_input=True, builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, - max_depth) + max_depth, self.min_impurity_split) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, - max_leaf_nodes) + max_leaf_nodes, self.min_impurity_split) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) @@ -608,6 +615,10 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): If None, the random number generator is the RandomState instance used by `np.random`. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. A node will split + if its impurity is above the threshold, otherwise it is a leaf. + presort : bool, optional (default=False) Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large @@ -685,6 +696,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, + min_impurity_split=1e-7, class_weight=None, presort=False): super(DecisionTreeClassifier, self).__init__( @@ -698,6 +710,7 @@ def __init__(self, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, + min_impurity_split=min_impurity_split, presort=presort) def predict_proba(self, X, check_input=True): @@ -848,6 +861,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): If None, the random number generator is the RandomState instance used by `np.random`. + min_impurity_split : float, optional (default=1e-7) + Threshold for early stopping in tree growth. If the impurity + of a node is below the threshold, the node is a leaf. + presort : bool, optional (default=False) Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large @@ -917,6 +934,7 @@ def __init__(self, max_features=None, random_state=None, max_leaf_nodes=None, + min_impurity_split=1e-7, presort=False): super(DecisionTreeRegressor, self).__init__( criterion=criterion, @@ -928,6 +946,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, + min_impurity_split=min_impurity_split, presort=presort) @@ -965,6 +984,7 @@ def __init__(self, max_features="auto", random_state=None, max_leaf_nodes=None, + min_impurity_split=1e-7, class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, @@ -976,6 +996,7 @@ def __init__(self, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, + min_impurity_split=min_impurity_split, random_state=random_state) @@ -1012,6 +1033,7 @@ def __init__(self, min_weight_fraction_leaf=0., max_features="auto", random_state=None, + min_impurity_split=1e-7, max_leaf_nodes=None): super(ExtraTreeRegressor, self).__init__( criterion=criterion, @@ -1022,4 +1044,5 @@ def __init__(self, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, + min_impurity_split=min_impurity_split, random_state=random_state)