8000 Merge pull request #8 from glouppe/gbm · glouppe/scikit-learn@f172759 · GitHub
[go: up one dir, main page]

Skip to content

Commit f172759

Browse files
committed
Merge pull request #8 from glouppe/gbm
Add max_leaf_nodes to forest + DOC
2 parents 2ff06a2 + c281adf commit f172759

File tree

3 files changed

+81
-39
lines changed

8000 3 files changed

+81
-39
lines changed

sklearn/ensemble/forest.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,12 @@ class RandomForestClassifier(ForestClassifier):
660660
``min_samples_leaf`` samples.
661661
Note: this parameter is tree-specific.
662662
663+
max_leaf_nodes : int or None, optional (default=None)
664+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
665+
Best nodes are defined as relative reduction in impurity.
666+
If None then unlimited number of leaf nodes.
667+
Note: this parameter is tree-specific.
668+
663669
bootstrap : boolean, optional (default=True)
664670
Whether bootstrap samples are used when building trees.
665671
@@ -721,6 +727,7 @@ def __init__(self,
721727
min_samples_split=2,
722728
min_samples_leaf=1,
723729
max_features="auto",
730+
max_leaf_nodes=None,
724731
bootstrap=True,
725732
oob_score=False,
726733
n_jobs=1,
@@ -733,7 +740,7 @@ def __init__(self,
733740
n_estimators=n_estimators,
734741
estimator_params=("criterion", "max_depth", "min_samples_split",
735742
"min_samples_leaf", "max_features",
736-
"random_state"),
743+
"max_leaf_nodes", "random_state"),
737744
bootstrap=bootstrap,
738745
oob_score=oob_score,
739746
n_jobs=n_jobs,
@@ -745,6 +752,7 @@ def __init__(self,
745752
self.min_samples_split = min_samples_split
746753
self.min_samples_leaf = min_samples_leaf
747754
self.max_features = max_features
755+
self.max_leaf_nodes = max_leaf_nodes
748756

749757
if min_density is not None:
750758
warn("The min_density parameter is deprecated as of version 0.14 "
@@ -804,6 +812,12 @@ class RandomForestRegressor(ForestRegressor):
804812
``min_samples_leaf`` samples.
805813
Note: this parameter is tree-specific.
806814
815+
max_leaf_nodes : int or None, optional (default=None)
816+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
817+
Best nodes are defined as relative reduction in impurity.
818+
If None then unlimited number of leaf nodes.
819+
Note: this parameter is tree-specific.
820+
807821
bootstrap : boolean, optional (default=True)
808822
Whether bootstrap samples are used when building trees.
809823
@@ -854,6 +868,7 @@ def __init__(self,
854868
min_samples_split=2,
855869
min_samples_leaf=1,
856870
max_features="auto",
871+
max_leaf_nodes=None,
857872
bootstrap=True,
858873
oob_score=False,
859874
n_jobs=1,
@@ -866,7 +881,7 @@ def __init__(self,
866881
n_estimators=n_estimators,
867882
estimator_params=("criterion", "max_depth", "min_samples_split",
868883
"min_samples_leaf", "max_features",
869-
"random_state"),
884+
"max_leaf_nodes", "random_state"),
870885
bootstrap=bootstrap,
871886
oob_score=oob_score,
872887
n_jobs=n_jobs,
@@ -878,6 +893,7 @@ def __init__(self,
878893
self.min_samples_split = min_samples_split
879894
self.min_samples_leaf = min_samples_leaf
880895
self.max_features = max_features
896+
self.max_leaf_nodes = max_leaf_nodes
881897

882898
if min_density is not None:
883899
warn("The min_density parameter is deprecated as of version 0.14 "
@@ -938,6 +954,12 @@ class ExtraTreesClassifier(ForestClassifier):
938954
``min_samples_leaf`` samples.
939955
Note: this parameter is tree-specific.
940956
957+
max_leaf_nodes : int or None, optional (default=None)
958+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
959+
Best nodes are defined as relative reduction in impurity.
960+
If None then unlimited number of leaf nodes.
961+
Note: this parameter is tree-specific.
962+
941963
bootstrap : boolean, optional (default=False)
942964
Whether bootstrap samples are used when building trees.
943965
@@ -1002,6 +1024,7 @@ def __init__(self,
10021024
min_samples_split=2,
10031025
min_samples_leaf=1,
10041026
max_features="auto",
1027+
max_leaf_nodes=None,
10051028
bootstrap=False,
10061029
oob_score=False,
10071030
n_jobs=1,
@@ -1014,7 +1037,7 @@ def __init__(self,
10141037
n_estimators=n_estimators,
10151038
estimator_params=("criterion", "max_depth", "min_samples_split",
10161039
"min_samples_leaf", "max_features",
1017-
"random_state"),
1040+
"max_leaf_nodes", "random_state"),
10181041
bootstrap=bootstrap,
10191042
oob_score=oob_score,
10201043
n_jobs=n_jobs,
@@ -1026,6 +1049,7 @@ def __init__(self,
10261049
self.min_samples_split = min_samples_split
10271050
self.min_samples_leaf = min_samples_leaf
10281051
self.max_features = max_features
1052+
self.max_leaf_nodes = max_leaf_nodes
10291053

10301054
if min_density is not None:
10311055
warn("The min_density parameter is deprecated as of version 0.14 "
@@ -1086,6 +1110,12 @@ class ExtraTreesRegressor(ForestRegressor):
10861110
``min_samples_leaf`` samples.
10871111
Note: this parameter is tree-specific.
10881112
1113+
max_leaf_nodes : int or None, optional (default=None)
1114+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
1115+
Best nodes are defined as relative reduction in impurity.
1116+
If None then unlimited number of leaf nodes.
1117+
Note: this parameter is tree-specific.
1118+
10891119
bootstrap : boolean, optional (default=False)
10901120
Whether bootstrap samples are used when building trees.
10911121
Note: this parameter is tree-specific.
@@ -1139,6 +1169,7 @@ def __init__(self,
11391169
min_samples_split=2,
11401170
min_samples_leaf=1,
11411171
max_features="auto",
1172+
max_leaf_nodes=None,
11421173
bootstrap=False,
11431174
oob_score=False,
11441175
n_jobs=1,
@@ -1151,7 +1182,7 @@ def __init__(self,
11511182
n_estimators=n_estimators,
11521183
estimator_params=("criterion", "max_depth", "min_samples_split",
11531184
"min_samples_leaf", "max_features",
1154-
"random_state"),
1185+
"max_leaf_nodes", "random_state"),
11551186
bootstrap=bootstrap,
11561187
oob_score=oob_score,
11571188
n_jobs=n_jobs,
@@ -1163,6 +1194,7 @@ def __init__(self,
11631194
self.min_samples_split = min_samples_split
11641195
self.min_samples_leaf = min_samples_leaf
11651196
self.max_features = max_features
1197+
self.max_leaf_nodes = max_leaf_nodes
11661198

11671199
if min_density is not None:
11681200
warn("The min_density parameter is deprecated as of version 0.14 "
@@ -1205,6 +1237,12 @@ class RandomTreesEmbedding(BaseForest):
12051237
``min_samples_leaf`` samples.
12061238
Note: this parameter is tree-specific.
12071239
1240+
max_leaf_nodes : int or None, optional (default=None)
1241+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
1242+
Best nodes are defined as relative reduction in impurity.
1243+
If None then unlimited number of leaf nodes.
1244+
Note: this parameter is tree-specific.
1245+
12081246
n_jobs : integer, optional (default=1)
12091247
The number of jobs to run in parallel for both `fit` and `predict`.
12101248
If -1, then the number of jobs is set to the number of cores.
@@ -1238,6 +1276,7 @@ def __init__(self,
12381276
max_depth=5,
12391277
min_samples_split=2,
12401278
min_samples_leaf=1,
1279+
max_leaf_nodes=None,
12411280
n_jobs=1,
12421281
random_state=None,
12431282
verbose=0,
@@ -1247,7 +1286,7 @@ def __init__(self,
12471286
n_estimators=n_estimators,
12481287
estimator_params=("criterion", "max_depth", "min_samples_split",
12491288
"min_samples_leaf", "max_features",
1250-
"random_state"),
1289+
"max_leaf_nodes", "random_state"),
12511290
bootstrap=False,
12521291
oob_score=False,
12531292
n_jobs=n_jobs,
@@ -1259,6 +1298,7 @@ def __init__(self,
12591298
self.min_samples_split = min_samples_split
12601299
self.min_samples_leaf = min_samples_leaf
12611300
self.max_features = 1
1301+
self.max_leaf_nodes = max_leaf_nodes
12621302

12631303
if min_density is not None:
12641304
warn("The min_density parameter is deprecated as of version 0.14 "

sklearn/ensemble/gradient_boosting.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble)):
519519
@abstractmethod
520520
def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
521521
min_samples_leaf, max_depth, init, subsample, max_features,
522-
random_state, alpha=0.9, verbose=0, max_leaf_nodes=-1):
522+
random_state, alpha=0.9, verbose=0, max_leaf_nodes=None):
523523

524524
self.n_estimators = n_estimators
525525
self.learning_rate = learning_rate
@@ -1012,16 +1012,16 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
10121012
Choosing `max_features < n_features` leads to a reduction of variance
10131013
and an increase in bias.
10141014
1015+
max_leaf_nodes : int or None, optional (default=None)
1016+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
1017+
Best nodes are defined as relative reduction in impurity.
1018+
If None then unlimited number of leaf nodes.
1019+
10151020
init : BaseEstimator, None, optional (default=None)
10161021
An estimator object that is used to compute the initial
10171022
predictions. ``init`` has to provide ``fit`` and ``predict``.
10181023
If None it uses ``loss.init_estimator``.
10191024
1020-
max_leaf_nodes : bool (default=True)
1021-
Whether to grow a max_leaf_nodes binary tree (subject to stopping conditions
1022-
of max_depth, and minimum samples) or a greedy tree branch of
1023-
``max_depth`` and with ``max_depth + 1`` leaf nodes.
1024-
10251025
verbose : int, default: 0
10261026
Enable verbose output. If 1 then it prints progress and performance
10271027
once in a while (the more trees the lower the frequency).
@@ -1080,7 +1080,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
10801080
subsample=1.0, min_samples_split=2, min_samples_leaf=1,
10811081
max_depth=3, init=None, random_state=None,
10821082
max_features=None, verbose=0,
1083-
max_leaf_nodes=-1):
1083+
max_leaf_nodes=None):
10841084

10851085
super(GradientBoostingClassifier, self).__init__(
10861086
loss, learning_rate, n_estimators, min_samples_split,
@@ -1298,6 +1298,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
12981298
Choosing `max_features < n_features` leads to a reduction of variance
12991299
and an increase in bias.
13001300
1301+
max_leaf_nodes : int or None, optional (default=None)
1302+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
1303+
Best nodes are defined as relative reduction in impurity.
1304+
If None then unlimited number of leaf nodes.
1305+
13011306
alpha : float (default=0.9)
13021307
The alpha-quantile of the huber loss function and the quantile
13031308
loss function. Only if ``loss='huber'`` or ``loss='quantile'``.
@@ -1307,11 +1312,6 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
13071312
predictions. ``init`` has to provide ``fit`` and ``predict``.
13081313
If None it uses ``loss.init_estimator``.
13091314
1310-
max_leaf_nodes : bool (default=True)
1311-
Whether to grow a max_leaf_nodes binary tree (subject to stopping conditions
1312-
of max_depth, and minimum samples) or a greedy tree branch of
1313-
``max_depth`` and with ``max_depth + 1`` leaf nodes.
1314-
13151315
verbose : int, default: 0
13161316
Enable verbose output. If 1 then it prints progress and performance
13171317
once in a while (the more trees the lower the frequency).
@@ -1369,7 +1369,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
13691369
def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
13701370
subsample=1.0, min_samples_split=2, min_samples_leaf=1,
13711371
max_depth=3, init=None, random_state=None,
1372-
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=-1):
1372+
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None):
13731373

13741374
super(GradientBoostingRegressor, self).__init__(
13751375
loss, learning_rate, n_estimators, min_samples_split,

sklearn/tree/tree.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
175175

176176
# Check parameters
177177
max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth
178+
max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
178179

179180
if isinstance(self.max_features, six.string_types):
180181
if self.max_features == "auto":
@@ -210,7 +211,7 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
210211
raise ValueError("max_depth must be greater than zero. ")
211212
if not (0 < max_features <= self.n_features_):
212213
raise ValueError("max_features must be in (0, n_features]")
213-
if not isinstance(self.max_leaf_nodes, (numbers.Integral, np.integer)):
214+
if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
214215
raise ValueError("max_leaf_nodes must be integral number")
215216
if -1 < self.max_leaf_nodes < 2:
216217
raise ValueError("max_leaf_nodes must be either smaller than 0 or larger than 1")
@@ -252,14 +253,15 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
252253
self.tree_ = Tree(self.n_features_, self.n_classes_,
253254
self.n_outputs_, splitter, max_depth,
254255
min_samples_split, self.min_samples_leaf,
255-
self.max_leaf_nodes, random_state)
256+
max_leaf_nodes, random_state)
256257

257258
# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
258-
if self.max_leaf_nodes < 0:
259-
tree_builder = DepthFirstTreeBuilder()
259+
if max_leaf_nodes < 0:
260+
builder = DepthFirstTreeBuilder()
260261
10000 else:
261-
tree_builder = BestFirstTreeBuilder()
262-
tree_builder.build(self.tree_, X, y, sample_weight)
262+
builder = BestFirstTreeBuilder()
263+
264+
builder.build(self.tree_, X, y, sample_weight)
263265

264266
if self.n_outputs_ == 1:
265267
self.n_classes_ = self.n_classes_[0]
@@ -376,16 +378,16 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
376378
all leaves are pure or until all leaves contain less than
377379
min_samples_split samples.
378380
379-
min_samples_split : integer, optional (default=2)
381+
min_samples_split : int, optional (default=2)
380382
The minimum number of samples required to split an internal node.
381383
382-
min_samples_leaf : integer, optional (default=1)
384+
min_samples_leaf : int, optional (default=1)
383385
The minimum number of samples required to be at a leaf node.
384386
385-
max_leaf_nodes : int (default=-1)
387+
max_leaf_nodes : int or None, optional (default=None)
386388
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
387-
Best nodes are defines as relative reduction in impurity.
388-
If < 0 then unlimited number of leaf nodes.
389+
Best nodes are defined as relative reduction in impurity.
390+
If None then unlimited number of leaf nodes.
389391
390392
random_state : int, RandomState instance or None, optional (default=None)
391393
If int, random_state is the seed used by the random number generator;
@@ -457,7 +459,7 @@ def __init__(self,
457459
random_state=None,
458460
min_density=None,
459461
compute_importances=None,
460-
max_leaf_nodes=-1):
462+
max_leaf_nodes=None):
461463
super(DecisionTreeClassifier, self).__init__(criterion=criterion,
462464
splitter=splitter,
463465
max_depth=max_depth,
@@ -580,21 +582,21 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
580582
- If "log2", then `max_features=log2(n_features)`.
581583
- If None, then `max_features=n_features`.
582584
583-
max_depth : integer or None, optional (default=None)
585+
max_depth : int or None, optional (default=None)
584586
The maximum depth of the tree. If None, then nodes are expanded until
585587
all leaves are pure or until all leaves contain less than
586588
min_samples_split samples.
587589
588-
min_samples_split : integer, optional (default=2)
590+
min_samples_split : int, optional (default=2)
589591
The minimum number of samples required to split an internal node.
590592
591-
min_samples_leaf : integer, optional (default=1)
593+
min_samples_leaf : int, optional (default=1)
592594
The minimum number of samples required to be at a leaf node.
593595
594-
max_leaf_nodes : int (default=-1)
596+
max_leaf_nodes : int or None, optional (default=None)
595597
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
596-
Best nodes are defines as relative reduction in impurity.
597-
If < 0 then unlimited number of leaf nodes.
598+
Best nodes are defined as relative reduction in impurity.
599+
If None then unlimited number of leaf nodes.
598600
599601
random_state : int, RandomState instance or None, optional (default=None)
600602
If int, random_state is the seed used by the random number generator;
@@ -658,7 +660,7 @@ def __init__(self,
658660
random_state=None,
659661
min_density=None,
660662
compute_importances=None,
661-
max_leaf_nodes=-1):
663+
max_leaf_nodes=None):
662664
super(DecisionTreeRegressor, self).__init__(criterion=criterion,
663665
splitter=splitter,
664666
max_depth=max_depth,
@@ -711,7 +713,7 @@ def __init__(self,
711713
random_state=None,
712714
min_density=None,
713715
compute_importances=None,
714-
max_leaf_nodes=-1):
716+
max_leaf_nodes=None):
715717
super(ExtraTreeClassifier, self).__init__(criterion=criterion,
716718
splitter=splitter,
717719
max_depth=max_depth,
@@ -764,7 +766,7 @@ def __init__(self,
764766
random_state=None,
765767
min_density=None,
766768
compute_importances=None,
767-
max_leaf_nodes=-1):
769+
max_leaf_nodes=None):
768770
super(ExtraTreeRegressor, self).__init__(criterion=criterion,
769771
splitter=splitter,
770772
max_depth=max_depth,

0 commit comments

Comments
 (0)
0