diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 3a5290dae81fb..9444f96f3fa3b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -183,6 +183,18 @@ Changelog and :class:`ensemble.HistGradientBoostingRegressor` instead. :pr:`14907` by `Adrin Jalali`_. +- |Enhancement| Addition of ``max_samples`` argument allows limiting + size of bootstrap samples to be less than size of dataset. Added to + :class:`ensemble.forest.ForestClassifier`, + :class:`ensemble.forest.ForestRegressor`, + :class:`ensemble.forest.RandomForestClassifier`, + :class:`ensemble.forest.RandomForestRegressor`, + :class:`ensemble.forest.ExtraTreesClassifier`, + :class:`ensemble.forest.ExtraTreesRegressor`, + :class:`ensemble.forest.RandomTreesEmbedding`. :pr:`14682` by + :user:`Matt Hancock ` and + :pr:`5963` by :user:`Pablo Duboue `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 4726a0dabcedf..a062c913aadcb 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause +import numbers from warnings import catch_warnings, simplefilter, warn import threading @@ -72,17 +73,57 @@ class calls the ``fit`` method of each sub-estimator on random samples MAX_INT = np.iinfo(np.int32).max -def _generate_sample_indices(random_state, n_samples): +def _get_n_samples_bootstrap(n_samples, max_samples): + """Get the number of samples in a bootstrap sample. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset. + max_samples : int or float + The maximum number of samples to draw from the total available: + - if float, this indicates a fraction of the total and should be + the interval `(0, 1)`; + - if int, this indicates the exact number of samples; + - if None, this indicates the total number of samples. + + Returns + ------- + n_samples_bootstrap : int + The total number of samples to draw for the bootstrap sample. + """ + if max_samples is None: + return n_samples + + if isinstance(max_samples, numbers.Integral): + if not (1 <= max_samples <= n_samples): + msg = "`max_samples` must be in range 1 to {} but got value {}" + raise ValueError(msg.format(n_samples, max_samples)) + return max_samples + + if isinstance(max_samples, numbers.Real): + if not (0 < max_samples < 1): + msg = "`max_samples` must be in range (0, 1) but got value {}" + raise ValueError(msg.format(max_samples)) + return int(round(n_samples * max_samples)) + + msg = "`max_samples` should be int or float, but got type '{}'" + raise TypeError(msg.format(type(max_samples))) + + +def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): """Private function used to _parallel_build_trees function.""" + random_instance = check_random_state(random_state) - sample_indices = random_instance.randint(0, n_samples, n_samples) + sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) return sample_indices -def _generate_unsampled_indices(random_state, n_samples): +def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): """Private function used to forest._set_oob_score function.""" - sample_indices = _generate_sample_indices(random_state, n_samples) + sample_indices = _generate_sample_indices(random_state, n_samples, + n_samples_bootstrap) sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) @@ -92,7 +133,8 @@ def _generate_unsampled_indices(random_state, n_samples): def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0, class_weight=None): + verbose=0, class_weight=None, + n_samples_bootstrap=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -104,7 +146,8 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, else: curr_sample_weight = sample_weight.copy() - indices = _generate_sample_indices(tree.random_state, n_samples) + indices = _generate_sample_indices(tree.random_state, n_samples, + n_samples_bootstrap) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts @@ -140,7 +183,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=None): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -153,6 +197,7 @@ def __init__(self, self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight + self.max_samples = max_samples def apply(self, X): """Apply trees in the forest to X, return leaf indices. @@ -277,6 +322,12 @@ def fit(self, X, y, sample_weight=None): else: sample_weight = expanded_class_weight + # Get bootstrap sample size + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X.shape[0], + max_samples=self.max_samples + ) + # Check parameters self._validate_estimator() @@ -320,7 +371,8 @@ def fit(self, X, y, sample_weight=None): **_joblib_parallel_args(prefer='threads'))( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight) + verbose=self.verbose, class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees @@ -410,7 +462,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - class_weight=None): + class_weight=None, + max_samples=None): super().__init__( base_estimator, n_estimators=n_estimators, @@ -421,7 +474,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) def _set_oob_score(self, X, y): """Compute out-of-bag score""" @@ -435,9 +489,13 @@ def _set_oob_score(self, X, y): predictions = [np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_)] + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) + for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples) + estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X[unsampled_indices, :], check_input=False) @@ -650,7 +708,8 @@ def __init__(self, n_jobs=None, random_state=None, verbose=0, - warm_start=False): + warm_start=False, + max_samples=None): super().__init__( base_estimator, n_estimators=n_estimators, @@ -660,7 +719,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) def predict(self, X): """Predict regression target for X. @@ -713,9 +773,13 @@ def _set_oob_score(self, X, y): predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) + for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples) + estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) @@ -922,6 +986,16 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + .. versionadded:: 0.22 + Attributes ---------- base_estimator_ : DecisionTreeClassifier @@ -1017,7 +1091,8 @@ def __init__(self, verbose=0, warm_start=False, class_weight=None, - ccp_alpha=0.0): + ccp_alpha=0.0, + max_samples=None): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1032,7 +1107,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1198,6 +1274,16 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + .. versionadded:: 0.22 + Attributes ---------- base_estimator_ : DecisionTreeRegressor @@ -1287,7 +1373,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - ccp_alpha=0.0): + ccp_alpha=0.0, + max_samples=None): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1301,7 +1388,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1485,6 +1573,16 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + .. versionadded:: 0.22 + Attributes ---------- base_estimator_ : ExtraTreeClassifier @@ -1560,7 +1658,8 @@ def __init__(self, verbose=0, warm_start=False, class_weight=None, - ccp_alpha=0.0): + ccp_alpha=0.0, + max_samples=None): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1575,7 +1674,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight) + class_weight=class_weight, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1738,6 +1838,16 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + .. versionadded:: 0.22 + Attributes ---------- base_estimator_ : ExtraTreeRegressor @@ -1800,7 +1910,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - ccp_alpha=0.0): + ccp_alpha=0.0, + max_samples=None): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -1814,7 +1925,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) self.criterion = criterion self.max_depth = max_depth @@ -1954,6 +2066,16 @@ class RandomTreesEmbedding(BaseForest): .. versionadded:: 0.22 + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + .. versionadded:: 0.22 + Attributes ---------- estimators_ : list of DecisionTreeClassifier @@ -1986,7 +2108,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - ccp_alpha=0.0): + ccp_alpha=0.0, + max_samples=None): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2000,7 +2123,8 @@ def __init__(self, n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start) + warm_start=warm_start, + max_samples=max_samples) self.max_depth = max_depth self.min_samples_split = min_samples_split diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 01102c9679053..b41d0e0e1ba13 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1330,3 +1330,65 @@ def test_forest_degenerate_feature_importances(): gbr = RandomForestRegressor(n_estimators=10).fit(X, y) assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64)) + + +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize( + 'max_samples, exc_type, exc_msg', + [(int(1e9), ValueError, + "`max_samples` must be in range 1 to 6 but got value 1000000000"), + (1.0, ValueError, + r"`max_samples` must be in range \(0, 1\) but got value 1.0"), + (2.0, ValueError, + r"`max_samples` must be in range \(0, 1\) but got value 2.0"), + (0.0, ValueError, + r"`max_samples` must be in range \(0, 1\) but got value 0.0"), + (np.nan, ValueError, + r"`max_samples` must be in range \(0, 1\) but got value nan"), + (np.inf, ValueError, + r"`max_samples` must be in range \(0, 1\) but got value inf"), + ('str max_samples?!', TypeError, + r"`max_samples` should be int or float, but got " + r"type '\'"), + (np.ones(2), TypeError, + r"`max_samples` should be int or float, but got type " + r"'\'")] +) +def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg): + # Check invalid `max_samples` values + est = FOREST_CLASSIFIERS_REGRESSORS[name](max_samples=max_samples) + with pytest.raises(exc_type, match=exc_msg): + est.fit(X, y) + + +@pytest.mark.parametrize( + 'ForestClass', [RandomForestClassifier, RandomForestRegressor] +) +def test_little_tree_with_small_max_samples(ForestClass): + rng = np.random.RandomState(1) + + X = rng.randn(10000, 2) + y = rng.randn(10000) > 0 + + # First fit with no restriction on max samples + est1 = ForestClass( + n_estimators=1, + random_state=rng, + max_samples=None, + ) + + # Second fit with max samples restricted to just 2 + est2 = ForestClass( + n_estimators=1, + random_state=rng, + max_samples=2, + ) + + est1.fit(X, y) + est2.fit(X, y) + + tree1 = est1.estimators_[0].tree_ + tree2 = est2.estimators_[0].tree_ + + msg = "Tree without `max_samples` restriction should have more nodes" + assert tree1.node_count > tree2.node_count, msg