diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py index 218d69d5ac7d7..4293033432c72 100644 --- a/examples/ensemble/plot_gradient_boosting_regularization.py +++ b/examples/ensemble/plot_gradient_boosting_regularization.py @@ -4,9 +4,10 @@ ================================ Illustration of the effect of different regularization strategies -for Gradient Boosting. The example is taken from Hastie et al 2009 [1]_. +for Gradient Boosting. The example is taken from Chapter 10.12 of +Hastie et al 2009 [1]_. -The loss function used is binomial deviance. Regularization via +The loss function used is log loss, aka binomial deviance. Regularization via shrinkage (``learning_rate < 1.0``) improves performance considerably. In combination with shrinkage, stochastic gradient boosting (``subsample < 1.0``) can produce more accurate models by reducing the @@ -21,14 +22,13 @@ """ -# Author: Peter Prettenhofer -# # License: BSD 3 clause import matplotlib.pyplot as plt import numpy as np -from sklearn import datasets, ensemble +from sklearn import datasets +from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split @@ -44,48 +44,58 @@ "max_leaf_nodes": 4, "max_depth": None, "random_state": 2, - "min_samples_split": 5, + "min_samples_leaf": 2, } -plt.figure() - -for label, color, setting in [ - ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}), - ("learning_rate=0.2", "turquoise", {"learning_rate": 0.2, "subsample": 1.0}), - ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}), - ( - "learning_rate=0.2, subsample=0.5", - "gray", - {"learning_rate": 0.2, "subsample": 0.5}, - ), - ( - "learning_rate=0.2, max_features=2", - "magenta", - {"learning_rate": 0.2, "max_features": 2}, - ), -]: - params = dict(original_params) - params.update(setting) - - clf = ensemble.GradientBoostingClassifier(**params) - clf.fit(X_train, y_train) - - # compute test set deviance - test_deviance = np.zeros((params["n_estimators"],), dtype=np.float64) - - for i, y_proba in enumerate(clf.staged_predict_proba(X_test)): - test_deviance[i] = 2 * log_loss(y_test, y_proba[:, 1]) - - plt.plot( - (np.arange(test_deviance.shape[0]) + 1)[::5], - test_deviance[::5], - "-", - color=color, - label=label, - ) - -plt.legend(loc="upper right") -plt.xlabel("Boosting Iterations") -plt.ylabel("Test Set Deviance") - -plt.show() +fig, axes = plt.subplots(ncols=2, figsize=(10, 5), sharex=True, sharey=True) + +for j, model_class in enumerate( + [GradientBoostingClassifier, HistGradientBoostingClassifier] +): + for label, color, setting in [ + ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}), + ("learning_rate=0.2", "turquoise", {"learning_rate": 0.2, "subsample": 1.0}), + ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}), + ( + "learning_rate=0.2, subsample=0.5", + "gray", + {"learning_rate": 0.2, "subsample": 0.5}, + ), + ( + "learning_rate=0.2, max_features=2", + "magenta", + {"learning_rate": 0.2, "max_features": 2}, + ), + ]: + params = dict(original_params) + params.update(setting) + n_iter = params["n_estimators"] + if model_class == HistGradientBoostingClassifier: + params["max_iter"] = params.pop("n_estimators") + if "max_features" in params: + params["max_features"] = float( + params["max_features"] / X_train.shape[1] + ) + + clf = model_class(**params) + clf.fit(X_train, y_train) + + # compute test set deviance + test_loss = np.zeros((n_iter,), dtype=np.float64) + + for i, y_proba in enumerate(clf.staged_predict_proba(X_test)): + test_loss[i] = 2 * log_loss(y_test, y_proba[:, 1]) + + axes[j].plot( + (np.arange(test_loss.shape[0]) + 1)[::5], + test_loss[::5], + "-", + color=color, + label=label, + ) + + axes[j].set_ylim(None, 2) + axes[j].legend(loc="upper right") + axes[j].set_xlabel("Boosting Iterations") + axes[j].set_ylabel("Test Set Log Loss") + axes[j].set_title(model_class.__name__) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index fe234958e631a..79f3a86d4c300 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -5,12 +5,14 @@ import numpy as np from .common import Y_DTYPE from .common cimport Y_DTYPE_C +from ...utils._typedefs cimport int64_t def _update_raw_predictions( Y_DTYPE_C [::1] raw_predictions, # OUT grower, n_threads, + sample_idx, ): """Update raw_predictions with the predictions of the newest tree. @@ -35,7 +37,7 @@ def _update_raw_predictions( values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) _update_raw_predictions_helper(raw_predictions, starts, stops, partition, - values, n_threads) + values, n_threads, sample_idx) cdef inline void _update_raw_predictions_helper( @@ -45,6 +47,7 @@ cdef inline void _update_raw_predictions_helper( const unsigned int [::1] partition, const Y_DTYPE_C [::1] values, int n_threads, + const int64_t [::1] sample_idx=None, ): cdef: @@ -52,7 +55,15 @@ cdef inline void _update_raw_predictions_helper( int leaf_idx int n_leaves = starts.shape[0] - for leaf_idx in prange(n_leaves, schedule='static', nogil=True, - num_threads=n_threads): - for position in range(starts[leaf_idx], stops[leaf_idx]): - raw_predictions[partition[position]] += values[leaf_idx] + if sample_idx is None: + for leaf_idx in prange( + n_leaves, schedule='static', nogil=True, num_threads=n_threads + ): + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[partition[position]] += values[leaf_idx] + else: + for leaf_idx in prange( + n_leaves, schedule='static', nogil=True, num_threads=n_threads + ): + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[sample_idx[partition[position]]] += values[leaf_idx] diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e7f69948e7cc2..13799e6d9e14d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -36,6 +36,7 @@ from ...utils import check_random_state, compute_sample_weight, is_scalar_nan, resample from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from ...utils.fixes import parse_version from ...utils.multiclass import check_classification_targets from ...utils.validation import ( _check_monotonic_cst, @@ -148,6 +149,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], "l2_regularization": [Interval(Real, 0, None, closed="left")], "max_features": [Interval(RealNotInt, 0, 1, closed="right")], + "subsample": [Interval(Real, 0.0, 1.0, closed="right")], "monotonic_cst": ["array-like", dict, None], "interaction_cst": [ list, @@ -188,6 +190,7 @@ def __init__( min_samples_leaf, l2_regularization, max_features, + subsample, max_bins, categorical_features, monotonic_cst, @@ -209,6 +212,7 @@ def __init__( self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_features = max_features + self.subsample = subsample self.max_bins = max_bins self.monotonic_cst = monotonic_cst self.interaction_cst = interaction_cst @@ -579,6 +583,30 @@ def fit(self, X, y, sample_weight=None): self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed) + # TODO: Remove this condition, once numpy 1.25 is the minimum version. + if parse_version(np.__version__) >= parse_version("1.25"): + self._bagging_subsample_rng = self._feature_subsample_rng.spawn(1)[0] + else: + # See numpy Generator.spawn(self, int n_children) and + # numpy BitGenerator.spawn + + def spawn_generator(generator, n_children): + return [ + type(generator)(g) + for g in spawn_bit_generator( + generator._bit_generator, n_children + ) + ] + + def spawn_bit_generator(_bit_generator, n_children): + return [ + type(_bit_generator)(seed=s) + for s in _bit_generator._seed_seq.spawn(n_children) + ] + + self._bagging_subsample_rng = spawn_generator( + self._feature_subsample_rng, 1 + )[0] self._validate_parameters() monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst) @@ -838,6 +866,19 @@ def fit(self, X, y, sample_weight=None): begin_at_stage = self.n_iter_ + # Out of bag settings + do_oob = self.subsample < 1.0 + if do_oob: + # Note that setting sample_weight to zero for the corresponding samples + # would result in false "count" statistics of the histograms. Therefore, + # we make take copys (fancy indexed numpy arrays) for the subsampling. + n_inbag = max(1, int(self.subsample * n_samples)) + sample_mask = np.zeros((n_samples,), dtype=bool) + sample_mask[:n_inbag] = True + else: + sample_mask = slice(None) + sample_mask_idx = None + # initialize gradients and hessians (empty arrays). # shape = (n_samples, n_trees_per_iteration). gradient, hessian = self._loss.init_gradient_and_hessian( @@ -884,12 +925,28 @@ def fit(self, X, y, sample_weight=None): g_view = gradient h_view = hessian + # Do out of bag if required + if do_oob: + self._bagging_subsample_rng.shuffle(sample_mask) + sample_mask_idx = np.flatnonzero(sample_mask) + X_binned_grow = np.asfortranarray(X_binned_train[sample_mask]) + g_grow = np.asfortranarray(g_view[sample_mask]) + if self._loss.constant_hessian: + h_grow = h_view + else: + h_grow = np.asfortranarray(h_view[sample_mask]) + + else: + X_binned_grow = X_binned_train + g_grow = g_view + h_grow = h_view + # Build `n_trees_per_iteration` trees. for k in range(self.n_trees_per_iteration_): grower = TreeGrower( - X_binned=X_binned_train, - gradients=g_view[:, k], - hessians=h_view[:, k], + X_binned=X_binned_grow, + gradients=g_grow[:, k], + hessians=h_grow[:, k], n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values, @@ -928,7 +985,12 @@ def fit(self, X, y, sample_weight=None): # Update raw_predictions with the predictions of the newly # created tree. tic_pred = time() - _update_raw_predictions(raw_predictions[:, k], grower, n_threads) + _update_raw_predictions( + raw_predictions[:, k], + grower, + n_threads, + sample_idx=sample_mask_idx, + ) toc_pred = time() acc_prediction_time += toc_pred - tic_pred @@ -1490,6 +1552,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 1.4 + subsample : float, default=1.0 + The fraction of randomly chosen samples to be used for fitting the individual + tree(s) in each boosting iteration. If smaller than 1.0 this results in + Stochastic Gradient Boosting or bagging. + Values must be in the range `(0.0, 1.0]`. + + .. versionadded:: 1.5 + max_bins : int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into @@ -1698,6 +1768,7 @@ def __init__( min_samples_leaf=20, l2_regularization=0.0, max_features=1.0, + subsample=1.0, max_bins=255, categorical_features="warn", monotonic_cst=None, @@ -1720,6 +1791,7 @@ def __init__( min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_features=max_features, + subsample=subsample, max_bins=max_bins, monotonic_cst=monotonic_cst, interaction_cst=interaction_cst, @@ -1866,6 +1938,14 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 1.4 + subsample : float, default=1.0 + The fraction of randomly chosen samples to be used for fitting the individual + tree(s) in each boosting iteration. If smaller than 1.0 this results in + Stochastic Gradient Boosting or bagging. + Values must be in the range `(0.0, 1.0]`. + + .. versionadded:: 1.5 + max_bins : int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into @@ -2076,6 +2156,7 @@ def __init__( min_samples_leaf=20, l2_regularization=0.0, max_features=1.0, + subsample=1.0, max_bins=255, categorical_features="warn", monotonic_cst=None, @@ -2099,6 +2180,7 @@ def __init__( min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_features=max_features, + subsample=subsample, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index e14d786b5bc74..63d229e5948d7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1588,6 +1588,45 @@ def test_categorical_features_warn(): hist.fit(X, y) +@pytest.mark.parametrize( + "problem", ("regression", "binary_classification", "multiclass_classification") +) +def test_bagging(problem): + """Test subsample/bagging.""" + n_samples = 100 + n_features = 2 + params = dict(max_iter=10, early_stopping=False, validation_fraction=None) + if problem == "regression": + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + random_state=0, + ) + model = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == "binary_classification" else 3 + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, + random_state=0, + ) + model = HistGradientBoostingClassifier + + est = model(**params).fit(X, y) + est_bag_75 = model(subsample=0.75, **params).fit(X, y) + est_bag_50 = model(subsample=0.5, **params).fit(X, y) + est_bag_25 = model(subsample=0.25, **params).fit(X, y) + + assert np.all(est.train_score_ > est_bag_75.train_score_) + assert np.all(est_bag_75.train_score_ > est_bag_50.train_score_) + assert np.all(est_bag_50.train_score_ > est_bag_25.train_score_) + + def get_different_bitness_node_ndarray(node_ndarray): new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32