From 7ed785aa448ae9739817d831aff3e76d18bdb2c3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 8 Dec 2023 16:40:16 +0100 Subject: [PATCH 1/8] ENH add subsample to HGBT --- .../gradient_boosting.py | 41 +++++++++++++++++++ .../tests/test_gradient_boosting.py | 39 ++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index a83b1dbd0f4b9..f48c349e5a463 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -148,6 +148,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], "l2_regularization": [Interval(Real, 0, None, closed="left")], "max_features": [Interval(RealNotInt, 0, 1, closed="right")], + "subsample": [Interval(Real, 0.0, 1.0, closed="right")], "monotonic_cst": ["array-like", dict, None], "interaction_cst": [ list, @@ -188,6 +189,7 @@ def __init__( min_samples_leaf, l2_regularization, max_features, + subsample, max_bins, categorical_features, monotonic_cst, @@ -209,6 +211,7 @@ def __init__( self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_features = max_features + self.subsample = subsample self.max_bins = max_bins self.monotonic_cst = monotonic_cst self.interaction_cst = interaction_cst @@ -577,6 +580,7 @@ def fit(self, X, y, sample_weight=None): self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed) + self._bagging_subsample_rng = self._feature_subsample_rng.spawn(1)[0] self._validate_parameters() monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst) @@ -836,6 +840,18 @@ def fit(self, X, y, sample_weight=None): begin_at_stage = self.n_iter_ + # Out of bag settings + do_oob = self.subsample < 1.0 + if do_oob: + n_inbag = max(1, int(self.subsample * n_samples)) + # Same dtype as sample_weight. + sample_mask = np.zeros((n_samples,), dtype=np.float64) + sample_mask[:n_inbag] = 1 + if sample_weight_train is None: + sample_weight_train_original = np.ones(n_samples) + else: + sample_weight_train_original = sample_weight_train + # initialize gradients and hessians (empty arrays). # shape = (n_samples, n_trees_per_iteration). gradient, hessian = self._loss.init_gradient_and_hessian( @@ -849,6 +865,11 @@ def fit(self, X, y, sample_weight=None): "[{}/{}] ".format(iteration + 1, self.max_iter), end="", flush=True ) + # Do out of bag if required + if do_oob: + self._bagging_subsample_rng.shuffle(sample_mask) + sample_weight_train = sample_weight_train_original * sample_mask + # Update gradients and hessians, inplace # Note that self._loss expects shape (n_samples,) for # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration). @@ -1487,6 +1508,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionadded:: 1.4 + subsample : float, default=1.0 + The fraction of randomly chosen samples to be used for fitting the individual + tree(s) in each boosting iteration. If smaller than 1.0 this results in + Stochastic Gradient Boosting or bagging. + Values must be in the range `(0.0, 1.0]`. + + .. versionadded:: 1.5 + max_bins : int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into @@ -1695,6 +1724,7 @@ def __init__( min_samples_leaf=20, l2_regularization=0.0, max_features=1.0, + subsample=1.0, max_bins=255, categorical_features="warn", monotonic_cst=None, @@ -1717,6 +1747,7 @@ def __init__( min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_features=max_features, + subsample=subsample, max_bins=max_bins, monotonic_cst=monotonic_cst, interaction_cst=interaction_cst, @@ -1863,6 +1894,14 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): .. versionadded:: 1.4 + subsample : float, default=1.0 + The fraction of randomly chosen samples to be used for fitting the individual + tree(s) in each boosting iteration. If smaller than 1.0 this results in + Stochastic Gradient Boosting or bagging. + Values must be in the range `(0.0, 1.0]`. + + .. versionadded:: 1.5 + max_bins : int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into @@ -2073,6 +2112,7 @@ def __init__( min_samples_leaf=20, l2_regularization=0.0, max_features=1.0, + subsample=1.0, max_bins=255, categorical_features="warn", monotonic_cst=None, @@ -2096,6 +2136,7 @@ def __init__( min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_features=max_features, + subsample=subsample, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 8adc0a19dc483..fc084fe37973a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1580,3 +1580,42 @@ def test_categorical_features_warn(): msg = "The categorical_features parameter will change to 'from_dtype' in v1.6" with pytest.warns(FutureWarning, match=msg): hist.fit(X, y) + + +@pytest.mark.parametrize( + "problem", ("regression", "binary_classification", "multiclass_classification") +) +def test_bagging(problem): + """Test subsample/bagging.""" + n_samples = 100 + n_features = 2 + params = dict(max_iter=10, early_stopping=False, validation_fraction=None) + if problem == "regression": + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + random_state=0, + ) + model = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == "binary_classification" else 3 + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, + random_state=0, + ) + model = HistGradientBoostingClassifier + + est = model(**params).fit(X, y) + est_bag_75 = model(subsample=0.75, **params).fit(X, y) + est_bag_50 = model(subsample=0.5, **params).fit(X, y) + est_bag_25 = model(subsample=0.25, **params).fit(X, y) + + assert np.all(est.train_score_ > est_bag_75.train_score_) + assert np.all(est_bag_75.train_score_ > est_bag_50.train_score_) + assert np.all(est_bag_50.train_score_ > est_bag_25.train_score_) From a5970684c5f96f452f595d47326fe07b9ea31bb1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 4 Jan 2024 18:52:10 +0100 Subject: [PATCH 2/8] FIX numpy.random.Generator.spawn for numpy<1.25 --- .../_hist_gradient_boosting/gradient_boosting.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f48c349e5a463..80624ee1a9571 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -36,6 +36,7 @@ from ...utils import check_random_state, compute_sample_weight, is_scalar_nan, resample from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from ...utils.fixes import parse_version from ...utils.multiclass import check_classification_targets from ...utils.validation import ( _check_monotonic_cst, @@ -580,7 +581,17 @@ def fit(self, X, y, sample_weight=None): self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed) - self._bagging_subsample_rng = self._feature_subsample_rng.spawn(1)[0] + if parse_version(np.__version__) >= parse_version("1.25"): + self._bagging_subsample_rng = self._feature_subsample_rng.spawn(1)[0] + else: + # See numpy Generator.spawn(self, int n_children). + n_children = 1 + self._bagging_subsample_rng = [ + type(self._feature_subsample_rng)(g) + for g in self._feature_subsample_rng._bit_generator.spawn( + n_children + ) + ][0] self._validate_parameters() monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst) From 851c4f2b249c82d3ac6c0cd83c00d01b1af43b91 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 4 Jan 2024 20:14:59 +0100 Subject: [PATCH 3/8] FIX spawing older Generators 2nd try --- .../gradient_boosting.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 80624ee1a9571..df74d16cf3e2e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -581,17 +581,27 @@ def fit(self, X, y, sample_weight=None): self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed) + # TODO: Remove this condition, once numpy 1.25 is the minimum version. if parse_version(np.__version__) >= parse_version("1.25"): self._bagging_subsample_rng = self._feature_subsample_rng.spawn(1)[0] else: - # See numpy Generator.spawn(self, int n_children). - n_children = 1 - self._bagging_subsample_rng = [ - type(self._feature_subsample_rng)(g) - for g in self._feature_subsample_rng._bit_generator.spawn( - n_children - ) - ][0] + # See numpy Generator.spawn(self, int n_children) and + # numpy BitGenerator.spawn + + def spawnGenerator(self, n_children): + return [ + type(self)(g) + for g in spawnBitGenerator(self._bit_generator, n_children) + ] + + def spawnBitGenerator(self, n_children): + return [ + type(self)(seed=s) for s in self._seed_seq.spawn(n_children) + ] + + self._bagging_subsample_rng = spawnGenerator( + self._feature_subsample_rng, 1 + )[0] self._validate_parameters() monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst) From b873839ec759ebd3300f5699ba162b0c3c36d621 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Jan 2024 22:44:54 +0100 Subject: [PATCH 4/8] CLN rename arg self --- .../_hist_gradient_boosting/gradient_boosting.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index df74d16cf3e2e..035157a43161f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -588,18 +588,21 @@ def fit(self, X, y, sample_weight=None): # See numpy Generator.spawn(self, int n_children) and # numpy BitGenerator.spawn - def spawnGenerator(self, n_children): + def spawn_generator(generator, n_children): return [ - type(self)(g) - for g in spawnBitGenerator(self._bit_generator, n_children) + type(generator)(g) + for g in spawn_bit_generator( + generator._bit_generator, n_children + ) ] - def spawnBitGenerator(self, n_children): + def spawn_bit_generator(_bit_generator, n_children): return [ - type(self)(seed=s) for s in self._seed_seq.spawn(n_children) + type(_bit_generator)(seed=s) + for s in _bit_generator._seed_seq.spawn(n_children) ] - self._bagging_subsample_rng = spawnGenerator( + self._bagging_subsample_rng = spawn_generator( self._feature_subsample_rng, 1 )[0] From 7d5e42639c028d5d2b6fa22ac4d56c92bb3cb2c9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 10 Jan 2024 13:50:23 +0100 Subject: [PATCH 5/8] DOC gradient boosting regularization --- .../plot_gradient_boosting_regularization.py | 106 ++++++++++-------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py index 218d69d5ac7d7..4293033432c72 100644 --- a/examples/ensemble/plot_gradient_boosting_regularization.py +++ b/examples/ensemble/plot_gradient_boosting_regularization.py @@ -4,9 +4,10 @@ ================================ Illustration of the effect of different regularization strategies -for Gradient Boosting. The example is taken from Hastie et al 2009 [1]_. +for Gradient Boosting. The example is taken from Chapter 10.12 of +Hastie et al 2009 [1]_. -The loss function used is binomial deviance. Regularization via +The loss function used is log loss, aka binomial deviance. Regularization via shrinkage (``learning_rate < 1.0``) improves performance considerably. In combination with shrinkage, stochastic gradient boosting (``subsample < 1.0``) can produce more accurate models by reducing the @@ -21,14 +22,13 @@ """ -# Author: Peter Prettenhofer -# # License: BSD 3 clause import matplotlib.pyplot as plt import numpy as np -from sklearn import datasets, ensemble +from sklearn import datasets +from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split @@ -44,48 +44,58 @@ "max_leaf_nodes": 4, "max_depth": None, "random_state": 2, - "min_samples_split": 5, + "min_samples_leaf": 2, } -plt.figure() - -for label, color, setting in [ - ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}), - ("learning_rate=0.2", "turquoise", {"learning_rate": 0.2, "subsample": 1.0}), - ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}), - ( - "learning_rate=0.2, subsample=0.5", - "gray", - {"learning_rate": 0.2, "subsample": 0.5}, - ), - ( - "learning_rate=0.2, max_features=2", - "magenta", - {"learning_rate": 0.2, "max_features": 2}, - ), -]: - params = dict(original_params) - params.update(setting) - - clf = ensemble.GradientBoostingClassifier(**params) - clf.fit(X_train, y_train) - - # compute test set deviance - test_deviance = np.zeros((params["n_estimators"],), dtype=np.float64) - - for i, y_proba in enumerate(clf.staged_predict_proba(X_test)): - test_deviance[i] = 2 * log_loss(y_test, y_proba[:, 1]) - - plt.plot( - (np.arange(test_deviance.shape[0]) + 1)[::5], - test_deviance[::5], - "-", - color=color, - label=label, - ) - -plt.legend(loc="upper right") -plt.xlabel("Boosting Iterations") -plt.ylabel("Test Set Deviance") - -plt.show() +fig, axes = plt.subplots(ncols=2, figsize=(10, 5), sharex=True, sharey=True) + +for j, model_class in enumerate( + [GradientBoostingClassifier, HistGradientBoostingClassifier] +): + for label, color, setting in [ + ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}), + ("learning_rate=0.2", "turquoise", {"learning_rate": 0.2, "subsample": 1.0}), + ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}), + ( + "learning_rate=0.2, subsample=0.5", + "gray", + {"learning_rate": 0.2, "subsample": 0.5}, + ), + ( + "learning_rate=0.2, max_features=2", + "magenta", + {"learning_rate": 0.2, "max_features": 2}, + ), + ]: + params = dict(original_params) + params.update(setting) + n_iter = params["n_estimators"] + if model_class == HistGradientBoostingClassifier: + params["max_iter"] = params.pop("n_estimators") + if "max_features" in params: + params["max_features"] = float( + params["max_features"] / X_train.shape[1] + ) + + clf = model_class(**params) + clf.fit(X_train, y_train) + + # compute test set deviance + test_loss = np.zeros((n_iter,), dtype=np.float64) + + for i, y_proba in enumerate(clf.staged_predict_proba(X_test)): + test_loss[i] = 2 * log_loss(y_test, y_proba[:, 1]) + + axes[j].plot( + (np.arange(test_loss.shape[0]) + 1)[::5], + test_loss[::5], + "-", + color=color, + label=label, + ) + + axes[j].set_ylim(None, 2) + axes[j].legend(loc="upper right") + axes[j].set_xlabel("Boosting Iterations") + axes[j].set_ylabel("Test Set Log Loss") + axes[j].set_title(model_class.__name__) From eab8e8253c87f49ed6fa969c65fd82a22a3ebc3f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 16 Feb 2024 22:37:48 +0100 Subject: [PATCH 6/8] ENH use subsampled X, g, h instead of zero sample_weight With zero sample_weight, the count statistics of the histograms is wrong. --- .../_gradient_boosting.pyx | 21 +++++++++---- .../gradient_boosting.py | 30 +++++++++++-------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index fe234958e631a..79f3a86d4c300 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -5,12 +5,14 @@ import numpy as np from .common import Y_DTYPE from .common cimport Y_DTYPE_C +from ...utils._typedefs cimport int64_t def _update_raw_predictions( Y_DTYPE_C [::1] raw_predictions, # OUT grower, n_threads, + sample_idx, ): """Update raw_predictions with the predictions of the newest tree. @@ -35,7 +37,7 @@ def _update_raw_predictions( values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) _update_raw_predictions_helper(raw_predictions, starts, stops, partition, - values, n_threads) + values, n_threads, sample_idx) cdef inline void _update_raw_predictions_helper( @@ -45,6 +47,7 @@ cdef inline void _update_raw_predictions_helper( const unsigned int [::1] partition, const Y_DTYPE_C [::1] values, int n_threads, + const int64_t [::1] sample_idx=None, ): cdef: @@ -52,7 +55,15 @@ cdef inline void _update_raw_predictions_helper( int leaf_idx int n_leaves = starts.shape[0] - for leaf_idx in prange(n_leaves, schedule='static', nogil=True, - num_threads=n_threads): - for position in range(starts[leaf_idx], stops[leaf_idx]): - raw_predictions[partition[position]] += values[leaf_idx] + if sample_idx is None: + for leaf_idx in prange( + n_leaves, schedule='static', nogil=True, num_threads=n_threads + ): + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[partition[position]] += values[leaf_idx] + else: + for leaf_idx in prange( + n_leaves, schedule='static', nogil=True, num_threads=n_threads + ): + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[sample_idx[partition[position]]] += values[leaf_idx] diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 035157a43161f..2ba16b040b55c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -867,14 +867,15 @@ def spawn_bit_generator(_bit_generator, n_children): # Out of bag settings do_oob = self.subsample < 1.0 if do_oob: + # Note that setting sample_weight to zero for the corresponding samples + # would result in false "count" statistics of the histograms. Therefore, + # we make take copys (fancy indexed numpy arrays) for the subsampling. n_inbag = max(1, int(self.subsample * n_samples)) - # Same dtype as sample_weight. - sample_mask = np.zeros((n_samples,), dtype=np.float64) - sample_mask[:n_inbag] = 1 - if sample_weight_train is None: - sample_weight_train_original = np.ones(n_samples) - else: - sample_weight_train_original = sample_weight_train + sample_mask = np.zeros((n_samples,), dtype=np.bool) + sample_mask[:n_inbag] = True + else: + sample_mask = slice(None) + sample_mask_idx = None # initialize gradients and hessians (empty arrays). # shape = (n_samples, n_trees_per_iteration). @@ -892,7 +893,7 @@ def spawn_bit_generator(_bit_generator, n_children): # Do out of bag if required if do_oob: self._bagging_subsample_rng.shuffle(sample_mask) - sample_weight_train = sample_weight_train_original * sample_mask + sample_mask_idx = np.flatnonzero(sample_mask) # Update gradients and hessians, inplace # Note that self._loss expects shape (n_samples,) for @@ -930,9 +931,9 @@ def spawn_bit_generator(_bit_generator, n_children): # Build `n_trees_per_iteration` trees. for k in range(self.n_trees_per_iteration_): grower = TreeGrower( - X_binned=X_binned_train, - gradients=g_view[:, k], - hessians=h_view[:, k], + X_binned=X_binned_train[sample_mask], + gradients=g_view[sample_mask, k], + hessians=h_view[sample_mask, k], n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values, @@ -971,7 +972,12 @@ def spawn_bit_generator(_bit_generator, n_children): # Update raw_predictions with the predictions of the newly # created tree. tic_pred = time() - _update_raw_predictions(raw_predictions[:, k], grower, n_threads) + _update_raw_predictions( + raw_predictions[:, k], + grower, + n_threads, + sample_idx=sample_mask_idx, + ) toc_pred = time() acc_prediction_time += toc_pred - tic_pred From a37bbe4cd366b7cd3d0d4972970019a0854e54f4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 16 Feb 2024 22:50:27 +0100 Subject: [PATCH 7/8] FIX bool insteaf of np.bool and enforce f-contiguity --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 41f4fe2d7360d..b44a7612d828d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -873,7 +873,7 @@ def spawn_bit_generator(_bit_generator, n_children): # would result in false "count" statistics of the histograms. Therefore, # we make take copys (fancy indexed numpy arrays) for the subsampling. n_inbag = max(1, int(self.subsample * n_samples)) - sample_mask = np.zeros((n_samples,), dtype=np.bool) + sample_mask = np.zeros((n_samples,), dtype=bool) sample_mask[:n_inbag] = True else: sample_mask = slice(None) @@ -933,9 +933,9 @@ def spawn_bit_generator(_bit_generator, n_children): # Build `n_trees_per_iteration` trees. for k in range(self.n_trees_per_iteration_): grower = TreeGrower( - X_binned=X_binned_train[sample_mask], - gradients=g_view[sample_mask, k], - hessians=h_view[sample_mask, k], + X_binned=np.asfortranarray(X_binned_train[sample_mask]), + gradients=np.asfortranarray(g_view[sample_mask, k]), + hessians=np.asfortranarray(h_view[sample_mask, k]), n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values, From 7fb2f72130b3228efc4bf72502c6af9b71db4582 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 16 Feb 2024 23:11:41 +0100 Subject: [PATCH 8/8] FIX special case constant hessian for subsample indices --- .../gradient_boosting.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b44a7612d828d..13799e6d9e14d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -892,11 +892,6 @@ def spawn_bit_generator(_bit_generator, n_children): "[{}/{}] ".format(iteration + 1, self.max_iter), end="", flush=True ) - # Do out of bag if required - if do_oob: - self._bagging_subsample_rng.shuffle(sample_mask) - sample_mask_idx = np.flatnonzero(sample_mask) - # Update gradients and hessians, inplace # Note that self._loss expects shape (n_samples,) for # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration). @@ -930,12 +925,28 @@ def spawn_bit_generator(_bit_generator, n_children): g_view = gradient h_view = hessian + # Do out of bag if required + if do_oob: + self._bagging_subsample_rng.shuffle(sample_mask) + sample_mask_idx = np.flatnonzero(sample_mask) + X_binned_grow = np.asfortranarray(X_binned_train[sample_mask]) + g_grow = np.asfortranarray(g_view[sample_mask]) + if self._loss.constant_hessian: + h_grow = h_view + else: + h_grow = np.asfortranarray(h_view[sample_mask]) + + else: + X_binned_grow = X_binned_train + g_grow = g_view + h_grow = h_view + # Build `n_trees_per_iteration` trees. for k in range(self.n_trees_per_iteration_): grower = TreeGrower( - X_binned=np.asfortranarray(X_binned_train[sample_mask]), - gradients=np.asfortranarray(g_view[sample_mask, k]), - hessians=np.asfortranarray(h_view[sample_mask, k]), + X_binned=X_binned_grow, + gradients=g_grow[:, k], + hessians=h_grow[:, k], n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values,