From 23531ccb498f2f016857d508894009af5373de56 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 13 Nov 2023 23:25:18 +0100 Subject: [PATCH 1/3] ENH reuse parent histograms as one of the child's histogram --- .../_hist_gradient_boosting/grower.py | 5 ++- .../_hist_gradient_boosting/histogram.pyx | 32 ++++++------------- .../tests/test_histogram.py | 8 ++--- 3 files changed, 16 insertions(+), 29 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 44392362fd60c..8ade111bace99 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -618,9 +618,8 @@ def split_next(self): if child.is_leaf: del child.histograms - # Release memory used by histograms as they are no longer needed for - # internal nodes once children histograms have been computed. - del node.histograms + # We do not release the memory of node.histograms as it is reused in one of the + # child nodes. return left_child_node, right_child_node diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 336ba372cb53a..5af972d85f0c4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -224,7 +224,7 @@ cdef class HistogramBuilder: def compute_histograms_subtraction( HistogramBuilder self, - hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] parent_histograms, # IN and OUT hist_struct [:, ::1] sibling_histograms, # IN const unsigned int [:] allowed_features=None, # IN ): @@ -252,16 +252,14 @@ cdef class HistogramBuilder: ------- histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins) The computed histograms of the current node. + We repurpose parent_histograms for this and don't need to allocate new + memory. """ cdef: int feature_idx int f_idx int n_allowed_features = self.n_features - hist_struct [:, ::1] histograms = np.empty( - shape=(self.n_features, self.n_bins), - dtype=HISTOGRAM_DTYPE - ) bint has_interaction_cst = allowed_features is not None int n_threads = self.n_threads @@ -281,9 +279,8 @@ cdef class HistogramBuilder: self.n_bins, parent_histograms, sibling_histograms, - histograms, ) - return histograms + return parent_histograms cpdef void _build_histogram_naive( @@ -313,25 +310,16 @@ cpdef void _build_histogram_naive( cpdef void _subtract_histograms( const int feature_idx, unsigned int n_bins, - hist_struct [:, ::1] hist_a, # IN + hist_struct [:, ::1] hist_a, # IN and OUT hist_struct [:, ::1] hist_b, # IN - hist_struct [:, ::1] out) noexcept nogil: # OUT - """compute (hist_a - hist_b) in out""" +) noexcept nogil: # OUT + """compute hist_a = hist_a - hist_b""" cdef: unsigned int i = 0 for i in range(n_bins): - out[feature_idx, i].sum_gradients = ( - hist_a[feature_idx, i].sum_gradients - - hist_b[feature_idx, i].sum_gradients - ) - out[feature_idx, i].sum_hessians = ( - hist_a[feature_idx, i].sum_hessians - - hist_b[feature_idx, i].sum_hessians - ) - out[feature_idx, i].count = ( - hist_a[feature_idx, i].count - - hist_b[feature_idx, i].count - ) + hist_a[feature_idx, i].sum_gradients -= hist_b[feature_idx, i].sum_gradients + hist_a[feature_idx, i].sum_hessians -= hist_b[feature_idx, i].sum_hessians + hist_a[feature_idx, i].count -= hist_b[feature_idx, i].count cpdef void _build_histogram( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 99f74b0f542ee..22375c7d4ea2c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -229,10 +229,10 @@ def test_hist_subtraction(constant_hessian): hist_right, ) - hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub) - _subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub) + hist_left_sub = np.copy(hist_parent) + hist_right_sub = np.copy(hist_parent) + _subtract_histograms(0, n_bins, hist_left_sub, hist_right) + _subtract_histograms(0, n_bins, hist_right_sub, hist_left) for key in ("count", "sum_hessians", "sum_gradients"): assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) From d242a6d922c869c938bdbd4020ef85fbb62c8de7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 28 Nov 2023 15:19:05 +0100 Subject: [PATCH 2/3] ENH break cyclic memory references again --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 8ade111bace99..8cf40d2a64539 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -603,6 +603,9 @@ def split_next(self): smallest_child.allowed_features, ) ) + # node.histograms is reused in largest_child.histograms. To break cyclic + # memory references and help garbage collection, we set it to None. + node.histograms = None self.total_compute_hist_time += time() - tic tic = time() @@ -618,8 +621,9 @@ def split_next(self): if child.is_leaf: del child.histograms - # We do not release the memory of node.histograms as it is reused in one of the - # child nodes. + # Release memory used by histograms as they are no longer needed for + # internal nodes once children histograms have been computed. + del node.histograms return left_child_node, right_child_node From cc2f238f83f20062b735d56dab951779976305c7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 29 Nov 2023 17:53:24 +0100 Subject: [PATCH 3/3] DOC add whatsnew --- doc/whats_new/v1.4.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 48559d7f603ef..5e453d0b1541e 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -345,6 +345,12 @@ Changelog is a predefined metric listed in :func:`metrics.get_scorer_names` and early stopping is enabled. :pr:`26163` by `Thomas Fan`_. +- |Efficiency| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` are now a bit faster by reusing + the parent node's histogram as children node's histogram in the subtraction trick. + In effect, less memory has to be allocated and deallocated. + :pr:`27865` by :user:`Christian Lorentzen `. + - |API| In :class:`ensemble.AdaBoostClassifier`, the `algorithm` argument `SAMME.R` was deprecated and will be removed in 1.6. :pr:`26830` by :user:`Stefanie Senger `.