From 25f863124d5ddef03c916b5757754cdcd345432b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 15 Apr 2023 21:31:05 +0200 Subject: [PATCH 01/12] ENH reuse parent histogram --- .../_hist_gradient_boosting/grower.py | 11 ++- .../_hist_gradient_boosting/histogram.pyx | 68 ++++++++++++++++++- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 4ed6041ecaa30..8370dfc17e4ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -574,9 +574,13 @@ def split_next(self): if n_samples_left < n_samples_right: smallest_child = left_child_node largest_child = right_child_node + split_bin_start = 0 + split_bin_end = node.split_info.bin_idx + 1 else: smallest_child = right_child_node largest_child = left_child_node + split_bin_start = node.split_info.bin_idx + 1 + split_bin_end = self.histogram_builder.n_bins # We use the brute O(n_samples) method on the child that has the # smallest number of samples, and the subtraction trick O(n_bins) @@ -584,7 +588,12 @@ def split_next(self): # Note that both left and right child have the same allowed_features. tic = time() smallest_child.histograms = self.histogram_builder.compute_histograms_brute( - smallest_child.sample_indices, smallest_child.allowed_features + smallest_child.sample_indices, + smallest_child.allowed_features, + node.split_info.feature_idx, + split_bin_start, + split_bin_end, + node.histograms, ) largest_child.histograms = ( self.histogram_builder.compute_histograms_subtraction( diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 336ba372cb53a..6ee9305f6e669 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -104,6 +104,10 @@ cdef class HistogramBuilder: HistogramBuilder self, const unsigned int [::1] sample_indices, # IN const unsigned int [:] allowed_features=None, # IN + const int split_feature_idx=-1, # IN + const unsigned int split_bin_start=0, # IN + const unsigned int split_bin_end=0, # IN + const hist_struct [:, ::1] parent_histograms=None, # IN ): """Compute the histograms of the node by scanning through all the data. @@ -118,6 +122,18 @@ cdef class HistogramBuilder: Indices of the features that are allowed by interaction constraints to be split. + split_feature_idx : int + Feature index of the feature that the parent node was split on. + + split_bin_start : unsigned int + Start of the bin indices belonging to the feature that was split on. + + split_bin_end : unsigned int + End (+1) of the bin indices belonging to the feature that was split on. + + parent_histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) + The histograms of the parent. + Returns ------- histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) @@ -141,6 +157,7 @@ cdef class HistogramBuilder: dtype=HISTOGRAM_DTYPE ) bint has_interaction_cst = allowed_features is not None + bint has_parent_hist = split_feature_idx >= 0 int n_threads = self.n_threads if has_interaction_cst: @@ -172,12 +189,57 @@ cdef class HistogramBuilder: else: feature_idx = f_idx - self._compute_histogram_brute_single_feature( - feature_idx, sample_indices, histograms - ) + if has_parent_hist and feature_idx == split_feature_idx: + self._compute_histogram_of_split_feature( + feature_idx, + sample_indices, + histograms, + split_bin_start, + split_bin_end, + parent_histograms, + ) + else: + self._compute_histogram_brute_single_feature( + feature_idx, sample_indices, histograms + ) return histograms + cdef void _compute_histogram_of_split_feature( + HistogramBuilder self, + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms, # OUT + const unsigned int split_bin_start, # IN + const unsigned int split_bin_end, # IN + const hist_struct [:, ::1] parent_histograms, # IN + ) noexcept nogil: # OUT + """Compute the histogram for the feature that was split on.""" + cdef: + unsigned int bin_idx = 0 + + if split_bin_start == 0: + for bin_idx in range(split_bin_end, self.n_bins): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + else: + for bin_idx in range(split_bin_start): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_start, split_bin_end): + histograms[feature_idx, bin_idx].sum_gradients = ( + parent_histograms[feature_idx, bin_idx].sum_gradients + ) + histograms[feature_idx, bin_idx].sum_hessians = ( + parent_histograms[feature_idx, bin_idx].sum_hessians + ) + histograms[feature_idx, bin_idx].count = ( + parent_histograms[feature_idx, bin_idx].count + ) + cdef void _compute_histogram_brute_single_feature( HistogramBuilder self, const int feature_idx, From b1efb34516f4458894027638e2982a6bced8a297 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 6 Sep 2023 22:32:40 +0200 Subject: [PATCH 02/12] CLN address review comments --- .../_hist_gradient_boosting/histogram.pyx | 43 +++++++++---------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 6ee9305f6e669..924950a052392 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -190,13 +190,12 @@ cdef class HistogramBuilder: feature_idx = f_idx if has_parent_hist and feature_idx == split_feature_idx: - self._compute_histogram_of_split_feature( - feature_idx, - sample_indices, - histograms, - split_bin_start, - split_bin_end, - parent_histograms, + self._compute_histogram_single_feature_from_parent( + feature_idx=feature_idx, + split_bin_start=split_bin_start, + split_bin_end=split_bin_end, + histograms=histograms, + parent_histograms=parent_histograms, ) else: self._compute_histogram_brute_single_feature( @@ -205,29 +204,27 @@ cdef class HistogramBuilder: return histograms - cdef void _compute_histogram_of_split_feature( + cdef void _compute_histogram_single_feature_from_parent( HistogramBuilder self, const int feature_idx, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms, # OUT - const unsigned int split_bin_start, # IN - const unsigned int split_bin_end, # IN + const unsigned int split_bin_start, # IN + const unsigned int split_bin_end, # IN const hist_struct [:, ::1] parent_histograms, # IN - ) noexcept nogil: # OUT + hist_struct [:, ::1] histograms, # OUT + ) noexcept nogil: """Compute the histogram for the feature that was split on.""" cdef: unsigned int bin_idx = 0 - if split_bin_start == 0: - for bin_idx in range(split_bin_end, self.n_bins): - histograms[feature_idx, bin_idx].sum_gradients = 0. - histograms[feature_idx, bin_idx].sum_hessians = 0. - histograms[feature_idx, bin_idx].count = 0 - else: - for bin_idx in range(split_bin_start): - histograms[feature_idx, bin_idx].sum_gradients = 0. - histograms[feature_idx, bin_idx].sum_hessians = 0. - histograms[feature_idx, bin_idx].count = 0 + for bin_idx in range(split_bin_start): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_end, self.n_bins): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 for bin_idx in range(split_bin_start, split_bin_end): histograms[feature_idx, bin_idx].sum_gradients = ( From eec5d2b2ace757c765a20b274db252492a0676bc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 6 Sep 2023 22:45:38 +0200 Subject: [PATCH 03/12] CLN make linter happy --- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 924950a052392..6c48ad1341ff4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -207,10 +207,10 @@ cdef class HistogramBuilder: cdef void _compute_histogram_single_feature_from_parent( HistogramBuilder self, const int feature_idx, - const unsigned int split_bin_start, # IN - const unsigned int split_bin_end, # IN - const hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] histograms, # OUT + const unsigned int split_bin_start, # IN + const unsigned int split_bin_end, # IN + const hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] histograms, # OUT ) noexcept nogil: """Compute the histogram for the feature that was split on.""" cdef: From f9d22e2cd742121ac821874bb31f951f79c11e41 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Nov 2023 21:34:10 +0100 Subject: [PATCH 04/12] MNT move split_info_struct to common.pxd --- .../_hist_gradient_boosting/common.pxd | 20 +++++++++++++++++ .../_hist_gradient_boosting/splitting.pyx | 22 ++----------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd index 3e71f2dc56060..ccc4cbe53712c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -38,6 +38,26 @@ cdef packed struct node_struct: # Only used if is_categorical is True unsigned int bitset_idx + +cdef struct split_info_struct: + # Same as the SplitInfo class, but we need a C struct to use it in the + # nogil sections and to use in arrays. + Y_DTYPE_C gain + int feature_idx + unsigned int bin_idx + unsigned char missing_go_to_left + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + unsigned int n_samples_left + unsigned int n_samples_right + Y_DTYPE_C value_left + Y_DTYPE_C value_right + unsigned char is_categorical + BITSET_DTYPE_C left_cat_bitset + + cpdef enum MonotonicConstraint: NO_CST = 0 POS = 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 17f5769dfaf14..ac1d2f4e56ba5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -16,34 +16,16 @@ from libc.string cimport memcpy from .common cimport X_BINNED_DTYPE_C from .common cimport Y_DTYPE_C -from .common cimport hist_struct from .common cimport BITSET_INNER_DTYPE_C from .common cimport BITSET_DTYPE_C from .common cimport MonotonicConstraint +from .common cimport hist_struct +from .common cimport split_info_struct from ._bitset cimport init_bitset from ._bitset cimport set_bitset from ._bitset cimport in_bitset -cdef struct split_info_struct: - # Same as the SplitInfo class, but we need a C struct to use it in the - # nogil sections and to use in arrays. - Y_DTYPE_C gain - int feature_idx - unsigned int bin_idx - unsigned char missing_go_to_left - Y_DTYPE_C sum_gradient_left - Y_DTYPE_C sum_gradient_right - Y_DTYPE_C sum_hessian_left - Y_DTYPE_C sum_hessian_right - unsigned int n_samples_left - unsigned int n_samples_right - Y_DTYPE_C value_left - Y_DTYPE_C value_right - unsigned char is_categorical - BITSET_DTYPE_C left_cat_bitset - - # used in categorical splits for sorting categories by increasing values of # sum_gradients / sum_hessians cdef struct categorical_info: From 65b852be4537063cc420a801eee96367b14b5c93 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Nov 2023 21:59:18 +0100 Subject: [PATCH 05/12] ENH support categorical parent feature histograms --- .../_hist_gradient_boosting/grower.py | 13 +- .../_hist_gradient_boosting/histogram.pyx | 128 ++++++++++++------ 2 files changed, 92 insertions(+), 49 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c0705842bd6e3..1d95b6061b1ba 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -574,13 +574,11 @@ def split_next(self): if n_samples_left < n_samples_right: smallest_child = left_child_node largest_child = right_child_node - split_bin_start = 0 - split_bin_end = node.split_info.bin_idx + 1 + is_left_child = True else: smallest_child = right_child_node largest_child = left_child_node - split_bin_start = node.split_info.bin_idx + 1 - split_bin_end = self.histogram_builder.n_bins + is_left_child = False # We use the brute O(n_samples) method on the child that has the # smallest number of samples, and the subtraction trick O(n_bins) @@ -590,10 +588,9 @@ def split_next(self): smallest_child.histograms = self.histogram_builder.compute_histograms_brute( smallest_child.sample_indices, smallest_child.allowed_features, - node.split_info.feature_idx, - split_bin_start, - split_bin_end, - node.histograms, + parent_split_info=node.split_info, + parent_histograms=node.histograms, + is_left_child=is_left_child, ) largest_child.histograms = ( self.histogram_builder.compute_histograms_subtraction( diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 6c48ad1341ff4..8de1ff0659ece 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -7,10 +7,12 @@ from cython.parallel import prange import numpy as np +from .common cimport BITSET_INNER_DTYPE_C from .common import HISTOGRAM_DTYPE -from .common cimport hist_struct from .common cimport X_BINNED_DTYPE_C from .common cimport G_H_DTYPE_C +from .common cimport hist_struct +from ._bitset cimport in_bitset # Notes: @@ -102,12 +104,11 @@ cdef class HistogramBuilder: def compute_histograms_brute( HistogramBuilder self, - const unsigned int [::1] sample_indices, # IN - const unsigned int [:] allowed_features=None, # IN - const int split_feature_idx=-1, # IN - const unsigned int split_bin_start=0, # IN - const unsigned int split_bin_end=0, # IN + const unsigned int [::1] sample_indices, # IN + const unsigned int [:] allowed_features=None, # IN + object parent_split_info=None, # IN const hist_struct [:, ::1] parent_histograms=None, # IN + const bint is_left_child=True, # IN ): """Compute the histograms of the node by scanning through all the data. @@ -122,18 +123,16 @@ cdef class HistogramBuilder: Indices of the features that are allowed by interaction constraints to be split. - split_feature_idx : int - Feature index of the feature that the parent node was split on. - - split_bin_start : unsigned int - Start of the bin indices belonging to the feature that was split on. - - split_bin_end : unsigned int - End (+1) of the bin indices belonging to the feature that was split on. + parent_split_info : split_info_struct + The split_info of the parent node. parent_histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) The histograms of the parent. + is_left_child : bool + True if the histogram of a left child is being computed, False for a right + child. + Returns ------- histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) @@ -157,9 +156,30 @@ cdef class HistogramBuilder: dtype=HISTOGRAM_DTYPE ) bint has_interaction_cst = allowed_features is not None - bint has_parent_hist = split_feature_idx >= 0 + # Feature index of the feature that the parent node was split on. + int split_feature_idx + # Start of the bin indices belonging to the feature that was split on. + unsigned int split_bin_start + # End (+1) of the bin indices belonging to the feature that was split on. + unsigned int split_bin_end + unsigned char is_categorical + BITSET_INNER_DTYPE_C [:] left_cat_bitset + bint has_parent_hist = False int n_threads = self.n_threads + if parent_split_info is not None: + has_parent_hist = True + split_feature_idx = parent_split_info.feature_idx + is_categorical = parent_split_info.is_categorical + if is_left_child: + split_bin_start = 0 + split_bin_end = parent_split_info.bin_idx + 1 + else: + split_bin_start = parent_split_info.bin_idx + 1 + split_bin_end = self.n_bins + if is_categorical: + left_cat_bitset = parent_split_info.left_cat_bitset + if has_interaction_cst: n_allowed_features = allowed_features.shape[0] @@ -194,6 +214,9 @@ cdef class HistogramBuilder: feature_idx=feature_idx, split_bin_start=split_bin_start, split_bin_end=split_bin_end, + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + is_left_child=is_left_child, histograms=histograms, parent_histograms=parent_histograms, ) @@ -204,38 +227,61 @@ cdef class HistogramBuilder: return histograms - cdef void _compute_histogram_single_feature_from_parent( + cpdef void _compute_histogram_single_feature_from_parent( HistogramBuilder self, const int feature_idx, - const unsigned int split_bin_start, # IN - const unsigned int split_bin_end, # IN - const hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] histograms, # OUT + const unsigned int split_bin_start, # IN + const unsigned int split_bin_end, # IN + const unsigned char is_categorical, # IN + const BITSET_INNER_DTYPE_C [:] left_cat_bitset, # IN + const bint is_left_child, # IN + const hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] histograms, # OUT ) noexcept nogil: """Compute the histogram for the feature that was split on.""" cdef: unsigned int bin_idx = 0 - - for bin_idx in range(split_bin_start): - histograms[feature_idx, bin_idx].sum_gradients = 0. - histograms[feature_idx, bin_idx].sum_hessians = 0. - histograms[feature_idx, bin_idx].count = 0 - - for bin_idx in range(split_bin_end, self.n_bins): - histograms[feature_idx, bin_idx].sum_gradients = 0. - histograms[feature_idx, bin_idx].sum_hessians = 0. - histograms[feature_idx, bin_idx].count = 0 - - for bin_idx in range(split_bin_start, split_bin_end): - histograms[feature_idx, bin_idx].sum_gradients = ( - parent_histograms[feature_idx, bin_idx].sum_gradients - ) - histograms[feature_idx, bin_idx].sum_hessians = ( - parent_histograms[feature_idx, bin_idx].sum_hessians - ) - histograms[feature_idx, bin_idx].count = ( - parent_histograms[feature_idx, bin_idx].count - ) + unsigned char in_left_binset + BITSET_INNER_DTYPE_C* p_left_cat_bitset = &left_cat_bitset[0] + + if is_categorical: + for bin_idx in range(self.n_bins): + in_left_binset = in_bitset(p_left_cat_bitset, bin_idx) + if (is_left_child and in_left_binset) or (not is_left_child and not in_left_binset): + histograms[feature_idx, bin_idx].sum_gradients = ( + parent_histograms[feature_idx, bin_idx].sum_gradients + ) + histograms[feature_idx, bin_idx].sum_hessians = ( + parent_histograms[feature_idx, bin_idx].sum_hessians + ) + histograms[feature_idx, bin_idx].count = ( + parent_histograms[feature_idx, bin_idx].count + ) + else: + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + else: + for bin_idx in range(split_bin_start): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_end, self.n_bins): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_start, split_bin_end): + histograms[feature_idx, bin_idx].sum_gradients = ( + parent_histograms[feature_idx, bin_idx].sum_gradients + ) + histograms[feature_idx, bin_idx].sum_hessians = ( + parent_histograms[feature_idx, bin_idx].sum_hessians + ) + histograms[feature_idx, bin_idx].count = ( + parent_histograms[feature_idx, bin_idx].count + ) cdef void _compute_histogram_brute_single_feature( HistogramBuilder self, From b512c3ac2813d25ee15bb1b7e605cd161cc734e4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Nov 2023 22:00:39 +0100 Subject: [PATCH 06/12] TST add test for _compute_histogram_single_feature_from_parent --- .../tests/test_histogram.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 99f74b0f542ee..86d25276995d3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -2,12 +2,14 @@ import pytest from numpy.testing import assert_allclose, assert_array_equal +from sklearn.ensemble._hist_gradient_boosting._bitset import set_bitset_memoryview from sklearn.ensemble._hist_gradient_boosting.common import ( G_H_DTYPE, HISTOGRAM_DTYPE, X_BINNED_DTYPE, ) from sklearn.ensemble._hist_gradient_boosting.histogram import ( + HistogramBuilder, _build_histogram, _build_histogram_naive, _build_histogram_no_hessian, @@ -15,6 +17,7 @@ _build_histogram_root_no_hessian, _subtract_histograms, ) +from sklearn.ensemble._hist_gradient_boosting.splitting import SplitInfo @pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram]) @@ -237,3 +240,89 @@ def test_hist_subtraction(constant_hessian): for key in ("count", "sum_hessians", "sum_gradients"): assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6) + + +@pytest.mark.parametrize("is_categorical", [False, True]) +def test_compute_histogram_single_feature_from_parent(is_categorical): + """Test _compute_histogram_single_feature_from_parent.""" + n_bins = 4 + X_binned = np.array([0, 1, 2, 3, 0, 1, 2, 3], dtype=X_BINNED_DTYPE)[:, None] + gradients = np.array([-2, -1, 1, 2, -2, -1, 1, 2], dtype=G_H_DTYPE) + hessians = np.array([-4, -2, 1, 2, -4, -2, 1, 2], dtype=G_H_DTYPE) + # Only bins 0 and 1 go to (child) histogram. + sample_indices = np.array([0, 1, 4, 5]).astype(np.uint32) + left_cat_bitset = np.zeros(shape=(8,), dtype=np.uint32) + set_bitset_memoryview(left_cat_bitset, 0) + set_bitset_memoryview(left_cat_bitset, 1) + assert left_cat_bitset[0] == 3 # 2**0 + 2**1 for bins 0 and 1 + + histogram_builder = HistogramBuilder( + X_binned, + n_bins, + gradients, + hessians, + hessians_are_constant=False, + n_threads=1, + ) + split_info = SplitInfo( + gain=1, # irrelevant for now + feature_idx=0, + bin_idx=1, + missing_go_to_left=True, # irrelevant for now + sum_gradient_left=0, # irrelevant for now + sum_hessian_left=0, # irrelevant for now + sum_gradient_right=0, # irrelevant for now + sum_hessian_right=0, # irrelevant for now + n_samples_left=0, # irrelevant for now + n_samples_right=0, # irrelevant for now + value_left=0, # irrelevant for now + value_right=0, # irrelevant for now + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + ) + hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_parent[0, :]["count"] = 2 + hist_parent[0, 0]["sum_gradients"] = -2 * 2 + hist_parent[0, 1]["sum_gradients"] = -1 * 2 + hist_parent[0, 2]["sum_gradients"] = 1 * 2 + hist_parent[0, 3]["sum_gradients"] = 2 * 2 + hist_parent[0, 0]["sum_hessians"] = -4 * 2 + hist_parent[0, 1]["sum_hessians"] = -2 * 2 + hist_parent[0, 2]["sum_hessians"] = 1 * 2 + hist_parent[0, 3]["sum_hessians"] = 2 * 2 + + hist1 = np.asarray( + histogram_builder.compute_histograms_brute( + sample_indices=sample_indices, + allowed_features=None, + parent_split_info=None, + parent_histograms=None, + is_left_child=True, + ) + ) + + hist2 = np.asanyarray( + histogram_builder.compute_histograms_brute( + sample_indices=sample_indices, + allowed_features=None, + parent_split_info=split_info, + parent_histograms=hist_parent, + is_left_child=True, + ) + ) + + hist3 = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + histogram_builder._compute_histogram_single_feature_from_parent( + feature_idx=0, + split_bin_start=0, + split_bin_end=1 + 1, + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + is_left_child=True, + histograms=hist3, + parent_histograms=hist_parent, + ) + + for key in ("count", "sum_hessians", "sum_gradients"): + assert_allclose(hist2[key], hist1[key], rtol=1e-6) + assert_allclose(hist3[key], hist1[key], rtol=1e-6) From a90bdc0b03aee3b51049c3c2358471ec4df6ac57 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Nov 2023 22:46:54 +0100 Subject: [PATCH 07/12] DOC add whatsnew --- doc/whats_new/v1.4.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 5a9df10e8c49f..91f976465d95d 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -303,6 +303,13 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. +- |Efficiency| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly `1/n_features` + faster to before with a single thread. The estimators now reuse the parent's node + histogram for the single feature that was split on, i.e. just copy the parent's node + histogram values for the corresponding bins. + :pr:`26189` by :user:`Christian Lorentzen `. + - |Efficiency| :class:`ensemble.GradientBoostingClassifier` is faster, for binary and in particular for multiclass problems thanks to the private loss function module. From 8e4e1ce3c9427dde33377bb9abe7495a4a6b0ed2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 14 Nov 2023 22:49:42 +0100 Subject: [PATCH 08/12] DOC correct formula for speed up --- doc/whats_new/v1.4.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 91f976465d95d..023d2f687fa9c 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -304,10 +304,10 @@ Changelog initiated by :user:`Patrick O'Reilly `. - |Efficiency| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly `1/n_features` - faster to before with a single thread. The estimators now reuse the parent's node - histogram for the single feature that was split on, i.e. just copy the parent's node - histogram values for the corresponding bins. + :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly + `1 - 1/n_features` faster to before with a single thread. The estimators now reuse + the parent's node histogram for the single feature that was split on, i.e. just copy + the parent's node histogram values for the corresponding bins. :pr:`26189` by :user:`Christian Lorentzen `. - |Efficiency| :class:`ensemble.GradientBoostingClassifier` is faster, From 5f13884d90c5fa1627cd1f5e243bb1d273feb880 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Dec 2023 18:46:23 +0100 Subject: [PATCH 09/12] FIX assign pointer to left_cat_bitset only if exist --- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 37cf475a6ce82..f8976533403e0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -242,9 +242,10 @@ cdef class HistogramBuilder: cdef: unsigned int bin_idx = 0 unsigned char in_left_binset - BITSET_INNER_DTYPE_C* p_left_cat_bitset = &left_cat_bitset[0] + BITSET_INNER_DTYPE_C* p_left_cat_bitset if is_categorical: + p_left_cat_bitset = &left_cat_bitset[0] for bin_idx in range(self.n_bins): in_left_binset = in_bitset(p_left_cat_bitset, bin_idx) if (is_left_child and in_left_binset) or (not is_left_child and not in_left_binset): From 7be6bfea00f7c8e05f7c23ae70bd13c7d17a42cf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 22 Feb 2025 18:18:11 +0100 Subject: [PATCH 10/12] CLN address review comments --- .../_hist_gradient_boosting/grower.py | 15 +++++------ .../_hist_gradient_boosting/histogram.pyx | 27 +++++++++---------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index eebfd7909c651..ac291b616bbb5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -581,14 +581,13 @@ def split_next(self): # (using histogram subtraction). n_samples_left = left_child_node.sample_indices.shape[0] n_samples_right = right_child_node.sample_indices.shape[0] - if n_samples_left < n_samples_right: - smallest_child = left_child_node - largest_child = right_child_node - is_left_child = True - else: - smallest_child = right_child_node - largest_child = left_child_node - is_left_child = False + is_left_child = n_samples_left < n_samples_right + if is_left_child: + smallest_child = left_child_node + largest_child = right_child_node + else: + smallest_child = right_child_node + largest_child = left_child_node # We use the brute O(n_samples) method on the child that has the # smallest number of samples, and the subtraction trick O(n_bins) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index f8976533403e0..b72bea5034159 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -157,26 +157,25 @@ cdef class HistogramBuilder: ) bint has_interaction_cst = allowed_features is not None # Feature index of the feature that the parent node was split on. - int split_feature_idx + int parent_split_feature_idx # Start of the bin indices belonging to the feature that was split on. - unsigned int split_bin_start + unsigned int parent_split_bin_start # End (+1) of the bin indices belonging to the feature that was split on. - unsigned int split_bin_end + unsigned int parent_split_bin_end unsigned char is_categorical BITSET_INNER_DTYPE_C [:] left_cat_bitset - bint has_parent_hist = False + bint has_parent_hist = parent_split_info is not None int n_threads = self.n_threads - if parent_split_info is not None: - has_parent_hist = True - split_feature_idx = parent_split_info.feature_idx + if has_parent_hist: + parent_split_feature_idx = parent_split_info.feature_idx is_categorical = parent_split_info.is_categorical if is_left_child: - split_bin_start = 0 - split_bin_end = parent_split_info.bin_idx + 1 + parent_split_bin_start = 0 + parent_split_bin_end = parent_split_info.bin_idx + 1 else: - split_bin_start = parent_split_info.bin_idx + 1 - split_bin_end = self.n_bins + parent_split_bin_start = parent_split_info.bin_idx + 1 + parent_split_bin_end = self.n_bins if is_categorical: left_cat_bitset = parent_split_info.left_cat_bitset @@ -209,11 +208,11 @@ cdef class HistogramBuilder: else: feature_idx = f_idx - if has_parent_hist and feature_idx == split_feature_idx: + if has_parent_hist and feature_idx == parent_split_feature_idx: self._compute_histogram_single_feature_from_parent( feature_idx=feature_idx, - split_bin_start=split_bin_start, - split_bin_end=split_bin_end, + split_bin_start=parent_split_bin_start, + split_bin_end=parent_split_bin_end, is_categorical=is_categorical, left_cat_bitset=left_cat_bitset, is_left_child=is_left_child, From ec8ca5dc87d61bd55c3132df9a89cc3d68d3bd69 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 22 Feb 2025 18:29:36 +0100 Subject: [PATCH 11/12] DOC new whatsnew entry --- .../upcoming_changes/sklearn.ensemble/26189.efficiency.rst | 6 ++++++ doc/whats_new/v1.4.rst | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst new file mode 100644 index 0000000000000..3b83889e3e226 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst @@ -0,0 +1,6 @@ +- :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly + `1 - 1/n_features` faster to before with a single thread. The estimators now reuse + the parent's node histogram for the single feature that was split on, i.e. just copy + the parent's node histogram values for the corresponding bins. + :pr:`26189` by :user:`Christian Lorentzen `. diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index f8fca7e458330..29d4d87e68748 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -645,13 +645,6 @@ Changelog In effect, less memory has to be allocated and deallocated. :pr:`27865` by :user:`Christian Lorentzen `. -- |Efficiency| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly - `1 - 1/n_features` faster to before with a single thread. The estimators now reuse - the parent's node histogram for the single feature that was split on, i.e. just copy - the parent's node histogram values for the corresponding bins. - :pr:`26189` by :user:`Christian Lorentzen `. - - |Efficiency| :class:`ensemble.GradientBoostingClassifier` is faster, for binary and in particular for multiclass problems thanks to the private loss function module. From 392d553798b4b683cf812ce699e1727a7cb446d3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 22 Feb 2025 18:35:20 +0100 Subject: [PATCH 12/12] FIX code indentation --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 8b5830030b16a..ca30342616ddd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -591,12 +591,12 @@ def split_next(self): n_samples_left = left_child_node.sample_indices.shape[0] n_samples_right = right_child_node.sample_indices.shape[0] is_left_child = n_samples_left < n_samples_right - if is_left_child: - smallest_child = left_child_node - largest_child = right_child_node - else: - smallest_child = right_child_node - largest_child = left_child_node + if is_left_child: + smallest_child = left_child_node + largest_child = right_child_node + else: + smallest_child = right_child_node + largest_child = left_child_node # We use the brute O(n_samples) method on the child that has the # smallest number of samples, and the subtraction trick O(n_bins)