diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst new file mode 100644 index 0000000000000..3b83889e3e226 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.ensemble/26189.efficiency.rst @@ -0,0 +1,6 @@ +- :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` are faster, roughly + `1 - 1/n_features` faster to before with a single thread. The estimators now reuse + the parent's node histogram for the single feature that was split on, i.e. just copy + the parent's node histogram values for the corresponding bins. + :pr:`26189` by :user:`Christian Lorentzen `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd index 9ff9fc89800d7..3aa50180ee8ce 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -37,6 +37,25 @@ cdef packed struct node_struct: unsigned int bitset_idx +cdef struct split_info_struct: + # Same as the SplitInfo class, but we need a C struct to use it in the + # nogil sections and to use in arrays. + Y_DTYPE_C gain + int feature_idx + unsigned int bin_idx + uint8_t missing_go_to_left + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + unsigned int n_samples_left + unsigned int n_samples_right + Y_DTYPE_C value_left + Y_DTYPE_C value_right + uint8_t is_categorical + BITSET_DTYPE_C left_cat_bitset + + cpdef enum MonotonicConstraint: NO_CST = 0 POS = 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index a71e564056f8f..ca30342616ddd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -590,7 +590,8 @@ def split_next(self): # (using histogram subtraction). n_samples_left = left_child_node.sample_indices.shape[0] n_samples_right = right_child_node.sample_indices.shape[0] - if n_samples_left < n_samples_right: + is_left_child = n_samples_left < n_samples_right + if is_left_child: smallest_child = left_child_node largest_child = right_child_node else: @@ -603,7 +604,11 @@ def split_next(self): # Note that both left and right child have the same allowed_features. tic = time() smallest_child.histograms = self.histogram_builder.compute_histograms_brute( - smallest_child.sample_indices, smallest_child.allowed_features + smallest_child.sample_indices, + smallest_child.allowed_features, + parent_split_info=node.split_info, + parent_histograms=node.histograms, + is_left_child=is_left_child, ) largest_child.histograms = ( self.histogram_builder.compute_histograms_subtraction( diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index e204eec6b9785..087970012d660 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -9,10 +9,12 @@ from libc.string cimport memset import numpy as np +from .common cimport BITSET_INNER_DTYPE_C from .common import HISTOGRAM_DTYPE -from .common cimport hist_struct from .common cimport X_BINNED_DTYPE_C from .common cimport G_H_DTYPE_C +from .common cimport hist_struct +from ._bitset cimport in_bitset from ...utils._typedefs cimport uint8_t @@ -105,8 +107,11 @@ cdef class HistogramBuilder: def compute_histograms_brute( HistogramBuilder self, - const unsigned int [::1] sample_indices, # IN - const unsigned int [:] allowed_features=None, # IN + const unsigned int [::1] sample_indices, # IN + const unsigned int [:] allowed_features=None, # IN + object parent_split_info=None, # IN + const hist_struct [:, ::1] parent_histograms=None, # IN + const bint is_left_child=True, # IN ): """Compute the histograms of the node by scanning through all the data. @@ -121,6 +126,16 @@ cdef class HistogramBuilder: Indices of the features that are allowed by interaction constraints to be split. + parent_split_info : split_info_struct + The split_info of the parent node. + + parent_histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) + The histograms of the parent. + + is_left_child : bool + True if the histogram of a left child is being computed, False for a right + child. + Returns ------- histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) @@ -144,8 +159,29 @@ cdef class HistogramBuilder: dtype=HISTOGRAM_DTYPE ) bint has_interaction_cst = allowed_features is not None + # Feature index of the feature that the parent node was split on. + int parent_split_feature_idx + # Start of the bin indices belonging to the feature that was split on. + unsigned int parent_split_bin_start + # End (+1) of the bin indices belonging to the feature that was split on. + unsigned int parent_split_bin_end + unsigned char is_categorical + BITSET_INNER_DTYPE_C [:] left_cat_bitset + bint has_parent_hist = parent_split_info is not None int n_threads = self.n_threads + if has_parent_hist: + parent_split_feature_idx = parent_split_info.feature_idx + is_categorical = parent_split_info.is_categorical + if is_left_child: + parent_split_bin_start = 0 + parent_split_bin_end = parent_split_info.bin_idx + 1 + else: + parent_split_bin_start = parent_split_info.bin_idx + 1 + parent_split_bin_end = self.n_bins + if is_categorical: + left_cat_bitset = parent_split_info.left_cat_bitset + if has_interaction_cst: n_allowed_features = allowed_features.shape[0] @@ -175,12 +211,81 @@ cdef class HistogramBuilder: else: feature_idx = f_idx - self._compute_histogram_brute_single_feature( - feature_idx, sample_indices, histograms - ) + if has_parent_hist and feature_idx == parent_split_feature_idx: + self._compute_histogram_single_feature_from_parent( + feature_idx=feature_idx, + split_bin_start=parent_split_bin_start, + split_bin_end=parent_split_bin_end, + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + is_left_child=is_left_child, + histograms=histograms, + parent_histograms=parent_histograms, + ) + else: + self._compute_histogram_brute_single_feature( + feature_idx, sample_indices, histograms + ) return histograms + cpdef void _compute_histogram_single_feature_from_parent( + HistogramBuilder self, + const int feature_idx, + const unsigned int split_bin_start, # IN + const unsigned int split_bin_end, # IN + const unsigned char is_categorical, # IN + const BITSET_INNER_DTYPE_C [:] left_cat_bitset, # IN + const bint is_left_child, # IN + const hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] histograms, # OUT + ) noexcept nogil: + """Compute the histogram for the feature that was split on.""" + cdef: + unsigned int bin_idx = 0 + unsigned char in_left_binset + BITSET_INNER_DTYPE_C* p_left_cat_bitset + + if is_categorical: + p_left_cat_bitset = &left_cat_bitset[0] + for bin_idx in range(self.n_bins): + in_left_binset = in_bitset(p_left_cat_bitset, bin_idx) + if (is_left_child and in_left_binset) or (not is_left_child and not in_left_binset): + histograms[feature_idx, bin_idx].sum_gradients = ( + parent_histograms[feature_idx, bin_idx].sum_gradients + ) + histograms[feature_idx, bin_idx].sum_hessians = ( + parent_histograms[feature_idx, bin_idx].sum_hessians + ) + histograms[feature_idx, bin_idx].count = ( + parent_histograms[feature_idx, bin_idx].count + ) + else: + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + else: + for bin_idx in range(split_bin_start): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_end, self.n_bins): + histograms[feature_idx, bin_idx].sum_gradients = 0. + histograms[feature_idx, bin_idx].sum_hessians = 0. + histograms[feature_idx, bin_idx].count = 0 + + for bin_idx in range(split_bin_start, split_bin_end): + histograms[feature_idx, bin_idx].sum_gradients = ( + parent_histograms[feature_idx, bin_idx].sum_gradients + ) + histograms[feature_idx, bin_idx].sum_hessians = ( + parent_histograms[feature_idx, bin_idx].sum_hessians + ) + histograms[feature_idx, bin_idx].count = ( + parent_histograms[feature_idx, bin_idx].count + ) + cdef void _compute_histogram_brute_single_feature( HistogramBuilder self, const int feature_idx, diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index de5b92f13c31a..0919d82345740 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -19,34 +19,16 @@ from libc.string cimport memcpy from ...utils._typedefs cimport uint8_t from .common cimport X_BINNED_DTYPE_C from .common cimport Y_DTYPE_C -from .common cimport hist_struct from .common cimport BITSET_INNER_DTYPE_C from .common cimport BITSET_DTYPE_C from .common cimport MonotonicConstraint +from .common cimport hist_struct +from .common cimport split_info_struct from ._bitset cimport init_bitset from ._bitset cimport set_bitset from ._bitset cimport in_bitset -cdef struct split_info_struct: - # Same as the SplitInfo class, but we need a C struct to use it in the - # nogil sections and to use in arrays. - Y_DTYPE_C gain - int feature_idx - unsigned int bin_idx - uint8_t missing_go_to_left - Y_DTYPE_C sum_gradient_left - Y_DTYPE_C sum_gradient_right - Y_DTYPE_C sum_hessian_left - Y_DTYPE_C sum_hessian_right - unsigned int n_samples_left - unsigned int n_samples_right - Y_DTYPE_C value_left - Y_DTYPE_C value_right - uint8_t is_categorical - BITSET_DTYPE_C left_cat_bitset - - # used in categorical splits for sorting categories by increasing values of # sum_gradients / sum_hessians cdef struct categorical_info: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 22375c7d4ea2c..10d9f4d15d6c6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -2,12 +2,14 @@ import pytest from numpy.testing import assert_allclose, assert_array_equal +from sklearn.ensemble._hist_gradient_boosting._bitset import set_bitset_memoryview from sklearn.ensemble._hist_gradient_boosting.common import ( G_H_DTYPE, HISTOGRAM_DTYPE, X_BINNED_DTYPE, ) from sklearn.ensemble._hist_gradient_boosting.histogram import ( + HistogramBuilder, _build_histogram, _build_histogram_naive, _build_histogram_no_hessian, @@ -15,6 +17,7 @@ _build_histogram_root_no_hessian, _subtract_histograms, ) +from sklearn.ensemble._hist_gradient_boosting.splitting import SplitInfo @pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram]) @@ -237,3 +240,89 @@ def test_hist_subtraction(constant_hessian): for key in ("count", "sum_hessians", "sum_gradients"): assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6) + + +@pytest.mark.parametrize("is_categorical", [False, True]) +def test_compute_histogram_single_feature_from_parent(is_categorical): + """Test _compute_histogram_single_feature_from_parent.""" + n_bins = 4 + X_binned = np.array([0, 1, 2, 3, 0, 1, 2, 3], dtype=X_BINNED_DTYPE)[:, None] + gradients = np.array([-2, -1, 1, 2, -2, -1, 1, 2], dtype=G_H_DTYPE) + hessians = np.array([-4, -2, 1, 2, -4, -2, 1, 2], dtype=G_H_DTYPE) + # Only bins 0 and 1 go to (child) histogram. + sample_indices = np.array([0, 1, 4, 5]).astype(np.uint32) + left_cat_bitset = np.zeros(shape=(8,), dtype=np.uint32) + set_bitset_memoryview(left_cat_bitset, 0) + set_bitset_memoryview(left_cat_bitset, 1) + assert left_cat_bitset[0] == 3 # 2**0 + 2**1 for bins 0 and 1 + + histogram_builder = HistogramBuilder( + X_binned, + n_bins, + gradients, + hessians, + hessians_are_constant=False, + n_threads=1, + ) + split_info = SplitInfo( + gain=1, # irrelevant for now + feature_idx=0, + bin_idx=1, + missing_go_to_left=True, # irrelevant for now + sum_gradient_left=0, # irrelevant for now + sum_hessian_left=0, # irrelevant for now + sum_gradient_right=0, # irrelevant for now + sum_hessian_right=0, # irrelevant for now + n_samples_left=0, # irrelevant for now + n_samples_right=0, # irrelevant for now + value_left=0, # irrelevant for now + value_right=0, # irrelevant for now + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + ) + hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_parent[0, :]["count"] = 2 + hist_parent[0, 0]["sum_gradients"] = -2 * 2 + hist_parent[0, 1]["sum_gradients"] = -1 * 2 + hist_parent[0, 2]["sum_gradients"] = 1 * 2 + hist_parent[0, 3]["sum_gradients"] = 2 * 2 + hist_parent[0, 0]["sum_hessians"] = -4 * 2 + hist_parent[0, 1]["sum_hessians"] = -2 * 2 + hist_parent[0, 2]["sum_hessians"] = 1 * 2 + hist_parent[0, 3]["sum_hessians"] = 2 * 2 + + hist1 = np.asarray( + histogram_builder.compute_histograms_brute( + sample_indices=sample_indices, + allowed_features=None, + parent_split_info=None, + parent_histograms=None, + is_left_child=True, + ) + ) + + hist2 = np.asanyarray( + histogram_builder.compute_histograms_brute( + sample_indices=sample_indices, + allowed_features=None, + parent_split_info=split_info, + parent_histograms=hist_parent, + is_left_child=True, + ) + ) + + hist3 = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + histogram_builder._compute_histogram_single_feature_from_parent( + feature_idx=0, + split_bin_start=0, + split_bin_end=1 + 1, + is_categorical=is_categorical, + left_cat_bitset=left_cat_bitset, + is_left_child=True, + histograms=hist3, + parent_histograms=hist_parent, + ) + + for key in ("count", "sum_hessians", "sum_gradients"): + assert_allclose(hist2[key], hist1[key], rtol=1e-6) + assert_allclose(hist3[key], hist1[key], rtol=1e-6)