8000 ENH reuse parent histogram in HGBT by lorentzenchr · Pull Request #26189 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH reuse parent histogram in HGBT #26189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- :class:`ensemble.HistGradientBoostingClassifier` and
:class:`ensemble.HistGradientBoostingRegressor` are faster, roughly
`1 - 1/n_features` faster to before with a single thread. The estimators now reuse
the parent's node histogram for the single feature that was split on, i.e. just copy
the parent's node histogram values for the corresponding bins.
:pr:`26189` by :user:`Christian Lorentzen <lorentzenchr>`.
19 changes: 19 additions & 0 deletions sklearn/ensemble/_hist_gradient_boosting/common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,25 @@ cdef packed struct node_struct:
unsigned int bitset_idx


cdef struct split_info_struct:
# Same as the SplitInfo class, but we need a C struct to use it in the
# nogil sections and to use in arrays.
Y_DTYPE_C gain
int feature_idx
unsigned int bin_idx
uint8_t missing_go_to_left
Y_DTYPE_C sum_gradient_left
Y_DTYPE_C sum_gradient_right
Y_DTYPE_C sum_hessian_left
Y_DTYPE_C sum_hessian_right
unsigned int n_samples_left
unsigned int n_samples_right
Y_DTYPE_C value_left
Y_DTYPE_C value_right
uint8_t is_categorical
BITSET_DTYPE_C left_cat_bitset


cpdef enum MonotonicConstraint:
NO_CST = 0
POS = 1
Expand Down
9 changes: 7 additions & 2 deletions sklearn/ensemble/_hist_gradient_boosting/grower.py
Original file line number Diff line number Diff line change Expand Up @@ -590,7 +590,8 @@ def split_next(self): # (using histogram subtraction). n_samples_left = left_child_node.sample_indices.shape[0] n_samples_right = right_child_node.sample_indices.shape[0] if n_samples_left < n_samples_right: is_left_child = n_samples_left < n_samples_right if is_left_child: smallest_child = left_child_node largest_child = right_child_node else: Expand All @@ -603,7 +604,11 @@ def split_next(self): # Note that both left and right child have the same allowed_features. tic = time() smallest_child.histograms = self.histogram_builder.compute_histograms_brute( smallest_child.sample_indices, smallest_child.allowed_features smallest_child.sample_indices, smallest_child.allowed_features, parent_split_info=node.split_info, parent_histograms=node.histograms, is_left_child=is_left_child, ) largest_child.histograms = ( self.histogram_builder.compute_histograms_subtraction( Expand Down
117 changes: 111 additions & 6 deletions sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ from libc.string cimport memset

import numpy as np

from .common cimport BITSET_INNER_DTYPE_C
from .common import HISTOGRAM_DTYPE
from .common cimport hist_struct
from .common cimport X_BINNED_DTYPE_C
from .common cimport G_H_DTYPE_C
from .common cimport hist_struct
from ._bitset cimport in_bitset
from ...utils._typedefs cimport uint8_t


Expand Down Expand Up @@ -105,8 +107,11 @@ cdef class HistogramBuilder:

def compute_histograms_brute(
HistogramBuilder self,
const unsigned int [::1] sample_indices, # IN
const unsigned int [:] allowed_features=None, # IN
const unsigned int [::1] sample_indices, # IN
const unsigned int [:] allowed_features=None, # IN
object parent_split_info=None, # IN
const hist_struct [:, ::1] parent_histograms=None, # IN
const bint is_left_child=True, # IN
):
"""Compute the histograms of the node by scanning through all the data.

Expand All @@ -121,6 +126,16 @@ cdef class HistogramBuilder:
Indices of the features that are allowed by interaction constraints to be
split.

parent_split_info : split_info_struct
The split_info of the parent node.

parent_histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
The histograms of the parent.

is_left_child : bool
True if the histogram of a left child is being computed, False for a right
child.

Returns
-------
histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
Expand All @@ -144,8 +159,29 @@ cdef class HistogramBuilder:
dtype=HISTOGRAM_DTYPE
)
bint has_interaction_cst = allowed_features is not None
# Feature index of the feature that the parent node was split on.
int parent_split_feature_idx
# Start of the bin indices belonging to the feature that was split on.
unsigned int parent_split_bin_start
# End (+1) of the bin indices belonging to the feature that was split on.
unsigned int parent_split_bin_end
unsigned char is_categorical
BITSET_INNER_DTYPE_C [:] left_cat_bitset
bint has_parent_hist = parent_split_info is not None
int n_threads = self.n_threads

if has_parent_hist:
parent_split_feature_idx = parent_split_info.feature_idx
is_categorical = parent_split_info.is_categorical
if is_left_child:
parent_split_bin_start = 0
parent_split_bin_end = parent_split_info.bin_idx + 1
else:
parent_split_bin_start = parent_split_info.bin_idx + 1
parent_split_bin_end = self.n_bins
if is_categorical:
left_cat_bitset = parent_split_info.left_cat_bitset

if has_interaction_cst:
n_allowed_features = allowed_features.shape[0]

Expand Down Expand Up @@ -175,12 +211,81 @@ cdef class HistogramBuilder:
else:
feature_idx = f_idx

self._compute_histogram_brute_single_feature(
feature_idx, sample_indices, histograms
)
if has_parent_hist and feature_idx == parent_split_feature_idx:
self._compute_histogram_single_feature_from_parent(
feature_idx=feature_idx,
split_bin_start=parent_split_bin_start,
split_bin_end=parent_split_bin_end,
is_categorical=is_categorical,
left_cat_bitset=left_cat_bitset,
is_left_child=is_left_child,
histograms=histograms,
parent_histograms=parent_histograms,
)
else:
self._compute_histogram_brute_single_feature(
feature_idx, sample_indices, histograms
)

return histograms

cpdef void _compute_histogram_single_feature_from_parent(
HistogramBuilder self,
const int feature_idx,
const unsigned int split_bin_start, # IN
const unsigned int split_bin_end, # IN
const unsigned char is_categorical, # IN
const BITSET_INNER_DTYPE_C [:] left_cat_bitset, # IN
const bint is_left_child, # IN
const hist_struct [:, ::1] parent_histograms, # IN
hist_struct [:, ::1] histograms, # OUT
) noexcept nogil:
"""Compute the histogram for the feature that was split on."""
cdef:
unsigned int bin_idx = 0
unsigned char in_left_binset
BITSET_INNER_DTYPE_C* p_left_cat_bitset

if is_categorical:
p_left_cat_bitset = &left_cat_bitset[0]
for bin_idx in range(self.n_bins):
in_left_binset = in_bitset(p_left_cat_bitset, bin_idx)
if (is_left_child and in_left_binset) or (not is_left_child and not in_left_binset):
histograms[feature_idx, bin_idx].sum_gradients = (
parent_histograms[feature_idx, bin_idx].sum_gradients
)
histograms[feature_idx, bin_idx].sum_hessians = (
parent_histograms[feature_idx, bin_idx].sum_hessians
)
histograms[feature_idx, bin_idx].count = (
parent_histograms[feature_idx, bin_idx].count
)
else:
histograms[feature_idx, bin_idx].sum_gradients = 0.
histograms[feature_idx, bin_idx].sum_hessians = 0.
histograms[feature_idx, bin_idx].count = 0
else:
for bin_idx in range(split_bin_start):
9E88 histograms[feature_idx, bin_idx].sum_gradients = 0.
histograms[feature_idx, bin_idx].sum_hessians = 0.
histograms[feature_idx, bin_idx].count = 0

for bin_idx in range(split_bin_end, self.n_bins):
histograms[feature_idx, bin_idx].sum_gradients = 0.
histograms[feature_idx, bin_idx].sum_hessians = 0.
histograms[feature_idx, bin_idx].count = 0

for bin_idx in range(split_bin_start, split_bin_end):
histograms[feature_idx, bin_idx].sum_gradients = (
parent_histograms[feature_idx, bin_idx].sum_gradients
)
histograms[feature_idx, bin_idx].sum_hessians = (
parent_histograms[feature_idx, bin_idx].sum_hessians
)
histograms[feature_idx, bin_idx].count = (
parent_histograms[feature_idx, bin_idx].count
)

cdef void _compute_histogram_brute_single_feature(
HistogramBuilder self,
const int feature_idx,
Expand Down
22 changes: 2 additions & 20 deletions sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,16 @@ from libc.string cimport memcpy
from ...utils._typedefs cimport uint8_t
from .common cimport X_BINNED_DTYPE_C
from .common cimport Y_DTYPE_C
from .common cimport hist_struct
from .common cimport BITSET_INNER_DTYPE_C
from .common cimport BITSET_DTYPE_C
from .common cimport MonotonicConstraint
from .common cimport hist_struct
from .common cimport split_info_struct
from ._bitset cimport init_bitset
from ._bitset cimport set_bitset
from ._bitset cimport in_bitset


cdef struct split_info_struct:
# Same as the SplitInfo class, but we need a C struct to use it in the
# nogil sections and to use in arrays.
Y_DTYPE_C gain
int feature_idx
unsigned int bin_idx
uint8_t missing_go_to_left
Y_DTYPE_C sum_gradient_left
Y_DTYPE_C sum_gradient_right
Y_DTYPE_C sum_hessian_left
Y_DTYPE_C sum_hessian_right
unsigned int n_samples_left
unsigned int n_samples_right
Y_DTYPE_C value_left
Y_DTYPE_C value_right
uint8_t is_categorical
BITSET_DTYPE_C left_cat_bitset


# used in categorical splits for sorting categories by increasing values of
# sum_gradients / sum_hessians
cdef struct categorical_info:
Expand Down
89 changes: 89 additions & 0 deletions sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@
import pytest
from numpy.testing import assert_allclose, assert_array_equal

from sklearn.ensemble._hist_gradient_boosting._bitset import set_bitset_memoryview
from sklearn.ensemble._hist_gradient_boosting.common import (
G_H_DTYPE,
HISTOGRAM_DTYPE,
X_BINNED_DTYPE,
)
from sklearn.ensemble._hist_gradient_boosting.histogram import (
HistogramBuilder,
_build_histogram,
_build_histogram_naive,
_build_histogram_no_hessian,
_build_histogram_root,
_build_histogram_root_no_hessian,
_subtract_histograms,
)
from sklearn.ensemble._hist_gradient_boosting.splitting import SplitInfo


@pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram])
Expand Down Expand Up @@ -237,3 +240,89 @@ def test_hist_subtraction(constant_hessian):
for key in ("count", "sum_hessians", "sum_gradients"):
assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)


@pytest.mark.parametrize("is_categorical", [False, True])
def test_compute_histogram_single_feature_from_parent(is_categorical):
"""Test _compute_histogram_single_feature_from_parent."""
n_bins = 4
X_binned = np.array([0, 1, 2, 3, 0, 1, 2, 3], dtype=X_BINNED_DTYPE)[:, None]
gradients = np.array([-2, -1, 1, 2, -2, -1, 1, 2], dtype=G_H_DTYPE)
hessians = np.array([-4, -2, 1, 2, -4, -2, 1, 2], dtype=G_H_DTYPE)
# Only bins 0 and 1 go to (child) histogram.
sample_indices = np.array([0, 1, 4, 5]).astype(np.uint32)
left_cat_bitset = np.zeros(shape=(8,), dtype=np.uint32)
set_bitset_memoryview(left_cat_bitset, 0)
set_bitset_memoryview(left_cat_bitset, 1)
assert left_cat_bitset[0] == 3 # 2**0 + 2**1 for bins 0 and 1

histogram_builder = HistogramBuilder(
X_binned,
n_bins,
gradients,
hessians,
hessians_are_constant=False,
n_threads=1,
)
split_info = SplitInfo(
gain=1, # irrelevant for now
feature_idx=0,
bin_idx=1,
missing_go_to_left=True, # irrelevant for now
sum_gradient_left=0, # irrelevant for now
sum_hessian_left=0, # irrelevant for now
sum_gradient_right=0, # irrelevant for now
sum_hessian_right=0, # irrelevant for now
n_samples_left=0, # irrelevant for now
n_samples_right=0, # irrelevant for now
value_left=0, # irrelevant for now
value_right=0, # irrelevant for now
is_categorical=is_categorical,
left_cat_bitset=left_cat_bitset,
)
hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
hist_parent[0, :]["count"] = 2
hist_parent[0, 0]["sum_gradients"] = -2 * 2
hist_parent[0, 1]["sum_gradients"] = -1 * 2
hist_parent[0, 2]["sum_gradients"] = 1 * 2
hist_parent[0, 3]["sum_gradients"] = 2 * 2
hist_parent[0, 0]["sum_hessians"] = -4 * 2
hist_parent[0, 1]["sum_hessians"] = -2 * 2
hist_parent[0, 2]["sum_hessians"] = 1 * 2
hist_parent[0, 3]["sum_hessians"] = 2 * 2

hist1 = np.asarray(
histogram_builder.compute_histograms_brute(
sample_indices=sample_indices,
allowed_features=None,
parent_split_info=None,
parent_histograms=None,
is_left_child=True,
)
)

hist2 = np.asanyarray(
histogram_builder.compute_histograms_brute(
sample_indices=sample_indices,
allowed_features=None,
parent_split_info=split_info,
parent_histograms=hist_parent,
is_left_child=True,
)
)

hist3 = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
histogram_builder._compute_histogram_single_feature_from_parent(
feature_idx=0,
split_bin_start=0,
split_bin_end=1 + 1,
is_categorical=is_categorical,
left_cat_bitset=left_cat_bitset,
is_left_child=True,
histograms=hist3,
parent_histograms=hist_parent,
)

for key in ("count", "sum_hessians", "sum_gradients"):
assert_allclose(hist2[key], hist1[key], rtol=1e-6)
assert_allclose(hist3[key], hist1[key], rtol=1e-6)
0