8000 Fix for making the initial binning in HGBT parallel by OmarManzoor · Pull Request #29386 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Fix for making the initial binning in HGBT parallel #29386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions sklearn/ensemble/_hist_gradient_boosting/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
"""

# Author: Nicolas Hug
import concurrent.futures

import numpy as np

from ...base import BaseEstimator, TransformerMixin
from ...utils import check_array, check_random_state
from ...utils._openmp_helpers import _openmp_effective_n_threads
from ...utils.fixes import percentile
from ...utils.parallel import Parallel, delayed
from ...utils.validation import check_is_fitted
from ._binning import _map_to_bins
from ._bitset import set_bitset_memoryview
Expand Down Expand Up @@ -230,19 +230,13 @@ def fit(self, X, y=None):
self.bin_thresholds_ = [None] * n_features
n_bins_non_missing = [None] * n_features

with concurrent.futures.ThreadPoolExecutor(
max_workers=self.n_threads
) as executor:
future_to_f_idx = {
executor.submit(_find_binning_thresholds, X[:, f_idx], max_bins): f_idx
for f_idx in range(n_features)
if not self.is_categorical_[f_idx]
}
for future in concurrent.futures.as_completed(future_to_f_idx):
f_idx = future_to_f_idx[future]
self.bin_thresholds_[f_idx] = future.result()
n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1
non_cat_thresholds = Parallel(n_jobs=self.n_threads, backend="threading")(
delayed(_find_binning_thresholds)(X[:, f_idx], max_bins)
for f_idx in range(n_features)
if not self.is_categorical_[f_idx]
)

non_cat_idx = 0
for f_idx in range(n_features):
if self.is_categorical_[f_idx]:
# Since categories are assumed to be encoded in
Expand All @@ -252,6 +246,10 @@ def fit(self, X, y=None):
thresholds = known_categories[f_idx]
n_bins_non_missing[f_idx] = thresholds.shape[0]
self.bin_thresholds_[f_idx] = thresholds
else:
self.bin_thresholds_[f_idx] = non_cat_thresholds[non_cat_idx]
n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1
non_cat_idx += 1

self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
return self
Expand Down
0