8000 sample with replacement before binning · adrinjalali/scikit-learn@76dc710 · GitHub
[go: up one dir, main page]

8000 Skip to content

Commit 76dc710

Browse files
committed
sample with replacement before binning
1 parent b759142 commit 76dc710

File tree

3 files changed

+29
-12
lines changed

3 files changed

+29
-12
lines changed

sklearn/ensemble/_hist_gradient_boosting/binning.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
1717

1818

19-
def _find_binning_thresholds(data, max_bins, subsample, random_state):
19+
def _find_binning_thresholds(data, sample_weight, max_bins, subsample,
20+
random_state):
2021
"""Extract feature-wise quantiles from numerical data.
2122
2223
Missing values are ignored for finding the thresholds.
@@ -25,6 +26,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
2526
----------
2627
data : array-like, shape (n_samples, n_features)
2728
The data to bin.
29+
sample_weight : ndarray of shape(n_samples,), or None
30+
Sample weights associated with the data.
2831
max_bins: int
2932
The maximum number of bins to use for non-missing values. If for a
3033
given feature the number of unique values is less than ``max_bins``,
@@ -46,9 +49,15 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
4649
n_features``.
4750
"""
4851
rng = check_random_state(random_state)
49-
if subsample is not None and data.shape[0] > subsample:
50-
subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False)
51-
data = data.take(subset, axis=0)
52+
sample_size = min(subsample, data.shape[0])
53+
if sample_weight is not None:
54+
subset = rng.choice(np.arange(data.shape[0]), size=sample_size,
55+
replace=True,
56+
p=sample_weight / sample_weight.sum())
57+
else:
58+
subset = rng.choice(np.arange(data.shape[0]), size=sample_size,
59+
replace=True)
60+
data = data.take(subset, axis=0)
5261

5362
binning_thresholds = []
5463
for f_idx in range(data.shape[1]):
@@ -136,7 +145,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
136145
self.subsample = subsample
137146
self.random_state = random_state
138147

139-
def fit(self, X, y=None):
148+
def fit(self, X, y=None, sample_weight=None):
140149
"""Fit data X by computing the binning thresholds.
141150
142151
The last bin is reserved for missing values, whether missing values
@@ -146,8 +155,10 @@ def fit(self, X, y=None):
146155
----------
147156
X : array-like, shape (n_samples, n_features)
148157
The data to bin.
149-
y: None
158+
y : None
150159
Ignored.
160+
sample_weight : ndarray of shape(n_samples,), or None
161+
Sample weights associated with the data.
151162
152163
Returns
153164
-------
@@ -161,7 +172,7 @@ def fit(self, X, y=None):
161172
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
162173
max_bins = self.n_bins - 1
163< 8000 code>174
self.bin_thresholds_ = _find_binning_thresholds(
164-
X, max_bins, subsample=self.subsample,
175+
X, sample_weight, max_bins, subsample=self.subsample,
165176
random_state=self.random_state)
166177

167178
self.n_bins_non_missing_ = np.array(

sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,13 @@ def fit(self, X, y, sample_weight=None):
184184
n_bins = self.max_bins + 1 # + 1 for missing values
185185
self.bin_mapper_ = _BinMapper(n_bins=n_bins,
186186
random_state=self._random_seed)
187-
X_binned_train = self._bin_data(X_train, is_training_data=True)
187+
X_binned_train = self._bin_data(X_train,
188+
sample_weight=sample_weight_train,
189+
is_training_data=True)
188190
if X_val is not None:
189-
X_binned_val = self._bin_data(X_val, is_training_data=False)
191+
X_binned_val = self._bin_data(X_val,
192+
sample_weight=sample_weight_val,
193+
is_training_data=False)
190194
else:
191195
X_binned_val = None
192196

@@ -554,7 +558,7 @@ def _should_stop(self, scores):
554558
for score in recent_scores]
555559
return not any(recent_improvements)
556560

557-
def _bin_data(self, X, is_training_data):
561+
def _bin_data(self, X, sample_weight, is_training_data):
558562
"""Bin data X.
559563
560564
If is_training_data, then set the bin_mapper_ attribute.
@@ -567,7 +571,9 @@ def _bin_data(self, X, is_training_data):
567571
X.nbytes / 1e9, description), end="", flush=True)
568572
tic = time()
569573
if is_training_data:
570-
X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array
574+
# F-aligned array
575+
X_binned = self.bin_mapper_.fit_transform(
576+
X, sample_weight=sample_weight)
571577
else:
572578
X_binned = self.bin_mapper_.transform(X) # F-aligned array
573579
# We convert the array to C-contiguous since predicting is faster

sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
2121
random_state=None):
2222
# Just a redef to avoid having to pass arguments all the time (as the
2323
# function is private we don't use default values for parameters)
24-
return _find_binning_thresholds_orig(data, max_bins, subsample,
24+
return _find_binning_thresholds_orig(data, None, max_bins, subsample,
2525
random_state)
2626

2727

0 commit comments

Comments
 (0)
0