16
16
from .common import X_DTYPE , X_BINNED_DTYPE , ALMOST_INF
17
17
18
18
19
- def _find_binning_thresholds (data , max_bins , subsample , random_state ):
19
+ def _find_binning_thresholds (data , sample_weight , max_bins , subsample ,
20
+ random_state ):
20
21
"""Extract feature-wise quantiles from numerical data.
21
22
22
23
Missing values are ignored for finding the thresholds.
@@ -25,6 +26,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
25
26
----------
26
27
data : array-like, shape (n_samples, n_features)
27
28
The data to bin.
29
+ sample_weight : ndarray of shape(n_samples,), or None
30
+ Sample weights associated with the data.
28
31
max_bins: int
29
32
The maximum number of bins to use for non-missing values. If for a
30
33
given feature the number of unique values is less than ``max_bins``,
@@ -46,9 +49,15 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
46
49
n_features``.
47
50
"""
48
51
rng = check_random_state (random_state )
49
- if subsample is not None and data .shape [0 ] > subsample :
50
- subset = rng .choice (np .arange (data .shape [0 ]), subsample , replace = False )
51
- data = data .take (subset , axis = 0 )
52
+ sample_size = min (subsample , data .shape [0 ])
53
+ if sample_weight is not None :
54
+ subset = rng .choice (np .arange (data .shape [0 ]), size = sample_size ,
55
+ replace = True ,
56
+ p = sample_weight / sample_weight .sum ())
57
+ else :
58
+ subset = rng .choice (np .arange (data .shape [0 ]), size = sample_size ,
59
+ replace = True )
60
+ data = data .take (subset , axis = 0 )
52
61
53
62
binning_thresholds = []
54
63
for f_idx in range (data .shape [1 ]):
@@ -136,7 +145,7 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
136
145
self .subsample = subsample
137
146
self .random_state = random_state
138
147
139
- def fit (self , X , y = None ):
148
+ def fit (self , X , y = None , sample_weight = None ):
140
149
"""Fit data X by computing the binning thresholds.
141
150
142
151
The last bin is reserved for missing values, whether missing values
@@ -146,8 +155,10 @@ def fit(self, X, y=None):
146
155
----------
147
156
X : array-like, shape (n_samples, n_features)
148
157
The data to bin.
149
- y: None
158
+ y : None
150
159
Ignored.
160
+ sample_weight : ndarray of shape(n_samples,), or None
161
+ Sample weights associated with the data.
151
162
152
163
Returns
153
164
-------
@@ -161,7 +172,7 @@ def fit(self, X, y=None):
161
172
X = check_array (X , dtype = [X_DTYPE ], force_all_finite = False )
162
173
max_bins = self .n_bins - 1
163
<
8000
code>174 self .bin_thresholds_ = _find_binning_thresholds (
164
- X , max_bins , subsample = self .subsample ,
175
+ X , sample_weight , max_bins , subsample = self .subsample ,
165
176
random_state = self .random_state )
166
177
167
178
self .n_bins_non_missing_ = np .array (
0 commit comments