50
50
from ._gradient_boosting import _random_sample_mask
51
51
52
52
53
+ def _weighted_percentile (arr , sample_weight , percentile = 50 ):
54
+ """Compute the weighted ``percentile`` of ``arr`` with ``sample_weight``. """
55
+ sorted_idx = np .argsort (arr )
56
+
57
+ # Find index of median prediction for each sample
58
+ weight_cdf = sample_weight [sorted_idx ].cumsum ()
59
+ percentile_or_above = weight_cdf >= (percentile / 100.0 ) * weight_cdf [- 1 ]
60
+ percentile_idx = percentile_or_above .argmax ()
61
+
62
+ return arr [sorted_idx [percentile_idx ]]
63
+
64
+
53
65
class QuantileEstimator (BaseEstimator ):
54
66
"""An estimator predicting the alpha-quantile of the training targets."""
55
67
def __init__ (self , alpha = 0.9 ):
@@ -286,8 +298,9 @@ def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
286
298
residual , pred , sample_weight ):
287
299
"""LAD updates terminal regions to median estimates. """
288
300
terminal_region = np .where (terminal_regions == leaf )[0 ]
289
- tree .value [leaf , 0 , 0 ] = np .median (y .take (terminal_region , axis = 0 ) -
290
- pred .take (terminal_region , axis = 0 ))
301
+ sample_weight = sample_weight .take (terminal_region , axis = 0 )
302
+ diff = y .take (terminal_region , axis = 0 ) - pred .take (terminal_region , axis = 0 )
303
+ tree .value [leaf , 0 , 0 ] = _weighted_percentile (diff , sample_weight , percentile = 50 )
291
304
292
305
293
306
class HuberLossFunction (RegressionLossFunction ):
@@ -314,11 +327,20 @@ def __call__(self, y, pred, sample_weight=None):
314
327
diff = y - pred
315
328
gamma = self .gamma
316
329
if gamma is None :
317
- gamma = stats .scoreatpercentile (np .abs (diff ), self .alpha * 100 )
330
+ if sample_weight is None :
331
+ gamma = stats .scoreatpercentile (np .abs (diff ), self .alpha * 100 )
332
+ else :
333
+ gamma = _weighted_percentile (np .abs (diff ), sample_weight , self .alpha * 100 )
334
+
318
335
gamma_mask = np .abs (diff ) <= gamma
319
- sq_loss = np .sum (0.5 * diff [gamma_mask ] ** 2.0 )
320
- lin_loss = np .sum (gamma * (np .abs (diff [~ gamma_mask ]) - gamma / 2.0 ))
321
- return (sq_loss + lin_loss ) / y .shape [0 ]
336
+ if sample_weight is None :
337
+ sq_loss = np .sum (0.5 * diff [gamma_mask ] ** 2.0 )
338
+ lin_loss = np .sum (gamma * (np .abs (diff [~ gamma_mask ]) - gamma / 2.0 ))
339
+ else :
340
+ sq_loss = np .sum (0.5 * sample_weight [gamma_mask ] * diff [gamma_mask ] ** 2.0 )
341
+ lin_loss = np .sum (gamma * sample_weight [~ gamma_mask ] *
342
+ (np .abs (diff [~ gamma_mask ]) - gamma / 2.0 ))
343
+ return (sq_loss + lin_loss ) / sample_weight .sum ()
322
344
323
345
def negative_gradient (self , y , pred , ** kargs ):
324
346
pred = pred .ravel ()
@@ -334,13 +356,16 @@ def negative_gradient(self, y, pred, **kargs):
334
356
def _update_terminal_region (self , tree , terminal_regions , leaf , X , y ,
335
357
residual , pred , sample_weight ):
336
358
terminal_region = np .where (terminal_regions == leaf )[0 ]
359
+ sample_weight = sample_weight .take (terminal_region , axis = 0 )
337
360
gamma = self .gamma
338
361
diff = (y .take (terminal_region , axis = 0 )
339
362
- pred .take (terminal_region , axis = 0 ))
340
- median = np .median (diff )
363
+ median = _weighted_percentile (diff , sample_weight , percentile = 50 )
364
+ #median = np.median(diff)
341
365
diff_minus_median = diff - median
342
366
tree .value [leaf , 0 ] = median + np .mean (
343
367
np .sign (diff_minus_median ) *
368
+ sample_weight *
344
369
np .minimum (np .abs (diff_minus_median ), gamma ))
345
370
346
371
@@ -366,8 +391,13 @@ def __call__(self, y, pred, sample_weight=None):
366
391
alpha = self .alpha
367
392
368
393
mask = y > pred
369
- return (alpha * diff [mask ].sum () +
370
- (1.0 - alpha ) * diff [~ mask ].sum ()) / y .shape [0 ]
394
+ if sample_weight is None :
395
+ loss = (alpha * diff [mask ].sum () +
396
+ (1.0 - alpha ) * diff [~ mask ].sum ()) / y .shape [0 ]
397
+ else :
398
+ loss = (alpha * np .sum (sample_weight [mask ] * diff [mask ]) +
399
+ (1.0 - alpha ) * np .sum (sample_weight [~ mask ] * diff [~ mask ])) / sample_weight .sum ()
400
+ return loss
371
401
372
402
def negative_gradient (self , y , pred , ** kargs ):
373
403
alpha = self .alpha
@@ -380,7 +410,9 @@ def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
380
410
terminal_region = np .where (terminal_regions == leaf )[0 ]
381
411
diff = (y .take (terminal_region , axis = 0 )
382
412
- pred .take (terminal_region , axis = 0 ))
383
- val = stats .scoreatpercentile (diff , self .percentile )
413
+ sample_weight = sample_weight .take (terminal_region , axis = 0 )
414
+
415
+ val = _weighted_percentile (diff , sample_weight , self .percentile )
384
416
tree .value [leaf , 0 ] = val
385
417
386
418
@@ -903,9 +935,6 @@ def fit(self, X, y, sample_weight=None, monitor=None):
903
935
sample_weight = np .ones (n_samples , dtype = np .float32 )
904
936
else :
905
937
sample_weight = column_or_1d (sample_weight , warn = True )
906
- if self .loss in ('lad' , 'huber' , 'quantile' ):
907
- raise NotImplementedError ('sample_weight not supported for loss=%r' %
908
- self .loss )
909
938
910
939
if y .shape [0 ] != n_samples :
911
940
raise ValueError ('Shape mismatch of X and y: %d != %d' %
0 commit comments