8000 sample weight support for robust regression via weighted percentile algo · pprett/scikit-learn@33052ba · GitHub
[go: up one dir, main page]

Skip to content

Commit 33052ba

Browse files
committed
sample weight support for robust regression via weighted percentile algo
1 parent 156fc88 commit 33052ba

File tree

2 files changed

+44
-24
lines changed

2 files changed

+44
-24
lines changed

sklearn/ensemble/gradient_boosting.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@
5050
from ._gradient_boosting import _random_sample_mask
5151

5252

53+
def _weighted_percentile(arr, sample_weight, percentile=50):
54+
"""Compute the weighted ``percentile`` of ``arr`` with ``sample_weight``. """
55+
sorted_idx = np.argsort(arr)
56+
57+
# Find index of median prediction for each sample
58+
weight_cdf = sample_weight[sorted_idx].cumsum()
59+
percentile_or_above = weight_cdf >= (percentile / 100.0) * weight_cdf[-1]
60+
percentile_idx = percentile_or_above.argmax()
61+
62+
return arr[sorted_idx[percentile_idx]]
63+
64+
5365
class QuantileEstimator(BaseEstimator):
5466
"""An estimator predicting the alpha-quantile of the training targets."""
5567
def __init__(self, alpha=0.9):
@@ -286,8 +298,9 @@ def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
286298
residual, pred, sample_weight):
287299
"""LAD updates terminal regions to median estimates. """
288300
terminal_region = np.where(terminal_regions == leaf)[0]
289-
tree.value[leaf, 0, 0] = np.median(y.take(terminal_region, axis=0) -
290-
pred.take(terminal_region, axis=0))
301+
sample_weight = sample_weight.take(terminal_region, axis=0)
302+
diff = y.take(terminal_region, axis=0) - pred.take(terminal_region, axis=0)
303+
tree.value[leaf, 0, 0] = _weighted_percentile(diff, sample_weight, percentile=50)
291304

292305

293306
class HuberLossFunction(RegressionLossFunction):
@@ -314,11 +327,20 @@ def __call__(self, y, pred, sample_weight=None):
314327
diff = y - pred
315328
gamma = self.gamma
316329
if gamma is None:
317-
gamma = stats.scoreatpercentile(np.abs(diff), self.alpha * 100)
330+
if sample_weight is None:
331+
gamma = stats.scoreatpercentile(np.abs(diff), self.alpha * 100)
332+
else:
333+
gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100)
334+
318335
gamma_mask = np.abs(diff) <= gamma
319-
sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2.0)
320-
lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2.0))
321-
return (sq_loss + lin_loss) / y.shape[0]
336+
if sample_weight is None:
337+
sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2.0)
338+
lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2.0))
339+
else:
340+
sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask] ** 2.0)
341+
lin_loss = np.sum(gamma * sample_weight[~gamma_mask] *
342+
(np.abs(diff[~gamma_mask]) - gamma / 2.0))
343+
return (sq_loss + lin_loss) / sample_weight.sum()
322344

323345
def negative_gradient(self, y, pred, **kargs):
324346
pred = pred.ravel()
@@ -334,13 +356,16 @@ def negative_gradient(self, y, pred, **kargs):
334356
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
335357
residual, pred, sample_weight):
336358
terminal_region = np.where(terminal_regions == leaf)[0]
359+
sample_weight = sample_weight.take(terminal_region, axis=0)
337360
gamma = self.gamma
338361
diff = (y.take(terminal_region, axis=0)
339362
- pred.take(terminal_region, axis=0))
340-
median = np.median(diff)
363+
median = _weighted_percentile(diff, sample_weight, percentile=50)
364+
#median = np.median(diff)
341365
diff_minus_median = diff - median
342366
tree.value[leaf, 0] = median + np.mean(
343367
np.sign(diff_minus_median) *
368+
sample_weight *
344369
np.minimum(np.abs(diff_minus_median), gamma))
345370

346371

@@ -366,8 +391,13 @@ def __call__(self, y, pred, sample_weight=None):
366391
alpha = self.alpha
367392

368393
mask = y > pred
369-
return (alpha * diff[mask].sum() +
370-
(1.0 - alpha) * diff[~mask].sum()) / y.shape[0]
394+
if sample_weight is None:
395+
loss = (alpha * diff[mask].sum() +
396+
(1.0 - alpha) * diff[~mask].sum()) / y.shape[0]
397+
else:
398+
loss = (alpha * np.sum(sample_weight[mask] * diff[mask]) +
399+
(1.0 - alpha) * np.sum(sample_weight[~mask] * diff[~mask])) / sample_weight.sum()
400+
return loss
371401

372402
def negative_gradient(self, y, pred, **kargs):
373403
alpha = self.alpha
@@ -380,7 +410,9 @@ def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
380410
terminal_region = np.where(terminal_regions == leaf)[0]
381411
diff = (y.take(terminal_region, axis=0)
382412
- pred.take(terminal_region, axis=0))
383-
val = stats.scoreatpercentile(diff, self.percentile)
413+
sample_weight = sample_weight.take(terminal_region, axis=0)
414+
415+
val = _weighted_percentile(diff, sample_weight, self.percentile)
384416
tree.value[leaf, 0] = val
385417

386418

@@ -903,9 +935,6 @@ def fit(self, X, y, sample_weight=None, monitor=None):
903935
sample_weight = np.ones(n_samples, dtype=np.float32)
904936
else:
905937
sample_weight = column_or_1d(sample_weight, warn=True)
906-
if self.loss in ('lad', 'huber', 'quantile'):
907-
raise NotImplementedError('sample_weight not supported for loss=%r' %
908-
self.loss)
909938

910939
if y.shape[0] != n_samples:
911940
raise ValueError('Shape mismatch of X and y: %d != %d' %

sklearn/ensemble/tests/test_gradient_boosting.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -955,17 +955,8 @@ def test_probability_exponential():
955955

956956
clf.fit(X, y)
957957
assert_array_equal(clf.predict(T), true_result)
958-
assert_raises(AttributeError, clf.predict_proba, T)
959-
assert_raises(AttributeError, lambda : next(clf.staged_predict_proba(T)))
960-
961-
962-
def test_sample_weight_robust():
963-
"""Test that robost regression loss raise ValueError. """
964-
sample_weight = np.ones(len(boston.target))
965-
for loss in ('lad', 'huber', 'quantile'):
966-
est = GradientBoostingRegressor(n_estimators=1, loss=loss)
967-
assert_raises(ValueError, est.fit, boston.data, boston.target,
968-
sample_weight=sample_weight)
958+
assert_raises(TypeError, clf.predict_proba, T)
959+
assert_raises(TypeError, lambda : next(clf.staged_predict_proba(T)))
969960

970961

971962
if __name__ == "__main__":

0 commit comments

Comments
 (0)
0