-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[MRG] implement least absolute deviation loss in GBDTs #13896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eb35824
68832fc
6bada43
f9ec2b2
8dbbf7c
6e90e2a
d394dd6
0e69acd
77a86c1
6d3a71e
78f9734
d32ff16
2396298
e3b01a2
74bf2b6
f113dfc
5504aa7
b6afd2d
8ecc0ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,13 +18,24 @@ | |
from .common import Y_DTYPE | ||
from .common import G_H_DTYPE | ||
from ._loss import _update_gradients_least_squares | ||
from ._loss import _update_gradients_least_absolute_deviation | ||
from ._loss import _update_gradients_hessians_binary_crossentropy | ||
from ._loss import _update_gradients_hessians_categorical_crossentropy | ||
|
||
|
||
class BaseLoss(ABC): | ||
"""Base class for a loss.""" | ||
|
||
# This variable indicates whether the loss requires the leaves values to | ||
# be updated once the tree has been trained. The trees are trained to | ||
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for | ||
# some losses (e.g. least absolute deviation) we need to adjust the tree | ||
# values to account for the "line search" of the gradient descent | ||
# procedure. See the original paper Greedy Function Approximation: A | ||
# Gradient Boosting Machine by Friedman | ||
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. | ||
need_update_leaves_values = False | ||
|
||
def init_gradients_and_hessians(self, n_samples, prediction_dim): | ||
"""Return initial gradients and hessians. | ||
|
||
|
@@ -53,9 +64,10 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): | |
shape = (prediction_dim, n_samples) | ||
gradients = np.empty(shape=shape, dtype=G_H_DTYPE) | ||
if self.hessians_are_constant: | ||
# if the hessians are constant, we consider they are equal to 1. | ||
# this is correct as long as we adjust the gradients. See e.g. LS | ||
# loss | ||
# If the hessians are constant, we consider they are equal to 1. | ||
# - This is correct for the half LS loss | ||
# - For LAD loss, hessians are actually 0, but they are always | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if that's the case, doesn't it make sense to actually set them to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or even have a parameter in the loss class which is the function which gives you the constant hessians, i.e. for LAD it'd be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO it's more maintainable to have the convention that constant hessians are always 1 rather than have custom constant hessians for each loss. Especially when these hessians are never used, like here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, that's something that's always implicit and not explained in the papers, but hessians need to be treated as 1 (even though they're 0): in order for the leaf value computation to be an average instead of a sum. |
||
# ignored anyway. | ||
hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) | ||
else: | ||
hessians = np.empty(shape=shape, dtype=G_H_DTYPE) | ||
|
@@ -141,6 +153,63 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, | |
_update_gradients_least_squares(gradients, y_true, raw_predictions) | ||
|
||
|
||
class LeastAbsoluteDeviation(BaseLoss): | ||
"""Least asbolute deviation, for regression. | ||
|
||
For a given sample x_i, the loss is defined as:: | ||
|
||
loss(x_i) = |y_true_i - raw_pred_i| | ||
""" | ||
|
||
hessians_are_constant = True | ||
# This variable indicates whether the loss requires the leaves values to | ||
# be updated once the tree has been trained. The trees are trained to | ||
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for | ||
# some losses (e.g. least absolute deviation) we need to adjust the tree | ||
# values to account for the "line search" of the gradient descent | ||
# procedure. See the original paper Greedy Function Approximation: A | ||
# Gradient Boosting Machine by Friedman | ||
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. | ||
need_update_leaves_values = True | ||
|
||
def __call__(self, y_true, raw_predictions, average=True): | ||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to | ||
# return a view. | ||
raw_predictions = raw_predictions.reshape(-1) | ||
loss = np.abs(y_true - raw_predictions) | ||
return loss.mean() if average else loss | ||
|
||
def get_baseline_prediction(self, y_train, prediction_dim): | ||
return np.median(y_train) | ||
|
||
@staticmethod | ||
def inverse_link_function(raw_predictions): | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return raw_predictions | ||
|
||
def update_gradients_and_hessians(self, gradients, hessians, y_true, | ||
raw_predictions): | ||
# shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to | ||
# return a view. | ||
raw_predictions = raw_predictions.reshape(-1) | ||
gradients = gradients.reshape(-1) | ||
_update_gradients_least_absolute_deviation(gradients, y_true, | ||
raw_predictions) | ||
|
||
def update_leaves_values(self, grower, y_true, raw_predictions): | ||
# Update the values predicted by the tree with | ||
# median(y_true - raw_predictions). | ||
# See note about need_update_leaves_values in BaseLoss. | ||
|
||
# TODO: ideally this should be computed in parallel over the leaves | ||
# using something similar to _update_raw_predictions(), but this | ||
# requires a cython version of median() | ||
for leaf in grower.finalized_leaves: | ||
indices = leaf.sample_indices | ||
median_res = np.median(y_true[indices] - raw_predictions[indices]) | ||
leaf.value = grower.shrinkage * median_res | ||
# Note that the regularization is ignored here | ||
|
||
|
||
class BinaryCrossEntropy(BaseLoss): | ||
"""Binary cross-entropy loss, for binary classification. | ||
|
||
|
@@ -242,6 +311,7 @@ def predict_proba(self, raw_predictions): | |
|
||
_LOSSES = { | ||
'least_squares': LeastSquares, | ||
'least_absolute_deviation': LeastAbsoluteDeviation, | ||
'binary_crossentropy': BinaryCrossEntropy, | ||
'categorical_crossentropy': CategoricalCrossEntropy | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we remove right away the
optional
and the parenthesis?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rather keep the current docstring consistent with the other entries but I'm +1 in updating all of them in another PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK fine with me