-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
ENH Support sample weights in HGBT #14696
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
376c477
f9e0a1b
3793661
e02126c
4e686d4
8cd484c
a9c30d5
f09beb6
c922d30
39282a0
934323e
54d3c27
df28919
56b385a
434d0d0
66f7ad3
a1440bb
34ad5a5
892a5b5
bd9ae1d
78b8f64
11a07dd
eb3f8f1
cccde96
7500c78
ccc527b
868a2f0
bbe5fce
35cf4b6
a2f79b2
0ff3b72
ed24433
05140c4
7656657
ec570ee
44b5d1c
728b32e
3a0b62f
9e0e7b6
cb8b94c
90550af
885c0b9
b233c63
964c68a
8e4d207
0931920
0616163
9987ff7
6116a0d
9ff57a6
2ffee14
ec8b37d
d6e0482
942d542
5d16e27
45b360c
fc2ad10
ac3df84
dbbe1ee
577a4f3
916eaa5
d0775e4
aba5160
1938175
469b6d9
30964d8
b759142
76dc710
9bc22d9
df55f08
a4877f9
d5f98d7
cfaa057
f5960f2
c9030bc
13d120a
dcccf01
1c81ebd
47b11d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,9 +27,51 @@ def _update_gradients_least_squares( | |
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# Note: a more correct exp is 2 * (raw_predictions - y_true) | ||
# but since we use 1 for the constant hessian value (and not 2) this | ||
# is strictly equivalent for the leaves values. | ||
gradients[i] = raw_predictions[i] - y_true[i] | ||
|
||
|
||
def _update_gradients_hessians_least_squares( | ||
G_H_DTYPE_C [::1] gradients, # OUT | ||
G_H_DTYPE_C [::1] hessians, # OUT | ||
const Y_DTYPE_C [::1] y_true, # IN | ||
const Y_DTYPE_C [::1] raw_predictions, # IN | ||
const Y_DTYPE_C [::1] sample_weight): # IN | ||
|
||
cdef: | ||
int n_samples | ||
int i | ||
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight | ||
# but since we use 1 for the constant hessian value (and not 2) this | ||
# is strictly equivalent for the leaves values. | ||
gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i] | ||
hessians[i] = sample_weight[i] | ||
|
||
|
||
def _update_gradients_hessians_least_absolute_deviation( | ||
G_H_DTYPE_C [::1] gradients, # OUT | ||
G_H_DTYPE_C [::1] hessians, # OUT | ||
const Y_DTYPE_C [::1] y_true, # IN | ||
const Y_DTYPE_C [::1] raw_predictions, # IN | ||
const Y_DTYPE_C [::1] sample_weight): # IN | ||
|
||
cdef: | ||
int n_samples | ||
int i | ||
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# gradient = sign(raw_predicition - y_pred) * sample_weight | ||
gradients[i] = sample_weight[i] * (2 * | ||
(y_true[i] - raw_predictions[i] < 0) - 1) | ||
hessians[i] = sample_weight[i] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this works because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, this works because:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking about the math. lightgbm does the same thing with the l1 loss. When I see derivative of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I think I see what you did here: #13896 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, also, if you look at the sklearn wrapper in lightgbm, all hessians and gradients are simply multiplied by SW. |
||
|
||
|
||
def _update_gradients_least_absolute_deviation( | ||
G_H_DTYPE_C [::1] gradients, # OUT | ||
const Y_DTYPE_C [::1] y_true, # IN | ||
|
@@ -49,44 +91,66 @@ def _update_gradients_hessians_binary_crossentropy( | |
G_H_DTYPE_C [::1] gradients, # OUT | ||
G_H_DTYPE_C [::1] hessians, # OUT | ||
const Y_DTYPE_C [::1] y_true, # IN | ||
const Y_DTYPE_C [::1] raw_predictions): # IN | ||
const Y_DTYPE_C [::1] raw_predictions, # IN | ||
const Y_DTYPE_C [::1] sample_weight): # IN | ||
|
||
int n_samples | ||
Y_DTYPE_C p_i # proba that ith sample belongs to positive class | ||
int i | ||
|
||
n_samples = raw_predictions.shape[0] | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
p_i = _cexpit(raw_predictions[i]) | ||
gradients[i] = p_i - y_true[i] | ||
hessians[i] = p_i * (1. - p_i) | ||
if sample_weight is None: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
p_i = _cexpit(raw_predictions[i]) | ||
gradients[i] = p_i - y_true[i] | ||
hessians[i] = p_i * (1. - p_i) | ||
else: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
p_i = _cexpit(raw_predictions[i]) | ||
gradients[i] = (p_i - y_true[i]) * sample_weight[i] | ||
hessians[i] = p_i * (1. - p_i) * sample_weight[i] | ||
|
||
|
||
def _update_gradients_hessians_categorical_crossentropy( | ||
G_H_DTYPE_C [:, ::1] gradients, # OUT | ||
G_H_DTYPE_C [:, ::1] hessians, # OUT | ||
const Y_DTYPE_C [::1] y_true, # IN | ||
const Y_DTYPE_C [:, ::1] raw_predictions): # IN | ||
const Y_DTYPE_C [:, ::1] raw_predictions, # IN | ||
const Y_DTYPE_C [::1] sample_weight): # IN | ||
cdef: | ||
int prediction_dim = raw_predictions.shape[0] | ||
int n_samples = raw_predictions.shape[1] | ||
int k # class index | ||
int i # sample index | ||
Y_DTYPE_C sw | ||
# p[i, k] is the probability that class(ith sample) == k. | ||
# It's the softmax of the raw predictions | ||
Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) | ||
Y_DTYPE_C p_i_k | ||
|
||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# first compute softmaxes of sample i for each class | ||
for k in range(prediction_dim): | ||
p[i, k] = raw_predictions[k, i] # prepare softmax | ||
_compute_softmax(p, i) | ||
# then update gradients and hessians | ||
for k in range(prediction_dim): | ||
p_i_k = p[i, k] | ||
gradients[k, i] = p_i_k - (y_true[i] == k) | ||
hessians[k, i] = p_i_k * (1. - p_i_k) | ||
if sample_weight is None: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# first compute softmaxes of sample i for each class | ||
for k in range(prediction_dim): | ||
p[i, k] = raw_predictions[k, i] # prepare softmax | ||
_compute_softmax(p, i) | ||
# then update gradients and hessians | ||
for k in range(prediction_dim): | ||
p_i_k = p[i, k] | ||
gradients[k, i] = p_i_k - (y_true[i] == k) | ||
hessians[k, i] = p_i_k * (1. - p_i_k) | ||
else: | ||
for i in prange(n_samples, schedule='static', nogil=True): | ||
# first compute softmaxes of sample i for each class | ||
for k in range(prediction_dim): | ||
p[i, k] = raw_predictions[k, i] # prepare softmax | ||
_compute_softmax(p, i) | ||
# then update gradients and hessians | ||
sw = sample_weight[i] | ||
for k in range(prediction_dim): | ||
p_i_k = p[i, k] | ||
gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw | ||
thomasjpfan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw | ||
|
||
|
||
cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would the probability show more comfort?