7
7
8
8
9
9
class LinearLoss :
10
- """General class for loss functions with raw_prediction = X @ coef.
10
+ """General class for loss functions with raw_prediction = X @ coef + intercept .
11
11
12
12
The loss is the sum of per sample losses and includes an L2 term::
13
13
@@ -28,19 +28,30 @@ class LinearLoss:
28
28
n_dof = n_features
29
29
30
30
if loss.is_multiclass:
31
- coef.shape = (n_classes * n_dof,)
31
+ coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
32
32
else:
33
33
coef.shape = (n_dof,)
34
34
35
35
The intercept term is at the end of the coef array:
36
36
if loss.is_multiclass:
37
- intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
37
+ if coef.shape (n_classes, n_dof):
38
+ intercept = coef[:, -1]
39
+ if coef.shape (n_classes * n_dof,)
40
+ intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
38
41
intercept.shape = (n_classes,)
39
42
else:
40
43
intercept = coef[-1]
41
44
42
- Note: If the average loss per sample is wanted instead of the sum of the
43
- loss per sample, one can simply use a rescaled sample_weight such that
45
+ Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
46
+
47
+ coef.reshape((n_classes, -1), order="F")
48
+
49
+ The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
50
+ coefficients without intercept, coef[:, :-1], contiguous and speeds up
51
+ matrix-vector computations.
52
+
53
+ Note: If the average loss per sample is wanted instead of the sum of the loss per
54
+ sample, one can simply use a rescaled sample_weight such that
44
55
sum(sample_weight) = 1.
45
56
46
57
Parameters
@@ -58,8 +69,11 @@ def _w_intercept_raw(self, coef, X):
58
69
59
70
Parameters
60
71
----------
61
- coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
72
+ coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
62
73
Coefficients of a linear model.
74
+ If shape (n_classes * n_dof,), the classes of one feature are contiguous,
75
+ i.e. one reconstructs the 2d-array via
76
+ coef.reshape((n_classes, -1), order="F").
63
77
X : {array-like, sparse matrix} of shape (n_samples, n_features)
64
78
Training data.
65
79
@@ -82,7 +96,10 @@ def _w_intercept_raw(self, coef, X):
82
96
raw_prediction = X @ w + intercept
83
97
else :
84
98
# reshape to (n_classes, n_dof)
85
- w = coef .reshape (self ._loss .n_classes , - 1 )
99
+ if coef .ndim == 1 :
100
+ w = coef .reshape ((self ._loss .n_classes , - 1 ), order = "F" )
101
+ else :
102
+ w = coef
86
103
if self .fit_intercept :
87
104
intercept = w [:, - 1 ]
88
105
w = w [:, :- 1 ]
@@ -97,8 +114,11 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
97
114
98
115
Parameters
99
116
----------
100
- coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
117
+ coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
101
118
Coefficients of a linear model.
119
+ If shape (n_classes * n_dof,), the classes of one feature are contiguous,
120
+ i.e. one reconstructs the 2d-array via
121
+ coef.reshape((n_classes, -1), order="F").
102
122
y : contiguous array of shape (n_samples,)
103
123
Observed, true target values.
104
124
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -137,8 +157,11 @@ def loss_gradient(
137
157
138
158
Parameters
139
159
----------
140
- coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
160
+ coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
141
161
Coefficients of a linear model.
162
+ If shape (n_classes * n_dof,), the classes of one feature are contiguous,
163
+ i.e. one reconstructs the 2d-array via
164
+ coef.reshape((n_classes, -1), order="F").
142
165
y : contiguous array of shape (n_samples,)
143
166
Observed, true target values.
144
167
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -155,8 +178,8 @@ def loss_gradient(
155
178
loss : float
156
179
Sum of losses per sample plus penalty.
157
180
158
- gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
159
- The gradient of the loss as ravelled array .
181
+ gradient : ndarray of shape coef.shape
182
+ The gradient of the loss.
160
183
"""
161
184
n_features , n_classes = X .shape [1 ], self ._loss .n_classes
162
185
n_dof = n_features + self .fit_intercept
@@ -179,12 +202,15 @@ def loss_gradient(
179
202
return loss , grad
180
203
else :
181
204
loss += 0.5 * l2_reg_strength * squared_norm (w )
182
- grad = np .empty ((n_classes , n_dof ), dtype = X .dtype )
205
+ grad = np .empty ((n_classes , n_dof ), dtype = X .dtype , order = "F" )
183
206
# gradient.shape = (n_samples, n_classes)
184
207
grad [:, :n_features ] = gradient_per_sample .T @ X + l2_reg_strength * w
185
208
if self .fit_intercept :
186
209
grad [:, - 1 ] = gradient_per_sample .sum (axis = 0 )
187
- return loss , grad .ravel ()
210
+ if coef .ndim == 1 :
211
+ return loss , grad .ravel (order = "F" )
212
+ else :
213
+ return loss , grad
188
214
189
215
def gradient (
190
216
self , coef , X , y , sample_weight = None , l2_reg_strength = 0.0 , n_threads = 1
@@ -193,8 +219,11 @@ def gradient(
193
219
194
220
Parameters
195
221
----------
196
- coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
222
+ coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
197
223
Coefficients of a linear model.
224
+ If shape (n_classes * n_dof,), the classes of one feature are contiguous,
225
+ i.e. one reconstructs the 2d-array via
226
+ coef.reshape((n_classes, -1), order="F").
198
227
y : contiguous array of shape (n_samples,)
199
228
Observed, true target values.
200
229
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -208,8 +237,8 @@ def gradient(
208
237
209
238
Returns
210
239
-------
211
- gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
212
- The gradient of the loss as ravelled array .
240
+ gradient : ndarray of shape coef.shape
241
+ The gradient of the loss.
213
242
"""
214
243
n_features , n_classes = X .shape [1 ], self ._loss .n_classes
215
244
n_dof = n_features + self .fit_intercept
@@ -229,12 +258,15 @@ def gradient(
229
258
grad [- 1 ] = gradient_per_sample .sum ()
230
259
return grad
231
260
else :
232
- grad = np .empty ((n_classes , n_dof ), dtype = X .dtype )
261
+ grad = np .empty ((n_classes , n_dof ), dtype = X .dtype , order = "F" )
233
262
# gradient.shape = (n_samples, n_classes)
234
263
grad [:, :n_features ] = gradient_per_sample .T @ X + l2_reg_strength * w
235
264
if self .fit_intercept :
236
265
grad [:, - 1 ] = gradient_per_sample .sum (axis = 0 )
237
- return grad .ravel ()
266
+ if coef .ndim == 1 :
267
+ return grad .ravel (order = "F" )
268
+ else :
269
+ return grad
238
270
239
271
def gradient_hessp (
240
272
self , coef , X , y , sample_weight = None , l2_reg_strength = 0.0 , n_threads = 1
@@ -243,8 +275,11 @@ def gradient_hessp(
243
275
244
276
Parameters
245
277
----------
246
- coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
278
+ coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
247
279
Coefficients of a linear model.
280
+ If shape (n_classes * n_dof,), the classes of one feature are contiguous,
281
+ i.e. one reconstructs the 2d-array via
282
+ coef.reshape((n_classes, -1), order="F").
248
283
y : contiguous array of shape (n_samples,)
249
284
Observed, true target values.
250
285
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -258,14 +293,15 @@ def gradient_hessp(
258
293
259
294
Returns
260
295
-------
261
- gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
262
- The gradient of the loss as ravelled array .
296
+ gradient : ndarray of shape coef.shape
297
+ The gradient of the loss.
263
298
264
299
hessp : callable
265
300
Function that takes in a vector input of shape of gradient and
266
301
and returns matrix-vector product with hessian.
267
302
"""
268
303
(n_samples , n_features ), n_classes = X .shape , self ._loss .n_classes
304
+ n_dof = n_features + self .fit_intercept
269
305
w , intercept , raw_prediction = self ._w_intercept_raw (coef , X )
270
306
271
307
if not self ._loss .is_multiclass :
@@ -322,7 +358,7 @@ def hessp(s):
322
358
sample_weight = sample_weight ,
323
359
n_threads = n_threads ,
324
360
)
325
- grad = np .empty_like ( coef . reshape (n_classes , - 1 ), dtype = X .dtype )
361
+ grad = np .empty ( (n_classes , n_dof ), dtype = X .dtype , order = "F" )
326
362
grad [:, :n_features ] = gradient .T @ X + l2_reg_strength * w
327
363
if self .fit_intercept :
328
364
grad [:, - 1 ] = gradient .sum (axis = 0 )
@@ -348,7 +384,7 @@ def hessp(s):
348
384
#
349
385
# See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa
350
386
def hessp (s ):
351
- s = s .reshape (n_classes , - 1 ) # shape = (n_classes, n_dof)
387
+ s = s .reshape (( n_classes , - 1 ), order = "F" ) # shape = (n_classes, n_dof)
352
388
if self .fit_intercept :
353
389
s_intercept = s [:, - 1 ]
354
390
s = s [:, :- 1 ]
@@ -359,10 +395,16 @@ def hessp(s):
359
395
tmp *= proba # * p_i_k
360
396
if sample_weight is not None :
361
397
tmp *= sample_weight [:, np .newaxis ]
362
- hess_prod = np .empty_like (grad )
398
+ hess_prod = np .empty_like (grad , order = "F" )
363
399
hess_prod [:, :n_features ] = tmp .T @ X + l2_reg_strength * s
364
400
if self .fit_intercept :
365
401
hess_prod [:, - 1 ] = tmp .sum (axis = 0 )
366
- return hess_prod .ravel ()
402
+ if coef .ndim == 1 :
403
+ return hess_prod .ravel (order = "F" )
404
+ else :
405
+ return hess_prod
367
406
368
- return grad .ravel (), hessp
407
+ if coef .ndim == 1 :
408
+ return grad .ravel (order = "F" ), hessp
409
+ else :
410
+ return grad , hessp
0 commit comments