8000 ENH respect F-contiguity · scikit-learn/scikit-learn@39ffe9d · GitHub
[go: up one dir, main page]

Skip to content

Commit 39ffe9d

Browse files
committed
ENH respect F-contiguity
1 parent 17c674d commit 39ffe9d

File tree

3 files changed

+79
-32
lines changed

3 files changed

+79
-32
lines changed

sklearn/linear_model/_linear_loss.py

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
class LinearLoss:
10-
"""General class for loss functions with raw_prediction = X @ coef.
10+
"""General class for loss functions with raw_prediction = X @ coef + intercept.
1111
1212
The loss is the sum of per sample losses and includes an L2 term::
1313
@@ -28,19 +28,30 @@ class LinearLoss:
2828
n_dof = n_features
2929
3030
if loss.is_multiclass:
31-
coef.shape = (n_classes * n_dof,)
31+
coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
3232
else:
3333
coef.shape = (n_dof,)
3434
3535
The intercept term is at the end of the coef array:
3636
if loss.is_multiclass:
37-
intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
37+
if coef.shape (n_classes, n_dof):
38+
intercept = coef[:, -1]
39+
if coef.shape (n_classes * n_dof,)
40+
intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
3841
intercept.shape = (n_classes,)
3942
else:
4043
intercept = coef[-1]
4144
42-
Note: If the average loss per sample is wanted instead of the sum of the
43-
loss per sample, one can simply use a rescaled sample_weight such that
45+
Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
46+
47+
coef.reshape((n_classes, -1), order="F")
48+
49+
The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
50+
coefficients without intercept, coef[:, :-1], contiguous and speeds up
51+
matrix-vector computations.
52+
53+
Note: If the average loss per sample is wanted instead of the sum of the loss per
54+
sample, one can simply use a rescaled sample_weight such that
4455
sum(sample_weight) = 1.
4556
4657
Parameters
@@ -58,8 +69,11 @@ def _w_intercept_raw(self, coef, X):
5869
5970
Parameters
6071
----------
61-
coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
72+
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
6273
Coefficients of a linear model.
74+
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
75+
i.e. one reconstructs the 2d-array via
76+
coef.reshape((n_classes, -1), order="F").
6377
X : {array-like, sparse matrix} of shape (n_samples, n_features)
6478
Training data.
6579
@@ -82,7 +96,10 @@ def _w_intercept_raw(self, coef, X):
8296
raw_prediction = X @ w + intercept
8397
else:
8498
# reshape to (n_classes, n_dof)
85-
w = coef.reshape(self._loss.n_classes, -1)
99+
if coef.ndim == 1:
100+
w = coef.reshape((self._loss.n_classes, -1), order="F")
101+
else:
102+
w = coef
86103
if self.fit_intercept:
87104
intercept = w[:, -1]
88105
w = w[:, :-1]
@@ -97,8 +114,11 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
97114
98115
Parameters
99116
----------
100-
coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
117+
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
101118
Coefficients of a linear model.
119+
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
120+
i.e. one reconstructs the 2d-array via
121+
coef.reshape((n_classes, -1), order="F").
102122
y : contiguous array of shape (n_samples,)
103123
Observed, true target values.
104124
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -137,8 +157,11 @@ def loss_gradient(
137157
138158
Parameters
139159
----------
140-
coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
160+
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
141161
Coefficients of a linear model.
162+
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
163+
i.e. one reconstructs the 2d-array via
164+
coef.reshape((n_classes, -1), order="F").
142165
y : contiguous array of shape (n_samples,)
143166
Observed, true target values.
144167
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -155,8 +178,8 @@ def loss_gradient(
155178
loss : float
156179
Sum of losses per sample plus penalty.
157180
158-
gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
159-
The gradient of the loss as ravelled array.
181+
gradient : ndarray of shape coef.shape
182+
The gradient of the loss.
160183
"""
161184
n_features, n_classes = X.shape[1], self._loss.n_classes
162185
n_dof = n_features + self.fit_intercept
@@ -179,12 +202,15 @@ def loss_gradient(
179202
return loss, grad
180203
else:
181204
loss += 0.5 * l2_reg_strength * squared_norm(w)
182-
grad = np.empty((n_classes, n_dof), dtype=X.dtype)
205+
grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
183206
# gradient.shape = (n_samples, n_classes)
184207
grad[:, :n_features] = gradient_per_sample.T @ X + l2_reg_strength * w
185208
if self.fit_intercept:
186209
grad[:, -1] = gradient_per_sample.sum(axis=0)
187-
return loss, grad.ravel()
210+
if coef.ndim == 1:
211+
return loss, grad.ravel(order="F")
212+
else:
213+
return loss, grad
188214

189215
def gradient(
190216
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
@@ -193,8 +219,11 @@ def gradient(
193219
194220
Parameters
195221
----------
196-
coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
222+
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
197223
Coefficients of a linear model.
224+
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
225+
i.e. one reconstructs the 2d-array via
226+
coef.reshape((n_classes, -1), order="F").
198227
y : contiguous array of shape (n_samples,)
199228
Observed, true target values.
200229
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -208,8 +237,8 @@ def gradient(
208237
209238
Returns
210239
-------
211-
gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
212-
The gradient of the loss as ravelled array.
240+
gradient : ndarray of shape coef.shape
241+
The gradient of the loss.
213242
"""
214243
n_features, n_classes = X.shape[1], self._loss.n_classes
215244
n_dof = n_features + self.fit_intercept
@@ -229,12 +258,15 @@ def gradient(
229258
grad[-1] = gradient_per_sample.sum()
230259
return grad
231260
else:
232-
grad = np.empty((n_classes, n_dof), dtype=X.dtype)
261+
grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
233262
# gradient.shape = (n_samples, n_classes)
234263
grad[:, :n_features] = gradient_per_sample.T @ X + l2_reg_strength * w
235264
if self.fit_intercept:
236265
grad[:, -1] = gradient_per_sample.sum(axis=0)
237-
return grad.ravel()
266+
if coef.ndim == 1:
267+
return grad.ravel(order="F")
268+
else:
269+
return grad
238270

239271
def gradient_hessp(
240272
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
@@ -243,8 +275,11 @@ def gradient_hessp(
243275
244276
Parameters
245277
----------
246-
coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
278+
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
247279
Coefficients of a linear model.
280+
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
281+
i.e. one reconstructs the 2d-array via
282+
coef.reshape((n_classes, -1), order="F").
248283
y : contiguous array of shape (n_samples,)
249284
Observed, true target values.
250285
X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -258,14 +293,15 @@ def gradient_hessp(
258293
259294
Returns
260295
-------
261-
gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
262-
The gradient of the loss as ravelled array.
296+
gradient : ndarray of shape coef.shape
297+
The gradient of the loss.
263298
264299
hessp : callable
265300
Function that takes in a vector input of shape of gradient and
266301
and returns matrix-vector product with hessian.
267302
"""
268303
(n_samples, n_features), n_classes = X.shape, self._loss.n_classes
304+
n_dof = n_features + self.fit_intercept
269305
w, intercept, raw_prediction = self._w_intercept_raw(coef, X)
270306

271307
if not self._loss.is_multiclass:
@@ -322,7 +358,7 @@ def hessp(s):
322358
sample_weight=sample_weight,
323359
n_threads=n_threads,
324360
)
325-
grad = np.empty_like(coef.reshape(n_classes, -1), dtype=X.dtype)
361+
grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
326362
grad[:, :n_features] = gradient.T @ X + l2_reg_strength * w
327363
if self.fit_intercept:
328364
grad[:, -1] = gradient.sum(axis=0)
@@ -348,7 +384,7 @@ def hessp(s):
348384
#
349385
# See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa
350386
def hessp(s):
351-
s = s.reshape(n_classes, -1) # shape = (n_classes, n_dof)
387+
s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof)
352388
if self.fit_intercept:
353389
s_intercept = s[:, -1]
354390
s = s[:, :-1]
@@ -359,10 +395,16 @@ def hessp(s):
359395
tmp *= proba # * p_i_k
360396
if sample_weight is not None:
361397
tmp *= sample_weight[:, np.newaxis]
362-
hess_prod = np.empty_like(grad)
398+
hess_prod = np.empty_like(grad, order="F")
363399
hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
364400
if self.fit_intercept:
365401
hess_prod[:, -1] = tmp.sum(axis=0)
366-
return hess_prod.ravel()
402+
if coef.ndim == 1:
403+
return hess_prod.ravel(order="F")
404+
else:
405+
return hess_prod
367406

368-
return grad.ravel(), hessp
407+
if coef.ndim == 1:
408+
return grad.ravel(order="F"), hessp
409+
else:
410+
return grad, hessp

sklearn/linear_model/_logistic.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,12 @@ def _logistic_regression_path(
389389
w0[:, : coef.shape[1]] = coef
390390

391391
if multi_class == "multinomial":
392-
# scipy.optimize.minimize and newton-cg accepts only
393-
# ravelled parameters.
394392
if solver in ["lbfgs", "newton-cg"]:
395-
w0 = w0.ravel()
393+
# scipy.optimize.minimize and newton-cg accept only ravelled parameters,
394+
# i.e. 1d-arrays. LinearLoss expects classes to be contiguous and
395+
# reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
396+
# As w0 is F-contiguous, ravel(order="F") also avoids a copy.
397+
w0 = w0.ravel(order="F")
396398
loss = LinearLoss(
397399
loss=HalfMultinomialLoss(n_classes=classes.size),
398400
fit_intercept=fit_intercept,
@@ -507,7 +509,10 @@ def _logistic_regression_path(
507509

508510
if multi_class == "multinomial":
509511
n_classes = max(2, classes.size)
510-
multi_w0 = np.reshape(w0, (n_classes, -1))
512+
if solver in ["lbfgs", "newton-cg"]:
513+
multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
514+
else:
515+
multi_w0 = w0
511516
if n_classes == 2:
512517
multi_w0 = multi_w0[1][np.newaxis, :]
513518
coefs.append(multi_w0.copy())

sklearn/linear_model/tests/test_logistic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,7 +1155,7 @@ def test_multinomial_grad_hess():
11551155
X = rng.randn(n_samples, n_features)
11561156
w = rng.rand(n_classes, n_features)
11571157
y = np.argmax(np.dot(X, w.T), axis=1).astype(X.dtype)
1158-
w = w.ravel()
1158+
w = w.ravel(order="F")
11591159
sample_weights = np.ones(X.shape[0])
11601160
alpha = 1.0
11611161
multinomial = LinearLoss(
@@ -1183,7 +1183,7 @@ def test_multinomial_grad_hess():
11831183
]
11841184
)
11851185
d_grad -= d_grad.mean(axis=0)
1186-
approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
1186+
approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel(order="F")
11871187
assert_array_almost_equal(hess_col, approx_hess_col)
11881188

11891189

0 commit comments

Comments
 (0)
0