scikit-learn
diff --git a/‎sklearn/linear_model/_linear_loss.py
Lines changed: 68 additions & 26 deletions b/‎sklearn/linear_model/_linear_loss.py
Lines changed: 68 additions & 26 deletions
diff --git a/‎sklearn/linear_model/_logistic.py
Lines changed: 9 additions & 4 deletions b/‎sklearn/linear_model/_logistic.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎sklearn/linear_model/tests/test_logistic.py
Lines changed: 2 additions & 2 deletions b/‎sklearn/linear_model/tests/test_logistic.py
Lines changed: 2 additions & 2 deletions
@@ -7,7 +7,7 @@
 
 
 class LinearLoss:
-    """General class for loss functions with raw_prediction = X @ coef.
+    """General class for loss functions with raw_prediction = X @ coef + intercept.
 
     The loss is the sum of per sample losses and includes an L2 term::
 
@@ -28,19 +28,30 @@ class LinearLoss:
             n_dof = n_features
 
         if loss.is_multiclass:
-            coef.shape = (n_classes * n_dof,)
+            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
         else:
             coef.shape = (n_dof,)
 
         The intercept term is at the end of the coef array:
         if loss.is_multiclass:
-            intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
+            if coef.shape (n_classes, n_dof):
+                intercept = coef[:, -1]
+            if coef.shape (n_classes * n_dof,)
+                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
             intercept.shape = (n_classes,)
         else:
             intercept = coef[-1]
 
-    Note: If the average loss per sample is wanted instead of the sum of the
-    loss per sample, one can simply use a rescaled sample_weight such that
+    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
+
+        coef.reshape((n_classes, -1), order="F")
+
+    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
+    coefficients without intercept, coef[:, :-1], contiguous and speeds up
+    matrix-vector computations.
+
+    Note: If the average loss per sample is wanted instead of the sum of the loss per
+    sample, one can simply use a rescaled sample_weight such that
     sum(sample_weight) = 1.
 
     Parameters
@@ -58,8 +69,11 @@ def _w_intercept_raw(self, coef, X):
 
         Parameters
         ----------
-        coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
             Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
 
@@ -82,7 +96,10 @@ def _w_intercept_raw(self, coef, X):
             raw_prediction = X @ w + intercept
         else:
             # reshape to (n_classes, n_dof)
-            w = coef.reshape(self._loss.n_classes, -1)
+            if coef.ndim == 1:
+                w = coef.reshape((self._loss.n_classes, -1), order="F")
+            else:
+                w = coef
             if self.fit_intercept:
                 intercept = w[:, -1]
                 w = w[:, :-1]
@@ -97,8 +114,11 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
 
         Parameters
         ----------
-        coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
             Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
         y : contiguous array of shape (n_samples,)
             Observed, true target values.
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -137,8 +157,11 @@ def loss_gradient(
 
         Parameters
         ----------
-        coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
             Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
         y : contiguous array of shape (n_samples,)
             Observed, true target values.
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -155,8 +178,8 @@ def loss_gradient(
         loss : float
             Sum of losses per sample plus penalty.
 
-        gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
-             The gradient of the loss as ravelled array.
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
         """
         n_features, n_classes = X.shape[1], self._loss.n_classes
         n_dof = n_features + self.fit_intercept
@@ -179,12 +202,15 @@ def loss_gradient(
             return loss, grad
         else:
             loss += 0.5 * l2_reg_strength * squared_norm(w)
-            grad = np.empty((n_classes, n_dof), dtype=X.dtype)
+            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
             # gradient.shape = (n_samples, n_classes)
             grad[:, :n_features] = gradient_per_sample.T @ X + l2_reg_strength * w
             if self.fit_intercept:
                 grad[:, -1] = gradient_per_sample.sum(axis=0)
-            return loss, grad.ravel()
+            if coef.ndim == 1:
+                return loss, grad.ravel(order="F")
+            else:
+                return loss, grad
 
     def gradient(
         self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
@@ -193,8 +219,11 @@ def gradient(
 
         Parameters
         ----------
-        coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
             Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
         y : contiguous array of shape (n_samples,)
             Observed, true target values.
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -208,8 +237,8 @@ def gradient(
 
         Returns
         -------
-        gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
-             The gradient of the loss as ravelled array.
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
         """
         n_features, n_classes = X.shape[1], self._loss.n_classes
         n_dof = n_features + self.fit_intercept
@@ -229,12 +258,15 @@ def gradient(
                 grad[-1] = gradient_per_sample.sum()
             return grad
         else:
-            grad = np.empty((n_classes, n_dof), dtype=X.dtype)
+            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
             # gradient.shape = (n_samples, n_classes)
             grad[:, :n_features] = gradient_per_sample.T @ X + l2_reg_strength * w
             if self.fit_intercept:
                 grad[:, -1] = gradient_per_sample.sum(axis=0)
-            return grad.ravel()
+            if coef.ndim == 1:
+                return grad.ravel(order="F")
+            else:
+                return grad
 
     def gradient_hessp(
         self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
@@ -243,8 +275,11 @@ def gradient_hessp(
 
         Parameters
         ----------
-        coef : ndarray of shape (n_dof,) or (n_classes * n_dof,)
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
             Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
         y : contiguous array of shape (n_samples,)
             Observed, true target values.
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
@@ -258,14 +293,15 @@ def gradient_hessp(
 
         Returns
         -------
-        gradient : ndarray of shape (n_dof,) or (n_classes * n_dof)
-             The gradient of the loss as ravelled array.
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
 
         hessp : callable
             Function that takes in a vector input of shape of gradient and
             and returns matrix-vector product with hessian.
         """
         (n_samples, n_features), n_classes = X.shape, self._loss.n_classes
+        n_dof = n_features + self.fit_intercept
         w, intercept, raw_prediction = self._w_intercept_raw(coef, X)
 
         if not self._loss.is_multiclass:
@@ -322,7 +358,7 @@ def hessp(s):
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
-            grad = np.empty_like(coef.reshape(n_classes, -1), dtype=X.dtype)
+            grad = np.empty((n_classes, n_dof), dtype=X.dtype, order="F")
             grad[:, :n_features] = gradient.T @ X + l2_reg_strength * w
             if self.fit_intercept:
                 grad[:, -1] = gradient.sum(axis=0)
@@ -348,7 +384,7 @@ def hessp(s):
             #
             # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411  # noqa
             def hessp(s):
-                s = s.reshape(n_classes, -1)  # shape = (n_classes, n_dof)
+                s = s.reshape((n_classes, -1), order="F")  # shape = (n_classes, n_dof)
                 if self.fit_intercept:
                     s_intercept = s[:, -1]
                     s = s[:, :-1]
@@ -359,10 +395,16 @@ def hessp(s):
                 tmp *= proba  # * p_i_k
                 if sample_weight is not None:
                     tmp *= sample_weight[:, np.newaxis]
-                hess_prod = np.empty_like(grad)
+                hess_prod = np.empty_like(grad, order="F")
                 hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
                 if self.fit_intercept:
                     hess_prod[:, -1] = tmp.sum(axis=0)
-                return hess_prod.ravel()
+                if coef.ndim == 1:
+                    return hess_prod.ravel(order="F")
+                else:
+                    return hess_prod
 
-        return grad.ravel(), hessp
+        if coef.ndim == 1:
+            return grad.ravel(order="F"), hessp
+        else:
+            return grad, hessp
@@ -389,10 +389,12 @@ def _logistic_regression_path(
                 w0[:, : coef.shape[1]] = coef
 
     if multi_class == "multinomial":
-        # scipy.optimize.minimize and newton-cg accepts only
-        # ravelled parameters.
         if solver in ["lbfgs", "newton-cg"]:
-            w0 = w0.ravel()
+            # scipy.optimize.minimize and newton-cg accept only ravelled parameters,
+            # i.e. 1d-arrays. LinearLoss expects classes to be contiguous and
+            # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
+            # As w0 is F-contiguous, ravel(order="F") also avoids a copy.
+            w0 = w0.ravel(order="F")
             loss = LinearLoss(
                 loss=HalfMultinomialLoss(n_classes=classes.size),
                 fit_intercept=fit_intercept,
@@ -507,7 +509,10 @@ def _logistic_regression_path(
 
         if multi_class == "multinomial":
             n_classes = max(2, classes.size)
-            multi_w0 = np.reshape(w0, (n_classes, -1))
+            if solver in ["lbfgs", "newton-cg"]:
+                multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
+            else:
+                multi_w0 = w0
             if n_classes == 2:
                 multi_w0 = multi_w0[1][np.newaxis, :]
             coefs.append(multi_w0.copy())
 
@@ -1155,7 +1155,7 @@ def test_multinomial_grad_hess():
     X = rng.randn(n_samples, n_features)
     w = rng.rand(n_classes, n_features)
     y = np.argmax(np.dot(X, w.T), axis=1).astype(X.dtype)
-    w = w.ravel()
+    w = w.ravel(order="F")
     sample_weights = np.ones(X.shape[0])
     alpha = 1.0
     multinomial = LinearLoss(
@@ -1183,7 +1183,7 @@ def test_multinomial_grad_hess():
         ]
     )
     d_grad -= d_grad.mean(axis=0)
-    approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
+    approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel(order="F")
     assert_array_almost_equal(hess_col, approx_hess_col)