scikit-learn · lorentzenchr · Mar 27, 2022 · Mar 28, 2022 · Mar 29, 2022 · Mar 29, 2022
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
@@ -16,10 +16,23 @@ from cython cimport floating
 import warnings
 from ..exceptions import ConvergenceWarning
 
-from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
-                                   _copy, _scal)
-from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
-
+from ..utils._cython_blas cimport (
+    ColMajor,
+    NoTrans,
+    Trans,
+    # BLAS Level 1
+    _asum,
+    _axpy,
+    _copy,
+    _dot,
+    _nrm2,
+    _scal,
+    # BLAS Level 2
+    _ger,
+    _gemv,
+    # BLAS Level 3
+    _gemm,
+)
 
 from ..utils._random cimport our_rand_r
 
@@ -134,7 +147,9 @@ def enet_coordinate_descent(
     cdef unsigned int n_features = X.shape[1]
 
     # compute norms of the columns of X
-    cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0)
+    # norm_cols_X = np.square(X).sum(axis=0)
+    # the following avoids large intermediate memory allocation
+    cdef floating[::1] norm_cols_X = np.einsum("ij,ij->j", X, X)
 
     # initial value of the residuals
     cdef floating[::1] R = np.empty(n_samples, dtype=dtype)
@@ -700,9 +715,7 @@ def enet_coordinate_descent_gram(
                     dual_norm_XtA = abs_max(n_features, XtA_ptr)
 
                 # temp = np.sum(w * H)
-                tmp = 0.0
-                for ii in range(n_features):
-                    tmp += w[ii] * H[ii]
+                tmp = _dot(n_features, &w[0], 1, &H[0], 1)
                 R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w
 
                 # w_norm2 = np.dot(w, w)
@@ -755,6 +768,12 @@ def enet_coordinate_descent_multi_task(
 
         0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
 
+    Parameters
+    ----------
+    W : F-contiguous ndarray of shape (n_tasks, n_features)
+    X : F-contiguous ndarray of shape (n_samples, n_features)
+    Y : F-contiguous ndarray of shape (n_samples, n_tasks)
+
     Returns
     -------
     W : ndarray of shape (n_tasks, n_features)
@@ -778,8 +797,8 @@ def enet_coordinate_descent_multi_task(
     cdef unsigned int n_tasks = Y.shape[1]
 
     # to store XtA
-    cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype)
-    cdef floating XtA_axis1norm
+    cdef floating[::1, :] XtA = np.zeros((n_tasks, n_features), dtype=dtype, order="F")
+    cdef floating XtA_axis0norm
     cdef floating dual_norm_XtA
 
     # initial value of the residuals
@@ -851,7 +870,7 @@ def enet_coordinate_descent_multi_task(
                 #      &X[0, ii], 1,
                 #      &w_ii[0], 1, &R[0, 0], n_tasks)
                 # Using Blas Level1 and for loop to avoid slower threads
-                # for such small vectors
+                # for such small vectors (of size n_tasks)
                 for jj in range(n_tasks):
                     if w_ii[jj] != 0:
                         _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
@@ -903,20 +922,24 @@ def enet_coordinate_descent_multi_task(
                 # the tolerance: check the duality gap as ultimate stopping
                 # criterion
 
-                # XtA = np.dot(X.T, R) - l2_reg * W.T
-                for ii in range(n_features):
-                    for jj in range(n_tasks):
-                        XtA[ii, jj] = _dot(
-                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
-                            ) - l2_reg * W[jj, ii]
-
-                # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
+                # Using numpy:
+                #     XtA = np.dot(R.T, X) - l2_reg * W
+                # Using BLAS Level 3:
+                #     XtA = np.dot(R.T, X)
+                _gemm(ColMajor, Trans, NoTrans, n_tasks, n_features, n_samples, 1.0,
+                      &R[0, 0], n_samples, &X[0, 0], n_samples, 0.0, &XtA[0, 0],
+                      n_tasks)
+                # XtA -= l2_reg * W
+                _axpy(n_features * n_tasks, -l2_reg, &W[0, 0], 1 ,&XtA[0, 0], 1)
+
+                # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=0)))
                 dual_norm_XtA = 0.0
                 for ii in range(n_features):
-                    # np.sqrt(np.sum(XtA ** 2, axis=1))
-                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
-                    if XtA_axis1norm > dual_norm_XtA:
-                        dual_norm_XtA = XtA_axis1norm
+                    # np.sqrt(np.sum(XtA ** 2, axis=0))
+                    # sum is over tasks
+                    XtA_axis0norm = _nrm2(n_tasks, &XtA[0, ii], 1)
+                    if XtA_axis0norm > dual_norm_XtA:
+                        dual_norm_XtA = XtA_axis0norm
 
                 # TODO: use squared L2 norm directly
                 # R_norm = linalg.norm(R, ord='fro')