10000 ENH use more blas functions in cd solvers by lorentzenchr · Pull Request #22972 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH use more blas functions in cd solvers #22972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 46 additions & 23 deletions sklearn/linear_model/_cd_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,23 @@ from cython cimport floating
import warnings
from ..exceptions import ConvergenceWarning

from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
_copy, _scal)
from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans

from ..utils._cython_blas cimport (
ColMajor,
NoTrans,
Trans,
# BLAS Level 1
_asum,
_axpy,
_copy,
_dot,
_nrm2,
_scal,
# BLAS Level 2
_ger,
_gemv,
# BLAS Level 3
_gemm,
)

from ..utils._random cimport our_rand_r

Expand Down Expand Up @@ -134,7 +147,9 @@ def enet_coordinate_descent(
cdef unsigned int n_features = X.shape[1]

# compute norms of the columns of X
cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0)
# norm_cols_X = np.square(X).sum(axis=0)
# the following avoids large intermediate memory allocation
cdef floating[::1] norm_cols_X = np.einsum("ij,ij->j", X, X)

# initial value of the residuals
cdef floating[::1] R = np.empty(n_samples, dtype=dtype)
Expand Down Expand Up @@ -700,9 +715,7 @@ def enet_coordinate_descent_gram(
dual_norm_XtA = abs_max(n_features, XtA_ptr)

# temp = np.sum(w * H)
tmp = 0.0
for ii in range(n_features):
tmp += w[ii] * H[ii]
tmp = _dot(n_features, &w[0], 1, &H[0], 1)
R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w

# w_norm2 = np.dot(w, w)
Expand Down Expand Up @@ -755,6 +768,12 @@ def enet_coordinate_descent_multi_task(

0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2

Parameters
----------
W : F-contiguous ndarray of shape (n_tasks, n_features)
X : F-contiguous ndarray of shape (n_samples, n_features)
Y : F-contiguous ndarray of shape (n_samples, n_tasks)

Returns
-------
W : ndarray of shape (n_tasks, n_features)
Expand All @@ -778,8 +797,8 @@ def enet_coordinate_descent_multi_task(
cdef unsigned int n_tasks = Y.shape[1]

# to store XtA
cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype)
cdef floating XtA_axis1norm
cdef floating[::1, :] XtA = np.zeros((n_tasks, n_features), dtype=dtype, order="F")
cdef floating XtA_axis0norm
cdef floating dual_norm_XtA

# initial value of the residuals
Expand Down Expand Up @@ -851,7 +870,7 @@ def enet_coordinate_descent_multi_task(
# &X[0, ii], 1,
# &w_ii[0], 1, &R[0, 0], n_tasks)
# Using Blas Level1 and for loop to avoid slower threads
# for such small vectors
# for such small vectors (of size n_tasks)
for jj in range(n_tasks):
if w_ii[jj] != 0:
_axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
Expand Down Expand Up @@ -903,20 +922,24 @@ def enet_coordinate_descent_multi_task(
# the tolerance: check the duality gap as ultimate stopping
# criterion

# XtA = np.dot(X.T, R) - l2_reg * W.T
for ii in range(n_features):
for jj in range(n_tasks):
XtA[ii, jj] = _dot(
n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
) - l2_reg * W[jj, ii]

# dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
# Using numpy:
# XtA = np.dot(R.T, X) - l2_reg * W
# Using BLAS Level 3:
# XtA = np.dot(R.T, X)
_gemm(ColMajor, Trans, NoTrans, n_tasks, n_features, n_samples, 1.0,
&R[0, 0], n_samples, &X[0, 0], n_samples, 0.0, &XtA[0, 0],
n_tasks)
# XtA -= l2_reg * W
_axpy(n_features * n_tasks, -l2_reg, &W[0, 0], 1 ,&XtA[0, 0], 1)

# dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=0)))
dual_norm_XtA = 0.0
for ii in range(n_features):
# np.sqrt(np.sum(XtA ** 2, axis=1))
XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
if XtA_axis1norm > dual_norm_XtA:
dual_norm_XtA = XtA_axis1norm
# np.sqrt(np.sum(XtA ** 2, axis=0))
# sum is over tasks
XtA_axis0norm = _nrm2(n_tasks, &XtA[0, ii], 1)
if XtA_axis0norm > dual_norm_XtA:
dual_norm_XtA = XtA_axis0norm

# TODO: use squared L2 norm directly
# R_norm = linalg.norm(R, ord='fro')
Expand Down
0