From 78700804a8800c600953cd5402518cc98e152c00 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Mon, 12 Sep 2022 10:42:56 +0200 Subject: [PATCH 1/7] ENH csr_row_norms optimization --- sklearn/utils/sparsefuncs_fast.pyx | 38 ++++++++++++++++++------------ 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 895a41c23634b..c71594061cb9b 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -27,27 +27,35 @@ def csr_row_norms(X): """L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) - return _csr_row_norms(X.data, X.shape, X.indices, X.indptr) + return np.asarray(_csr_row_norms(X.data, X.indices, X.indptr)) -def _csr_row_norms(cnp.ndarray[floating, ndim=1, mode="c"] X_data, - shape, - cnp.ndarray[integral, ndim=1, mode="c"] X_indices, - cnp.ndarray[integral, ndim=1, mode="c"] X_indptr): +def _csr_row_norms( + floating[::1] X_data, + integral[::1] X_indices, + integral[::1] X_indptr, +): cdef: - unsigned long long n_samples = shape[0] - unsigned long long i + integral n_samples = X_indptr.shape[0] - 1 + integral i integral j - double sum_ + floating sum_ - norms = np.empty(n_samples, dtype=X_data.dtype) - cdef floating[::1] norms_view = norms + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 - for i in range(n_samples): - sum_ = 0.0 - for j in range(X_indptr[i], X_indptr[i + 1]): - sum_ += X_data[j] * X_data[j] - norms_view[i] = sum_ + cdef floating[::1] norms = ( + np.empty(n_samples, dtype=dtype) + ) + + with nogil: + for i in range(n_samples): + sum_ = 0.0 + for j in range(X_indptr[i], X_indptr[i + 1]): + sum_ += X_data[j] * X_data[j] + norms[i] = sum_ return norms From f4ff936d9f60411773560d313d14ea2ebdb25095 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 12 Sep 2022 12:05:00 +0200 Subject: [PATCH 2/7] Apply suggestions from code review Co-authored-by: Julien Jerphanion Co-authored-by: Guillaume Lemaitre --- sklearn/utils/sparsefuncs_fast.pyx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index c71594061cb9b..a3eda2dfd8d7c 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -41,14 +41,9 @@ def _csr_row_norms( integral j floating sum_ - if floating is float: - dtype = np.float32 - else: - dtype = np.float64 + dtype = np.float32 if floating is float else np.float64 - cdef floating[::1] norms = ( - np.empty(n_samples, dtype=dtype) - ) + cdef floating[::1] norms = np.empty(n_samples, dtype=dtype) with nogil: for i in range(n_samples): From 496c6cc2b149f292c6a452e75510d8f39d7950d6 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 12 Sep 2022 15:50:58 +0200 Subject: [PATCH 3/7] Update sklearn/utils/sparsefuncs_fast.pyx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/utils/sparsefuncs_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index a3eda2dfd8d7c..2eb33f02ed9b3 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -24,7 +24,7 @@ ctypedef cnp.float64_t DOUBLE def csr_row_norms(X): - """L2 norm of each row in CSR matrix X.""" + """Squared L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) return np.asarray(_csr_row_norms(X.data, X.indices, X.indptr)) From 5e84dad2519a5889daa409a7b13355881fe34972 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 19 Jan 2023 11:01:41 +0100 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Julien Jerphanion Co-authored-by: Guillaume Lemaitre --- sklearn/utils/sparsefuncs_fast.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 2eb33f02ed9b3..bd60a02f5e802 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -27,18 +27,17 @@ def csr_row_norms(X): """Squared L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) - return np.asarray(_csr_row_norms(X.data, X.indices, X.indptr)) + return _csr_row_norms(X.data, X.indices, X.indptr).base def _csr_row_norms( - floating[::1] X_data, - integral[::1] X_indices, - integral[::1] X_indptr, + const floating[::1] X_data, + const integral[::1] X_indices, + const integral[::1] X_indptr, ): cdef: integral n_samples = X_indptr.shape[0] - 1 - integral i - integral j + integral i, j floating sum_ dtype = np.float32 if floating is float else np.float64 From a4fa7ec6a85d4bd023f8da93a418483c32550c3e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 23 Jan 2023 16:58:18 +0100 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/utils/sparsefuncs_fast.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 2292bc1690111..8673f30eba972 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -42,14 +42,12 @@ def _csr_row_norms( dtype = np.float32 if floating is float else np.float64 - cdef floating[::1] norms = np.empty(n_samples, dtype=dtype) + cdef floating[::1] norms = np.zeros(n_samples, dtype=dtype) with nogil: for i in range(n_samples): - sum_ = 0.0 for j in range(X_indptr[i], X_indptr[i + 1]): - sum_ += X_data[j] * X_data[j] - norms[i] = sum_ + norms[i] += X_data[j] * X_data[j] return norms From 92316b3a9d39cdf90c48a883f3ce5f90ae4de667 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 26 Jan 2023 11:19:38 +0100 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/utils/sparsefuncs_fast.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 8673f30eba972..629da25a19c2b 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -27,7 +27,7 @@ def csr_row_norms(X): """Squared L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) - return _csr_row_norms(X.data, X.indices, X.indptr).base + return _csr_row_norms(X.data, X.indices, X.indptr) def _csr_row_norms( @@ -38,7 +38,7 @@ def _csr_row_norms( cdef: integral n_samples = X_indptr.shape[0] - 1 integral i, j - floating sum_ + double sum_ dtype = np.float32 if floating is float else np.float64 @@ -49,7 +49,7 @@ def _csr_row_norms( for j in range(X_indptr[i], X_indptr[i + 1]): norms[i] += X_data[j] * X_data[j] - return norms + return norms.base def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False): From 036a88715de61dd823418400f25503dcccf0d153 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Fri, 27 Jan 2023 11:27:21 +0100 Subject: [PATCH 7/7] Apply suggestion from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/utils/sparsefuncs_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 629da25a19c2b..9cbfcc1f7a3f6 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -49,7 +49,7 @@ def _csr_row_norms( for j in range(X_indptr[i], X_indptr[i + 1]): norms[i] += X_data[j] * X_data[j] - return norms.base + return np.asarray(norms) def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):