From a19f0e23a46bf39a6aae87d553ab21125818940f Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 28 Dec 2022 15:11:17 +0100 Subject: [PATCH 1/4] remove warnings --- sklearn/utils/sparsefuncs_fast.pyx | 90 +++++++++++++++--------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 895a41c23634b..efd2c4521a024 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -106,9 +106,9 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: - cnp.npy_intp i - unsigned long long row_ind - integral col_ind + cnp.intp_t row_ind + unsigned long long k + integral i, col_ind cnp.float64_t diff # means[j] contains the mean of feature j cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features) @@ -142,8 +142,8 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # number of non nan elements of X[:, col_ind] counts[col_ind] -= 1 - for i in range(n_features): - means[i] /= sum_weights[i] + for k in range(n_features): + means[k] /= sum_weights[k] for row_ind in range(len(X_indptr) - 1): for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): @@ -156,15 +156,15 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] - for i in range(n_features): - if counts[i] != counts_nz[i]: - correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i] - correction[i] = correction[i]**2 / sum_weights[i] - if counts[i] != counts_nz[i]: + for k in range(n_features): + if counts[k] != counts_nz[k]: + correction[k] -= (sum_weights[k] - sum_weights_nz[k]) * means[k] + correction[k] = correction[k]**2 / sum_weights[k] + if counts[k] != counts_nz[k]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 - variances[i] = (variances[i] - correction[i]) / sum_weights[i] + variances[k] += (sum_weights[k] - sum_weights_nz[k]) * means[k]**2 + variances[k] = (variances[k] - correction[k]) / sum_weights[k] if floating is float: return (np.array(means, dtype=np.float32), @@ -224,13 +224,13 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, unsigned long long n_features, cnp.ndarray[integral, ndim=1] X_indices, cnp.ndarray[integral, ndim=1] X_indptr, - cnp.ndarray[floating, ndim=1] weights): + cnp.ndarray[floating, ndim=1] weights, +): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: - cnp.npy_intp i - unsigned long long col_ind - integral row_ind + integral i, row_ind + unsigned long long k, col_ind cnp.float64_t diff # means[j] contains the mean of feature j cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features) @@ -264,8 +264,8 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # number of non nan elements of X[:, col_ind] counts[col_ind] -= 1 - for i in range(n_features): - means[i] /= sum_weights[i] + for k in range(n_features): + means[k] /= sum_weights[k] for col_ind in range(n_features): for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): @@ -278,15 +278,15 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] - for i in range(n_features): - if counts[i] != counts_nz[i]: - correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i] - correction[i] = correction[i]**2 / sum_weights[i] - if counts[i] != counts_nz[i]: + for k in range(n_features): + if counts[k] != counts_nz[k]: + correction[k] -= (sum_weights[k] - sum_weights_nz[k]) * means[k] + correction[k] = correction[k]**2 / sum_weights[k] + if counts[k] != counts_nz[k]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 - variances[i] = (variances[i] - correction[i]) / sum_weights[i] + variances[k] += (sum_weights[k] - sum_weights_nz[k]) * means[k]**2 + variances[k] = (variances[k] - correction[k]) / sum_weights[k] if floating is float: return (np.array(means, dtype=np.float32), @@ -383,13 +383,12 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data, # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: - cnp.npy_intp i + unsigned long long i - # last = stats until now - # new = the current increment - # updated = the aggregated stats - # when arrays, they are indexed by i per-feature - cdef: + # last = stats until now + # new = the current increment + # updated = the aggregated stats + # when arrays, they are indexed by i per-feature cnp.ndarray[floating, ndim=1] new_mean cnp.ndarray[floating, ndim=1] new_var cnp.ndarray[floating, ndim=1] updated_mean @@ -469,15 +468,17 @@ def _inplace_csr_row_normalize_l1(cnp.ndarray[floating, ndim=1] X_data, shape, cnp.ndarray[integral, ndim=1] X_indices, cnp.ndarray[integral, ndim=1] X_indptr): - cdef unsigned long long n_samples = shape[0] - cdef unsigned long long n_features = shape[1] + cdef: + unsigned long long n_samples = shape[0] + unsigned long long n_features = shape[1] - # the column indices for row i are stored in: - # indices[indptr[i]:indices[i+1]] - # and their corresponding values are stored in: - # data[indptr[i]:indptr[i+1]] - cdef cnp.npy_intp i, j - cdef double sum_ + # the column indices for row i are stored in: + # indices[indptr[i]:indices[i+1]] + # and their corresponding values are stored in: + # data[indptr[i]:indptr[i+1]] + unsigned long long i + integral j + double sum_ for i in range(n_samples): sum_ = 0.0 @@ -503,11 +504,12 @@ def _inplace_csr_row_normalize_l2(cnp.ndarray[floating, ndim=1] X_data, shape, cnp.ndarray[integral, ndim=1] X_indices, cnp.ndarray[integral, ndim=1] X_indptr): - cdef integral n_samples = shape[0] - cdef integral n_features = shape[1] - - cdef cnp.npy_intp i, j - cdef double sum_ + cdef: + unsigned long long n_samples = shape[0] + unsigned long long n_features = shape[1] + unsigned long long i + integral j + double sum_ for i in range(n_samples): sum_ = 0.0 From b059f8e4b8cf3aede57aa02b6bfffc674852a43e Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 28 Dec 2022 15:29:00 +0100 Subject: [PATCH 2/4] reverse a spurious change --- sklearn/utils/sparsefuncs_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index efd2c4521a024..1c33b8e7787a1 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -224,7 +224,7 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, unsigned long long n_features, cnp.ndarray[integral, ndim=1] X_indices, cnp.ndarray[integral, ndim=1] X_indptr, - cnp.ndarray[floating, ndim=1] weights, + cnp.ndarray[floating, ndim=1] weights, ): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments From ac82aa311c9e606656c44fc392b0228cbccbd364 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 28 Dec 2022 15:29:55 +0100 Subject: [PATCH 3/4] reverse a spurious change --- sklearn/utils/sparsefuncs_fast.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 1c33b8e7787a1..7d1f80bfb50a0 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -224,8 +224,7 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, unsigned long long n_features, cnp.ndarray[integral, ndim=1] X_indices, cnp.ndarray[integral, ndim=1] X_indptr, - cnp.ndarray[floating, ndim=1] weights, -): + cnp.ndarray[floating, ndim=1] weights): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: From 36fc10dfca2ac02a1cd3f36178a16c9a7eefd7ea Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 31 Dec 2022 12:09:10 +0100 Subject: [PATCH 4/4] apply suggestions --- sklearn/utils/sparsefuncs_fast.pyx | 53 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 7d1f80bfb50a0..e0aaa2a5a060e 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -107,7 +107,7 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # cannot be declared directly and can only be passed as function arguments cdef: cnp.intp_t row_ind - unsigned long long k + unsigned long long feature_idx integral i, col_ind cnp.float64_t diff # means[j] contains the mean of feature j @@ -142,8 +142,8 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # number of non nan elements of X[:, col_ind] counts[col_ind] -= 1 - for k in range(n_features): - means[k] /= sum_weights[k] + for feature_idx in range(n_features): + means[feature_idx] /= sum_weights[feature_idx] for row_ind in range(len(X_indptr) - 1): for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): @@ -156,15 +156,22 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] - for k in range(n_features): - if counts[k] != counts_nz[k]: - correction[k] -= (sum_weights[k] - sum_weights_nz[k]) * means[k] - correction[k] = correction[k]**2 / sum_weights[k] - if counts[k] != counts_nz[k]: + for feature_idx in range(n_features): + if counts[feature_idx] != counts_nz[feature_idx]: + correction[feature_idx] -= ( + sum_weights[feature_idx] - sum_weights_nz[feature_idx] + ) * means[feature_idx] + correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx] + if counts[feature_idx] != counts_nz[feature_idx]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. - variances[k] += (sum_weights[k] - sum_weights_nz[k]) * means[k]**2 - variances[k] = (variances[k] - correction[k]) / sum_weights[k] + variances[feature_idx] += ( + sum_weights[feature_idx] - sum_weights_nz[feature_idx] + ) * means[feature_idx]**2 + variances[feature_idx] = ( + (variances[feature_idx] - correction[feature_idx]) / + sum_weights[feature_idx] + ) if floating is float: return (np.array(means, dtype=np.float32), @@ -229,7 +236,7 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # cannot be declared directly and can only be passed as function arguments cdef: integral i, row_ind - unsigned long long k, col_ind + unsigned long long feature_idx, col_ind cnp.float64_t diff # means[j] contains the mean of feature j cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features) @@ -263,8 +270,8 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, # number of non nan elements of X[:, col_ind] counts[col_ind] -= 1 - for k in range(n_features): - means[k] /= sum_weights[k] + for feature_idx in range(n_features): + means[feature_idx] /= sum_weights[feature_idx] for col_ind in range(n_features): for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): @@ -277,15 +284,21 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data, correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] - for k in range(n_features): - if counts[k] != counts_nz[k]: - correction[k] -= (sum_weights[k] - sum_weights_nz[k]) * means[k] - correction[k] = correction[k]**2 / sum_weights[k] - if counts[k] != counts_nz[k]: + for feature_idx in range(n_features): + if counts[feature_idx] != counts_nz[feature_idx]: + correction[feature_idx] -= ( + sum_weights[feature_idx] - sum_weights_nz[feature_idx] + ) * means[feature_idx] + correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx] + if counts[feature_idx] != counts_nz[feature_idx]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. - variances[k] += (sum_weights[k] - sum_weights_nz[k]) * means[k]**2 - variances[k] = (variances[k] - correction[k]) / sum_weights[k] + variances[feature_idx] += ( + sum_weights[feature_idx] - sum_weights_nz[feature_idx] + ) * means[feature_idx]**2 + variances[feature_idx] = ( + (variances[feature_idx] - correction[feature_idx]) + ) / sum_weights[feature_idx] if floating is float: return (np.array(means, dtype=np.float32),