8000 MAINT replace cnp.ndarray with memory views in sparsefuncs_fast by OmarManzoor · Pull Request #25764 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MAINT replace cnp.ndarray with memory views in sparsefuncs_fast #25764

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
"sklearn.utils._weight_vector",
"sklearn.utils.arrayfuncs",
"sklearn.utils.murmurhash",
"sklearn.utils.sparsefuncs_fast",
)


Expand Down
189 changes: 106 additions & 83 deletions sklearn/utils/sparsefuncs_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
return means, variances


def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
unsigned long long n_samples,
unsigned long long n_features,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr,
cnp.ndarray[floating, ndim=1] weights):
def _csr_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
Expand All @@ -114,21 +116,20 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
integral i, col_ind
cnp.float64_t diff
# means[j] contains the mean of feature j
cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features)
cnp.float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.ndarray[cnp.float64_t, ndim=1] variances = np.zeros(n_features)
cnp.float64_t[::1] variances = np.zeros(n_features)

cnp.ndarray[cnp.float64_t, ndim=1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] sum_weights_nz = np.zeros(
shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] correction = np.zeros(
shape=n_features)
cnp.float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)

cnp.ndarray[cnp.uint64_t, ndim=1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
cnp.ndarray[cnp.uint64_t, ndim=1] counts_nz = np.zeros(
shape=n_features, dtype=np.uint64)
cnp.uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for row_ind in range(len(X_indptr) - 1):
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
Expand Down Expand Up @@ -177,11 +178,15 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
)

if floating is float:
return (np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32))
return (
np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32),
)
else:
return means, variances, sum_weights
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)


def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
Expand Down Expand Up @@ -229,34 +234,35 @@ def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
return means, variances


def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
unsigned long long n_samples,
unsigned long long n_features,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr,
cnp.ndarray[floating, ndim=1] weights):
def _csc_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
integral i, row_ind
unsigned long long feature_idx, col_ind
cnp.float64_t diff
# means[j] contains the mean of feature j
cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features)
cnp.float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.ndarray[cnp.float64_t, ndim=1] variances = np.zeros(n_features)
cnp.float64_t[::1] variances = np.zeros(n_features)

cnp.ndarray[cnp.float64_t, ndim=1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] sum_weights_nz = np.zeros(
shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] correction = np.zeros(
shape=n_features)
cnp.float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)

cnp.ndarray[cnp.uint64_t, ndim=1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
cnp.ndarray[cnp.uint64_t, ndim=1] counts_nz = np.zeros(
shape=n_features, dtype=np.uint64)
cnp.uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for col_ind in range(n_features):
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
Expand Down Expand Up @@ -308,7 +314,9 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32))
else:
return means, variances, sum_weights
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)


def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
Expand Down Expand Up @@ -383,18 +391,20 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
weights.astype(X_dtype, copy=False))


def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
floating n_samples,
unsigned long long n_features,
cnp.ndarray[int, ndim=1] X_indices,
# X_indptr might be either in32 or int64
cnp.ndarray[integral, ndim=1] X_indptr,
str X_format,
cnp.ndarray[floating, ndim=1] last_mean,
cnp.ndarray[floating, ndim=1] last_var,
cnp.ndarray[floating, ndim=1] last_n,
# previous sum of the weights (ie float)
cnp.ndarray[floating, ndim=1] weights):
def _incr_mean_variance_axis0(
const floating[:] X_data,
floating n_samples,
unsigned long long n_features,
const int[:] X_indices,
# X_indptr might be either in32 or int64
const integral[:] X_indptr,
str X_format,
floating[:] last_mean,
floating[:] last_var,
floating[:] last_n,
# previous sum of the weights (ie float)
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
Expand All @@ -404,10 +414,10 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
# new = the current increment
# updated = the aggregated stats
# when arrays, they are indexed by i per-feature
cnp.ndarray[floating, ndim=1] new_mean
cnp.ndarray[floating, ndim=1] new_var
cnp.ndarray[floating, ndim=1] updated_mean
cnp.ndarray[floating, ndim=1] updated_var
floating[::1] new_mean
floating[::1] new_var
floating[::1] updated_mean
floating[::1] updated_var

if floating is float:
dtype = np.float32
Expand All @@ -420,9 +430,9 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
updated_var = np.zeros_like(new_mean, dtype=dtype)

cdef:
cnp.ndarray[floating, ndim=1] new_n
cnp.ndarray[floating, ndim=1] updated_n
cnp.ndarray[floating, ndim=1] last_over_new_n
floating[::1] new_n
floating[::1] updated_n
floating[::1] last_over_new_n

# Obtain new stats first
updated_n = np.zeros(shape=n_features, dtype=dtype)
Expand All @@ -444,7 +454,7 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
break

if is_first_pass:
return new_mean, new_var, new_n
return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)

for i in range(n_features):
updated_n[i] = last_n[i] + new_n[i]
Expand All @@ -471,18 +481,24 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
updated_mean[i] = last_mean[i]
updated_n[i] = last_n[i]

return updated_mean, updated_var, updated_n
return (
np.asarray(updated_mean),
np.asarray(updated_var),
np.asarray(updated_n),
)


def inplace_csr_row_normalize_l1(X):
"""Inplace row normalize using the l1 norm"""
_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)


def _inplace_csr_row_normalize_l1(cnp.ndarray[floating, ndim=1] X_data,
shape,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr):
def _inplace_csr_row_normalize_l1(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long n_features = shape[1]
Expand Down Expand Up @@ -515,10 +531,12 @@ def inplace_csr_row_normalize_l2(X):
_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)


def _inplace_csr_row_normalize_l2(cnp.ndarray[floating, ndim=1] X_data,
shape,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr):
def _inplace_csr_row_normalize_l2(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long n_features = shape[1]
Expand All @@ -543,10 +561,12 @@ def _inplace_csr_row_normalize_l2(cnp.ndarray[floating, ndim=1] X_data,
X_data[j] /= sum_


def assign_rows_csr(X,
cnp.ndarray[cnp.npy_intp, ndim=1] X_rows,
cnp.ndarray[cnp.npy_intp, ndim=1] out_rows,
cnp.ndarray[floating, ndim=2, mode="c"] out):
def assign_rows_csr(
X,
const cnp.npy_intp[:] X_rows,
const cnp.npy_intp[:] out_rows,
floating[:, ::1] out,
):
"""Densify selected rows of a CSR matrix into a preallocated array.

Like out[out_rows] = X[X_rows].toarray() but without copying.
Expand All @@ -562,18 +582,21 @@ def assign_rows_csr(X,
cdef:
# npy_intp (np.intp in Python) is what np.where returns,
# but int is what scipy.sparse uses.
int i, ind, j
int i, ind, j, k
cnp.npy_intp rX
cnp.ndarray[floating, ndim=1] data = X.data
cnp.ndarray[int, ndim=1] indices = X.indices, indptr = X.indptr
const floating[:] data = X.data
const int[:] indices = X.indices, indptr = X.indptr

if X_rows.shape[0] != out_rows.shape[0]:
raise ValueError("cannot assign %d rows to %d"
% (X_rows.shape[0], out_rows.shape[0]))

out[out_rows] = 0.
for i in range(X_rows.shape[0]):
rX = X_rows[i]
for ind in range(indptr[rX], indptr[rX + 1]):
j = indices[ind]
out[out_rows[i], j] = data[ind]
with nogil:
for k in range(out_rows.shape[0]):
out[out_rows[k]] = 0.0

for i in range(X_rows.shape[0]):
rX = X_rows[i]
for ind in range(indptr[rX], indptr[rX + 1]):
j = indices[ind]
out[out_rows[i], j] = data[ind]
0