8000 MAINT replace cnp.ndarray with memory views in sparsefuncs_fast by OmarManzoor · Pull Request #25764 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits < 8000 details-menu class="select-menu-modal position-absolute" style="z-index: 99;">
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
"sklearn.utils._weight_vector",
"sklearn.utils.arrayfuncs",
"sklearn.utils.murmurhash",
"sklearn.utils.sparsefuncs_fast",
)


Expand Down
189 changes: 106 additions & 83 deletions sklearn/utils/sparsefuncs_fast.pyx
8000
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
return means, variances


def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
unsigned long long n_samples,
unsigned long long n_features,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr,
cnp.ndarray[floating, ndim=1] weights):
def _csr_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
Expand All @@ -114,21 +116,20 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
integral i, col_ind
cnp.float64_t diff
# means[j] contains the mean of feature j
cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features)
cnp.float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.ndarray[cnp.float64_t, ndim=1] variances = np.zeros(n_features)
cnp.float64_t[::1] variances = np.zeros(n_features)

cnp.ndarray[cnp.float64_t, ndim=1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] sum_weights_nz = np.zeros(
shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] correction = np.zeros(
shape=n_features)
cnp.float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)

cnp.ndarray[cnp.uint64_t, ndim=1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
cnp.ndarray[cnp.uint64_t, ndim=1] counts_nz = np.zeros(
shape=n_features, dtype=np.uint64)
cnp.uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for row_ind in range(len(X_indptr) - 1):
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
Expand Down Expand Up @@ -177,11 +178,15 @@ def _csr_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
)

if floating is float:
return (np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32))
return (
np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32),
)
else:
return means, variances, sum_weights
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)


def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
Expand Down Expand Up @@ -229,34 +234,35 @@ def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
return means, variances


def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
unsigned long long n_samples,
unsigned long long n_features,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr,
cnp.ndarray[floating, ndim=1] weights):
def _csc_mean_variance_axis0(
const floating[::1] X_data,
unsigned long long n_samples,
unsigned long long n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
integral i, row_ind
unsigned long long feature_idx, col_ind
cnp.float64_t diff
# means[j] contains the mean of feature j
cnp.ndarray[cnp.float64_t, ndim=1] means = np.zeros(n_features)
cnp.float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
cnp.ndarray[cnp.float64_t, ndim=1] variances = np.zeros(n_features)
cnp.float64_t[::1] variances = np.zeros(n_features)

cnp.ndarray[cnp.float64_t, ndim=1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] sum_weights_nz = np.zeros(
shape=n_features)
cnp.ndarray[cnp.float64_t, ndim=1] correction = np.zeros(
shape=n_features)
cnp.float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
cnp.float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
cnp.float64_t[::1] correction = np.zeros(shape=n_features)

cnp.ndarray[cnp.uint64_t, ndim=1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
cnp.ndarray[cnp.uint64_t, ndim=1] counts_nz = np.zeros(
shape=n_features, dtype=np.uint64)
cnp.uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
cnp.uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

for col_ind in range(n_features):
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
Expand Down Expand Up @@ -308,7 +314,9 @@ def _csc_mean_variance_axis0(cnp.ndarray[floating, ndim=1, mode="c"] X_data,
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32))
else:
return means, variances, sum_weights
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)


def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
Expand Down Expand Up @@ -383,18 +391,20 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
weights.astype(X_dtype, copy=False))


def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
floating n_samples,
unsigned long long n_features,
cnp.ndarray[int, ndim=1] X_indices,
# X_indptr might be either in32 or int64
cnp.ndarray[integral, ndim=1] X_indptr,
str X_format,
cnp.ndarray[floating, ndim=1] last_mean,
cnp.ndarray[floating, ndim=1] last_var,
cnp.ndarray[floating, ndim=1] last_n,
# previous sum of the weights (ie float)
cnp.ndarray[floating, ndim=1] weights):
def _incr_mean_variance_axis0(
const floating[:] X_data,
floating n_samples,
unsigned long long n_features,
const int[:] X_indices,
# X_indptr might be either in32 or int64
const integral[:] X_indptr,
str X_format,
floating[:] last_mean,
floating[:] last_var,
floating[:] last_n,
# previous sum of the weights (ie float)
8000 const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
Expand All @@ -404,10 +414,10 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
# new = the current increment
# updated = the aggregated stats
# when arrays, they are indexed by i per-feature
cnp.ndarray[floating, ndim=1] new_mean
cnp.ndarray[floating, ndim=1] new_var
cnp.ndarray[floating, ndim=1] updated_mean
cnp.ndarray[floating, ndim=1] updated_var
floating[::1] new_mean
floating[::1] new_var
floating[::1] updated_mean
floating[::1] updated_var

if floating is float:
dtype = np.float32
Expand All @@ -420,9 +430,9 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
updated_var = np.zeros_like(new_mean, dtype=dtype)

cdef:
cnp.ndarray[floating, ndim=1] new_n
cnp.ndarray[floating, ndim=1] updated_n
cnp.ndarray[floating, ndim=1] last_over_new_n
floating[::1] new_n
floating[::1] updated_n
floating[::1] last_over_new_n

# Obtain new stats first
updated_n = np.zeros(shape=n_features, dtype=dtype)
Expand All @@ -444,7 +454,7 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
break

if is_first_pass:
return new_mean, new_var, new_n
return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)

for i in range(n_features):
updated_n[i] = last_n[i] + new_n[i]
Expand All @@ -471,18 +481,24 @@ def _incr_mean_variance_axis0(cnp.ndarray[floating, ndim=1] X_data,
updated_mean[i] = last_mean[i]
updated_n[i] = last_n[i]

return updated_mean, updated_var, updated_n
return (
np.asarray(updated_mean),
np.asarray(updated_var),
np.asarray(updated_n),
)


def inplace_csr_row_normalize_l1(X):
"""Inplace row normalize using the l1 norm"""
_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)


def _inplace_csr_row_normalize_l1(cnp.ndarray[floating, ndim=1] X_data,
shape,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr):
def _inplace_csr_row_normalize_l1(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long n_features = shape[1]
Expand Down Expand Up @@ -515,10 +531,12 @@ def inplace_csr_row_normalize_l2(X):
_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)


def _inplace_csr_row_normalize_l2(cnp.ndarray[floating, ndim=1] X_data,
shape,
cnp.ndarray[integral, ndim=1] X_indices,
cnp.ndarray[integral, ndim=1] X_indptr):
def _inplace_csr_row_normalize_l2(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long n_features = shape[1]
Expand All @@ -543,10 +561,12 @@ def _inplace_csr_row_normalize_l2(cnp.ndarray[floating, ndim=1] X_data,
X_data[j] /= sum_


def assign_rows_csr(X,
cnp.ndarray[cnp.npy_intp, ndim=1] X_rows,
cnp.ndarray[cnp.npy_intp, ndim=1] out_rows,
cnp.ndarray[floating, ndim=2, mode="c"] out):
def assign_rows_csr(
X,
const cnp.npy_intp[:] X_rows,
const cnp.npy_intp[:] out_rows,
floating[:, ::1] out,
):
"""Densify selected rows of a CSR matrix into a preallocated array.

Like out[out_rows] = X[X_rows].toarray() but without copying.
Expand All @@ -562,18 +582,21 @@ def assign_rows_csr(X,
cdef:
# npy_intp (np.intp in Python) is what np.where returns,
# but int is what scipy.sparse uses.
int i, ind, j
int i, ind, j, k
cnp.npy_intp rX
cnp.ndarray[floating, ndim=1] data = X.data
cnp.ndarray[int, ndim=1] indices = X.indices, indptr = X.indptr
const floating[:] data = X.data
const int[:] indices = X.indices, indptr = X.indptr

if X_rows.shape[0] != out_rows.shape[0]:
raise ValueError("cannot assign %d rows to %d"
% (X_rows.shape[0], out_rows.shape[0]))

out[out_rows] = 0.
for i in range(X_rows.shape[0]):
rX = X_rows[i]
for ind in range(indptr[rX], indptr[rX + 1]):
j = indices[ind]
out[out_rows[i], j] = data[ind]
with nogil:
for k in range(out_rows.shape[0]):
out[out_rows[k]] = 0.0

for i in range(X_rows.shape[0]):
rX = X_rows[i]
for ind in range(indptr[rX], indptr[rX + 1]):
j = indices[ind]
out[out_rows[i], j] = data[ind]
0