8000 [WIP] FIX index overflow error in sparse matrix polynomial expansion … by niuk-a · Pull Request #20524 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] FIX index overflow error in sparse matrix polynomial expansion … #20524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 37 additions & 57 deletions sklearn/preprocessing/_csr_polynomial_expansion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@ from numpy cimport ndarray
cimport numpy as np

np.import_array()
ctypedef np.int32_t INDEX_T
ctypedef np.int32_t INDEX_A
ctypedef fused INDEX_B:
np.int32_t
np.int64_t
ctypedef np.int8_t FLAG_T

ctypedef fused DATA_T:
np.float32_t
np.float64_t
np.int32_t
np.int64_t


cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
INDEX_T interaction_only) nogil:
cdef inline INDEX_B _deg2_column(INDEX_B d, INDEX_B i, INDEX_B j,
FLAG_T interaction_only) nogil:
"""Compute the index of the column for a degree 2 expansion

d is the dimensionality of the input data, i and j are the indices
Expand All @@ -31,8 +32,8 @@ cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
return d * i - (i**2 + i) / 2 + j


cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
INDEX_T interaction_only) nogil:
cdef inline INDEX_B _deg3_column(INDEX_B d, INDEX_B i, INDEX_B j, INDEX_B k,
FLAG_T interaction_only) nogil:
"""Compute the index of the column for a degree 3 expansion

d is the dimensionality of the input data, i, j and k are the indices
Expand All @@ -49,10 +50,14 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,


def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
ndarray[INDEX_T, ndim=1] indices,
ndarray[INDEX_T, ndim=1] indptr,
INDEX_T d, INDEX_T interaction_only,
INDEX_T degree):
ndarray[INDEX_A, ndim=1] indices,
ndarray[INDEX_A, ndim=1] indptr,
INDEX_A d,
ndarray[DATA_T, ndim=1] result_data,
ndarray[INDEX_B, ndim=1] result_indices,
ndarray[INDEX_B, ndim=1] result_indptr,
FLAG_T interaction_only,
FLAG_T degree):
"""
Perform a second-degree polynomial or interaction expansion on a scipy
compressed sparse row (CSR) matrix. The method used only takes products of
Expand All @@ -74,6 +79,15 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
d : int
The dimensionality of the input CSR matrix.

result_data : nd-array
The output CSR matrix's "data" attribute

result_indices : nd-array
< 8000 /td> The output CSR matrix's "indices" attribute

result_indptr : nd-array
The output CSR matrix's "indptr" attribute

interaction_only : int
0 for a polynomial expansion, 1 for an interaction expansion.

Expand All @@ -86,44 +100,13 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
"""

assert degree in (2, 3)

if degree == 2:
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
else:
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
- interaction_only*d**2)
if expanded_dimensionality == 0:
return None
assert expanded_dimensionality > 0

cdef INDEX_T total_nnz = 0, row_i, nnz

# Count how many nonzero elements the expanded matrix will contain.
for row_i in range(indptr.shape[0]-1):
# nnz is the number of nonzero elements in this row.
nnz = indptr[row_i + 1] - indptr[row_i]
if degree == 2:
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
else:
total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
- interaction_only * nnz ** 2)

# Make the arrays that will form the CSR matrix of the expansion.
cdef ndarray[DATA_T, ndim=1] expanded_data = ndarray(
shape=total_nnz, dtype=data.dtype)
cdef ndarray[INDEX_T, ndim=1] expanded_indices = ndarray(
shape=total_nnz, dtype=indices.dtype)
cdef INDEX_T num_rows = indptr.shape[0] - 1
cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
shape=num_rows + 1, dtype=indptr.dtype)

cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
i_ptr, j_ptr, k_ptr, num_cols_in_row, \
expanded_column
cdef INDEX_A row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr

cdef INDEX_B expanded_index=0, num_cols_in_row, col

with nogil:
expanded_indptr[0] = indptr[0]
result_indptr[0] = indptr[0]
for row_i in range(indptr.shape[0]-1):
row_starts = indptr[row_i]
row_ends = indptr[row_i + 1]
Expand All @@ -133,9 +116,9 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
for j_ptr in range(i_ptr + interaction_only, row_ends):
j = indices[j_ptr]
if degree == 2:
col = _deg2_column(d, i, j, interaction_only)
expanded_indices[expanded_index] = col
expanded_data[expanded_index] = (
col = _deg2_column[INDEX_B](d, i, j, interaction_only)
result_indices[expanded_index] = col
result_data[expanded_index] = (
data[i_ptr] * data[j_ptr])
expanded_index += 1
num_cols_in_row += 1
Expand All @@ -144,14 +127,11 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
for k_ptr in range(j_ptr + interaction_only,
row_ends):
k = indices[k_ptr]
col = _deg3_column(d, i, j, k, interaction_only)
expanded_indices[expanded_index] = col
expanded_data[expanded_index] = (
col = _deg3_column[INDEX_B](d, i, j, k, interaction_only)
result_indices[expanded_index] = col
result_data[expanded_index] = (
data[i_ptr] * data[j_ptr] * data[k_ptr])
expanded_index += 1
num_cols_in_row += 1

expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row

return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
shape=(num_rows, expanded_dimensionality))
result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
48 changes: 43 additions & 5 deletions sklearn/preprocessing/_polynomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ def _num_combinations(

return combinations

@staticmethod
def _calc_expanded_dimensionality(d, interaction_only, degree):
if degree == 2:
return (d ** 2 + d) // 2 - interaction_only * d
else:
return (d ** 3 + 3 * d ** 2 + 2 * d) // 6 - interaction_only * d ** 2

@property
def powers_(self):
check_is_fitted(self)
Expand Down Expand Up @@ -334,17 +341,48 @@ def transform(self, X):
if self._min_degree <= 1:
to_stack.append(X)
for deg in range(max(2, self._min_degree), self._max_degree + 1):
Xp_next = _csr_polynomial_expansion(
X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg
# Count how many nonzero elements the expanded matrix will contain.
total_nnz = sum(
self._calc_expanded_dimensionality(
X.indptr[row_i + 1] - X.indptr[row_i],
self.interaction_only,
deg,
)
for row_i in range(X.indptr.shape[0] - 1)
)
expanded_d = self._calc_expanded_dimensionality(
X.shape[1], self.interaction_only, deg
)
if Xp_next is None:
if expanded_d == 0:
break
to_stack.append(Xp_next)
assert expanded_d > 0
index_t = np.int64 if expanded_d > np.iinfo(np.int32).max else np.int32
expanded_data = np.ndarray(shape=total_nnz, dtype=X.data.dtype)
expanded_indices = np.ndarray(shape=total_nnz, dtype=index_t)
expanded_indptr = np.ndarray(shape=X.indptr.shape[0], dtype=index_t)
_csr_polynomial_expansion(
X.data,
X.indices,
X.indptr,
X.shape[1],
expanded_data,
expanded_indices,
expanded_indptr,
self.interaction_only,
deg,
)
to_stack.append(
sparse.csr_matrix(
(expanded_data, expanded_indices, expanded_indptr),
shape=(X.indptr.shape[0] - 1, expanded_d),
dtype=X.dtype,
)
)
if len(to_stack) == 0:
# edge case: deal with empty matrix
XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
else:
XP = sparse.hstack(to_stack, format="csr")
XP = sparse.hstack(to_stack, format="csr", dtype=X.dtype)
elif sparse.isspmatrix_csc(X) and self._max_degree < 4:
return self.transform(X.tocsr()).tocsc()
elif sparse.isspmatrix(X):
Expand Down
26 changes: 26 additions & 0 deletions sklearn/preprocessing/tests/test_polynomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,3 +855,29 @@ def test_polynomial_features_deprecated_n_input_features():

with pytest.warns(FutureWarning, match=depr_msg):
PolynomialFeatures().fit(X).n_input_features_


def test_csr_polynomial_expansion_index_overflow():
N = 12
M = 120000
dtype = np.float32
x = sparse.csr_matrix(
(
np.arange(1, 5, dtype=np.int64),
(np.array([N - 1, N - 1, N, N]), np.array([M - 1, M, M - 1, M])),
),
shape=(N + 1, M + 1),
dtype=dtype,
copy=False,
)
pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
xinter = pf.fit_transform(x)
n_index, m_index = xinter.nonzero()

assert xinter.dtype == dtype
assert xinter.shape == (13, 7200180001)
assert_array_almost_equal(xinter.data, np.array([1, 2, 2, 3, 4, 12], dtype=dtype))
assert_array_almost_equal(n_index, np.array([11, 11, 11, 12, 12, 12]))
assert_array_almost_equal(
m_index, np.array([119999, 120000, 7200180000, 119999, 120000, 7200180000])
)
0