10000 [WIP] FIX index overflow error in sparse matrix polynomial expansion … · niuk-a/scikit-learn@7eef7ad · GitHub
[go: up one dir, main page]

Skip to content

Commit 7eef7ad

Browse files
committed
[WIP] FIX index overflow error in sparse matrix polynomial expansion (bis)
Fixes scikit-learn#19676
1 parent 1bd007f commit 7eef7ad

File tree

3 files changed

+106
-62
lines changed

3 files changed

+106
-62
lines changed

sklearn/preprocessing/_csr_polynomial_expansion.pyx

Lines changed: 37 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,18 @@ from numpy cimport ndarray
99
cimport numpy as np
1010

1111
np.import_array()
12-
ctypedef np.int32_t INDEX_T
12+
ctypedef np.int32_t INDEX_A
13+
ctypedef fused INDEX_B:
14+
np.int32_t
15+
np.int64_t
16+
ctypedef np.int8_t FLAG_T
1317

1418
ctypedef fused DATA_T:
1519
np.float32_t
1620
np.float64_t
17-
np.int32_t
18-
np.int64_t
1921

20-
21-
cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
22-
INDEX_T interaction_only) nogil:
22+
cdef inline INDEX_B _deg2_column(INDEX_B d, INDEX_B i, INDEX_B j,
23+
FLAG_T interaction_only) nogil:
2324
"""Compute the index of the column for a degree 2 expansion
2425
2526
d is the dimensionality of the input data, i and j are the indices
@@ -31,8 +32,8 @@ cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
3132
return d * i - (i**2 + i) / 2 + j
3233

3334

34-
cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
35-
INDEX_T interaction_only) nogil:
35+
cdef inline INDEX_B _deg3_column(INDEX_B d, INDEX_B i, INDEX_B j, INDEX_B k,
36+
FLAG_T interaction_only) nogil:
3637
"""Compute the index of the column for a degree 3 expansion
3738
3839
d is the dimensionality of the input data, i, j and k are the indices
@@ -49,10 +50,14 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
4950

5051

5152
def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
52-
ndarray[INDEX_T, ndim=1] indices,
53-
ndarray[INDEX_T, ndim=1] indptr,
54-
INDEX_T d, INDEX_T interaction_only,
55-
INDEX_T degree):
53+
ndarray[INDEX_A, ndim=1] indices,
54+
ndarray[INDEX_A, ndim=1] indptr,
55+
INDEX_A d,
56+
ndarray[DATA_T, ndim=1] result_data,
57+
ndarray[INDEX_B, ndim=1] result_indices,
58+
ndarray[INDEX_B, ndim=1] result_indptr,
59+
FLAG_T interaction_only,
60+
FLAG_T degree):
5661
"""
5762
Perform a second-degree polynomial or interaction expansion on a scipy
5863
compressed sparse row (CSR) matrix. The method used only takes products of
@@ -74,6 +79,15 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
7479
d : int
7580
The dimensionality of the input CSR matrix.
7681
82+
result_data : nd-array
83+
The output CSR matrix's "data" attribute
84+
85+
result_indices : nd-array
86+
The output CSR matrix's "indices" attribute
87+
88+
result_indptr : nd-array
89+
The output CSR matrix's "indptr" attribute
90+
7791
interaction_only : int
7892
0 for a polynomial expansion, 1 for an interaction expansion.
7993
@@ -86,44 +100,13 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
86100
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
87101
"""
88102

89-
assert degree in (2, 3)
90-
91-
if degree == 2:
92-
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
93-
else:
94-
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
95-
- interaction_only*d**2)
96-
if expanded_dimensionality == 0:
97-
return None
98-
assert expanded_dimensionality > 0
99-
100-
cdef INDEX_T total_nnz = 0, row_i, nnz
101-
102-
# Count how many nonzero elements the expanded matrix will contain.
103-
for row_i in range(indptr.shape[0]-1):
104-
# nnz is the number of nonzero elements in this row.
105-
nnz = indptr[row_i + 1] - indptr[row_i]
106-
if degree == 2:
107-
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
108-
else:
109-
total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
110-
- interaction_only * nnz ** 2)
111-
112103
# Make the arrays that will form the CSR matrix of the expansion.
113-
cdef ndarray[DATA_T, ndim=1] expanded_data = ndarray(
114-
shape=total_nnz, dtype=data.dtype)
115-
cdef ndarray[INDEX_T, ndim=1] expanded_indices = ndarray(
116-
shape=total_nnz, dtype=indices.dtype)
117-
cdef INDEX_T num_rows = indptr.shape[0] - 1
118-
cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
119-
shape=num_rows + 1, dtype=indptr.dtype)
120-
121-
cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
122-
i_ptr, j_ptr, k_ptr, num_cols_in_row, \
123-
expanded_column
104+
cdef INDEX_A row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
105+
106+
cdef INDEX_B expanded_index=0, num_cols_in_row, col
124107

125108
with nogil:
126-
expanded_indptr[0] = indptr[0]
109+
result_indptr[0] = indptr[0]
127110
for row_i in range(indptr.shape[0]-1):
128111
row_starts = indptr[row_i]
129112
row_ends = indptr[row_i + 1]
@@ -133,9 +116,9 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
133116
for j_ptr in range(i_ptr + interaction_only, row_ends):
134117
j = indices[j_ptr]
135118
if degree == 2:
136-
col = _deg2_column(d, i, j, interaction_only)
137-
expanded_indices[expanded_index] = col
138-
expanded_data[expanded_index] = (
119+
col = _deg2_column[INDEX_B](d, i, j, interaction_only)
120+
result_indices[expanded_index] = col
121+
result_data[expanded_index] = (
139122
data[i_ptr] * data[j_ptr])
140123
expanded_index += 1
141124
num_cols_in_row += 1
@@ -144,14 +127,11 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
144127
for k_ptr in range(j_ptr + interaction_only,
145128
row_ends):
146129
k = indices[k_ptr]
147-
col = _deg3_column(d, i, j, k, interaction_only)
148-
expanded_indices[expanded_index] = col
149-
expanded_data[expanded_index] = (
130+
col = _deg3_column[INDEX_B](d, i, j, k, interaction_only)
131+
result_indices[expanded_index] = col
132+
result_data[expanded_index] = (
150133
data[i_ptr] * data[j_ptr] * data[k_ptr])
151134
expanded_index += 1
152135
num_cols_in_row += 1
153136

154-
expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row
155-
156-
return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
157-
shape=(num_rows, expanded_dimensionality))
137+
result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row

sklearn/preprocessing/_polynomial.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,13 @@ def _num_combinations(
170170

171171
return combinations
172172

173+
@staticmethod
174+
def _calc_expanded_dimensionality(d, interaction_only, degree):
175+
if degree == 2:
176+
return (d ** 2 + d) // 2 - interaction_only * d
177+
else:
178+
return (d ** 3 + 3 * d ** 2 + 2 * d) // 6 - interaction_only * d ** 2
179+
173180
@property
174181
def powers_(self):
175182
check_is_fitted(self)
@@ -334,17 +341,48 @@ def transform(self, X):
334341
if self._min_degree <= 1:
335342
to_stack.append(X)
336343
for deg in range(max(2, self._min_degree), self._max_degree + 1):
337-
Xp_next = _csr_polynomial_expansion(
338-
X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg
344+
# Count how many nonzero elements the expanded matrix will contain.
345+
total_nnz = sum(
346+
self._calc_expanded_dimensionality(
347+
X.indptr[row_i + 1] - X.indptr[row_i],
348+
self.interaction_only,
349+
deg,
350+
)
351+
for row_i in range(X.indptr.shape[0] - 1)
352+
)
353+
expanded_d = self._calc_expanded_dimensionality(
354+
X.shape[1], self.interaction_only, deg
339355
)
340-
if Xp_next is None:
356+
if expanded_d == 0:
341357
break
342-
to_stack.append(Xp_next)
358+
assert expanded_d > 0
359+
index_t = np.int64 if expanded_d > np.iinfo(np.int32).max else np.int32
360+
expanded_data = np.ndarray(shape=total_nnz, dtype=X.data.dtype)
361+
expanded_indices = np.ndarray(shape=total_nnz, dtype=index_t)
362+
expanded_indptr = np.ndarray(shape=X.indptr.shape[0], dtype=index_t)
363+
_csr_polynomial_expansion(
364+
X.data,
365+
X.indices,
366+
X.indptr,
367+
X.shape[1],
368+
expanded_data,
369+
expanded_indices,
370+
expanded_indptr,
371+
self.interaction_only,
372+
deg,
373+
)
374+
to_stack.append(
375+
sparse.csr_matrix(
376+
(expanded_data, expanded_indices, expanded_indptr),
377+
shape=(X.indptr.shape[0] - 1, expanded_d),
378+
dtype=X.dtype,
379+
)
380+
)
343381
if len(to_stack) == 0:
344382
# edge case: deal with empty matrix
345383
XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
346384
else:
347-
XP = sparse.hstack(to_stack, format="csr")
385+
XP = sparse.hstack(to_stack, format="csr", dtype=X.dtype)
348386
elif sparse.isspmatrix_csc(X) and self._max_degree < 4:
349387
return self.transform(X.tocsr()).tocsc()
350388
elif sparse.isspmatrix(X):

sklearn/preprocessing/tests/test_polynomial.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -855,3 +855,29 @@ def test_polynomial_features_deprecated_n_input_features():
855855

856856
with pytest.warns(FutureWarning, match=depr_msg):
857857
PolynomialFeatures().fit(X).n_input_features_
858+
859+
860+
def test_csr_polynomial_expansion_index_overflow():
861+
N = 12
862+
M = 120000
863+
dtype = np.float32
864+
x = sparse.csr_matrix(
865+
(
866+
np.arange(1, 5, dtype=np.int64),
867+
(np.array([N - 1, N - 1, N, N]), np.array([M - 1, M, M - 1, M])),
868+
),
869+
shape=(N + 1, M + 1),
870+
dtype=dtype,
871+
copy=False,
872+
)
873+
pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
874+
xinter = pf.fit_transform(x)
875+
n_index, m_index = xinter.nonzero()
876+
877+
assert xinter.dtype == dtype
878+
assert xinter.shape == (13, 7200180001)
879+
assert_array_almost_equal(xinter.data, np.array([1, 2, 2, 3, 4, 12], dtype=dtype))
880+
assert_array_almost_equal(n_index, np.array([11, 11, 11, 12, 12, 12]))
881+
assert_array_almost_equal(
882+
m_index, np.array([119999, 120000, 7200180000, 119999, 120000, 7200180000])
883+
)

0 commit comments

Comments
 (0)
0