scikit-learn · niuk-a · Jul 8, 2021
diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -9,17 +9,18 @@ from numpy cimport ndarray
 cimport numpy as np
 
 np.import_array()
-ctypedef np.int32_t INDEX_T
+ctypedef np.int32_t INDEX_A
+ctypedef fused INDEX_B:
+    np.int32_t
+    np.int64_t
+ctypedef np.int8_t FLAG_T
 
 ctypedef fused DATA_T:
     np.float32_t
     np.float64_t
-    np.int32_t
-    np.int64_t
 
-
-cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
-                                 INDEX_T interaction_only) nogil:
+cdef inline INDEX_B _deg2_column(INDEX_B d, INDEX_B i, INDEX_B j,
+                                 FLAG_T interaction_only) nogil:
     """Compute the index of the column for a degree 2 expansion
 
     d is the dimensionality of the input data, i and j are the indices
@@ -31,8 +32,8 @@ cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
         return d * i - (i**2 + i) / 2 + j
 
 
-cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
-                                 INDEX_T interaction_only) nogil:
+cdef inline INDEX_B _deg3_column(INDEX_B d, INDEX_B i, INDEX_B j, INDEX_B k,
+                                 FLAG_T interaction_only) nogil:
     """Compute the index of the column for a degree 3 expansion
 
     d is the dimensionality of the input data, i, j and k are the indices
@@ -49,10 +50,14 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
 
 
 def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
-                              ndarray[INDEX_T, ndim=1] indices,
-                              ndarray[INDEX_T, ndim=1] indptr,
-                              INDEX_T d, INDEX_T interaction_only,
-                              INDEX_T degree):
+                              ndarray[INDEX_A, ndim=1] indices,
+                              ndarray[INDEX_A, ndim=1] indptr,
+                              INDEX_A d,
+                              ndarray[DATA_T, ndim=1] result_data,
+                              ndarray[INDEX_B, ndim=1] result_indices,
+                              ndarray[INDEX_B, ndim=1] result_indptr,
+                              FLAG_T interaction_only,
+                              FLAG_T degree):
     """
     Perform a second-degree polynomial or interaction expansion on a scipy
     compressed sparse row (CSR) matrix. The method used only takes products of
@@ -74,6 +79,15 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
     d : int
         The dimensionality of the input CSR matrix.
 
+    result_data : nd-array
+         The output CSR matrix's "data" attribute
+
+    result_indices : nd-array
+         The output CSR matrix's "indices" attribute
+
+    result_indptr : nd-array
+        The output CSR matrix's "indptr" attribute
+
     interaction_only : int
         0 for a polynomial expansion, 1 for an interaction expansion.
 
@@ -86,44 +100,13 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
     Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
     """
 
-    assert degree in (2, 3)
-
-    if degree == 2:
-        expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
-    else:
-        expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
-                                      - interaction_only*d**2)
-    if expanded_dimensionality == 0:
-        return None
-    assert expanded_dimensionality > 0
-
-    cdef INDEX_T total_nnz = 0, row_i, nnz
-
-    # Count how many nonzero elements the expanded matrix will contain.
-    for row_i in range(indptr.shape[0]-1):
-        # nnz is the number of nonzero elements in this row.
-        nnz = indptr[row_i + 1] - indptr[row_i]
-        if degree == 2:
-            total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
-        else:
-            total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
-                          - interaction_only * nnz ** 2)
-
     # Make the arrays that will form the CSR matrix of the expansion.
-    cdef ndarray[DATA_T, ndim=1] expanded_data = ndarray(
-        shape=total_nnz, dtype=data.dtype)
-    cdef ndarray[INDEX_T, ndim=1] expanded_indices = ndarray(
-        shape=total_nnz, dtype=indices.dtype)
-    cdef INDEX_T num_rows = indptr.shape[0] - 1
-    cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
-        shape=num_rows + 1, dtype=indptr.dtype)
-
-    cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
-                 i_ptr, j_ptr, k_ptr, num_cols_in_row,  \
-                 expanded_column
+    cdef INDEX_A row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr
+
+    cdef INDEX_B expanded_index=0, num_cols_in_row, col
 
     with nogil:
-        expanded_indptr[0] = indptr[0]
+        result_indptr[0] = indptr[0]
         for row_i in range(indptr.shape[0]-1):
             row_starts = indptr[row_i]
             row_ends = indptr[row_i + 1]
@@ -133,9 +116,9 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
                 for j_ptr in range(i_ptr + interaction_only, row_ends):
                     j = indices[j_ptr]
                     if degree == 2:
-                        col = _deg2_column(d, i, j, interaction_only)
-                        expanded_indices[expanded_index] = col
-                        expanded_data[expanded_index] = (
+                        col = _deg2_column[INDEX_B](d, i, j, interaction_only)
+                        result_indices[expanded_index] = col
+                        result_data[expanded_index] = (
                             data[i_ptr] * data[j_ptr])
                         expanded_index += 1
                         num_cols_in_row += 1
@@ -144,14 +127,11 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
                         for k_ptr in range(j_ptr + interaction_only,
                                             row_ends):
                             k = indices[k_ptr]
-                            col = _deg3_column(d, i, j, k, interaction_only)
-                            expanded_indices[expanded_index] = col
-                            expanded_data[expanded_index] = (
+                            col = _deg3_column[INDEX_B](d, i, j, k, interaction_only)
+                            result_indices[expanded_index] = col
+                            result_data[expanded_index] = (
                                 data[i_ptr] * data[j_ptr] * data[k_ptr])
                             expanded_index += 1
                             num_cols_in_row += 1
 
-            expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row
-
-    return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
-                      shape=(num_rows, expanded_dimensionality))
+            result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
@@ -170,6 +170,13 @@ def _num_combinations(
 
         return combinations
 
+    @staticmethod
+    def _calc_expanded_dimensionality(d, interaction_only, degree):
+        if degree == 2:
+            return (d ** 2 + d) // 2 - interaction_only * d
+        else:
+            return (d ** 3 + 3 * d ** 2 + 2 * d) // 6 - interaction_only * d ** 2
+
     @property
     def powers_(self):
         check_is_fitted(self)
@@ -334,17 +341,48 @@ def transform(self, X):
             if self._min_degree <= 1:
                 to_stack.append(X)
             for deg in range(max(2, self._min_degree), self._max_degree + 1):
-                Xp_next = _csr_polynomial_expansion(
-                    X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg
+                # Count how many nonzero elements the expanded matrix will contain.
+                total_nnz = sum(
+                    self._calc_expanded_dimensionality(
+                        X.indptr[row_i + 1] - X.indptr[row_i],
+                        self.interaction_only,
+                        deg,
+                    )
+                    for row_i in range(X.indptr.shape[0] - 1)
+                )
+                expanded_d = self._calc_expanded_dimensionality(
+                    X.shape[1], self.interaction_only, deg
                 )
-                if Xp_next is None:
+                if expanded_d == 0:
                     break
-                to_stack.append(Xp_next)
+                assert expanded_d > 0
+                index_t = np.int64 if expanded_d > np.iinfo(np.int32).max else np.int32
+                expanded_data = np.ndarray(shape=total_nnz, dtype=X.data.dtype)
+                expanded_indices = np.ndarray(shape=total_nnz, dtype=index_t)
+                expanded_indptr = np.ndarray(shape=X.indptr.shape[0], dtype=index_t)
+                _csr_polynomial_expansion(
+                    X.data,
+                    X.indices,
+                    X.indptr,
+                    X.shape[1],
+                    expanded_data,
+                    expanded_indices,
+                    expanded_indptr,
+                    self.interaction_only,
+                    deg,
+                )
+                to_stack.append(
+                    sparse.csr_matrix(
+                        (expanded_data, expanded_indices, expanded_indptr),
+                        shape=(X.indptr.shape[0] - 1, expanded_d),
+                        dtype=X.dtype,
+                    )
+                )
             if len(to_stack) == 0:
                 # edge case: deal with empty matrix
                 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
             else:
-                XP = sparse.hstack(to_stack, format="csr")
+                XP = sparse.hstack(to_stack, format="csr", dtype=X.dtype)
         elif sparse.isspmatrix_csc(X) and self._max_degree < 4:
             return self.transform(X.tocsr()).tocsc()
         elif sparse.isspmatrix(X):

diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
@@ -855,3 +855,29 @@ def test_polynomial_features_deprecated_n_input_features():
 
     with pytest.warns(FutureWarning, match=depr_msg):
         PolynomialFeatures().fit(X).n_input_features_
+
+
+def test_csr_polynomial_expansion_index_overflow():
+    N = 12
+    M = 120000
+    dtype = np.float32
+    x = sparse.csr_matrix(
+        (
+            np.arange(1, 5, dtype=np.int64),
+            (np.array([N - 1, N - 1, N, N]), np.array([M - 1, M, M - 1, M])),
+        ),
+        shape=(N + 1, M + 1),
+        dtype=dtype,
+        copy=False,
+    )
+    pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
+    xinter = pf.fit_transform(x)
+    n_index, m_index = xinter.nonzero()
+
+    assert xinter.dtype == dtype
+    assert xinter.shape == (13, 7200180001)
+    assert_array_almost_equal(xinter.data, np.array([1, 2, 2, 3, 4, 12], dtype=dtype))
+    assert_array_almost_equal(n_index, np.array([11, 11, 11, 12, 12, 12]))
+    assert_array_almost_equal(
+        m_index, np.array([119999, 120000, 7200180000, 119999, 120000, 7200180000])
+    )