From a92b8a70e238b1c038dd8a73cca6096baa9f5bf4 Mon Sep 17 00:00:00 2001 From: Jianling Zhong Date: Thu, 2 Apr 2020 16:35:10 -0700 Subject: [PATCH] [MRG] FIX index overflow error in sparse matrix polynomial expansion --- sklearn/preprocessing/_csr_polynomial_expansion.pyx | 6 ++++-- sklearn/preprocessing/_data.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index dd36f8321410f..d00700f9d972f 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -8,7 +8,9 @@ from scipy.sparse import csr_matrix from numpy cimport ndarray cimport numpy as np -ctypedef np.int32_t INDEX_T +ctypedef fused INDEX_T: + np.int32_t + np.int64_t ctypedef fused DATA_T: np.float32_t @@ -119,7 +121,7 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data, cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \ i_ptr, j_ptr, k_ptr, num_cols_in_row, \ - expanded_column + expanded_column, col with nogil: expanded_indptr[0] = indptr[0] diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 33e2bac562489..848a073023099 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1560,8 +1560,12 @@ def transform(self, X): to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) to_stack.append(X) for deg in range(2, self.degree+1): - Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], + # use np.int64 for index datatype to prevent overflow + # in case X has a large dimension + Xp_next = _csr_polynomial_expansion(X.data, + X.indices.astype(np.int64), + X.indptr.astype(np.int64), + np.int64(X.shape[1]), self.interaction_only, deg) if Xp_next is None: