11# Author: Lars Buitinck
22# License: BSD 3 clause
33
4+ import sys
45import array
56from cpython cimport array
67cimport cython
@@ -9,6 +10,7 @@ cimport numpy as np
910import numpy as np
1011
1112from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
13+ from sklearn.utils.fixes import sp_version
1214
1315np.import_array()
1416
@@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
3335 cdef array.array indices
3436 cdef array.array indptr
3537 indices = array.array(" i" )
36- indptr = array.array(" i" , [0 ])
38+ if sys.version_info >= (3 , 3 ):
39+ indices_array_dtype = " q"
40+ indices_np_dtype = np.longlong
41+ else :
42+ # On Windows with PY2.7 long int would still correspond to 32 bit.
43+ indices_array_dtype = " l"
44+ indices_np_dtype = np.int_
45+
46+ indptr = array.array(indices_array_dtype, [0 ])
3747
3848 # Since Python array does not understand Numpy dtypes, we grow the indices
3949 # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
4050 cdef Py_ssize_t capacity = 8192 # arbitrary
41- cdef np.int32_t size = 0
51+ cdef np.int64_t size = 0
4252 cdef np.ndarray values = np.empty(capacity, dtype = dtype)
4353
4454 for x in raw_X:
@@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
7989 indptr[len (indptr) - 1 ] = size
8090
8191 indices_a = np.frombuffer(indices, dtype = np.int32)
82- return (indices_a, np.frombuffer(indptr, dtype = np.int32), values[:size])
92+ indptr_a = np.frombuffer(indptr, dtype = indices_np_dtype)
93+
94+ if indptr[- 1 ] > 2147483648 : # = 2**31
95+ if sp_version < (0 , 14 ):
96+ raise ValueError ((' sparse CSR array has {} non-zero '
97+ ' elements and requires 64 bit indexing, '
98+ ' which is unsupported with scipy {}. '
99+ ' Please upgrade to scipy >=0.14' )
100+ .format(indptr[- 1 ], ' .' .join(sp_version)))
101+ # both indices and indptr have the same dtype in CSR arrays
102+ indices_a = indices_a.astype(np.int64)
103+ else :
104+ indptr_a = indptr_a.astype(np.int32)
105+
106+ return (indices_a, indptr_a, values[:size])
0 commit comments