1
1
# Author: Lars Buitinck
2
2
# License: BSD 3 clause
3
3
4
+ import sys
4
5
import array
5
6
from cpython cimport array
6
7
cimport cython
@@ -9,6 +10,7 @@ cimport numpy as np
9
10
import numpy as np
10
11
11
12
from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
13
+ from sklearn.utils.fixes import sp_version
12
14
13
15
np.import_array()
14
16
@@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
33
35
cdef array.array indices
34
36
cdef array.array indptr
35
37
indices = array.array(" i" )
36
- indptr = array.array(" i" , [0 ])
38
+ if sys.version_info >= (3 , 3 ):
39
+ indices_array_dtype = " q"
40
+ indices_np_dtype = np.longlong
41
+ else :
42
+ # On Windows with PY2.7 long int would still correspond to 32 bit.
43
+ indices_array_dtype = " l"
44
+ indices_np_dtype = np.int_
45
+
46
+ indptr = array.array(indices_array_dtype, [0 ])
37
47
38
48
# Since Python array does not understand Numpy dtypes, we grow the indices
39
49
# and values arrays ourselves. Use a Py_ssize_t capacity for safety.
40
50
cdef Py_ssize_t capacity = 8192 # arbitrary
41
- cdef np.int32_t size = 0
51
+ cdef np.int64_t size = 0
42
52
cdef np.ndarray values = np.empty(capacity, dtype = dtype)
43
53
44
54
for x in raw_X:
@@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
79
89
indptr[len (indptr) - 1 ] = size
80
90
81
91
indices_a = np.frombuffer(indices, dtype = np.int32)
82
- return (indices_a, np.frombuffer(indptr, dtype = np.int32), values[:size])
92
+ indptr_a = np.frombuffer(indptr, dtype = indices_np_dtype)
93
+
94
+ if indptr[- 1 ] > 2147483648 : # = 2**31
95
+ if sp_version < (0 , 14 ):
96
+ raise ValueError ((' sparse CSR array has {} non-zero '
97
+ ' elements and requires 64 bit indexing, '
98
+ ' which is unsupported with scipy {}. '
99
+ ' Please upgrade to scipy >=0.14' )
100
+ .format(indptr[- 1 ], ' .' .join(sp_version)))
101
+ # both indices and indptr have the same dtype in CSR arrays
102
+ indices_a = indices_a.astype(np.int64)
103
+ else :
104
+ indptr_a = indptr_a.astype(np.int32)
105
+
106
+ return (indices_a, indptr_a, values[:size])
0 commit comments