scikit-learn
diff --git a/‎sklearn/feature_extraction/_hashing.pyx
Lines changed: 10 additions & 3 deletions b/‎sklearn/feature_extraction/_hashing.pyx
Lines changed: 10 additions & 3 deletions
diff --git a/‎sklearn/feature_extraction/text.py
Lines changed: 16 additions & 3 deletions b/‎sklearn/feature_extraction/text.py
Lines changed: 16 additions & 3 deletions
@@ -9,6 +9,7 @@ cimport numpy as np
 import numpy as np
 
 from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
+from sklearn.utils.fixes import sp_version
 
 np.import_array()
 
@@ -33,12 +34,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
     cdef array.array indices
     cdef array.array indptr
     indices = array.array("i")
-    indptr = array.array("i", [0])
+    indptr = array.array("l", [0])
 
     # Since Python array does not understand Numpy dtypes, we grow the indices
     # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
     cdef Py_ssize_t capacity = 8192     # arbitrary
-    cdef np.int32_t size = 0
+    cdef np.intp_t size = 0
     cdef np.ndarray values = np.empty(capacity, dtype=dtype)
 
     for x in raw_X:
@@ -79,4 +80,10 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
         indptr[len(indptr) - 1] = size
 
     indices_a = np.frombuffer(indices, dtype=np.int32)
-    return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size])
+    indptr_a = np.frombuffer(indptr, dtype=np.int64)
+
+    if indptr[-1] > 2147483648:  # = 2**31
+        indices_a = indices_a.astype(np.int64)
+    else:
+        indptr_a = indptr_a.astype(np.int32)
+    return (indices_a, indptr_a, values[:size])
@@ -30,6 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted
+from ..utils.fixes import sp_version
 
 __all__ = ['CountVectorizer',
            'ENGLISH_STOP_WORDS',
@@ -762,7 +763,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        # indptr stores indices into j_indices, which can be large
+        # indptr <
D4CF
span class="x x-first x-last">can overflow in 32 bit, always use 64 bit
         indptr = _make_int_array(dtype='l')
         values = _make_int_array()
         indptr.append(0)
@@ -790,8 +791,20 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = np.asarray(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.int_)
+        if indptr[-1] > 2147483648:  # = 2**31 - 1
+            if sp_version >= (0, 14):
+                indices_dtype = np.int_
+            else:
+                raise ValueError(('sparse CSR array has {} non-zero '
+                                  'elements and require 64 bit indexing, '
+                                  ' which is unsupported with scipy {}. '
+                                  'Please upgrade to scipy >=0.14')
+                                 .format(indptr[-1], '.'.join(sp_version)))
+
+        else:
+            indices_dtype = np.intc
+        j_indices = np.asarray(j_indices, dtype=indices_dtype)
+        indptr = np.frombuffer(indptr, dtype=np.int_).astype(indices_dtype)
         values = np.frombuffer(values, dtype=np.intc)
 
         X = sp.csr_matrix((values, j_indices, indptr),