|
30 | 30 | from .hashing import FeatureHasher
|
31 | 31 | from .stop_words import ENGLISH_STOP_WORDS
|
32 | 32 | from ..utils.validation import check_is_fitted
|
| 33 | +from ..utils.fixes import sp_version |
33 | 34 |
|
34 | 35 | __all__ = ['CountVectorizer',
|
35 | 36 | 'ENGLISH_STOP_WORDS',
|
@@ -762,7 +763,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
|
762 | 763 |
|
763 | 764 | analyze = self.build_analyzer()
|
764 | 765 | j_indices = []
|
765 |
| - # indptr stores indices into j_indices, which can be large |
| 766 | + # indptr <
D4CF
span class="x x-first x-last">can overflow in 32 bit, always use 64 bit |
766 | 767 | indptr = _make_int_array(dtype='l')
|
767 | 768 | values = _make_int_array()
|
768 | 769 | indptr.append(0)
|
@@ -790,8 +791,20 @@ def _count_vocab(self, raw_documents, fixed_vocab):
|
790 | 791 | raise ValueError("empty vocabulary; perhaps the documents only"
|
791 | 792 | " contain stop words")
|
792 | 793 |
|
793 |
| - j_indices = np.asarray(j_indices, dtype=np.intc) |
794 |
| - indptr = np.frombuffer(indptr, dtype=np.int_) |
| 794 | + if indptr[-1] > 2147483648: # = 2**31 - 1 |
| 795 | + if sp_version >= (0, 14): |
| 796 | + indices_dtype = np.int_ |
| 797 | + else: |
| 798 | + raise ValueError(('sparse CSR array has {} non-zero ' |
| 799 | + 'elements and require 64 bit indexing, ' |
| 800 | + ' which is unsupported with scipy {}. ' |
| 801 | + 'Please upgrade to scipy >=0.14') |
| 802 | + .format(indptr[-1], '.'.join(sp_version))) |
| 803 | + |
| 804 | + else: |
| 805 | + indices_dtype = np.intc |
| 806 | + j_indices = np.asarray(j_indices, dtype=indices_dtype) |
| 807 | + indptr = np.frombuffer(indptr, dtype=np.int_).astype(indices_dtype) |
795 | 808 | values = np.frombuffer(values, dtype=np.intc)
|
796 | 809 |
|
797 | 810 | X = sp.csr_matrix((values, j_indices, indptr),
|
|
0 commit comments