scikit-learn · jnothman · Jan 30, 2019 · Jun 15, 2018 · Jan 3, 2019 · Jan 14, 2019
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -51,6 +51,14 @@ Changelog
   combination with ``handle_unknown='ignore'``.
   :issue:`12881` by `Joris Van den Bossche`_.
 
+:mod:`sklearn.feature_extraction.text`
+......................................
+
+- |Fix| Fixed a bug in :class:`feature_extraction.text.CountVectorizer` which 
+  would result in the sparse feature matrix having conflicting `indptr` and
+  `indices` precisions under very large vocabularies. :issue:`11295` by
+  :user:`Gabriel Vacaliuc <gvacaliuc>`.
+
 .. _changes_0_20_2:
 
 Version 0.20.2

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -36,7 +36,8 @@
                                    assert_warns_message, assert_raise_message,
                                    clean_warning_registry, ignore_warnings,
                                    SkipTest, assert_raises, assert_no_warnings,
-                                   fails_if_pypy, assert_allclose_dense_sparse)
+                                   fails_if_pypy, assert_allclose_dense_sparse,
+                                   skip_if_32bit)
 from collections import defaultdict
 from functools import partial
 import pickle
@@ -1144,6 +1145,35 @@ def test_vectorizer_stop_words_inconsistent():
                          ['hello world'])
 
 
+@skip_if_32bit
+def test_countvectorizer_sort_features_64bit_sparse_indices():
+    """
+    Check that CountVectorizer._sort_features preserves the dtype of its sparse
+    feature matrix.
+
+    This test is skipped on 32bit platforms, see:
+        https://github.com/scikit-learn/scikit-learn/pull/11295
+    for more details.
+    """
+
+    X = sparse.csr_matrix((5, 5), dtype=np.int64)
+
+    # force indices and indptr to int64.
+    INDICES_DTYPE = np.int64
+    X.indices = X.indices.astype(INDICES_DTYPE)
+    X.indptr = X.indptr.astype(INDICES_DTYPE)
+
+    vocabulary = {
+            "scikit-learn": 0,
+            "is": 1,
+            "great!": 2
+            }
+
+    Xs = CountVectorizer()._sort_features(X, vocabulary)
+
+    assert INDICES_DTYPE == Xs.indices.dtype
+
+
 @fails_if_pypy
 @pytest.mark.parametrize('Estimator',
                          [CountVectorizer, TfidfVectorizer, HashingVectorizer])

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -31,6 +31,7 @@
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
 from ..utils.fixes import sp_version
+from ..utils import _IS_32BIT
 
 
 __all__ = ['HashingVectorizer',
@@ -871,7 +872,7 @@ def _sort_features(self, X, vocabulary):
         Returns a reordered matrix and modifies the vocabulary in place
         """
         sorted_features = sorted(vocabulary.items())
-        map_index = np.empty(len(sorted_features), dtype=np.int32)
+        map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
 if indptr[-1] > 2147483648:  # = 2**31 - 1 
     if sp_version >= (0, 14): 
         indices_dtype = np.int64 
     else: 
         raise ValueError(('sparse CSR array has {} non-zero ' 
                           'elements and requires 64 bit indexing, ' 
                           ' which is unsupported with scipy {}. ' 
                           'Please upgrade to scipy >=0.14') 
                          .format(indptr[-1], '.'.join(sp_version))) 
 else: 
     indices_dtype = np.int32 
 if indptr[-1] > 2147483648:  # = 2**31 - 1 
     if sp_version >= (0, 14): 
         indices_dtype = np.int64 
     else: 
         raise ValueError(('sparse CSR array has {} non-zero ' 
                           'elements and requires 64 bit indexing, ' 
                           ' which is unsupported with scipy {}. ' 
                           'Please upgrade to scipy >=0.14') 
                          .format(indptr[-1], '.'.join(sp_version))) 
  
 else: 
     indices_dtype = np.int32 
         for new_val, (term, old_val) in enumerate(sorted_features):
             vocabulary[term] = new_val
             map_index[old_val] = new_val
@@ -961,14 +962,12 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                                  " contain stop words")
 
         if indptr[-1] > 2147483648:  # = 2**31 - 1
-            if sp_version >= (0, 14):
-                indices_dtype = np.int64
-            else:
+            if _IS_32BIT:
                 raise ValueError(('sparse CSR array has {} non-zero '
                                   'elements and requires 64 bit indexing, '
-                                  ' which is unsupported with scipy {}. '
-                                  'Please upgrade to scipy >=0.14')
-                                 .format(indptr[-1], '.'.join(sp_version)))
+                                  'which is unsupported with 32 bit Python.')
+                                 .format(indptr[-1]))
+            indices_dtype = np.int64
 
         else:
             indices_dtype = np.int32