scikit-learn
diff --git a/‎sklearn/feature_extraction/tests/test_text.py
Lines changed: 25 additions & 0 deletions b/‎sklearn/feature_extraction/tests/test_text.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎sklearn/feature_extraction/text.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/feature_extraction/text.py
Lines changed: 1 addition & 1 deletion
@@ -1140,3 +1140,28 @@ def test_vectorizer_stop_words_inconsistent():
     vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
     assert_warns_message(UserWarning, message, vec.fit_transform,
                          ['hello world'])
+
+
+@pytest.mark.parametrize("vec", [CountVectorizer()])
+def test_countvectorizer_sort_features_64bit_sparse_indices(vec):
+    # If a count vectorizer has to store >= 2**31 count values, the sparse
+    # storage matrix has 64bit indices / indptrs.  This requires ~2*8*2**31
+    # bytes of memory in practice, so we just test the method that would
+    # hypothetically fail.
+
+    X = sparse.csr_matrix((5, 5), dtype=np.int64)
+
+    # force indices and indptr to int64.
+    INDEX_DTYPE = np.int64
+    X.indices = X.indices.astype(INDEX_DTYPE, copy=False)
+    X.indptr = X.indptr.astype(INDEX_DTYPE, copy=False)
+
+    vocabulary = {
+            "scikit-learn": 0,
+            "is": 1,
+            "great!": 2
+            }
+
+    vec._sort_features(X, vocabulary)
+
+    assert_equal(INDEX_DTYPE, X.indices.dtype)
@@ -852,7 +852,7 @@ def _sort_features(self, X, vocabulary):
         Returns a reordered matrix and modifies the vocabulary in place
         """
         sorted_features = sorted(six.iteritems(vocabulary))
-        map_index = np.empty(len(sorted_features), dtype=np.int32)
+        map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
         for new_val, (term, old_val) in enumerate(sorted_features):
             vocabulary[term] = new_val
             map_index[old_val] = new_val