updated added test code w/ @rth's suggestions

gvacaliuc · gvacaliuc · commit 9c0b8139cd22 · 2018-06-17T16:04:58.000-04:00
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -1099,26 +1099,25 @@ def test_vectorizers_invalid_ngram_range(vec):
             ValueError, message, vec.transform, ["good news everyone"])
 
 
-@pytest.mark.parametrize("vec", [CountVectorizer()])
 def test_countvectorizer_sort_features_64bit_sparse_indices(vec):
-    # If a count vectorizer has to store >= 2**31 count values, the sparse
-    # storage matrix has 64bit indices / indptrs.  This requires ~2*8*2**31
-    # bytes of memory in practice, so we just test the method that would
-    # hypothetically fail.
+    """
+    Check that CountVectorizer._sort_features preserves the dtype of its sparse
+    feature matrix.
+    """
 
     X = sparse.csr_matrix((5, 5), dtype=np.int64)
 
     # force indices and indptr to int64.
-    INDEX_DTYPE = np.int64
-    X.indices = X.indices.astype(INDEX_DTYPE, copy=False)
-    X.indptr = X.indptr.astype(INDEX_DTYPE, copy=False)
+    INDICES_DTYPE = np.int64
+    X.indices = X.indices.astype(INDICES_DTYPE)
+    X.indptr = X.indptr.astype(INDICES_DTYPE)
 
     vocabulary = {
             "scikit-learn": 0,
             "is": 1,
             "great!": 2
             }
 
-    vec._sort_features(X, vocabulary)
+    Xs = CountVectorizer()._sort_features(X, vocabulary)
 
-    assert_equal(INDEX_DTYPE, X.indices.dtype)
+    assert INDICES_DTYPE == Xs.indices.dtype