updated added test code w/ @rth's suggestions

gvacaliuc · gvacaliuc · commit 59d12b821598 · 2018-08-08T23:44:48.000-04:00
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -1119,6 +1119,7 @@ def test_vectorizers_invalid_ngram_range(vec):
             ValueError, message, vec.transform, ["good news everyone"])
 
 
+<<<<<<< 359b5c0b72afd4e15039a0e16e2be1702b0d62c9
 def test_vectorizer_stop_words_inconsistent():
     if PY2:
         lstr = "[u'and', u'll', u've']"
@@ -1142,26 +1143,25 @@ def test_vectorizer_stop_words_inconsistent():
                          ['hello world'])
 
 
-@pytest.mark.parametrize("vec", [CountVectorizer()])
-def test_countvectorizer_sort_features_64bit_sparse_indices(vec):
-    # If a count vectorizer has to store >= 2**31 count values, the sparse
-    # storage matrix has 64bit indices / indptrs.  This requires ~2*8*2**31
-    # bytes of memory in practice, so we just test the method that would
-    # hypothetically fail.
+def test_countvectorizer_sort_features_64bit_sparse_indices():
+    """
+    Check that CountVectorizer._sort_features preserves the dtype of its sparse
+    feature matrix.
+    """
 
     X = sparse.csr_matrix((5, 5), dtype=np.int64)
 
     # force indices and indptr to int64.
-    INDEX_DTYPE = np.int64
-    X.indices = X.indices.astype(INDEX_DTYPE, copy=False)
-    X.indptr = X.indptr.astype(INDEX_DTYPE, copy=False)
+    INDICES_DTYPE = np.int64
+    X.indices = X.indices.astype(INDICES_DTYPE)
+    X.indptr = X.indptr.astype(INDICES_DTYPE)
 
     vocabulary = {
             "scikit-learn": 0,
             "is": 1,
             "great!": 2
             }
 
-    vec._sort_features(X, vocabulary)
+    Xs = CountVectorizer()._sort_features(X, vocabulary)
 
-    assert_equal(INDEX_DTYPE, X.indices.dtype)
+    assert INDICES_DTYPE == Xs.indices.dtype