@@ -1119,6 +1119,7 @@ def test_vectorizers_invalid_ngram_range(vec):
1119
1119
ValueError , message , vec .transform , ["good news everyone" ])
1120
1120
1121
1121
1122
+ < << << << 359 b5c0b72afd4e15039a0e16e2be1702b0d62c9
1122
1123
def test_vectorizer_stop_words_inconsistent ():
1123
1124
if PY2 :
1124
1125
lstr = "[u'and', u'll', u've']"
@@ -1142,26 +1143,25 @@ def test_vectorizer_stop_words_inconsistent():
1142
1143
['hello world' ])
1143
1144
1144
1145
1145
- @pytest .mark .parametrize ("vec" , [CountVectorizer ()])
1146
- def test_countvectorizer_sort_features_64bit_sparse_indices (vec ):
1147
- # If a count vectorizer has to store >= 2**31 count values, the sparse
1148
- # storage matrix has 64bit indices / indptrs. This requires ~2*8*2**31
1149
- # bytes of memory in practice, so we just test the method that would
1150
- # hypothetically fail.
1146
+ def test_countvectorizer_sort_features_64bit_sparse_indices ():
1147
+ """
1148
+ Check that CountVectorizer._sort_features preserves the dtype of its sparse
1149
+ feature matrix.
1150
+ """
1151
1151
1152
1152
X = sparse .csr_matrix ((5 , 5 ), dtype = np .int64 )
1153
1153
1154
1154
# force indices and indptr to int64.
1155
- INDEX_DTYPE = np .int64
1156
- X .indices = X .indices .astype (INDEX_DTYPE , copy = False )
1157
- X .indptr = X .indptr .astype (INDEX_DTYPE , copy = False )
1155
+ INDICES_DTYPE = np .int64
1156
+ X .indices = X .indices .astype (INDICES_DTYPE )
1157
+ X .indptr = X .indptr .astype (INDICES_DTYPE )
1158
1158
1159
1159
vocabulary = {
1160
1160
"scikit-learn" : 0 ,
1161
1161
"is" : 1 ,
1162
1162
"great!" : 2
1163
1163
}
1164
1164
1165
- vec ._sort_features (X , vocabulary )
1165
+ Xs = CountVectorizer () ._sort_features (X , vocabulary )
1166
1166
1167
- assert_equal ( INDEX_DTYPE , X .indices .dtype )
1167
+ assert INDICES_DTYPE == Xs .indices .dtype
0 commit comments