8
8
from sklearn .feature_extraction .text import TfidfTransformer
9
9
from sklearn .feature_extraction .text import TfidfVectorizer
10
10
11
+ from sklearn .feature_extraction .text import ENGLISH_STOP_WORDS
12
+
11
13
from sklearn .grid_search import GridSearchCV
12
14
from sklearn .pipeline import Pipeline
13
15
from sklearn .svm import LinearSVC
@@ -172,8 +174,8 @@ def test_unicode_decode_error():
172
174
ca = CountVectorizer (analyzer = 'char' , ngram_range = (3 , 6 ),
173
175
charset = 'ascii' ).build_analyzer ()
174
176
assert_raises (UnicodeDecodeError , ca , text_bytes )
175
-
176
-
177
+
178
+
177
179
def test_char_ngram_analyzer ():
178
180
cnga = CountVectorizer (analyzer = 'char' , strip_accents = 'unicode' ,
179
181
ngram_range = (3 , 6 )).build_analyzer ()
@@ -242,6 +244,19 @@ def test_countvectorizer_custom_vocabulary_pipeline():
242
244
assert_equal (set (pipe .named_steps ['count' ].vocabulary_ ),
243
245
set (what_we_like ))
244
246
assert_equal (X .shape [1 ], len (what_we_like ))
247
+
248
+
249
+ def test_countvectorizer_stop_words ():
250
+ cv = CountVectorizer ()
251
+ cv .set_params (stop_words = 'english' )
252
+ assert_equal (cv .get_stop_words (), ENGLISH_STOP_WORDS )
253
+ cv .set_params (stop_words = '_bad_str_stop_' )
254
+ assert_raises (ValueError , cv .get_stop_words )
255
+ cv .set_params (stop_words = u'_bad_unicode_stop_' )
256
+ assert_raises (ValueError , cv .get_stop_words )
257
+ stoplist = ['some' , 'other' , 'words' ]
258
+ cv .set_params (stop_words = stoplist )
259
+ assert_equal (cv .get_stop_words (), stoplist )
245
260
246
261
247
262
def test_countvectorizer_empty_vocabulary ():
@@ -393,6 +408,14 @@ def test_vectorizer():
393
408
# test idf transform with unlearned idf vector
394
409
t3 = TfidfTransformer (use_idf = True )
395
410
assert_raises (ValueError , t3 .transform , counts_train )
411
+
412
+ # test idf transform with incompatible n_features
413
+ X = [[1 , 1 , 5 ],
414
+ [1 , 1 , 0 ]]
415
+ t3 .fit (X )
416
+ X_incompt = [[1 , 3 ],
417
+ [1 , 3 ]]
418
+ assert_raises (ValueError , t3 .transform , X_incompt )
396
419
397
420
# L1-normalized term frequencies sum to one
398
421
assert_array_almost_equal (np .sum (tf , axis = 1 ), [1.0 ] * n_train )
@@ -414,7 +437,32 @@ def test_vectorizer():
414
437
# test transform on unfitted vectorizer with empty vocabulary
415
438
v3 = CountVectorizer (vocabulary = None )
416
439
assert_raises (ValueError , v3 .transform , train_data )
417
-
440
+
441
+ # ascii preprocessor?
442
+ v3 .set_params (strip_accents = 'ascii' , lowercase = False )
443
+ assert_equal (v3 .build_preprocessor (), strip_accents_ascii )
444
+
445
+ # error on bad strip_accents param
446
+ v3 .set_params (strip_accents = '_gabbledegook_' , preprocessor = None )
447
+ assert_raises (ValueError , v3 .build_preprocessor )
448
+
449
+ # error with bad analyzer type
450
+ v3 .set_params = '_invalid_analyzer_type_'
451
+ assert_raises (ValueError , v3 .build_analyzer )
452
+
453
+
454
+ def test_tfidf_vectorizer_setters ():
455
+ tv = TfidfVectorizer (norm = 'l2' , use_idf = False ,
456
+ smooth_idf = False , sublinear_tf = False )
457
+ tv .norm = 'l1'
458
+ assert_equal (tv ._tfidf .norm , 'l1' )
459
+ tv .use_idf = True
460
+ assert_true (tv ._tfidf .use_idf )
461
+ tv .smooth_idf = True
462
+ assert_true (tv ._tfidf .smooth_idf )
463
+ tv .sublinear_tf = True
464
+ assert_true (tv ._tfidf .sublinear_tf )
465
+
418
466
419
467
def test_hashing_vectorizer ():
420
468
v = HashingVectorizer ()
@@ -456,8 +504,11 @@ def test_hashing_vectorizer():
456
504
457
505
def test_feature_names ():
458
506
cv = CountVectorizer (max_df = 0.5 , min_df = 1 )
459
- X = cv .fit_transform (ALL_FOOD_DOCS )
460
-
507
+
508
+ # test for Value error on unfitted/empty vocabulary
509
+ assert_raises (ValueError , cv .get_feature_names )
510
+
511
+ X = cv .fit_transform (ALL_FOOD_DOCS )
461
512
n_samples , n_features = X .shape
462
513
assert_equal (len (cv .vocabulary_ ), n_features )
463
514
0 commit comments