18
18
from sklearn .pipeline import Pipeline
19
19
from sklearn .svm import LinearSVC
20
20
21
+ from sklearn .base import clone
21
22
22
23
import numpy as np
23
24
from nose import SkipTest
@@ -283,7 +284,8 @@ def test_countvectorizer_stop_words():
283
284
284
285
def test_countvectorizer_empty_vocabulary ():
285
286
try :
286
- CountVectorizer (vocabulary = [])
287
+ vect = CountVectorizer (vocabulary = [])
288
+ vect .fit (["foo" ])
287
289
assert False , "we shouldn't get here"
288
290
except ValueError as e :
289
291
assert_in ("empty vocabulary" , str (e ).lower ())
@@ -379,7 +381,6 @@ def test_vectorizer():
379
381
380
382
# build a vectorizer v1 with the same vocabulary as the one fitted by v1
381
383
v2 = CountVectorizer (vocabulary = v1 .vocabulary_ )
382
-
383
384
# compare that the two vectorizer give the same output on the test sample
384
385
for v
8000
span> in (v1 , v2 ):
385
386
counts_test = v .transform (test_data )
@@ -405,7 +406,6 @@ def test_vectorizer():
405
406
assert_equal (counts_test [0 , vocabulary ["burger" ]], 0 )
406
407
assert_equal (counts_test [0 , vocabulary ["beer" ]], 0 )
407
408
assert_equal (counts_test [0 , vocabulary ["pizza" ]], 0 )
408
-
409
409
# test tf-idf
410
410
t1 = TfidfTransformer (norm = 'l1' )
411
411
tfidf = t1 .fit (counts_train ).transform (counts_train ).toarray ()
@@ -440,10 +440,10 @@ def test_vectorizer():
440
440
# (equivalent to term count vectorizer + tfidf transformer)
441
441
train_data = iter (ALL_FOOD_DOCS [:- 1 ])
442
442
tv = TfidfVectorizer (norm = 'l1' )
443
- assert_false (tv .fixed_vocabulary )
444
443
445
444
tv .max_df = v1 .max_df
446
445
tfidf2 = tv .fit_transform (train_data ).toarray ()
446
+ assert_false (tv .fixed_vocabulary )
447
447
assert_array_almost_equal (tfidf , tfidf2 )
448
448
449
449
# test the direct tfidf vectorizer with new data
@@ -824,7 +824,6 @@ def test_tfidf_vectorizer_with_fixed_vocabulary():
824
824
# non regression smoke test for inheritance issues
825
825
vocabulary = ['pizza' , 'celeri' ]
826
826
vect = TfidfVectorizer (vocabulary = vocabulary )
827
- assert_true (vect .fixed_vocabulary )
828
827
X_1 = vect .fit_transform (ALL_FOOD_DOCS )
829
828
X_2 = vect .transform (ALL_FOOD_DOCS )
830
829
assert_array_almost_equal (X_1 .toarray (), X_2 .toarray ())
@@ -870,7 +869,8 @@ def test_pickling_transformer():
870
869
871
870
def test_non_unique_vocab ():
872
871
vocab = ['a' , 'b' , 'c' , 'a' , 'a' ]
873
- assert_raises (ValueError , CountVectorizer , vocabulary = vocab )
872
+ vect = CountVectorizer (vocabulary = vocab )
873
+ assert_raises (ValueError , vect .fit , [])
874
874
875
875
876
876
def test_hashingvectorizer_nan_in_docs ():
@@ -901,3 +901,11 @@ def test_tfidfvectorizer_export_idf():
901
901
vect = TfidfVectorizer (use_idf = True )
902
902
vect .fit (JUNK_FOOD_DOCS )
903
903
assert_array_al
8000
most_equal (vect .idf_ , vect ._tfidf .idf_ )
904
+
905
+
906
+ def test_vectorizer_vocab_clone ():
907
+ vect_vocab = TfidfVectorizer (vocabulary = ["the" ])
908
+ vect_vocab_clone = clone (vect_vocab )
909
+ vect_vocab .fit (ALL_FOOD_DOCS )
910
+ vect_vocab_clone .fit (ALL_FOOD_DOCS )
911
+ assert_equal (vect_vocab_clone .vocabulary_ , vect_vocab .vocabulary_ )
0 commit comments