|
2 | 2 | from sklearn.feature_extraction.text import strip_tags
|
3 | 3 | from sklearn.feature_extraction.text import strip_accents_unicode
|
4 | 4 | from sklearn.feature_extraction.text import strip_accents_ascii
|
5 |
| -from sklearn.feature_extraction.text import _check_stop_list |
6 | 5 |
|
7 | 6 | from sklearn.feature_extraction.text import HashingVectorizer
|
8 | 7 | from sklearn.feature_extraction.text import CountVectorizer
|
@@ -175,14 +174,6 @@ def test_unicode_decode_error():
|
175 | 174 | ca = CountVectorizer(analyzer='char', ngram_range=(3, 6),
|
176 | 175 | charset='ascii').build_analyzer()
|
177 | 176 | assert_raises(UnicodeDecodeError, ca, text_bytes)
|
178 |
| - |
179 |
| - |
180 |
| -def test_check_stop_list(): |
181 |
| - assert_equal(_check_stop_list('english'), ENGLISH_STOP_WORDS) |
182 |
| - assert_raises(ValueError, _check_stop_list, 'bad_str_stop') |
183 |
| - assert_raises(ValueError, _check_stop_list, u'bad_unicode_stop') |
184 |
| - stoplist = ['some', 'other', 'words'] |
185 |
| - assert_equal(_check_stop_list(stoplist), s
8000
toplist) |
186 | 177 |
|
187 | 178 |
|
188 | 179 | def test_char_ngram_analyzer():
|
@@ -253,6 +244,19 @@ def test_countvectorizer_custom_vocabulary_pipeline():
|
253 | 244 | assert_equal(set(pipe.named_steps['count'].vocabulary_),
|
254 | 245 | set(what_we_like))
|
255 | 246 | assert_equal(X.shape[1], len(what_we_like))
|
| 247 | + |
| 248 | + |
| 249 | +def test_countvectorizer_stop_words(): |
| 250 | + cv = CountVectorizer() |
| 251 | + cv.set_params(stop_words='english') |
| 252 | + assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) |
| 253 | + cv.set_params(stop_words='_bad_str_stop_') |
| 254 | + assert_raises(ValueError, cv.get_stop_words) |
| 255 | + cv.set_params(stop_words=u'_bad_unicode_stop_') |
| 256 | + assert_raises(ValueError, cv.get_stop_words) |
| 257 | + stoplist = ['some', 'other', 'words'] |
| 258 | + cv.set_params(stop_words=stoplist) |
| 259 | + assert_equal(cv.get_stop_words(), stoplist) |
256 | 260 |
|
257 | 261 |
|
258 | 262 | def test_countvectorizer_empty_vocabulary():
|
|
0 commit comments