8000 Merge pull request #1660 from rlmv/fe_tests · seckcoder/scikit-learn@0e97a7f · GitHub
[go: up one dir, main page]

Skip to content

Commit 0e97a7f

Browse files
committed
Merge pull request scikit-learn#1660 from rlmv/fe_tests
test cases for feature_extraction.text
2 parents a54809e + c8e8614 commit 0e97a7f

File tree

2 files changed

+63
-12
lines changed

2 files changed

+63
-12
lines changed

sklearn/feature_extraction/tests/test_text.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from sklearn.feature_extraction.text import TfidfTransformer
99
from sklearn.feature_extraction.text import TfidfVectorizer
1010

11+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
12+
1113
from sklearn.grid_search import GridSearchCV
1214
from sklearn.pipeline import Pipeline
1315
from sklearn.svm import LinearSVC
@@ -172,8 +174,8 @@ def test_unicode_decode_error():
172174
ca = CountVectorizer(analyzer='char', ngram_range=(3, 6),
173175
charset='ascii').build_analyzer()
174176
assert_raises(UnicodeDecodeError, ca, text_bytes)
175-
176-
177+
178+
177179
def test_char_ngram_analyzer():
178180
cnga = CountVectorizer(analyzer='char', strip_accents='unicode',
179181
ngram_range=(3, 6)).build_analyzer()
@@ -242,6 +244,19 @@ def test_countvectorizer_custom_vocabulary_pipeline():
242244
assert_equal(set(pipe.named_steps['count'].vocabulary_),
243245
set(what_we_like))
244246
assert_equal(X.shape[1], len(what_we_like))
247+
248+
249+
def test_countvectorizer_stop_words():
250+
cv = CountVectorizer()
251+
cv.set_params(stop_words='english')
252+
assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
253+
cv.set_params(stop_words='_bad_str_stop_')
254+
assert_raises(ValueError, cv.get_stop_words)
255+
cv.set_params(stop_words=u'_bad_unicode_stop_')
256+
assert_raises(ValueError, cv.get_stop_words)
257+
stoplist = ['some', 'other', 'words']
258+
cv.set_params(stop_words=stoplist)
259+
assert_equal(cv.get_stop_words(), stoplist)
245260

246261

247262
def test_countvectorizer_empty_vocabulary():
@@ -393,6 +408,14 @@ def test_vectorizer():
393408
# test idf transform with unlearned idf vector
394409
t3 = TfidfTransformer(use_idf=True)
395410
assert_raises(ValueError, t3.transform, counts_train)
411+
412+
# test idf transform with incompatible n_features
413+
X = [[1, 1, 5],
414+
[1, 1, 0]]
415+
t3.fit(X)
416+
X_incompt = [[1, 3],
417+
[1, 3]]
418+
assert_raises(ValueError, t3.transform, X_incompt)
396419

397420
# L1-normalized term frequencies sum to one
398421
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
@@ -414,7 +437,32 @@ def test_vectorizer():
414437
# test transform on unfitted vectorizer with empty vocabulary
415438
v3 = CountVectorizer(vocabulary=None)
416439
assert_raises(ValueError, v3.transform, train_data)
417-
440+
441+
# ascii preprocessor?
442+
v3.set_params(strip_accents='ascii', lowercase=False)
443+
assert_equal(v3.build_preprocessor(), strip_accents_ascii)
444+
445+
# error on bad strip_accents param
446+
v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
447+
assert_raises(ValueError, v3.build_preprocessor)
448+
449+
# error with bad analyzer type
450+
v3.set_params = '_invalid_analyzer_type_'
451+
assert_raises(ValueError, v3.build_analyzer)
452+
453+
454+
def test_tfidf_vectorizer_setters():
455+
tv = TfidfVectorizer(norm='l2', use_idf=False,
456+
smooth_idf=False, sublinear_tf=False)
457+
tv.norm = 'l1'
458+
assert_equal(tv._tfidf.norm, 'l1')
459+
tv.use_idf = True
460+
assert_true(tv._tfidf.use_idf)
461+
tv.smooth_idf = True
462+
assert_true(tv._tfidf.smooth_idf)
463+
tv.sublinear_tf = True
464+
assert_true(tv._tfidf.sublinear_tf)
465+
418466

419467
def test_hashing_vectorizer():
420468
v = HashingVectorizer()
@@ -456,8 +504,11 @@ def test_hashing_vectorizer():
456504

457505
def test_feature_names():
458506
cv = CountVectorizer(max_df=0.5, min_df=1)
459-
X = cv.fit_transform(ALL_FOOD_DOCS)
460-
507+
508+
# test for Value error on unfitted/empty vocabulary
509+
assert_raises(ValueError, cv.get_feature_names)
510+
511+
X = cv.fit_transform(ALL_FOOD_DOCS)
461512
n_samples, n_features = X.shape
462513
assert_equal(len(cv.vocabulary_), n_features)
463514

sklearn/feature_extraction/text.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def build_preprocessor(self):
168168

169169
# unfortunately python functools package does not have an efficient
170170
# `compose` function that would have allowed us to chain a dynamic
171-
# number of functions. However the however of a lambda call is a few
171+
# number of functions. However the cost of a lambda call is a few
172172
# hundreds of nanoseconds which is negligible when compared to the
173173
# cost of tokenizing a string of 1000 chars for instance.
174174
noop = lambda x: x
@@ -319,8 +319,8 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
319319
320320
stop_words: string {'english'}, list, or None (default)
321321
If a string, it is passed to _check_stop_list and the appropriate stop
322-
list is returned is currently the only
323-
supported string value.
322+
list is returned. 'english' is currently the only supported string
323+
value.
324324
325325
If a list, that list is assumed to contain stop words, all of which
326326
will be removed from the resulting tokens.
@@ -500,8 +500,8 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
500500
501501
stop_words : string {'english'}, list, or None (default)
502502
If a string, it is passed to _check_stop_list and the appropriate stop
503-
list is returned is currently the only
504-
supported string value.
503+
list is returned. 'english' is currently the only supported string
504+
value.
505505
506506
If a list, that list is assumed to contain stop words, all of which
507507
will be removed from the resulting tokens.
@@ -994,8 +994,8 @@ class TfidfVectorizer(CountVectorizer):
994994
995995
stop_words : string {'english'}, list, or None (default)
996996
If a string, it is passed to _check_stop_list and the appropriate stop
997-
list is returned is currently the only
998-
supported string value.
997+
list is returned. 'english' is currently the only supported string
998+
value.
999999
10001000
If a list, that list is assumed to contain stop words, all of which
10011001
will be removed from the resulting tokens.

0 commit comments

Comments
 (0)
0