8000 FIX CountVectorizer does not check for lowercase in vocabulary (#19401) · scikit-learn/scikit-learn@769da3d · GitHub
[go: up one dir, main page]

Skip to content

Commit 769da3d

Browse files
authored
FIX CountVectorizer does not check for lowercase in vocabulary (#19401)
1 parent abd1597 commit 769da3d

File tree

2 files changed

+21
-0
lines changed

2 files changed

+21
-0
lines changed

sklearn/feature_extraction/tests/test_text.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,18 @@ def test_countvectorizer_custom_token_pattern_with_several_group():
378378
vectorizer.fit(corpus)
379379

380380

381+
def test_countvectorizer_uppercase_in_vocab():
382+
vocabulary = ['Sample', 'Upper', 'Case' 'Vocabulary']
383+
message = ("Upper case characters found in"
384+
" vocabulary while 'lowercase'"
385+
" is True. These entries will not"
386+
" be matched with any documents")
387+
388+
vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary)
389+
assert_warns_message(UserWarning, message,
390+
vectorizer.fit_transform, vocabulary)
391+
392+
381393
def test_tf_idf_smoothing():
382394
X = [[1, 1, 1],
383395
[1, 1, 0],

sklearn/feature_extraction/text.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,15 @@ def _count_vocab(self, raw_documents, fixed_vocab):
11071107
j_indices = []
11081108
indptr = []
11091109

1110+
if self.lowercase:
1111+
for vocab in vocabulary:
1112+
if any(map(str.isupper, vocab)):
1113+
warnings.warn("Upper case characters found in"
1114+
" vocabulary while 'lowercase'"
1115+
" is True. These entries will not"
1116+
" be matched with any documents")
1117+
break
1118+
11101119
values = _make_int_array()
11111120
indptr.append(0)
11121121
for doc in raw_documents:

0 commit comments

Comments
 (0)
0