FIX CountVectorizer does not check for lowercase in vocabulary (#19401)

zitorelova · web-flow · commit 769da3d51fee · 2021-02-12T10:03:39.000+01:00
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -378,6 +378,18 @@ def test_countvectorizer_custom_token_pattern_with_several_group():
         vectorizer.fit(corpus)
 
 
+def test_countvectorizer_uppercase_in_vocab():
+    vocabulary = ['Sample', 'Upper', 'Case' 'Vocabulary']
+    message = ("Upper case characters found in"
+               " vocabulary while 'lowercase'"
+               " is True. These entries will not"
+               " be matched with any documents")
+
+    vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary)
+    assert_warns_message(UserWarning, message,
+                         vectorizer.fit_transform, vocabulary)
+
+
 def test_tf_idf_smoothing():
     X = [[1, 1, 1],
          [1, 1, 0],
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -1107,6 +1107,15 @@ def _count_vocab(self, raw_documents, fixed_vocab):
         j_indices = []
         indptr = []
 
+        if self.lowercase:
+            for vocab in vocabulary:
+                if any(map(str.isupper, vocab)):
+                    warnings.warn("Upper case characters found in"
+                                  " vocabulary while 'lowercase'"
+                                  " is True. These entries will not"
+                                  " be matched with any documents")
+                    break
+
         values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents: