diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index e3180c96546bc..ebe13cc0c240a 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -378,6 +378,18 @@ def test_countvectorizer_custom_token_pattern_with_several_group(): vectorizer.fit(corpus) +def test_countvectorizer_uppercase_in_vocab(): + vocabulary = ['Sample', 'Upper', 'Case' 'Vocabulary'] + message = ("Upper case characters found in" + " vocabulary while 'lowercase'" + " is True. These entries will not" + " be matched with any documents") + + vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary) + assert_warns_message(UserWarning, message, + vectorizer.fit_transform, vocabulary) + + def test_tf_idf_smoothing(): X = [[1, 1, 1], [1, 1, 0], diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 7fd6303e52491..fad0e53ed31ca 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1107,6 +1107,15 @@ def _count_vocab(self, raw_documents, fixed_vocab): j_indices = [] indptr = [] + if self.lowercase: + for vocab in vocabulary: + if any(map(str.isupper, vocab)): + warnings.warn("Upper case characters found in" + " vocabulary while 'lowercase'" + " is True. These entries will not" + " be matched with any documents") + break + values = _make_int_array() indptr.append(0) for doc in raw_documents: