8000 FIX stop words validation in text vectorizers with custom preprocesso… · scikit-learn/scikit-learn@eb36c28 · GitHub
[go: up one dir, main page]

Skip to content

Commit eb36c28

Browse files
rthqinhanmin2014
authored andcommitted
FIX stop words validation in text vectorizers with custom preprocessors / tokenizers (#12393)
1 parent 8622885 commit eb36c28

File tree

3 files changed

+75
-5
lines changed

3 files changed

+75
-5
lines changed

doc/whats_new/v0.20.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,14 @@ Changelog
8585
where ``max_features`` was sometimes rounded down to zero.
8686
:issue:`12388` by :user:`Connor Tann <Connossor>`.
8787

88+
:mod:`sklearn.feature_extraction`
89+
...........................
90+
91+
- |Fix| Fixed a regression in v0. 8000 20.0 where
92+
:func:`feature_extraction.text.CountVectorizer` and other text vectorizers
93+
could error during stop words validation with custom preprocessors
94+
or tokenizers. :issue:`12393` by `Roman Yurchak`_.
95+
8896
:mod:`sklearn.linear_model`
8997
...........................
9098

sklearn/feature_extraction/tests/test_text.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from __future__ import unicode_literals
2+
import re
23
import warnings
34

45
import pytest
@@ -1121,6 +1122,14 @@ def test_vectorizers_invalid_ngram_range(vec):
11211122
ValueError, message, vec.transform, ["good news everyone"])
11221123

11231124

1125+
def _check 10000 _stop_words_consistency(estimator):
1126+
stop_words = estimator.get_stop_words()
1127+
tokenize = estimator.build_tokenizer()
1128+
preprocess = estimator.build_preprocessor()
1129+
return estimator._check_stop_words_consistency(stop_words, preprocess,
1130+
tokenize)
1131+
1132+
11241133
@fails_if_pypy
11251134
def test_vectorizer_stop_words_inconsistent():
11261135
if PY2:
@@ -1135,11 +1144,44 @@ def test_vectorizer_stop_words_inconsistent():
11351144
vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
11361145
assert_warns_message(UserWarning, message, vec.fit_transform,
11371146
['hello world'])
1147+
# reset stop word validation
1148+
del vec._stop_words_id
1149+
assert _check_stop_words_consistency(vec) is False
11381150

11391151
# Only one warning per stop list
11401152
assert_no_warnings(vec.fit_transform, ['hello world'])
1153+
assert _check_stop_words_consistency(vec) is None
11411154

11421155
# Test caching of inconsistency assessment
11431156
vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
11441157
assert_warns_message(UserWarning, message, vec.fit_transform,
11451158
['hello world'])
1159+
1160+
1161+
@fails_if_pypy
1162+
@pytest.mark.parametrize('Estimator',
1163+
[CountVectorizer, TfidfVectorizer, HashingVectorizer])
1164+
def test_stop_word_validation_custom_preprocessor(Estimator):
1165+
data = [{'text': 'some text'}]
1166+
1167+
vec = Estimator()
1168+
assert _check_stop_words_consistency(vec) is True
1169+
1170+
vec = Estimator(preprocessor=lambda x: x['text'],
1171+
stop_words=['and'])
1172+
assert _check_stop_words_consistency(vec) == 'error'
1173+
# checks are cached
1174+
assert _check_stop_words_consistency(vec) is None
1175+
vec.fit_transform(data)
1176+
1177+
class CustomEstimator(Estimator):
1178+
def build_preprocessor(self):
1179+
return lambda x: x['text']
1180+
1181+
vec = CustomEstimator(stop_words=['and'])
1182+
assert _check_stop_words_consistency(vec) == 'error'
1183+
1184+
vec = Estimator(tokenizer=lambda doc: re.compile(r'\w{1,}')
1185+
.findall(doc),
1186+
stop_words=['and'])
1187+
assert _check_stop_words_consistency(vec) is True

sklearn/feature_extraction/text.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,22 @@ def get_stop_words(self):
270270
return _check_stop_list(self.stop_words)
271271

272272
def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
273+
"""Check if stop words are consistent
274+
275+
Returns
276+
-------
277+
is_consistent : True if stop words are consistent with the preprocessor
278+
and tokenizer, False if they are not, None if the check
279+
was previously performed, "error" if it could not be
280+
performed (e.g. because of the use of a custom
281+
preprocessor / tokenizer)
282+
"""
283+
if id(self.stop_words) == getattr(self, '_stop_words_id', None):
284+
# Stop words are were previously validated
285+
return None
286+
273287
# NB: stop_words is validated, unlike self.stop_words
274-
if id(self.stop_words) != getattr(self, '_stop_words_id', None):
288+
try:
275289
inconsistent = set()
276290
for w in stop_words or ():
277291
tokens = list(tokenize(preprocess(w)))
@@ -281,10 +295,16 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
281295
self._stop_words_id = id(self.stop_words)
282296

283297
if inconsistent:
284-
warnings.warn('Your stop_words may be inconsistent with your '
285-
'preprocessing. Tokenizing the stop words '
286-
'generated tokens %r not in stop_words.' %
287-
sorted(inconsistent))
298+
warnings.warn('Your stop_words may be inconsistent with '
299+
'your preprocessing. Tokenizing the stop '
300+
'words generated tokens %r not in '
301+
'stop_words.' % sorted(inconsistent))
302+
return not inconsistent
303+
except Exception:
304+
# Failed to check stop words consistency (e.g. because a custom
305+
# preprocessor or tokenizer was used)
306+
self._stop_words_id = id(self.stop_words)
307+
return 'error'
288308

289309
def build_analyzer(self):
290310
"""Return a callable that handles preprocessing and tokenization"""

0 commit comments

Comments
 (0)
0