8000 FIX unicode support in count vectorizer. Closes #1098. · seckcoder/scikit-learn@8b0219a · GitHub
[go: up one dir, main page]

Skip to content

Commit 8b0219a

Browse files
amuellerGaelVaroquaux
authored andcommitted
FIX unicode support in count vectorizer. Closes scikit-learn#1098.
1 parent d65fae7 commit 8b0219a

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

sklearn/feature_extraction/tests/test_text.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,28 @@ def test_vectorizer_pipeline_grid_selection():
580580
assert_false(best_vectorizer.fixed_vocabulary)
581581

582582

583+
def test_count_vectorizer_unicode():
584+
# tests that the count vectorizer works with cyrillic.
585+
document = (u"\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0"
586+
u"\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0"
587+
u"\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd"
588+
u"\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7"
589+
u"\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81"
590+
u"\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3"
591+
u"\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0"
592+
u"\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87"
593+
u"\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82"
594+
u"\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80"
595+
u"\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3"
596+
u"\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81"
597+
u"\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 "
598+
u"\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1"
599+
u"\x8f.")
600+
vect = CountVectorizer(min_df=1)
601+
X = vect.fit_transform([document])
602+
assert_equal(X.shape, (1, 15))
603+
604+
583605
def test_tfidf_vectorizer_with_fixed_vocabulary():
584606
# non regression smoke test for inheritance issues
585607
vocabulary = ['pizza', 'celeri']

sklearn/feature_extraction/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ class CountVectorizer(BaseEstimator):
207207
def __init__(self, input='content', charset='utf-8',
208208
charset_error='strict', strip_accents=None,
209209
lowercase=True, preprocessor=None, tokenizer=None,
210-
stop_words=None, token_pattern=ur"\b\w\w+\b",
210+
stop_words=None, token_pattern=ur"(?u)\b\w\w+\b",
211211
ngram_range=(1, 1),
212212
min_n=None, max_n=None, analyzer='word',
213213
max_df=1.0, min_df=2, max_features=None,
@@ -830,7 +830,7 @@ class TfidfVectorizer(CountVectorizer):
830830
def __init__(self, input='content', charset='utf-8',
831831
charset_error='strict', strip_accents=None, lowercase=True,
832832
preprocessor=None, tokenizer=None, analyzer='word',
833-
stop_words=None, token_pattern=ur"\b\w\w+\b", min_n=None,
833+
stop_words=None, token_pattern=ur"(?u)\b\w\w+\b", min_n=None,
834834
max_n=None, ngram_range=(1, 1), max_df=1.0, min_df=2,
835835
max_features=None, vocabulary=None, binary=False, dtype=long,
836836
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):

0 commit comments

Comments
 (0)
0