8000 FIX a bug in feature_extraction.text.strip_accents_unicode (#15100) · NeuroDataDesign/scikit-learn@aada3ea · GitHub
[go: up one dir, main page]

Skip to content

Commit aada3ea

Browse files
DGradyrth
authored andcommitted
FIX a bug in feature_extraction.text.strip_accents_unicode (scikit-learn#15100)
1 parent 220e146 commit aada3ea

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

doc/whats_new/v0.22.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,10 @@ Changelog
255255
removed in v0.24. :pr:`14520` by
256256
:user:`Guillem G. Subies <guillemgsubies>`.
257257

258+
- |Fix| :func:`feature_extraction.text.strip_accents_unicode` now correctly
259+
removes accents from strings that are in NFKD normalized form. :pr:`15100` by
260+
:user:`Daniel Grady <DGrady>`.
261+
258262
:mod:`sklearn.feature_selection`
259263
................................
260264

sklearn/feature_extraction/tests/test_text.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,21 @@ def test_strip_accents():
9797
expected = 'this is a test'
9898
assert strip_accents_unicode(a) == expected
9999

100+
# strings that are already decomposed
101+
a = "o\u0308" # o with diaresis
102+
expected = "o"
103+
assert strip_accents_unicode(a) == expected
104+
105+
# combining marks by themselves
106+
a = "\u0300\u0301\u0302\u0303"
107+
expected = ""
108+
assert strip_accents_unicode(a) == expected
109+
110+
# Multiple combining marks on one character
111+
a = "o\u0308\u0304"
112+
expected = "o"
113+
assert strip_accents_unicode(a) == expected
114+
100115

101116
def test_to_ascii():
102117
# check some classical latin accentuated symbols

sklearn/feature_extraction/text.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,13 @@ def strip_accents_unicode(s):
129129
Remove accentuated char for any unicode symbol that has a direct
130130
ASCII equivalent.
131131
"""
132-
normalized = unicodedata.normalize('NFKD', s)
133-
if normalized == s:
132+
try:
133+
# If `s` is ASCII-compatible, then it does not contain any accented
134+
# characters and we can avoid an expensive list comprehension
135+
s.encode("ASCII", errors="strict")
134136
return s
135-
else:
137+
except UnicodeEncodeError:
138+
normalized = unicodedata.normalize('NFKD', s)
136139
return ''.join([c for c in normalized if not unicodedata.combining(c)])
137140

138141

0 commit comments

Comments
 (0)
0