File tree Expand file tree Collapse file tree 3 files changed +25
-3
lines changed Expand file tree Collapse file tree 3 files changed +25
-3
lines changed Original file line number Diff line number Diff line change @@ -255,6 +255,10 @@ Changelog
255255 removed in v0.24. :pr: `14520 ` by
256256 :user: `Guillem G. Subies <guillemgsubies> `.
257257
258+ - |Fix | :func: `feature_extraction.text.strip_accents_unicode ` now correctly
259+ removes accents from strings that are in NFKD normalized form. :pr: `15100 ` by
260+ :user: `Daniel Grady <DGrady> `.
261+
258262:mod: `sklearn.feature_selection `
259263................................
260264
Original file line number Diff line number Diff line change @@ -97,6 +97,21 @@ def test_strip_accents():
9797 expected = 'this is a test'
9898 assert strip_accents_unicode (a ) == expected
9999
100+ # strings that are already decomposed
101+ a = "o\u0308 " # o with diaresis
102+ expected = "o"
103+ assert strip_accents_unicode (a ) == expected
104+
105+ # combining marks by themselves
106+ a = "\u0300 \u0301 \u0302 \u0303 "
107+ expected = ""
108+ assert strip_accents_unicode (a ) == expected
109+
110+ # Multiple combining marks on one character
111+ a = "o\u0308 \u0304 "
112+ expected = "o"
113+ assert strip_accents_unicode (a ) == expected
114+
100115
101116def test_to_ascii ():
102117 # check some classical latin accentuated symbols
Original file line number Diff line number Diff line change @@ -129,10 +129,13 @@ def strip_accents_unicode(s):
129129 Remove accentuated char for any unicode symbol that has a direct
130130 ASCII equivalent.
131131 """
132- normalized = unicodedata .normalize ('NFKD' , s )
133- if normalized == s :
132+ try :
133+ # If `s` is ASCII-compatible, then it does not contain any accented
134+ # characters and we can avoid an expensive list comprehension
135+ s .encode ("ASCII" , errors = "strict" )
134136 return s
135- else :
137+ except UnicodeEncodeError :
138+ normalized = unicodedata .normalize ('NFKD' , s )
136139 return '' .join ([c for c in normalized if not unicodedata .combining (c )])
137140
138141
You can’t perform that action at this time.
0 commit comments