8000 gh-129569: The function unicodedata.normalize() always returns built-… · python/cpython@c359fcd · GitHub
[go: up one dir, main page]

Skip to content

Commit c359fcd

Browse files
Hizuru3vstinner
andauthored
gh-129569: The function unicodedata.normalize() always returns built-in str (#129570)
Co-authored-by: Victor Stinner <vstinner@python.org>
1 parent 9bf73c0 commit c359fcd

File tree

3 files changed

+29
-5
lines changed

3 files changed

+29
-5
lines changed

Lib/test/test_unicodedata.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,29 @@ def test_bug_834676(self):
467467
# Check for bug 834676
468468
unicodedata.normalize('NFC', '\ud55c\uae00')
469469

470+
def test_normalize_return_type(self):
471+
# gh-129569: normalize() return type must always be str
472+
normalize = unicodedata.normalize
473+
474+
class MyStr(str):
475+
pass
476+
477+
normalization_forms = ("NFC", "NFKC", "NFD", "NFKD")
478+
input_strings = (
479+
# normalized strings
480+
"",
481+
"ascii",
482+
# unnormalized strings
483+
"\u1e0b\u0323",
484+
"\u0071\u0307\u0323",
485+
)
486+
487+
for form in normalization_forms:
488+
for input_str in input_strings:
489+
with self.subTest(form=form, input_str=input_str):
490+
self.assertIs(type(normalize(form, input_str)), str)
491+
self.assertIs(type(normalize(form, MyStr(input_str))), str)
492+
470493

471494
if __name__ == "__main__":
472495
unittest.main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.

Modules/unicodedata.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -933,34 +933,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
933933
if (PyUnicode_GET_LENGTH(input) == 0) {
934934
/* Special case empty input strings, since resizing
935935
them later would cause internal errors. */
936-
return Py_NewRef(input);
936+
return PyUnicode_FromObject(input);
937937
}
938938

939939
if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
940940
if (is_normalized_quickcheck(self, input,
941941
true, false, true) == YES) {
942-
return Py_NewRef(input);
942+
return PyUnicode_FromObject(input);
943943
}
944944
return nfc_nfkc(self, input, 0);
945945
}
946946
if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
947947
if (is_normalized_quickcheck(self, input,
948948
true, true, true) == YES) {
949-
return Py_NewRef(input);
949+
return PyUnicode_FromObject(input);
950950
}
951951
return nfc_nfkc(self, input, 1);
952952
}
953953
if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
954954
if (is_normalized_quickcheck(self, input,
955955
false, false, true) == YES) {
956-
return Py_NewRef(input);
956+
return PyUnicode_FromObject(input);
957957
}
958958
return nfd_nfkd(self, input, 0);
959959
}
960960
if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
961961
if (is_normalized_quickcheck(self, input,
962962
false, true, true) == YES) {
963-
return Py_NewRef(input);
963+
return PyUnicode_FromObject(input);
964964
}
965965
return nfd_nfkd(self, input, 1);
966966
}

0 commit comments

Comments
 (0)
0