8000 bpo-37966: Fully implement the UAX #15 quick-check algorithm. · gnprice/cpython@2a222da · GitHub
[go: up one dir, main page]

Skip to content

Commit 2a222da

Browse files
committed
bpo-37966: Fully implement the UAX python#15 quick-check algorithm.
The purpose of the `unicodedata.is_normalized` function is to answer the question `str == unicodedata.normalized(form, str)` more efficiently than writing just that, by using the "quick check" optimization described in the Unicode standard in UAX python#15. However, it turns out the code doesn't implement the full algorithm from the standard, and as a result we often miss the optimization and end up having to compute the whole normalized string after all. Implement the standard's algorithm. This greatly speeds up `unicodedata.is_normalized` in many cases where our partial variant of quick-check had been returning MAYBE and the standard algorithm returns NO. At a quick test on my desktop, the existing code takes about 4.4 ms/MB (so 4.4 ns per byte) when the partial quick-check returns MAYBE and it has to do the slow normalize-and-compare: $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 50 loops, best of 5: 4.39 msec per loop With this patch, it gets the answer instantly (58 ns) on the same 1 MB string: $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 5000000 loops, best of 5: 58.2 nsec per loop
1 parent 4025110 commit 2a222da

File tree

4 files changed

+47
-28
lines changed

4 files changed

+47
-28
lines changed

Doc/whatsnew/3.8.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,8 +1090,9 @@ unicodedata
10901090
<http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
10911091

10921092
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
1093-
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
1094-
:issue:`32285`).
1093+
is in a specific normal form, often much faster than by actually normalizing
1094+
the string. (Contributed by Max Belanger, David Euresti, and Greg Price in
1095+
:issue:`32285` and :issue:`37966`).
10951096

10961097

10971098
unittest

Lib/test/test_unicodedata.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ def test_issue29456(self):
208208
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
209209
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
210210

211+
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
212+
# see test_normalization.py .
211213

212214
def test_east_asian_width(self):
213215
eaw = self.db.east_asian_width
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The implementation of :func:`~unicodedata.is_normalized` has been greatly
2+
sped up on strings that aren't normalized, by implementing the full
3+
normalization-quick-check algorithm from the Unicode standard.

Modules/unicodedata.c

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -775,29 +775,37 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
775775
return result;
776776
}
777777

778-
typedef enum {YES, NO, MAYBE} NormalMode;
779-
780-
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781-
static NormalMode
782-
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
778+
// This needs to match the logic in makeunicodedata.py
779+
// which constructs the quickcheck data.
780+
typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
781+
782+
/* Run the Unicode normalization "quickcheck" algorithm.
783+
*
784+
* Return YES or NO if quickcheck determines the input is certainly
785+
* normalized or certainly not, and MAYBE if quickcheck is unable to
786+
* tell. */
787+
static QuickcheckResult
788+
is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k)
783789
{
784-
Py_ssize_t i, len;
785-
int kind;
786-
void *data;
787-
unsigned char prev_combining = 0, quickcheck_mask;
790+
/* This is an implementation of the following algorithm:
791+
https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
792+
See there for background.
793+
*/
788794

789795
/* An older version of the database is requested, quickchecks must be
790796
disabled. */
791797
if (self && UCD_Check(self))
792798
return NO;
793799

794-
/* This is an implementation of the following algorithm:
795-
https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
796-
See there for background.
797-
*/
800+
Py_ssize_t i, len;
801+
int kind;
802+
void *data;
803+
unsigned char prev_combining = 0;
798804

799-
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */
800-
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
805+
/* The two quickcheck bits at this shift have type QuickcheckResult. */
806+
int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
807+
808+
QuickcheckResult result = YES; /* certainly normalized, unless we find something */
801809

802810
i = 0;
803811
kind = PyUnicode_KIND(input);
@@ -806,16 +814,21 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
806814
while (i < len) {
807815
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
808816
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
809-
unsigned char combining = record->combining;
810-
unsigned char quickcheck = record->normalization_quick_check;
811817

812-
if (quickcheck & quickcheck_mask)
813-
return MAYBE; /* this string might need normalization */
818+
unsigned char combining = record->combining;
814819
if (combining && prev_combining > combining)
815820
return NO; /* non-canonical sort order, not normalized */
816821
prev_combining = combining;
822+
823+
unsigned char quickcheck = record->normalization_quick_check;
824+
switch ((quickcheck >> quickcheck_shift) & 3) {
825+
case NO:
826+
return NO;
827+
case MAYBE:
828+
result = MAYBE; /* this string might need normalization */
829+
}
817830
}
818-
return YES; /* certainly normalized */
831+
return result;
819832
}
820833

821834
/*[clinic input]
@@ -848,7 +861,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
848861
PyObject *result;
849862
int nfc = 0;
850863
int k = 0;
851-
NormalMode m;
864+
QuickcheckResult m;
852865

853866
PyObject *cmp;
854867
int match = 0;
@@ -871,7 +884,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
871884
return NULL;
872885
}
873886

874-
m = is_normalized(self, input, nfc, k);
887+
m = is_normalized_quickcheck(self, input, nfc, k);
875888

876889
if (m == MAYBE) {
877890
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -917,28 +930,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
917930
}
918931

919932
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
920-
if (is_normalized(self, input, 1, 0) == YES) {
933+
if (is_normalized_quickcheck(self, input, 1, 0) == YES) {
921934
Py_INCREF(input);
922935
return input;
923936
}
924937
return nfc_nfkc(self, input, 0);
925938
}
926939
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
927-
if (is_normalized(self, input, 1, 1) == YES) {
940+
if (is_normalized_quickcheck(self, input, 1, 1) == YES) {
928941
Py_INCREF(input);
929942
return input;
930943
}
931944
return nfc_nfkc(self, input, 1);
932945
}
933946
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
934-
if (is_normalized(self, input, 0, 0) == YES) {
947+
if (is_normalized_quickcheck(self, input, 0, 0) == YES) {
935948
Py_INCREF(input);
936949
return input;
937950
}
938951
return nfd_nfkd(self, input, 0);
939952
}
940953
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
941-
if (is_normalized(self, input, 0, 1) == YES) {
954+
if (is_normalized_quickcheck(self, input, 0, 1) == YES) {
942955
Py_INCREF(input);
943956
return input;
944957
}

0 commit comments

Comments
 (0)
0