8000 Merged revisions 72054 via svnmerge from · gnprice/cpython@7a0fedf · GitHub
[go: up one dir, main page]

Skip to content

Commit 7a0fedf

Browse files
committed
Merged revisions 72054 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines Issue #1734234: Massively speedup `unicodedata.normalize()` when the string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen. ........
1 parent 57f3d93 commit 7a0fedf

File tree

5 files changed

+2056
-1746
lines changed

5 files changed

+2056
-1746
lines changed

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ Craig Rowland
616616
Paul Rubin
617617
Sam Ruby
618618
Audun S. Runde
619+
Rauli Ruohonen
619620
Jeff Rush
620621
Sam Rushing
621622
Mark Russell

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ Installation
9292
Library
9393
-------
9494

95+
- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
96+
string is already in normalized form, by performing a quick check beforehand.
97+
Original patch by Rauli Ruohonen.
98+
9599
- Issue #5853: calling a function of the mimetypes module from several threads
96100
at once could hit the recursion limit if the mimetypes database hadn't been
97101
initialized before.

Modules/unicodedata.c

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ typedef struct {
2727
const unsigned char mirrored; /* true if mirrored in bidir mode */
2828
const unsigned char east_asian_width; /* index into
2929
_PyUnicode_EastAsianWidth */
30+
const unsigned char normalization_quick_check; /* see is_normalized() */
3031
} _PyUnicode_DatabaseRecord;
3132

3233
typedef struct change_record {
@@ -722,7 +723,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
722723
PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
723724
return result;
724725
}
725-
726+
727+
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
728+
static int
729+
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
730+
{
731+
Py_UNICODE *i, *end;
732+
unsigned char prev_combining = 0, quickcheck_mask;
733+
734+
/* An older version of the database is requested, quickchecks must be
735+
disabled. */
736+
if (self && UCD_Check(self))
737+
return 0;
738+
739+
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
740+
as described in http://unicode.org/reports/tr15/#Annex8. */
741+
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
742+
743+
i = PyUnicode_AS_UNICODE(input);
744+
end = i + PyUnicode_GET_SIZE(input);
745+
while (i < end) {
746+
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
747+
unsigned char combining = record->combining;
748+
unsigned char quickcheck = record->normalization_quick_check;
749+
750+
if (quickcheck & quickcheck_mask)
751+
return 0; /* this string might need normalization */
752+
if (combining && prev_combining > combining)
753+
return 0; /* non-canonical sort order, not normalized */
754+
prev_combining = combining;
755+
}
756+
return 1; /* certainly normalized */
757+
}
758+
726759
PyDoc_STRVAR(unicodedata_normalize__doc__,
727760
"normalize(form, unistr)\n\
728761
\n\
@@ -746,14 +779,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
746779
return input;
747780
}
748781

749-
if (strcmp(form, "NFC") == 0)
782+
if (strcmp(form, "NFC") == 0) {
783+
if (is_normalized(self, input, 1, 0)) {
784+
Py_INCREF(input);
785+
return input;
786+
}
750787
return nfc_nfkc(self, input, 0);
751-
if (strcmp(form, "NFKC") == 0)
788+
}
789+
if (strcmp(form, "NFKC") == 0) {
790+
if (is_normalized(self, input, 1, 1)) {
791+
Py_INCREF(input);
792+
return input;
793+
}
752794
return nfc_nfkc(self, input, 1);
753-
if (strcmp(form, "NFD") == 0)
795+
}
796+
if (strcmp(form, "NFD") == 0) {
797+
if (is_normalized(self, input, 0, 0)) {
798+
Py_INCREF(input);
799+
return input;
800+
}
754801
return nfd_nfkd(self, input, 0);
755-
if (strcmp(form, "NFKD") == 0)
802+
}
803+
if (strcmp(form, "NFKD") == 0) {
804+
if (is_normalized(self, input, 0, 1)) {
805+
Py_INCREF(input);
806+
return input;
807+
}
756808
return nfd_nfkd(self, input, 1);
809+
}
757810
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
758811
return NULL;
759812
}

0 commit comments

Comments
 (0)
0