@@ -27,6 +27,7 @@ typedef struct {
27
27
const unsigned char mirrored ; /* true if mirrored in bidir mode */
28
28
const unsigned char east_asian_width ; /* index into
29
29
_PyUnicode_EastAsianWidth */
30
+ const unsigned char normalization_quick_check ; /* see is_normalized() */
30
31
} _PyUnicode_DatabaseRecord ;
31
32
32
33
typedef struct change_record {
@@ -722,7 +723,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
722
723
PyUnicode_Resize (& result , o - PyUnicode_AS_UNICODE (result ));
723
724
return result ;
724
725
}
725
-
726
+
727
+ /* Return 1 if the input is certainly normalized, 0 if it might not be. */
728
+ static int
729
+ is_normalized (PyObject * self , PyObject * input , int nfc , int k )
730
+ {
731
+ Py_UNICODE * i , * end ;
732
+ unsigned char prev_combining = 0 , quickcheck_mask ;
733
+
734
+ /* An older version of the database is requested, quickchecks must be
735
+ disabled. */
736
+ if (self && UCD_Check (self ))
737
+ return 0 ;
738
+
739
+ /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
740
+ as described in http://unicode.org/reports/tr15/#Annex8. */
741
+ quickcheck_mask = 3 << ((nfc ? 4 : 0 ) + (k ? 2 : 0 ));
742
+
743
+ i = PyUnicode_AS_UNICODE (input );
744
+ end = i + PyUnicode_GET_SIZE (input );
745
+ while (i < end ) {
746
+ const _PyUnicode_DatabaseRecord * record = _getrecord_ex (* i ++ );
747
+ unsigned char combining = record -> combining ;
748
+ unsigned char quickcheck = record -> normalization_quick_check ;
749
+
750
+ if (quickcheck & quickcheck_mask )
751
+ return 0 ; /* this string might need normalization */
752
+ if (combining && prev_combining > combining )
753
+ return 0 ; /* non-canonical sort order, not normalized */
754
+ prev_combining = combining ;
755
+ }
756
+ return 1 ; /* certainly normalized */
757
+ }
758
+
726
759
PyDoc_STRVAR (unicodedata_normalize__doc__ ,
727
760
"normalize(form, unistr)\n\
728
761
\n\
@@ -746,14 +779,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
746
779
return input ;
747
780
}
748
781
749
- if (strcmp (form , "NFC" ) == 0 )
782
+ if (strcmp (form , "NFC" ) == 0 ) {
783
+ if (is_normalized (self , input , 1 , 0 )) {
784
+ Py_INCREF (input );
785
+ return input ;
786
+ }
750
787
return nfc_nfkc (self , input , 0 );
751
- if (strcmp (form , "NFKC" ) == 0 )
788
+ }
789
+ if (strcmp (form , "NFKC" ) == 0 ) {
790
+ if (is_normalized (self , input , 1 , 1 )) {
791
+ Py_INCREF (input );
792
+ return input ;
793
+ }
752
794
return nfc_nfkc (self , input , 1 );
753
- if (strcmp (form , "NFD" ) == 0 )
795
+ }
796
+ if (strcmp (form , "NFD" ) == 0 ) {
797
+ if (is_normalized (self , input , 0 , 0 )) {
798
+ Py_INCREF (input );
799
+ return input ;
800
+ }
754
801
return nfd_nfkd (self , input , 0 );
755
- if (strcmp (form , "NFKD" ) == 0 )
802
+ }
803
+ if (strcmp (form , "NFKD" ) == 0 ) {
804
+ if (is_normalized (self , input , 0 , 1 )) {
805
+ Py_INCREF (input );
806
+ return input ;
807
+ }
756
808
return nfd_nfkd (self , input , 1 );
809
+ }
757
810
PyErr_SetString (PyExc_ValueError , "invalid normalization form" );
758
811
return NULL ;
759
812
}
0 commit comments