Address some of review comments and test the UTF-8 cache.

python · serhiy-storchaka · Oct 11, 2023 · Oct 3, 2023 · Oct 3, 2023 · Oct 3, 2023
commit bdf2f1e27cdc42ec976a7a23b83f0aade13a56ad
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -1398,10 +1398,10 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
 
 .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
 
-   Compare a Unicode object with a UTF-8 encoded C string and return true
-   if they are equal and false otherwise.
+   Compare a Unicode object with a UTF-8 encoded C string and return true (``1``)
-   Compare a Unicode object with a UTF-8 encoded C string and return true (``1``)
+   Compare a Unicode object with a UTF-8 encoded or ASCII encoding C string and return true (``1``)
-   Compare a Unicode object with a UTF-8 encoded C string and return true (``1``)
+   Compare a Unicode object with a UTF-8 encoded or ASCII encoding C string and return true (``1``)
+   if they are equal and false (``0``) otherwise.
    If the Unicode object contains null or surrogate characters or
-   the C string is not encoded to UTF-8 return 0.
+   the C string is not encoded to UTF-8 return false.
 
    This function does not raise exceptions.
 

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -1302,26 +1302,34 @@ def test_comparewithasciistring(self):
     def test_equaltoutf8(self):
         """Test PyUnicode_EqualToUTF8()"""
         from _testcapi import unicode_equaltoutf8 as equaltoutf8
+        from _testcapi import unicode_asutf8andsize as asutf8andsize
 
         strings = [
             'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
             '\U0001f600\U0001f601\U0001f602'
         ]
         for s in strings:
+            # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
+            # encoded string cached in the Unicode object.
+            asutf8andsize(s, 0)
             b = s.encode()
-            self.assertEqual(equaltoutf8(s, b), 1)
-            self.assertEqual(equaltoutf8(b.decode(), b), 1)
+            self.assertEqual(equaltoutf8(s, b), 1)  # Use the UTF-8 cache.
+            s2 = b.decode()  # New Unicode object without the UTF-8 cache.
+            self.assertEqual(equaltoutf8(s2, b), 1)
             self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
             self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
             self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
-            self.assertEqual(equaltoutf8(s, b + b'x'), 0)
-            self.assertEqual(equaltoutf8(s, b[:-1]), 0)
-            self.assertEqual(equaltoutf8(s, b[:-1] + b'x'), 0)
-
-        # surrogateescape
-        self.assertEqual(equaltoutf8('\udcfe', b'\xfe'), 0)
-        # surrogatepass
-        self.assertEqual(equaltoutf8('\udcfe', b'\xed\xb3\xbe'), 0)
+            self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
+            self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
+            self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
+
+        # Surrogate characters are always treated as not equal
+        self.assertEqual(equaltoutf8('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogateescape")), 0)
+        self.assertEqual(equaltoutf8('\udcfe',
+                            '\udcfe'.encode("utf8", "surrogatepass")), 0)
+        self.assertEqual(equaltoutf8('\ud801',
+                            '\ud801'.encode("utf8", "surrogatepass")), 0)
 
         # CRASHES equaltoutf8(b'abc', b'abc')
         # CRASHES equaltoutf8([], b'abc')

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -10678,6 +10678,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
 {
     assert(_PyUnicode_CHECK(unicode));
     assert(str);
+
     if (PyUnicode_IS_ASCII(unicode)) {
         size_t len = (size_t)PyUnicode_GET_LENGTH(unicode);
         return strlen(str) == len &&
@@ -10689,49 +10690,53 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
             memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
     }
 
-    Py_UCS4 ch;
+    const unsigned char *s = (const unsigned char *)str;
     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
     int kind = PyUnicode_KIND(unicode);
     const void *data = PyUnicode_DATA(unicode);
     /* Compare Unicode string and UTF-8 string */
     for (Py_ssize_t i = 0; i < len; i++) {
-        ch = PyUnicode_READ(kind, data, i);
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
         if (ch == 0) {
             return 0;
         }
         else if (ch < 0x80) {
-            if (ch != (unsigned char)*str++) {
+            if (s[0] != ch) {
                 return 0;
             }
+            s += 1;
         }
         else if (ch < 0x800) {
-            if ((0xc0 | (ch >> 6)) != (unsigned char)*str++ ||
-                (0x80 | (ch & 0x3f)) != (unsigned char)*str++)
+            if (s[0] != (0xc0 | (ch >> 6)) ||
+                s[1] != (0x80 | (ch & 0x3f)))
             {
                 return 0;
             }
+            s += 2;
         }
         else if (ch < 0x10000) {
             if (Py_UNICODE_IS_SURROGATE(ch) ||
-                (0xe0 | (ch >> 12)) != (unsigned char)*str++ ||
-                (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ ||
-                (0x80 | (ch & 0x3f)) != (unsigned char)*str++)
+                s[0] != (0xe0 | (ch >> 12)) ||
+                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[2] != (0x80 | (ch & 0x3f)))
             {
                 return 0;
             }
+            s += 3;
         }
         else {
             assert(ch <= MAX_UNICODE);
-            if ((0xf0 | (ch >> 18)) != (unsigned char)*str++ ||
-                (0x80 | ((ch >> 12) & 0x3f)) != (unsigned char)*str++ ||
-                (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ ||
-                (0x80 | (ch & 0x3f)) != (unsigned char)*str++)
+            if (s[0] != (0xf0 | (ch >> 18)) ||
+                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
+                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
+                s[3] != (0x80 | (ch & 0x3f)))
             {
                 return 0;
             }
+            s += 4;
         }
     }
-    return *str == 0;
+    return *s == 0;
 }
 
 int