Merge remote-tracking branch 'upstream/main' into fix/c-api-unicode-e…

…rror-get-start-123378
python · encukou · Dec 4, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
commit 6caf5b66d045758211ad2c612a2a5e2fb73f9b65
@@ -3058,23 +3058,27 @@ UnicodeEncodeError_str(PyObject *self)
     PyObject *reason_str = NULL;
     PyObject *encoding_str = NULL;
 
-    if (!uself->object) {
+    if (exc->object == NULL) {
         /* Not properly initialized. */
-        return PyUnicode_FromString("");
+        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
     }
 
     /* Get reason and encoding as strings, which they might not be if
        they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
     if (reason_str == NULL) {
         goto done;
     }
-    encoding_str = PyObject_Str(uself->encoding);
+    encoding_str = PyObject_Str(exc->encoding);
     if (encoding_str == NULL) {
         goto done;
     }
-    if (unicode_error_is_single_bad_char(uself)) {
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
         const char *fmt;
         if (badchar <= 0xff) {
             fmt = "'%U' codec can't encode character '\\x%02x' in position %zd: %U";
@@ -3086,12 +3090,19 @@ UnicodeEncodeError_str(PyObject *self)
             fmt = "'%U' codec can't encode character '\\U%08x' in position %zd: %U";
         }
         result = PyUnicode_FromFormat(
-            fmt, encoding_str, (int)badchar, uself->start, reason_str);
+            fmt,
+            encoding_str,
+            (int)badchar,
+            start,
+            reason_str);
     }
     else {
         result = PyUnicode_FromFormat(
             "'%U' codec can't encode characters in position %zd-%zd: %U",
-            encoding_str, uself->start, uself->end - 1, reason_str);
+            encoding_str,
+            start,
+            end - 1,
+            reason_str);
     }
 done:
     Py_XDECREF(reason_str);
@@ -3167,32 +3178,41 @@ UnicodeDecodeError_str(PyObject *self)
     PyObject *reason_str = NULL;
     PyObject *encoding_str = NULL;
 
-    if (!uself->object) {
+    if (exc->object == NULL) {
         /* Not properly initialized. */
-        return PyUnicode_FromString("");
+        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
     }
 
     /* Get reason and encoding as strings, which they might not be if
        they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
     if (reason_str == NULL) {
         goto done;
     }
-    encoding_str = PyObject_Str(uself->encoding);
+    encoding_str = PyObject_Str(exc->encoding);
     if (encoding_str == NULL) {
         goto done;
     }
 
-    if (unicode_error_is_single_bad_byte(uself)) {
-        int byte = (int)(PyBytes_AS_STRING(uself->object)[uself->start] & 0xff);
+    Py_ssize_t len = PyBytes_GET_SIZE(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        int badbyte = (int)(PyBytes_AS_STRING(exc->object)[start] & 0xff);
         result = PyUnicode_FromFormat(
             "'%U' codec can't decode byte 0x%02x in position %zd: %U",
-            encoding_str, byte, uself->start, reason_str);
+            encoding_str,
+            badbyte,
+            start,
+            reason_str);
     }
     else {
         result = PyUnicode_FromFormat(
             "'%U' codec can't decode bytes in position %zd-%zd: %U",
-            encoding_str, uself->start, uself->end - 1, reason_str);
+            encoding_str,
+            start,
+            end - 1,
+            reason_str);
     }
 done:
     Py_XDECREF(reason_str);
@@ -3270,8 +3290,11 @@ UnicodeTranslateError_str(PyObject *self)
         goto done;
     }
 
-    if (unicode_error_is_single_bad_char(uself)) {
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
+    Py_ssize_t start = exc->start, end = exc->end;
+
+    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
+        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
         const char *fmt;
         if (badchar <= 0xff) {
             fmt = "can't translate character '\\x%02x' in position %zd: %U";
@@ -3282,12 +3305,18 @@ UnicodeTranslateError_str(PyObject *self)
         else {
             fmt = "can't translate character '\\U%08x' in position %zd: %U";
         }
-        result = PyUnicode_FromFormat(fmt, (int)badchar, uself->start, reason_str);
+        result = PyUnicode_FromFormat(
+            fmt,
+            (int)badchar,
+            start,
+            reason_str);
     }
     else {
         result = PyUnicode_FromFormat(
             "can't translate characters in position %zd-%zd: %U",
-            uself->start, uself->end - 1, reason_str);
+            start,
+            end - 1,
+            reason_str);
     }
 done:
     Py_XDECREF(reason_str);