From eb2237a441628da448bf23f3bc151fc6d3697739 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Sun, 9 Feb 2025 13:49:44 +0100
Subject: [PATCH 1/3] Use new helpers in the `replace` handler.

We also refactor that handler and extract the logic for each
exceptions being handled into separate functions.
---
 Python/codecs.c | 103 +++++++++++++++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 40 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 6c9f8222079ec8..84913a201af598 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -774,50 +774,72 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc)
 }
 
 
-PyObject *PyCodec_ReplaceErrors(PyObject *exc)
+// --- handler: 'replace' -----------------------------------------------------
+
+static PyObject *
+_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
 {
     Py_ssize_t start, end, slen;
+    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
+                                  &start, &end, &slen, false) < 0) {
+        return NULL;
+    }
+    PyObject *res = PyUnicode_New(slen, '?');
+    if (res == NULL) {
+        return NULL;
+    }
+    assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
+    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
+    memset(outp, '?', sizeof(Py_UCS1) * slen);
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return Py_BuildValue("(Nn)", res, end);
+}
 
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        if (_PyUnicodeError_GetParams(exc, NULL, NULL,
-                                      &start, &end, &slen, false) < 0) {
-            return NULL;
-        }
-        PyObject *res = PyUnicode_New(slen, '?');
-        if (res == NULL) {
-            return NULL;
-        }
-        assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
-        Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
-        memset(outp, '?', sizeof(Py_UCS1) * slen);
-        assert(_PyUnicode_CheckConsistency(res, 1));
-        return Py_BuildValue("(Nn)", res, end);
+
+static PyObject *
+_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
+{
+    Py_ssize_t end;
+    if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
+        return NULL;
     }
-    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
-        if (_PyUnicodeError_GetParams(exc, NULL, NULL,
-                                      NULL, &end, NULL, true) < 0) {
-            return NULL;
-        }
-        return Py_BuildValue("(Cn)",
-                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
-                             end);
+    // Note: Py_UNICODE_REPLACEMENT_CHARACTER < (2 ** 16) < INT_MAX
+    return Py_BuildValue("(Cn)", (int)Py_UNICODE_REPLACEMENT_CHARACTER, end);
+}
+
+
+static PyObject *
+_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
+{
+    Py_ssize_t start, end, slen;
+    if (_PyUnicodeError_GetParams(exc, NULL, NULL,
+                                  &start, &end, &slen, false) < 0) {
+        return NULL;
     }
-    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
-        if (_PyUnicodeError_GetParams(exc, NULL, NULL,
-                                      &start, &end, &slen, false) < 0) {
-            return NULL;
-        }
-        PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);
-        if (res == NULL) {
-            return NULL;
-        }
-        assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
-        Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
-        for (Py_ssize_t i = 0; i < slen; ++i) {
-            outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
-        }
-        assert(_PyUnicode_CheckConsistency(res, 1));
-        return Py_BuildValue("(Nn)", res, end);
+    PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);
+    if (res == NULL) {
+        return NULL;
+    }
+    assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
+    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
+    for (Py_ssize_t i = 0; i < slen; ++i) {
+        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
+    }
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return Py_BuildValue("(Nn)", res, end);
+}
+
+
+PyObject *PyCodec_ReplaceErrors(PyObject *exc)
+{
+    if (_PyIsUnicodeEncodeError(exc)) {
+        return _PyCodec_ReplaceUnicodeEncodeError(exc);
+    }
+    else if (_PyIsUnicodeDecodeError(exc)) {
+        return _PyCodec_ReplaceUnicodeDecodeError(exc);
+    }
+    else if (_PyIsUnicodeTranslateError(exc)) {
+        return _PyCodec_ReplaceUnicodeTranslateError(exc);
     }
     else {
         wrong_exception_type(exc);
@@ -1413,7 +1435,8 @@ ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
 }
 
 
-static PyObject *replace_errors(PyObject *self, PyObject *exc)
+static inline PyObject *
+replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
 {
     return PyCodec_ReplaceErrors(exc);
 }

From aaa1e4a3bfce0337f7da192b49146fc9caf64ffb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Sun, 23 Feb 2025 11:21:37 +0100
Subject: [PATCH 2/3] PEP-7

---
 Python/codecs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 4d0941d48784d2..a8f3930c31291d 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -781,7 +781,8 @@ _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
 {
     Py_ssize_t start, end, slen;
     if (_PyUnicodeError_GetParams(exc, NULL, NULL,
-                                  &start, &end, &slen, false) < 0) {
+                                  &start, &end, &slen, false) < 0)
+    {
         return NULL;
     }
     PyObject *res = PyUnicode_New(slen, '?');
@@ -813,7 +814,8 @@ _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
 {
     Py_ssize_t start, end, slen;
     if (_PyUnicodeError_GetParams(exc, NULL, NULL,
-                                  &start, &end, &slen, false) < 0) {
+                                  &start, &end, &slen, false) < 0)
+    {
         return NULL;
     }
     PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);

From d78e395a0e61bec6f251211f5012498060057153 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Tue, 25 Feb 2025 11:31:08 +0100
Subject: [PATCH 3/3] create utility for creating a string with N copies of
 0xFFFD

---
 Python/codecs.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index a8f3930c31291d..b876b816f688a0 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -730,6 +730,27 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
 }
 
 
+/*
+ * Create a Unicode string containing 'count' copies of the official
+ * Unicode REPLACEMENT CHARACTER (0xFFFD).
+ */
+static PyObject *
+codec_handler_unicode_replacement_character(Py_ssize_t count)
+{
+    PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
+    if (res == NULL) {
+        return NULL;
+    }
+    assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
+    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
+    for (Py_ssize_t i = 0; i < count; ++i) {
+        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
+    }
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return res;
+}
+
+
 // --- handler: 'strict' ------------------------------------------------------
 
 PyObject *PyCodec_StrictErrors(PyObject *exc)
@@ -804,8 +825,11 @@ _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
     if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
         return NULL;
     }
-    // Note: Py_UNICODE_REPLACEMENT_CHARACTER < (2 ** 16) < INT_MAX
-    return Py_BuildValue("(Cn)", (int)Py_UNICODE_REPLACEMENT_CHARACTER, end);
+    PyObject *res = codec_handler_unicode_replacement_character(1);
+    if (res == NULL) {
+        return NULL;
+    }
+    return Py_BuildValue("(Nn)", res, end);
 }
 
 
@@ -818,16 +842,10 @@ _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
     {
         return NULL;
     }
-    PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);
+    PyObject *res = codec_handler_unicode_replacement_character(slen);
     if (res == NULL) {
         return NULL;
     }
-    assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
-    Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
-    for (Py_ssize_t i = 0; i < slen; ++i) {
-        outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
-    }
-    assert(_PyUnicode_CheckConsistency(res, 1));
     return Py_BuildValue("(Nn)", res, end);
 }