From eb2237a441628da448bf23f3bc151fc6d3697739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 13:49:44 +0100 Subject: [PATCH 1/3] Use new helpers in the `replace` handler. We also refactor that handler and extract the logic for each exceptions being handled into separate functions. --- Python/codecs.c | 103 +++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 6c9f8222079ec8..84913a201af598 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -774,50 +774,72 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) } -PyObject *PyCodec_ReplaceErrors(PyObject *exc) +// --- handler: 'replace' ----------------------------------------------------- + +static PyObject * +_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) { Py_ssize_t start, end, slen; + if (_PyUnicodeError_GetParams(exc, NULL, NULL, + &start, &end, &slen, false) < 0) { + return NULL; + } + PyObject *res = PyUnicode_New(slen, '?'); + if (res == NULL) { + return NULL; + } + assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + memset(outp, '?', sizeof(Py_UCS1) * slen); + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); +} - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, - &start, &end, &slen, false) < 0) { - return NULL; - } - PyObject *res = PyUnicode_New(slen, '?'); - if (res == NULL) { - return NULL; - } - assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); - Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); - memset(outp, '?', sizeof(Py_UCS1) * slen); - assert(_PyUnicode_CheckConsistency(res, 1)); - return Py_BuildValue("(Nn)", res, end); + +static PyObject * +_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) +{ + Py_ssize_t end; + if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { + return NULL; } - else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, - NULL, &end, NULL, true) < 0) { - return NULL; - } - return Py_BuildValue("(Cn)", - (int)Py_UNICODE_REPLACEMENT_CHARACTER, - end); + // Note: Py_UNICODE_REPLACEMENT_CHARACTER < (2 ** 16) < INT_MAX + return Py_BuildValue("(Cn)", (int)Py_UNICODE_REPLACEMENT_CHARACTER, end); +} + + +static PyObject * +_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) +{ + Py_ssize_t start, end, slen; + if (_PyUnicodeError_GetParams(exc, NULL, NULL, + &start, &end, &slen, false) < 0) { + return NULL; } - else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (_PyUnicodeError_GetParams(exc, NULL, NULL, - &start, &end, &slen, false) < 0) { - return NULL; - } - PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER); - if (res == NULL) { - return NULL; - } - assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); - Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - for (Py_ssize_t i = 0; i < slen; ++i) { - outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; - } - assert(_PyUnicode_CheckConsistency(res, 1)); - return Py_BuildValue("(Nn)", res, end); + PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER); + if (res == NULL) { + return NULL; + } + assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); + Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); + for (Py_ssize_t i = 0; i < slen; ++i) { + outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; + } + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); +} + + +PyObject *PyCodec_ReplaceErrors(PyObject *exc) +{ + if (_PyIsUnicodeEncodeError(exc)) { + return _PyCodec_ReplaceUnicodeEncodeError(exc); + } + else if (_PyIsUnicodeDecodeError(exc)) { + return _PyCodec_ReplaceUnicodeDecodeError(exc); + } + else if (_PyIsUnicodeTranslateError(exc)) { + return _PyCodec_ReplaceUnicodeTranslateError(exc); } else { wrong_exception_type(exc); @@ -1413,7 +1435,8 @@ ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc) } -static PyObject *replace_errors(PyObject *self, PyObject *exc) +static inline PyObject * +replace_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_ReplaceErrors(exc); } From aaa1e4a3bfce0337f7da192b49146fc9caf64ffb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 23 Feb 2025 11:21:37 +0100 Subject: [PATCH 2/3] PEP-7 --- Python/codecs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 4d0941d48784d2..a8f3930c31291d 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -781,7 +781,8 @@ _PyCodec_ReplaceUnicodeEncodeError(PyObject *exc) { Py_ssize_t start, end, slen; if (_PyUnicodeError_GetParams(exc, NULL, NULL, - &start, &end, &slen, false) < 0) { + &start, &end, &slen, false) < 0) + { return NULL; } PyObject *res = PyUnicode_New(slen, '?'); @@ -813,7 +814,8 @@ _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) { Py_ssize_t start, end, slen; if (_PyUnicodeError_GetParams(exc, NULL, NULL, - &start, &end, &slen, false) < 0) { + &start, &end, &slen, false) < 0) + { return NULL; } PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER); From d78e395a0e61bec6f251211f5012498060057153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 25 Feb 2025 11:31:08 +0100 Subject: [PATCH 3/3] create utility for creating a string with N copies of 0xFFFD --- Python/codecs.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index a8f3930c31291d..b876b816f688a0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -730,6 +730,27 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) } +/* + * Create a Unicode string containing 'count' copies of the official + * Unicode REPLACEMENT CHARACTER (0xFFFD). + */ +static PyObject * +codec_handler_unicode_replacement_character(Py_ssize_t count) +{ + PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER); + if (res == NULL) { + return NULL; + } + assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); + Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); + for (Py_ssize_t i = 0; i < count; ++i) { + outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; + } + assert(_PyUnicode_CheckConsistency(res, 1)); + return res; +} + + // --- handler: 'strict' ------------------------------------------------------ PyObject *PyCodec_StrictErrors(PyObject *exc) @@ -804,8 +825,11 @@ _PyCodec_ReplaceUnicodeDecodeError(PyObject *exc) if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { return NULL; } - // Note: Py_UNICODE_REPLACEMENT_CHARACTER < (2 ** 16) < INT_MAX - return Py_BuildValue("(Cn)", (int)Py_UNICODE_REPLACEMENT_CHARACTER, end); + PyObject *res = codec_handler_unicode_replacement_character(1); + if (res == NULL) { + return NULL; + } + return Py_BuildValue("(Nn)", res, end); } @@ -818,16 +842,10 @@ _PyCodec_ReplaceUnicodeTranslateError(PyObject *exc) { return NULL; } - PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER); + PyObject *res = codec_handler_unicode_replacement_character(slen); if (res == NULL) { return NULL; } - assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); - Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res); - for (Py_ssize_t i = 0; i < slen; ++i) { - outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; - } - assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); }