From 3c4da2e470e51eb8dec58e21f4cabe8c1032cb93 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 14 May 2024 23:17:51 +0200 Subject: [PATCH 01/15] gh-119182: Add PyUnicodeWriter C API --- Include/cpython/unicodeobject.h | 41 +++++++- Modules/_testcapi/unicode.c | 165 +++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 167 +++++++++++++++++++++++++++++--- 3 files changed, 356 insertions(+), 17 deletions(-) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d9b54bce83202d..99df888eb4b5ee 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -444,7 +444,44 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( Py_ssize_t size); -/* --- _PyUnicodeWriter API ----------------------------------------------- */ +/* --- Public PyUnicodeWriter API ----------------------------------------- */ + +typedef struct PyUnicodeWriter PyUnicodeWriter; + +PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(void); +PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer); +PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer); + +PyAPI_FUNC(void) PyUnicodeWriter_SetOverallocate( + PyUnicodeWriter *writer, + int overallocate); + +PyAPI_FUNC(int) PyUnicodeWriter_WriteChar( + PyUnicodeWriter *writer, + Py_UCS4 ch); +PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( + PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size); + +PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( + PyUnicodeWriter *writer, + PyObject *str); +PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr( + PyUnicodeWriter *writer, + PyObject *obj); +PyAPI_FUNC(int) PyUnicodeWriter_WriteSubstring( + PyUnicodeWriter *writer, + PyObject *str, + Py_ssize_t start, + Py_ssize_t end); +PyAPI_FUNC(int) PyUnicodeWriter_Format( + PyUnicodeWriter *writer, + const char *format, + ...); + + +/* --- Private _PyUnicodeWriter API --------------------------------------- */ typedef struct { PyObject *buffer; @@ -466,7 +503,7 @@ typedef struct { /* If readonly is 1, buffer is a shared string (cannot be modified) and size is set to 0. */ unsigned char readonly; -} _PyUnicodeWriter ; +} _PyUnicodeWriter; // Initialize a Unicode writer. // diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 015db9017139d0..8a325d75ba5ff0 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,6 +221,167 @@ unicode_copycharacters(PyObject *self, PyObject *args) } +static PyObject * +test_unicodewriter(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + if (writer == NULL) { + return NULL; + } + + // test PyUnicodeWriter_SetOverallocate() + PyUnicodeWriter_SetOverallocate(writer, 1); + + // test PyUnicodeWriter_WriteUTF8() + if (PyUnicodeWriter_WriteUTF8(writer, "var", -1) < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteChar() + if (PyUnicodeWriter_WriteChar(writer, '=') < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteSubstring() + PyObject *str = PyUnicode_FromString("[long]"); + if (str == NULL) { + goto error; + } + int ret = PyUnicodeWriter_WriteSubstring(writer, str, 1, 5); + Py_CLEAR(str); + if (ret < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteStr() + str = PyUnicode_FromString(" value "); + if (str == NULL) { + goto error; + } + ret = PyUnicodeWriter_WriteStr(writer, str); + Py_CLEAR(str); + if (ret < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteRepr() + str = PyUnicode_FromString("repr"); + if (str == NULL) { + goto error; + } + ret = PyUnicodeWriter_WriteRepr(writer, str); + Py_CLEAR(str); + if (ret < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "var=long value 'repr'")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_utf8(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUTF8(writer, "ascii", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + if (PyUnicodeWriter_WriteUTF8(writer, "latin1=\xC3\xA9", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + if (PyUnicodeWriter_WriteUTF8(writer, "euro=\xE2\x82\xAC", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, + "ascii-latin1=\xC3\xA9-euro=\xE2\x82\xAC.")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_invalid_utf8(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + if (writer == NULL) { + return NULL; + } + assert(PyUnicodeWriter_WriteUTF8(writer, "invalid=\xFF", -1) < 0); + PyUnicodeWriter_Discard(writer); + + assert(PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)); + PyErr_Clear(); + + Py_RETURN_NONE; +} + + +static PyObject * +test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + if (writer == NULL) { + return NULL; + } + + // test PyUnicodeWriter_Format() + if (PyUnicodeWriter_Format(writer, "%s %i", "Hello", 123) < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteChar() + if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "Hello 123.")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + static PyMethodDef TestMethods[] = { {"unicode_new", unicode_new, METH_VARARGS}, {"unicode_fill", unicode_fill, METH_VARARGS}, @@ -229,6 +390,10 @@ static PyMethodDef TestMethods[] = { {"unicode_asucs4copy", unicode_asucs4copy, METH_VARARGS}, {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, + {"test_unicodewriter", test_unicodewriter, METH_NOARGS}, + {"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS}, + {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS}, + {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..0f269f102c2095 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2872,23 +2872,21 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, return f; } -PyObject * -PyUnicode_FromFormatV(const char *format, va_list vargs) +static int +unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) { + writer->min_length += strlen(format) + 100; + writer->overallocate = 1; + va_list vargs2; const char *f; - _PyUnicodeWriter writer; - - _PyUnicodeWriter_Init(&writer); - writer.min_length = strlen(format) + 100; - writer.overallocate = 1; // Copy varags to be able to pass a reference to a subfunction. va_copy(vargs2, vargs); for (f = format; *f; ) { if (*f == '%') { - f = unicode_fromformat_arg(&writer, f, &vargs2); + f = unicode_fromformat_arg(writer, f, &vargs2); if (f == NULL) goto fail; } @@ -2912,21 +2910,33 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) len = p - f; if (*p == '\0') - writer.overallocate = 0; + writer->overallocate = 0; - if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) + if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) goto fail; f = p; } } va_end(vargs2); - return _PyUnicodeWriter_Finish(&writer); + return 0; fail: va_end(vargs2); - _PyUnicodeWriter_Dealloc(&writer); - return NULL; + return -1; +} + +PyObject * +PyUnicode_FromFormatV(const char *format, va_list vargs) +{ + _PyUnicodeWriter writer; + _PyUnicodeWriter_Init(&writer); + + if (unicode_from_format(&writer, format, vargs) < 0) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + return _PyUnicodeWriter_Finish(&writer); } PyObject * @@ -2941,6 +2951,18 @@ PyUnicode_FromFormat(const char *format, ...) return ret; } +int +PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) +{ + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + + va_list vargs; + va_start(vargs, format); + int ret = unicode_from_format(_writer, format, vargs); + va_end(vargs); + return ret; +} + static Py_ssize_t unicode_get_widechar_size(PyObject *unicode) { @@ -4927,6 +4949,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, } +// Used by PyUnicodeWriter_WriteUTF8() implementation static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, @@ -13080,6 +13103,7 @@ unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start, return PyBool_FromLong(result); } + static inline void _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { @@ -13103,6 +13127,7 @@ _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) } } + void _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { @@ -13111,12 +13136,35 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) /* ASCII is the bare minimum */ writer->min_char = 127; - /* use a value smaller than PyUnicode_1BYTE_KIND() so + /* use a kind value smaller than PyUnicode_1BYTE_KIND so _PyUnicodeWriter_PrepareKind() will copy the buffer. */ - writer->kind = 0; + assert(writer->kind == 0); assert(writer->kind <= PyUnicode_1BYTE_KIND); } + +PyUnicodeWriter* +PyUnicodeWriter_Create(void) +{ + const size_t size = sizeof(_PyUnicodeWriter); + PyUnicodeWriter *writer = (PyUnicodeWriter *)PyMem_Malloc(size); + if (writer == NULL) { + PyErr_NoMemory(); + return NULL; + } + _PyUnicodeWriter_Init((_PyUnicodeWriter*)writer); + PyUnicodeWriter_SetOverallocate(writer, 1); + return writer; +} + + +void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) +{ + _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer); + PyMem_Free(writer); +} + + // Initialize _PyUnicodeWriter with initial buffer static inline void _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) @@ -13127,6 +13175,14 @@ _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) writer->min_length = writer->size; } + +void +PyUnicodeWriter_SetOverallocate(PyUnicodeWriter *writer, int overallocate) +{ + ((_PyUnicodeWriter*)writer)->overallocate = (unsigned char)overallocate; +} + + int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) @@ -13242,9 +13298,16 @@ _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) return _PyUnicodeWriter_WriteCharInline(writer, ch); } +int PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) +{ + return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); +} + int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) { + assert(PyUnicode_Check(str)); + Py_UCS4 maxchar; Py_ssize_t len; @@ -13270,6 +13333,34 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) return 0; } +int +PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) +{ + PyObject *str = PyObject_Str(obj); + if (str == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} + + +int +PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) +{ + PyObject *str = PyObject_Repr(obj); + if (str == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} + + int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, Py_ssize_t start, Py_ssize_t end) @@ -13302,6 +13393,29 @@ _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, return 0; } + +int +PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t start, Py_ssize_t end) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str, not %T", str); + return -1; + } + if (start < 0 || start > end) { + PyErr_Format(PyExc_ValueError, "invalid start argument"); + return -1; + } + if (end > PyUnicode_GET_LENGTH(str)) { + PyErr_Format(PyExc_ValueError, "invalid end argument"); + return -1; + } + + return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str, + start, end); +} + + int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, const char *ascii, Py_ssize_t len) @@ -13362,6 +13476,18 @@ _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, return 0; } +int +PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size) +{ + if (size == -1) { + size = strlen(str); + } + return unicode_decode_utf8_writer((_PyUnicodeWriter*)writer, str, size, + _Py_ERROR_STRICT, NULL, NULL); +} + int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) @@ -13408,6 +13534,17 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) return unicode_result(str); } + +PyObject* +PyUnicodeWriter_Finish(PyUnicodeWriter *writer) +{ + PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer); + assert(((_PyUnicodeWriter*)writer)->buffer == NULL); + PyMem_Free(writer); + return str; +} + + void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) { From b12f085fe11217bf645e468769bd1337f93500d1 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 5 Jun 2024 16:43:22 +0200 Subject: [PATCH 02/15] PyUnicodeWriter_Create() expects a length Remove PyUnicodeWriter_SetOverallocate(). --- Include/cpython/unicodeobject.h | 6 +----- Modules/_testcapi/unicode.c | 11 ++++------- Objects/unicodeobject.c | 28 ++++++++++++++-------------- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 99df888eb4b5ee..b160783a64214f 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -448,14 +448,10 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( typedef struct PyUnicodeWriter PyUnicodeWriter; -PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(void); +PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(Py_ssize_t length); PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer); PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer); -PyAPI_FUNC(void) PyUnicodeWriter_SetOverallocate( - PyUnicodeWriter *writer, - int overallocate); - PyAPI_FUNC(int) PyUnicodeWriter_WriteChar( PyUnicodeWriter *writer, Py_UCS4 ch); diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 8a325d75ba5ff0..8dea466c9d27b4 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -224,14 +224,11 @@ unicode_copycharacters(PyObject *self, PyObject *args) static PyObject * test_unicodewriter(PyObject *self, PyObject *Py_UNUSED(args)) { - PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(100); if (writer == NULL) { return NULL; } - // test PyUnicodeWriter_SetOverallocate() - PyUnicodeWriter_SetOverallocate(writer, 1); - // test PyUnicodeWriter_WriteUTF8() if (PyUnicodeWriter_WriteUTF8(writer, "var", -1) < 0) { goto error; @@ -293,7 +290,7 @@ test_unicodewriter(PyObject *self, PyObject *Py_UNUSED(args)) static PyObject * test_unicodewriter_utf8(PyObject *self, PyObject *Py_UNUSED(args)) { - PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); if (writer == NULL) { return NULL; } @@ -335,7 +332,7 @@ test_unicodewriter_utf8(PyObject *self, PyObject *Py_UNUSED(args)) static PyObject * test_unicodewriter_invalid_utf8(PyObject *self, PyObject *Py_UNUSED(args)) { - PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); if (writer == NULL) { return NULL; } @@ -352,7 +349,7 @@ test_unicodewriter_invalid_utf8(PyObject *self, PyObject *Py_UNUSED(args)) static PyObject * test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) { - PyUnicodeWriter *writer = PyUnicodeWriter_Create(); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); if (writer == NULL) { return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0f269f102c2095..7fb76fbd1f713e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13139,22 +13139,29 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) /* use a kind value smaller than PyUnicode_1BYTE_KIND so _PyUnicodeWriter_PrepareKind() will copy the buffer. */ assert(writer->kind == 0); - assert(writer->kind <= PyUnicode_1BYTE_KIND); + assert(writer->kind < PyUnicode_1BYTE_KIND); } PyUnicodeWriter* -PyUnicodeWriter_Create(void) +PyUnicodeWriter_Create(Py_ssize_t length) { const size_t size = sizeof(_PyUnicodeWriter); - PyUnicodeWriter *writer = (PyUnicodeWriter *)PyMem_Malloc(size); - if (writer == NULL) { + PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); + if (pub_writer == NULL) { PyErr_NoMemory(); return NULL; } - _PyUnicodeWriter_Init((_PyUnicodeWriter*)writer); - PyUnicodeWriter_SetOverallocate(writer, 1); - return writer; + _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; + + _PyUnicodeWriter_Init(writer); + if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) { + PyUnicodeWriter_Discard(pub_writer); + return NULL; + } + writer->overallocate = 1; + + return pub_writer; } @@ -13176,13 +13183,6 @@ _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) } -void -PyUnicodeWriter_SetOverallocate(PyUnicodeWriter *writer, int overallocate) -{ - ((_PyUnicodeWriter*)writer)->overallocate = (unsigned char)overallocate; -} - - int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) From 175c23924be80305cb6851ef4f66d4c55c24c39a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 7 Jun 2024 21:34:00 +0200 Subject: [PATCH 03/15] Rename str to repr --- Objects/unicodeobject.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7fb76fbd1f713e..8cfaa92738063e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13350,13 +13350,13 @@ PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) int PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) { - PyObject *str = PyObject_Repr(obj); - if (str == NULL) { + PyObject *repr = PyObject_Repr(obj); + if (repr == NULL) { return -1; } - int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); - Py_DECREF(str); + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr); + Py_DECREF(repr); return res; } From 99fa2cb4a146e7d9b7b58c8ad73dc0c87fca5f3a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 7 Jun 2024 22:11:50 +0200 Subject: [PATCH 04/15] Add documentation --- Doc/c-api/unicode.rst | 75 +++++++++++++++++++ Doc/whatsnew/3.14.rst | 15 ++++ ...-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst | 13 ++++ 3 files changed, 103 insertions(+) create mode 100644 Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513..c86978ca79098e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1502,3 +1502,78 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyUnicode_InternInPlace`, returning either a new Unicode string object that has been interned, or a new ("owned") reference to an earlier interned string object with the same value. + +PyUnicodeWriter +^^^^^^^^^^^^^^^ + +The :c:type:`PyUnicodeWriter` API can be used to create a Python :class:`str` +object. + +.. versionadded:: 3.14 + +.. c:type:: PyUnicodeWriter + + An Unicode writer instance. + +.. c:function:: PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) + + Create an Unicode writer instance. + + Set an exception and return ``NULL`` on error. + +.. c:function:: void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) + + Discard an Unicode writer instance: free its memory. + +.. c:function:: PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer) + + Get the final Python :class:`str` object and free the writer instance. + + Set an exception and return ``NULL`` on error. + +.. c:function:: int PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) + + Write a single Unicode character. + + Return ``0`` on success, or set an exception and return ``-1`` on error. + +.. c:function:: int PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) + + Decode a string from UTF-8 in strict mode and write the output into the + writer. + + *size* is the string length in bytes. If *size* is equal to ``-1``, call + ``strlen(str)`` to get the string length. + + Return ``0`` on success, or set an exception and return ``-1`` on error. + +.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *str) + + Call :c:func:`PyObject_Str(obj) ` and write the output into + the writer. + + Return ``0`` on success, or set an exception and return ``-1`` on error. + +.. c:function:: int PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) + + Call :c:func:`PyObject_Repr(obj) ` and write the output into + the writer. + + Return ``0`` on success, or set an exception and return ``-1`` on error. + +.. c:function:: int PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, Py_ssize_t start, Py_ssize_t end) + + Write the substring ``str[start:end]`` into the writer. + + *str* must be Python :class:`str` object. *start* must be greater than or + equal to 0, and less than or equal to *end*. *end* must be less than or + equal to *str* length. + + Return ``0`` on success, or set an exception and return ``-1`` on error. + +.. c:function:: int PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) + + Similar to :c:func:`PyUnicode_FromFormat`, but write directly the output + into the writer. + + Return ``0`` on success, or set an exception and return ``-1`` on error. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b2dd80b64a691a..6053dacb21aa5f 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -258,6 +258,21 @@ New Features * Add :c:func:`PyLong_GetSign` function to get the sign of :class:`int` objects. (Contributed by Sergey B Kirpichev in :gh:`116560`.) +* Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` + object: + + * :c:func:`PyUnicodeWriter_Create` + * :c:func:`PyUnicodeWriter_Discard` + * :c:func:`PyUnicodeWriter_Finish` + * :c:func:`PyUnicodeWriter_WriteChar` + * :c:func:`PyUnicodeWriter_WriteUTF8` + * :c:func:`PyUnicodeWriter_WriteStr` + * :c:func:`PyUnicodeWriter_WriteRepr` + * :c:func:`PyUnicodeWriter_WriteSubstring` + * :c:func:`PyUnicodeWriter_Format` + + (Contributed by Victor Stinner in :gh:`119182`.) + Porting to Python 3.14 ---------------------- diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst new file mode 100644 index 00000000000000..073aea5b6b1b48 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst @@ -0,0 +1,13 @@ +Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: + +* :c:func:`PyUnicodeWriter_Create` +* :c:func:`PyUnicodeWriter_Discard` +* :c:func:`PyUnicodeWriter_Finish` +* :c:func:`PyUnicodeWriter_WriteChar` +* :c:func:`PyUnicodeWriter_WriteUTF8` +* :c:func:`PyUnicodeWriter_WriteStr` +* :c:func:`PyUnicodeWriter_WriteRepr` +* :c:func:`PyUnicodeWriter_WriteSubstring` +* :c:func:`PyUnicodeWriter_Format` + +Patch by Victor Stinner. From e3e15f0d701a95d256ff2376e0adab497e1064e5 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:02:45 +0200 Subject: [PATCH 05/15] Apply suggestions from code review Co-authored-by: Erlend E. Aasland --- Objects/unicodeobject.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8cfaa92738063e..1c6eb2985f3054 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13149,8 +13149,7 @@ PyUnicodeWriter_Create(Py_ssize_t length) const size_t size = sizeof(_PyUnicodeWriter); PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); if (pub_writer == NULL) { - PyErr_NoMemory(); - return NULL; + return PyErr_NoMemory(); } _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; @@ -13481,7 +13480,7 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) { - if (size == -1) { + if (size < 0) { size = strlen(str); } return unicode_decode_utf8_writer((_PyUnicodeWriter*)writer, str, size, From 1dbb5dfa70476e553bc1be00eec3048cb522d045 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:04:01 +0200 Subject: [PATCH 06/15] Apply suggestions from code review Co-authored-by: Erlend E. Aasland --- Doc/c-api/unicode.rst | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c86978ca79098e..a83fbac5214dfd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1523,24 +1523,23 @@ object. .. c:function:: void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) - Discard an Unicode writer instance: free its memory. + Discard an Unicode writer instance. .. c:function:: PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer) - Get the final Python :class:`str` object and free the writer instance. + Return the final Python :class:`str` object and free the writer instance. Set an exception and return ``NULL`` on error. .. c:function:: int PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) - Write a single Unicode character. + Write the single Unicode character *ch* into *writer*. Return ``0`` on success, or set an exception and return ``-1`` on error. .. c:function:: int PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) - Decode a string from UTF-8 in strict mode and write the output into the - writer. + Decode the string *str* from UTF-8 in strict mode and write the output into *writer*. *size* is the string length in bytes. If *size* is equal to ``-1``, call ``strlen(str)`` to get the string length. @@ -1549,21 +1548,19 @@ object. .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *str) - Call :c:func:`PyObject_Str(obj) ` and write the output into - the writer. + Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. Return ``0`` on success, or set an exception and return ``-1`` on error. .. c:function:: int PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) - Call :c:func:`PyObject_Repr(obj) ` and write the output into - the writer. + Call :c:func:`PyObject_Repr` on *obj* and write the output into *writer*. Return ``0`` on success, or set an exception and return ``-1`` on error. .. c:function:: int PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, Py_ssize_t start, Py_ssize_t end) - Write the substring ``str[start:end]`` into the writer. + Write the substring ``str[start:end]`` into *writer*. *str* must be Python :class:`str` object. *start* must be greater than or equal to 0, and less than or equal to *end*. *end* must be less than or @@ -1573,7 +1570,6 @@ object. .. c:function:: int PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) - Similar to :c:func:`PyUnicode_FromFormat`, but write directly the output - into the writer. + Similar to :c:func:`PyUnicode_FromFormat`, but write the output directly into *writer*. Return ``0`` on success, or set an exception and return ``-1`` on error. From 8f02e3304487b00a02e28e5e43e6eb569215ec22 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:10:32 +0200 Subject: [PATCH 07/15] Update the doc --- Doc/c-api/unicode.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index a83fbac5214dfd..d1ee5f3c41e19e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1515,22 +1515,25 @@ object. An Unicode writer instance. + The instance must be destroyed by :c:func:`PyUnicodeWriter_Finish` on + success, or :c:func:`PyUnicodeWriter_Discard` on error. + .. c:function:: PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) Create an Unicode writer instance. Set an exception and return ``NULL`` on error. -.. c:function:: void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) - - Discard an Unicode writer instance. - .. c:function:: PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer) - Return the final Python :class:`str` object and free the writer instance. + Return the final Python :class:`str` object and destroy the writer instance. Set an exception and return ``NULL`` on error. +.. c:function:: void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) + + Discard the internal Unicode buffer and destroy the writer instance. + .. c:function:: int PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) Write the single Unicode character *ch* into *writer*. From a1d0ab0b8b482e476e5114df36fd785ce4491ab2 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:12:59 +0200 Subject: [PATCH 08/15] Add dots in Changelog --- Doc/whatsnew/3.14.rst | 18 +++++++++--------- ...4-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 6053dacb21aa5f..c648a865098e79 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -261,15 +261,15 @@ New Features * Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: - * :c:func:`PyUnicodeWriter_Create` - * :c:func:`PyUnicodeWriter_Discard` - * :c:func:`PyUnicodeWriter_Finish` - * :c:func:`PyUnicodeWriter_WriteChar` - * :c:func:`PyUnicodeWriter_WriteUTF8` - * :c:func:`PyUnicodeWriter_WriteStr` - * :c:func:`PyUnicodeWriter_WriteRepr` - * :c:func:`PyUnicodeWriter_WriteSubstring` - * :c:func:`PyUnicodeWriter_Format` + * :c:func:`PyUnicodeWriter_Create`. + * :c:func:`PyUnicodeWriter_Discard`. + * :c:func:`PyUnicodeWriter_Finish`. + * :c:func:`PyUnicodeWriter_WriteChar`. + * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteStr`. + * :c:func:`PyUnicodeWriter_WriteRepr`. + * :c:func:`PyUnicodeWriter_WriteSubstring`. + * :c:func:`PyUnicodeWriter_Format`. (Contributed by Victor Stinner in :gh:`119182`.) diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst index 073aea5b6b1b48..3d1384c9f3252f 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst @@ -1,13 +1,13 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: -* :c:func:`PyUnicodeWriter_Create` -* :c:func:`PyUnicodeWriter_Discard` -* :c:func:`PyUnicodeWriter_Finish` -* :c:func:`PyUnicodeWriter_WriteChar` -* :c:func:`PyUnicodeWriter_WriteUTF8` -* :c:func:`PyUnicodeWriter_WriteStr` -* :c:func:`PyUnicodeWriter_WriteRepr` -* :c:func:`PyUnicodeWriter_WriteSubstring` -* :c:func:`PyUnicodeWriter_Format` +* :c:func:`PyUnicodeWriter_Create`. +* :c:func:`PyUnicodeWriter_Discard`. +* :c:func:`PyUnicodeWriter_Finish`. +* :c:func:`PyUnicodeWriter_WriteChar`. +* :c:func:`PyUnicodeWriter_WriteUTF8`. +* :c:func:`PyUnicodeWriter_WriteStr`. +* :c:func:`PyUnicodeWriter_WriteRepr`. +* :c:func:`PyUnicodeWriter_WriteSubstring`. +* :c:func:`PyUnicodeWriter_Format`. Patch by Victor Stinner. From e6195b767c21b3e21ee868d162f0718e810b7104 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:13:46 +0200 Subject: [PATCH 09/15] Update Objects/unicodeobject.c Co-authored-by: Erlend E. Aasland --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1c6eb2985f3054..1b84d235ebdec8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13149,7 +13149,7 @@ PyUnicodeWriter_Create(Py_ssize_t length) const size_t size = sizeof(_PyUnicodeWriter); PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); if (pub_writer == NULL) { - return PyErr_NoMemory(); + return (PyUnicodeWriter *)PyErr_NoMemory(); } _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; From 4865d434a02d601e3547df384380b42c9a30deba Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 10:20:08 +0200 Subject: [PATCH 10/15] Update Objects/unicodeobject.c Co-authored-by: Serhiy Storchaka --- Objects/unicodeobject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1b84d235ebdec8..731722df451bd8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13297,7 +13297,8 @@ _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) return _PyUnicodeWriter_WriteCharInline(writer, ch); } -int PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) +int +PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) { return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); } From 79b7c09b58ffa3d31a3b0348d8c041684acc1d7e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 11:12:09 +0200 Subject: [PATCH 11/15] Apply suggestions from code review Co-authored-by: Serhiy Storchaka --- Doc/c-api/unicode.rst | 2 +- Include/cpython/unicodeobject.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index d1ee5f3c41e19e..4ec945404b1778 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1549,7 +1549,7 @@ object. Return ``0`` on success, or set an exception and return ``-1`` on error. -.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *str) +.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index b160783a64214f..e5e1b6be118588 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -462,7 +462,7 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, - PyObject *str); + PyObject *obj); PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr( PyUnicodeWriter *writer, PyObject *obj); From db02dae80b57804ec9142167ac7d2b9dc23256f9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 10 Jun 2024 17:58:23 +0200 Subject: [PATCH 12/15] Make the API atomic --- Doc/c-api/unicode.rst | 18 +++++++---- Modules/_testcapi/unicode.c | 59 +++++++++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 20 ++++++++++--- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4ec945404b1778..ec38d9c26c0c72 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1538,7 +1538,8 @@ object. Write the single Unicode character *ch* into *writer*. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) @@ -1547,19 +1548,22 @@ object. *size* is the string length in bytes. If *size* is equal to ``-1``, call ``strlen(str)`` to get the string length. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Repr` on *obj* and write the output into *writer*. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, Py_ssize_t start, Py_ssize_t end) @@ -1569,10 +1573,12 @@ object. equal to 0, and less than or equal to *end*. *end* must be less than or equal to *str* length. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) Similar to :c:func:`PyUnicode_FromFormat`, but write the output directly into *writer*. - Return ``0`` on success, or set an exception and return ``-1`` on error. + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 8dea466c9d27b4..97978c09118a68 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -346,6 +346,34 @@ test_unicodewriter_invalid_utf8(PyObject *self, PyObject *Py_UNUSED(args)) } +static PyObject * +test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test recovering from PyUnicodeWriter_WriteUTF8() error + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + assert(PyUnicodeWriter_WriteUTF8(writer, "value=", -1) == 0); + + // write fails with an invalid string + assert(PyUnicodeWriter_WriteUTF8(writer, "invalid\xFF", -1) < 0); + PyErr_Clear(); + + // retry write with a valid string + assert(PyUnicodeWriter_WriteUTF8(writer, "valid", -1) == 0); + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "value=valid")); + Py_DECREF(result); + + Py_RETURN_NONE; +} + + static PyObject * test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) { @@ -379,6 +407,35 @@ test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) } +static PyObject * +test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test recovering from PyUnicodeWriter_Format() error + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + + assert(PyUnicodeWriter_Format(writer, "%s", "Hello") == 0); + + // PyUnicodeWriter_Format() fails with an invalid format string + assert(PyUnicodeWriter_Format(writer, "%s\xff", "World") < 0); + PyErr_Clear(); + + // Retry PyUnicodeWriter_Format() with a valid format string + assert(PyUnicodeWriter_Format(writer, "%s.", "World") == 0); + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "Hello World.")); + Py_DECREF(result); + + Py_RETURN_NONE; +} + + static PyMethodDef TestMethods[] = { {"unicode_new", unicode_new, METH_VARARGS}, {"unicode_fill", unicode_fill, METH_VARARGS}, @@ -390,7 +447,9 @@ static PyMethodDef TestMethods[] = { {"test_unicodewriter", test_unicodewriter, METH_NOARGS}, {"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS}, {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS}, + {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS}, {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS}, + {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 731722df451bd8..1f8c89dd12a528 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2955,12 +2955,17 @@ int PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) { _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; va_list vargs; va_start(vargs, format); - int ret = unicode_from_format(_writer, format, vargs); + int res = unicode_from_format(_writer, format, vargs); va_end(vargs); - return ret; + + if (res < 0) { + _writer->pos = old_pos; + } + return res; } static Py_ssize_t @@ -13484,8 +13489,15 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, if (size < 0) { size = strlen(str); } - return unicode_decode_utf8_writer((_PyUnicodeWriter*)writer, str, size, - _Py_ERROR_STRICT, NULL, NULL); + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; + int res = unicode_decode_utf8_writer(_writer, str, size, + _Py_ERROR_STRICT, NULL, NULL); + if (res < 0) { + _writer->pos = old_pos; + } + return res; } int From 10343b09051eab7e15ddf96e4852e33bfc4d12a9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 17:57:54 +0200 Subject: [PATCH 13/15] Fix typo --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 97978c09118a68..79f99c404cd757 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -416,7 +416,7 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args return NULL; } - assert(PyUnicodeWriter_Format(writer, "%s", "Hello") == 0); + assert(PyUnicodeWriter_Format(writer, "%s ", "Hello") == 0); // PyUnicodeWriter_Format() fails with an invalid format string assert(PyUnicodeWriter_Format(writer, "%s\xff", "World") < 0); From 957dc05881fbd6971f76673af71cf9be12a06c21 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 13 Jun 2024 10:54:15 +0200 Subject: [PATCH 14/15] Mention PyUnicode_DecodeUTF8() in the doc --- Doc/c-api/unicode.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index ec38d9c26c0c72..1f16067654aec7 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1513,7 +1513,7 @@ object. .. c:type:: PyUnicodeWriter - An Unicode writer instance. + A Unicode writer instance. The instance must be destroyed by :c:func:`PyUnicodeWriter_Finish` on success, or :c:func:`PyUnicodeWriter_Discard` on error. @@ -1551,6 +1551,10 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. + To use a different error handler than ``strict``, + :c:func:`PyUnicode_DecodeUTF8` can be used with + :c:func:`PyUnicodeWriter_WriteStr`. + .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. From 52efb9413d7ed081262383e4a3dd865a83a87e4e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 13 Jun 2024 11:11:56 +0200 Subject: [PATCH 15/15] Fix typo --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 1f16067654aec7..02e696c303fa91 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1520,7 +1520,7 @@ object. .. c:function:: PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) - Create an Unicode writer instance. + Create a Unicode writer instance. Set an exception and return ``NULL`` on error.