From c84f31437490f6ccf9cbf12e06af96ceb5307648 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 27 May 2024 16:21:18 +0200 Subject: [PATCH 01/27] gh-119609: Add PyUnicode_Export() function Add PyUnicode_Export(), PyUnicode_GetBufferFormat() and PyUnicode_Import() functions to the limited C API. --- Doc/c-api/unicode.rst | 65 ++++++ Doc/data/stable_abi.dat | 3 + Doc/whatsnew/3.14.rst | 4 + Include/unicodeobject.h | 18 ++ Lib/test/test_capi/test_unicode.py | 183 ++++++++++++++- Lib/test/test_stable_abi_ctypes.py | 3 + ...-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst | 3 + Misc/stable_abi.toml | 16 ++ Modules/_testlimitedcapi/unicode.c | 70 ++++++ Objects/unicodeobject.c | 220 +++++++++++++++++- PC/python3dll.c | 3 + 11 files changed, 584 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 958fafd47ac81b..603905d21555e5 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,6 +341,71 @@ APIs: .. versionadded:: 3.3 +.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) + + Export the contents of the *unicode* string in one of the requested format + *requested_formats*. + + * On success, fill *view*, and return ``0``. + * On error, set an exception and return ``-1``. + + The export must be released by :c:func:`PyBuffer_Release`. + The contents of the buffer are valid until they are released. + + The buffer is read-only and must not be modified. + + *unicode* and *view* must not be NULL. + + Available formats: + + .. c:namespace:: NULL + + =================================== ======== =========================== + Constant Identifier Value Description + =================================== ======== =========================== + .. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) + =================================== ======== =========================== + + *requested_formats* can be a single format or a bitwise combination of the + formats in the table above. + On success, *\*format* will be set to a single one of the requested flags. + + Note that future versions of Python may introduce additional formats. + + .. versionadded:: 3.14 + + +.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) + + Get the format of the buffer *view*. + + * On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value + and return ``0``. + * On error, set an exception and return ``-1``. + + *view* must be a buffer filled by :c:func:`PyUnicode_Export`. + + .. versionadded:: 3.14 + + +.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) + + Create a string object from a buffer in an “export format”. + + * Return a reference to a new string object on success. + * Set an exception and return ``NULL`` on error. + + *data* must not be NULL. *nbytes* must be positive or zero. + + See :c:func:`PyUnicode_Export` for the available formats. + + .. versionadded:: 3.14 + + .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \ Py_ssize_t size) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 7eeee270bb7f32..a6745986c2025e 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -784,6 +784,7 @@ func,PyUnicode_EncodeFSDefault,3.2,, func,PyUnicode_EncodeLocale,3.7,, func,PyUnicode_EqualToUTF8,3.13,, func,PyUnicode_EqualToUTF8AndSize,3.13,, +func,PyUnicode_Export,3.14,, func,PyUnicode_FSConverter,3.2,, func,PyUnicode_FSDecoder,3.2,, func,PyUnicode_Find,3.2,, @@ -797,8 +798,10 @@ func,PyUnicode_FromOrdinal,3.2,, func,PyUnicode_FromString,3.2,, func,PyUnicode_FromStringAndSize,3.2,, func,PyUnicode_FromWideChar,3.2,, +func,PyUnicode_GetBufferFormat,3.14,, func,PyUnicode_GetDefaultEncoding,3.2,, func,PyUnicode_GetLength,3.7,, +func,PyUnicode_Import,3.14,, func,PyUnicode_InternFromString,3.2,, func,PyUnicode_InternInPlace,3.2,, func,PyUnicode_IsIdentifier,3.2,, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index e1bd52370d776c..1d5e2a10b1b6dc 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -529,6 +529,10 @@ New Features (Contributed by Victor Stinner in :gh:`107954`.) +* Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, + and :c:func:`PyUnicode_Import` functions to export and import strings. + (Contributed by Victor Stinner in :gh:`119609`.) + Porting to Python 3.14 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee00715b3c51d..75d41a90ae65d7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string) +#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1* +#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2* +#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4* +#define PyUnicode_FORMAT_UTF8 0x10 // char* + +PyAPI_FUNC(int) PyUnicode_Export( + PyObject *unicode, + uint32_t requested_formats, + Py_buffer *view); +PyAPI_FUNC(int) PyUnicode_GetBufferFormat( + const Py_buffer *view, + uint32_t *format); +PyAPI_FUNC(PyObject*) PyUnicode_Import( + const void *data, + Py_ssize_t nbytes, + uint32_t format); + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e6f85427214958..6f026d6dd87225 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,5 +1,6 @@ -import unittest +import struct import sys +import unittest from test import support from test.support import import_helper @@ -28,6 +29,14 @@ class Str(str): pass +PyUnicode_FORMAT_ASCII = 0x01 +PyUnicode_FORMAT_UCS1 = 0x02 +PyUnicode_FORMAT_UCS2 = 0x04 +PyUnicode_FORMAT_UCS4 = 0x08 +PyUnicode_FORMAT_UTF8 = 0x10 +# Invalid native format +PyUnicode_FORMAT_INVALID = 0x20 + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1721,6 +1730,142 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + def test_unicode_export(self): + # Test PyUnicode_Export() and PyUnicode_FreeExport() + unicode_export = _testlimitedcapi.unicode_export + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + # export to the native format + formats = (PyUnicode_FORMAT_ASCII + | PyUnicode_FORMAT_UCS1 + | PyUnicode_FORMAT_UCS2 + | PyUnicode_FORMAT_UCS4) + BUFFER_UCS1 = 'B' + BUFFER_UCS2 = 'H' + if struct.calcsize('I') == 4: + BUFFER_UCS4 = 'I' + elif struct.calcsize('L') == 4: + BUFFER_UCS4 = 'L' + else: + self.fail("unable to get BUFFER_UCS4 ") + + def check_ucs1(text, formats): + if formats == PyUnicode_FORMAT_UCS1: + export_format = PyUnicode_FORMAT_UCS1 + elif text.isascii(): + export_format = PyUnicode_FORMAT_ASCII + else: + export_format = PyUnicode_FORMAT_UCS1 + self.assertEqual(unicode_export(text, formats), + (text.encode('latin1'), export_format, 1, BUFFER_UCS1)) + + def check_ucs2(text, formats): + self.assertEqual(unicode_export(text, formats), + (text.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) + + def check_ucs4(text, formats): + self.assertEqual(unicode_export(text, formats), + (text.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) + + def check_utf8(text): + self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8), + (text.encode('utf8'), + PyUnicode_FORMAT_UTF8, 1, 'B')) + + check_ucs1("abc", formats) + check_ucs1("latin1:\xe9", formats) + check_ucs2('ucs2:\u20ac', formats) + check_ucs4('ucs4:\U0010ffff', formats) + + # export ASCII as UCS1 + check_ucs1("abc", PyUnicode_FORMAT_UCS1) + + # export ASCII and UCS1 to UCS2 + check_ucs2("abc", PyUnicode_FORMAT_UCS2) + check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2) + + # always export to UCS4 + check_ucs4("abc", PyUnicode_FORMAT_UCS4) + check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4) + check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) + check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4) + + # always export to UTF8 + check_utf8("abc") + check_utf8("latin1:\xe9") + check_utf8('ucs2:\u20ac') + check_utf8('ucs4:\U0010ffff') + + # No supported format or invalid format + for formats in (0, PyUnicode_FORMAT_INVALID): + err_msg = "unable to find a matching export format" + with self.subTest(formats=formats): + with self.assertRaisesRegex(ValueError, err_msg): + unicode_export('abc', formats) + + def test_unicode_import(self): + # Test PyUnicode_Import() + unicode_import = _testlimitedcapi.unicode_import + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII), + "abc") + self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1), + "latin1:\xe9") + + self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2), + 'ucs2:\u20ac') + + self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4), + 'ucs4:\U0010ffff') + + text = "abc\xe9\U0010ffff" + self.assertEqual(unicode_import(text.encode('utf8'), + PyUnicode_FORMAT_UTF8), + text) + + # Empty string + for native_format in ( + PyUnicode_FORMAT_ASCII, + PyUnicode_FORMAT_UCS1, + PyUnicode_FORMAT_UCS2, + PyUnicode_FORMAT_UCS4, + PyUnicode_FORMAT_UTF8, + ): + with self.subTest(native_format=native_format): + self.assertEqual(unicode_import(b'', native_format), + '') + + # Invalid format + with self.assertRaises(ValueError): + unicode_import(b'', PyUnicode_FORMAT_INVALID) + + # Invalid size + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) + with self.assertRaises(ValueError): + unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2) + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): @@ -1903,6 +2048,38 @@ def test_recover_error(self): self.assertEqual(writer.finish(), 'Hello World.') - -if __name__ == "__main__": + def test_unicode_export_import_roundtrip(self): + unicode_export = _testlimitedcapi.unicode_export + unicode_import = _testlimitedcapi.unicode_import + + ASCII = PyUnicode_FORMAT_ASCII + UCS1 = PyUnicode_FORMAT_UCS1 + UCS2 = PyUnicode_FORMAT_UCS2 + UCS4 = PyUnicode_FORMAT_UCS4 + UTF8 = PyUnicode_FORMAT_UTF8 + ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) + + def roundtrip(string, formats): + buf, buf_fmt, item_size, view_fmt = unicode_export(string, formats) + self.assertEqual(unicode_import(buf, buf_fmt), string) + + for string, allowed_formats in ( + ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), + ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), + ('ucs4:\U0001f638', {UCS4, UTF8}), + ): + for formats in ASCII, UCS1, UCS2, UCS4, UTF8: + with self.subTest(string=string, formats=formats): + if formats not in allowed_formats: + with self.assertRaises(ValueError): + unicode_export(string, formats) + else: + roundtrip(string, formats) + + roundtrip(string, ALL) + + +if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 4bca33b7451f80..b496b43d4ef6cd 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -806,6 +806,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", "PyUnicode_EqualToUTF8AndSize", + "PyUnicode_Export", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", @@ -819,9 +820,11 @@ def test_windows_feature_macros(self): "PyUnicode_FromString", "PyUnicode_FromStringAndSize", "PyUnicode_FromWideChar", + "PyUnicode_GetBufferFormat", "PyUnicode_GetDefaultEncoding", "PyUnicode_GetLength", "PyUnicode_GetSize", + "PyUnicode_Import", "PyUnicode_InternFromString", "PyUnicode_InternImmortal", "PyUnicode_InternInPlace", diff --git a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst new file mode 100644 index 00000000000000..6d75f0c192bc85 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst @@ -0,0 +1,3 @@ +Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, and +:c:func:`PyUnicode_Import` functions to export and import strings. Patch by +Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 8bf638c473c712..7fb8971326a064 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2526,3 +2526,19 @@ added = '3.14' [function.PyLong_AsUInt64] added = '3.14' +[const.PyUnicode_FORMAT_ASCII] + added = '3.14' +[const.PyUnicode_FORMAT_UCS1] + added = '3.14' +[const.PyUnicode_FORMAT_UCS2] + added = '3.14' +[const.PyUnicode_FORMAT_UCS4] + added = '3.14' +[const.PyUnicode_FORMAT_UTF8] + added = '3.14' +[function.PyUnicode_Export] + added = '3.14' +[function.PyUnicode_GetBufferFormat] + added = '3.14' +[function.PyUnicode_Import] + added = '3.14' diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 2b70d09108a333..c64935920ff0b3 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1837,6 +1837,74 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) #undef CHECK_FORMAT_0 } + +// Test PyUnicode_Export() +static PyObject* +unicode_export(PyObject *self, PyObject *args) +{ + PyObject *obj; + unsigned int requested_formats; + if (!PyArg_ParseTuple(args, "OI", &obj, &requested_formats)) { + return NULL; + } + + Py_buffer view; + if (PyUnicode_Export(obj, requested_formats, &view) < 0) { + return NULL; + } + uint32_t format; + if (PyUnicode_GetBufferFormat(&view, &format) < 0) { + return NULL; + } + + // Make sure that the exported string ends with a NUL character + char *data = view.buf; + Py_ssize_t nbytes = view.len * view.itemsize; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + assert(data[nbytes] == 0); + break; + case PyUnicode_FORMAT_UCS2: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + break; + case PyUnicode_FORMAT_UCS4: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + assert(data[nbytes + 2] == 0); + assert(data[nbytes + 3] == 0); + break; + case PyUnicode_FORMAT_UTF8: + assert(data[nbytes] == 0); + break; + } + + assert(view.format != NULL); + PyObject *res = Py_BuildValue("y#Iis", + view.buf, view.len * view.itemsize, + (unsigned int)format, + (int)view.itemsize, view.format); + PyBuffer_Release(&view); + return res; +} + + +// Test PyUnicode_Import() +static PyObject* +unicode_import(PyObject *self, PyObject *args) +{ + const void *data; + Py_ssize_t nbytes; + unsigned int format; + if (!PyArg_ParseTuple(args, "y#I", &data, &nbytes, &format)) { + return NULL; + } + return PyUnicode_Import(data, nbytes, format); +} + + static PyMethodDef TestMethods[] = { {"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS}, {"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS}, @@ -1924,6 +1992,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, + {"unicode_export", unicode_export, METH_VARARGS}, + {"unicode_import", unicode_import, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2494c989544ca0..8766b448a63d7a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2332,6 +2332,220 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, } +static int +unicode_export(PyObject *unicode, Py_buffer *view, + Py_ssize_t len, const void *buf, + int itemsize, const char *format, uint32_t internal_format) +{ + if (PyBuffer_FillInfo(view, unicode, (void*)buf, len, + 1, PyBUF_SIMPLE) < 0) { + return -1; + } + view->itemsize = itemsize; + view->format = (char*)format; + view->internal = (void*)(uintptr_t)internal_format; + return 0; +} + + +int +PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) +{ +#if SIZEOF_INT == 4 +# define BUFFER_UCS4 "I" +#elif SIZEOF_LONG == 4 +# define BUFFER_UCS4 "L" +#else +# error "unable to find BUFFER_UCS4" +#endif + + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); + return -1; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + + // Native ASCII + if (PyUnicode_IS_ASCII(unicode) + && (requested_formats & PyUnicode_FORMAT_ASCII)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_ASCII); + } + + // Native UCS1 + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS1)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_UCS1); + } + + // Native UCS2 + if (kind == PyUnicode_2BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS2)) + { + return unicode_export(unicode, view, + len, PyUnicode_2BYTE_DATA(unicode), + 2, "H", PyUnicode_FORMAT_UCS2); + } + + // Convert ASCII or UCS1 to UCS2 + if (kind == PyUnicode_1BYTE_KIND + && requested_formats & PyUnicode_FORMAT_UCS2) + { + Py_UCS2 *ucs2 = PyMem_Malloc((len + 1) * sizeof(Py_UCS2)); + if (!ucs2) { + PyErr_NoMemory(); + return -1; + } + + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(unicode), + PyUnicode_1BYTE_DATA(unicode) + len, + ucs2); + ucs2[len] = 0; + + return unicode_export(unicode, view, + len, ucs2, + 2, "H", PyUnicode_FORMAT_UCS2); + } + + // Native UCS4 + if (kind == PyUnicode_4BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS4)) + { + return unicode_export(unicode, view, + len, PyUnicode_4BYTE_DATA(unicode), + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + } + + // Convert ASCII, UCS1 or UCS2 to UCS4 + if (requested_formats & PyUnicode_FORMAT_UCS4) { + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); + if (ucs4 == NULL) { + return -1; + } + return unicode_export(unicode, view, + len, ucs4, + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + } + + // Encode UCS1, UCS2 or UCS4 to UTF-8 + if (requested_formats & PyUnicode_FORMAT_UTF8) { + Py_ssize_t nbytes; + const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); + if (utf8 == NULL) { + return -1; + } + return unicode_export(unicode, view, + nbytes, utf8, + 1, "B", PyUnicode_FORMAT_UTF8); + } + + PyErr_Format(PyExc_ValueError, "unable to find a matching export format"); + return -1; + +#undef BUFFER_UCS4 +} + + +int +PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) +{ + if (view->obj == NULL || !PyUnicode_Check(view->obj)) { + PyErr_SetString(PyExc_ValueError, "not a str export"); + return -1; + } + + uintptr_t internal_format = (uintptr_t)view->internal; + switch (internal_format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + case PyUnicode_FORMAT_UCS2: + case PyUnicode_FORMAT_UCS4: + case PyUnicode_FORMAT_UTF8: + break; + default: + PyErr_SetString(PyExc_ValueError, "invalid format"); + return -1; + } + + *format = (uint32_t)internal_format; + return 0; +} + + +static void +unicode_releasebuffer(PyObject *unicode, Py_buffer *view) +{ + uintptr_t format = (uintptr_t)view->internal; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + case PyUnicode_FORMAT_UCS2: + case PyUnicode_FORMAT_UTF8: + // nothing to release + break; + case PyUnicode_FORMAT_UCS4: + if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { + PyMem_Free(view->buf); + } + break; + default: + // ignore silently an unknown format + break; + } +} + +PyObject* +PyUnicode_Import(const void *data, Py_ssize_t nbytes, + uint32_t format) +{ + if (nbytes < 0) { + PyErr_SetString(PyExc_ValueError, "Negative nbytes"); + return NULL; + } + + switch (format) + { + case PyUnicode_FORMAT_ASCII: + return PyUnicode_DecodeASCII((const char*)data, nbytes, NULL); + + case PyUnicode_FORMAT_UCS1: + return _PyUnicode_FromUCS1(data, nbytes); + + case PyUnicode_FORMAT_UCS2: + if (nbytes % 2) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 2: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS2(data, nbytes / 2); + + case PyUnicode_FORMAT_UCS4: + if (nbytes % 4) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 4: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS4(data, nbytes / 4); + + case PyUnicode_FORMAT_UTF8: + return PyUnicode_DecodeUTF8((const char*)data, nbytes, NULL); + + default: + PyErr_Format(PyExc_ValueError, "unknown format: %i", format); + return NULL; + } +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { @@ -15248,6 +15462,10 @@ errors defaults to 'strict'."); static PyObject *unicode_iter(PyObject *seq); +static PyBufferProcs unicode_as_buffer = { + .bf_releasebuffer = unicode_releasebuffer, +}; + PyTypeObject PyUnicode_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "str", /* tp_name */ @@ -15268,7 +15486,7 @@ PyTypeObject PyUnicode_Type = { (reprfunc) unicode_str, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ - 0, /* tp_as_buffer */ + &unicode_as_buffer, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS | _Py_TPFLAGS_MATCH_SELF, /* tp_flags */ diff --git a/PC/python3dll.c b/PC/python3dll.c index 1845334b244d8c..1bfa238eb7054d 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -717,6 +717,7 @@ EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) +EXPORT_FUNC(PyUnicode_Export) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) @@ -730,9 +731,11 @@ EXPORT_FUNC(PyUnicode_FromStringAndSize) EXPORT_FUNC(PyUnicode_FromWideChar) EXPORT_FUNC(PyUnicode_FSConverter) EXPORT_FUNC(PyUnicode_FSDecoder) +EXPORT_FUNC(PyUnicode_GetBufferFormat) EXPORT_FUNC(PyUnicode_GetDefaultEncoding) EXPORT_FUNC(PyUnicode_GetLength) EXPORT_FUNC(PyUnicode_GetSize) +EXPORT_FUNC(PyUnicode_Import) EXPORT_FUNC(PyUnicode_InternFromString) EXPORT_FUNC(PyUnicode_InternImmortal) EXPORT_FUNC(PyUnicode_InternInPlace) From d0cdbd1e46e4cb9cdd02a35779038b5fef06dabc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 5 Sep 2024 18:51:45 +0200 Subject: [PATCH 02/27] Address reviews --- Doc/c-api/unicode.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 603905d21555e5..9010f19cfb1e1b 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -349,7 +349,7 @@ APIs: * On success, fill *view*, and return ``0``. * On error, set an exception and return ``-1``. - The export must be released by :c:func:`PyBuffer_Release`. + The *view* buffer must be released by :c:func:`PyBuffer_Release`. The contents of the buffer are valid until they are released. The buffer is read-only and must not be modified. @@ -372,7 +372,8 @@ APIs: *requested_formats* can be a single format or a bitwise combination of the formats in the table above. - On success, *\*format* will be set to a single one of the requested flags. + To determine the format that was selected for output, call + :c:func:`PyUnicode_GetBufferFormat`. Note that future versions of Python may introduce additional formats. @@ -383,7 +384,7 @@ APIs: Get the format of the buffer *view*. - * On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value + * On success, set *\*format* to the corresponding ``PyUnicode_FORMAT_*`` value and return ``0``. * On error, set an exception and return ``-1``. @@ -394,7 +395,7 @@ APIs: .. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) - Create a string object from a buffer in an “export format”. + Create a Unicode string object from a buffer in a supported format. * Return a reference to a new string object on success. * Set an exception and return ``NULL`` on error. From 9b33dca5a08776c6542cfbd6285fc9cd0ab1d8fb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 5 Sep 2024 18:54:13 +0200 Subject: [PATCH 03/27] Exclude from limited C API 3.13 and older --- Include/unicodeobject.h | 2 ++ Modules/_testlimitedcapi/unicode.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 75d41a90ae65d7..b359ba780a538e 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030e0000 #define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string) #define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1* #define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2* @@ -265,6 +266,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, Py_ssize_t nbytes, uint32_t format); +#endif /* --- wchar_t support for platforms which support it --------------------- */ diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index c64935920ff0b3..ada61eda37ce6c 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1,7 +1,7 @@ #include "pyconfig.h" // Py_GIL_DISABLED #ifndef Py_GIL_DISABLED - // Need limited C API 3.13 to test PyUnicode_EqualToUTF8() -# define Py_LIMITED_API 0x030d0000 + // Need limited C API 3.14 to test PyUnicode_Export() +# define Py_LIMITED_API 0x030e0000 #endif #include "parts.h" From cf1f74a3e2d7ced582cb3a6fc740df8d8ad992f8 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 5 Sep 2024 19:30:04 +0200 Subject: [PATCH 04/27] Replace PyErr_Format() with PyErr_SetString() --- Objects/unicodeobject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8766b448a63d7a..081a4de8c3d16e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2446,7 +2446,8 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) 1, "B", PyUnicode_FORMAT_UTF8); } - PyErr_Format(PyExc_ValueError, "unable to find a matching export format"); + PyErr_SetString(PyExc_ValueError, + "unable to find a matching export format"); return -1; #undef BUFFER_UCS4 From 93d4470988dc71003fef075156a752c0adabb6d3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 5 Sep 2024 20:34:00 +0200 Subject: [PATCH 05/27] Fix test_collections: implement UserString.__release_buffer__() --- Lib/collections/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/collections/__init__.py b/Lib/collections/__init__.py index b47e728484c8ac..1f4af677da2484 100644 --- a/Lib/collections/__init__.py +++ b/Lib/collections/__init__.py @@ -1595,3 +1595,6 @@ def upper(self): def zfill(self, width): return self.__class__(self.data.zfill(width)) + + def __release_buffer__(self, view): + raise NotImplementedError From 17ad7b9c88156adfa413fb9922991e1fb85aaa77 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 9 Sep 2024 21:46:21 +0200 Subject: [PATCH 06/27] Add format parameter to PyUnicode_Export() --- Doc/c-api/unicode.rst | 20 ++----------- Include/unicodeobject.h | 4 +-- Modules/_testlimitedcapi/unicode.c | 5 +--- Objects/unicodeobject.c | 47 +++++++----------------------- 4 files changed, 16 insertions(+), 60 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 9010f19cfb1e1b..c822fd2c15855c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,12 +341,12 @@ APIs: .. versionadded:: 3.3 -.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) +.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view, uint32_t *format) Export the contents of the *unicode* string in one of the requested format *requested_formats*. - * On success, fill *view*, and return ``0``. + * On success, fill *view* and set *\*format*, and return ``0``. * On error, set an exception and return ``-1``. The *view* buffer must be released by :c:func:`PyBuffer_Release`. @@ -372,27 +372,13 @@ APIs: *requested_formats* can be a single format or a bitwise combination of the formats in the table above. - To determine the format that was selected for output, call - :c:func:`PyUnicode_GetBufferFormat`. + On success, *\*format* will be set to a single one of the requested flags. Note that future versions of Python may introduce additional formats. .. versionadded:: 3.14 -.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) - - Get the format of the buffer *view*. - - * On success, set *\*format* to the corresponding ``PyUnicode_FORMAT_*`` value - and return ``0``. - * On error, set an exception and return ``-1``. - - *view* must be a buffer filled by :c:func:`PyUnicode_Export`. - - .. versionadded:: 3.14 - - .. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) Create a Unicode string object from a buffer in a supported format. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index b359ba780a538e..219f6a00fffb7c 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -258,9 +258,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( PyAPI_FUNC(int) PyUnicode_Export( PyObject *unicode, uint32_t requested_formats, - Py_buffer *view); -PyAPI_FUNC(int) PyUnicode_GetBufferFormat( - const Py_buffer *view, + Py_buffer *view, uint32_t *format); PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index ada61eda37ce6c..a7dddec8ce9fcc 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1849,11 +1849,8 @@ unicode_export(PyObject *self, PyObject *args) } Py_buffer view; - if (PyUnicode_Export(obj, requested_formats, &view) < 0) { - return NULL; - } uint32_t format; - if (PyUnicode_GetBufferFormat(&view, &format) < 0) { + if (PyUnicode_Export(obj, requested_formats, &view, &format) < 0) { return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 081a4de8c3d16e..da599063632190 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2333,7 +2333,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, static int -unicode_export(PyObject *unicode, Py_buffer *view, +unicode_export(PyObject *unicode, Py_buffer *view, uint32_t *pformat, Py_ssize_t len, const void *buf, int itemsize, const char *format, uint32_t internal_format) { @@ -2344,12 +2344,14 @@ unicode_export(PyObject *unicode, Py_buffer *view, view->itemsize = itemsize; view->format = (char*)format; view->internal = (void*)(uintptr_t)internal_format; + *pformat = internal_format; return 0; } int -PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) +PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, + Py_buffer *view, uint32_t *format) { #if SIZEOF_INT == 4 # define BUFFER_UCS4 "I" @@ -2369,7 +2371,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (PyUnicode_IS_ASCII(unicode) && (requested_formats & PyUnicode_FORMAT_ASCII)) { - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, PyUnicode_1BYTE_DATA(unicode), 1, "B", PyUnicode_FORMAT_ASCII); } @@ -2379,7 +2381,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (kind == PyUnicode_1BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS1)) { - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, PyUnicode_1BYTE_DATA(unicode), 1, "B", PyUnicode_FORMAT_UCS1); } @@ -2388,7 +2390,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (kind == PyUnicode_2BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS2)) { - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, PyUnicode_2BYTE_DATA(unicode), 2, "H", PyUnicode_FORMAT_UCS2); } @@ -2409,7 +2411,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) ucs2); ucs2[len] = 0; - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, ucs2, 2, "H", PyUnicode_FORMAT_UCS2); } @@ -2418,7 +2420,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (kind == PyUnicode_4BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS4)) { - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, PyUnicode_4BYTE_DATA(unicode), 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); } @@ -2429,7 +2431,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (ucs4 == NULL) { return -1; } - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, len, ucs4, 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); } @@ -2441,7 +2443,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) if (utf8 == NULL) { return -1; } - return unicode_export(unicode, view, + return unicode_export(unicode, view, format, nbytes, utf8, 1, "B", PyUnicode_FORMAT_UTF8); } @@ -2454,33 +2456,6 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) } -int -PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) -{ - if (view->obj == NULL || !PyUnicode_Check(view->obj)) { - PyErr_SetString(PyExc_ValueError, "not a str export"); - return -1; - } - - uintptr_t internal_format = (uintptr_t)view->internal; - switch (internal_format) - { - case PyUnicode_FORMAT_ASCII: - case PyUnicode_FORMAT_UCS1: - case PyUnicode_FORMAT_UCS2: - case PyUnicode_FORMAT_UCS4: - case PyUnicode_FORMAT_UTF8: - break; - default: - PyErr_SetString(PyExc_ValueError, "invalid format"); - return -1; - } - - *format = (uint32_t)internal_format; - return 0; -} - - static void unicode_releasebuffer(PyObject *unicode, Py_buffer *view) { From d683d0a1bbf866c63df31e9df7d6b234ddbe5ae9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 9 Sep 2024 21:51:23 +0200 Subject: [PATCH 07/27] format must not be NULL --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c822fd2c15855c..a9e8b1de431312 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -354,7 +354,7 @@ APIs: The buffer is read-only and must not be modified. - *unicode* and *view* must not be NULL. + *unicode*, *view* and *format* must not be NULL. Available formats: From 78a70faeca2ab7cccc693e729b853b1410ffbbe3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 08:40:46 +0200 Subject: [PATCH 08/27] Fix memory leak in unicode_releasebuffer() UCS2 can also copy the buffer. --- Objects/unicodeobject.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index da599063632190..71e16019286f56 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2464,10 +2464,14 @@ unicode_releasebuffer(PyObject *unicode, Py_buffer *view) { case PyUnicode_FORMAT_ASCII: case PyUnicode_FORMAT_UCS1: - case PyUnicode_FORMAT_UCS2: case PyUnicode_FORMAT_UTF8: // nothing to release break; + case PyUnicode_FORMAT_UCS2: + if (PyUnicode_KIND(unicode) != PyUnicode_2BYTE_KIND) { + PyMem_Free(view->buf); + } + break; case PyUnicode_FORMAT_UCS4: if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { PyMem_Free(view->buf); From 79207f59f3ac2d89309467d7959f59ac49f1451c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 08:55:43 +0200 Subject: [PATCH 09/27] Remove PyUnicode_GetBufferFormat() documentation --- Doc/whatsnew/3.14.rst | 4 ++-- .../C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 1d5e2a10b1b6dc..9571621855522f 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -529,8 +529,8 @@ New Features (Contributed by Victor Stinner in :gh:`107954`.) -* Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, - and :c:func:`PyUnicode_Import` functions to export and import strings. +* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions + to export and import strings. (Contributed by Victor Stinner in :gh:`119609`.) diff --git a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst index 6d75f0c192bc85..3eae4543f087d0 100644 --- a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst +++ b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst @@ -1,3 +1,2 @@ -Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, and -:c:func:`PyUnicode_Import` functions to export and import strings. Patch by -Victor Stinner. +Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to +export and import strings. Patch by Victor Stinner. From bc0fb69bac273a86e03f8596717bd985eb7cc99d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 15:42:04 +0200 Subject: [PATCH 10/27] Apply suggestions from code review Co-authored-by: Petr Viktorin --- Doc/c-api/unicode.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index a9e8b1de431312..b763d59a2e20e4 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -343,13 +343,14 @@ APIs: .. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view, uint32_t *format) - Export the contents of the *unicode* string in one of the requested format - *requested_formats*. + Export the contents of the *unicode* string in one of the *requested_formats*. - * On success, fill *view* and set *\*format*, and return ``0``. - * On error, set an exception and return ``-1``. + * On success, fill *view*, set *\*format*, and return ``0``. + * On error, set an exception, set *\*format* to 0, and return ``-1``. + *view* is left unchanged. - The *view* buffer must be released by :c:func:`PyBuffer_Release`. + After a successful call to :c:func:`PyUnicode_Export`, + the *view* buffer must be released by :c:func:`PyBuffer_Release`. The contents of the buffer are valid until they are released. The buffer is read-only and must not be modified. @@ -369,6 +370,8 @@ APIs: .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) =================================== ======== =========================== + + UCS-2 and UCS-4 use the native byte order. *requested_formats* can be a single format or a bitwise combination of the formats in the table above. From 2cdbc27d36c1079914e06fb02b4b90f72f6edee9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 15:45:22 +0200 Subject: [PATCH 11/27] Set format to 0 on error --- Modules/_testlimitedcapi/unicode.c | 3 ++- Objects/unicodeobject.c | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index a7dddec8ce9fcc..b37aca149e818d 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1849,8 +1849,9 @@ unicode_export(PyObject *self, PyObject *args) } Py_buffer view; - uint32_t format; + uint32_t format = (uint32_t)UNINITIALIZED_INT; if (PyUnicode_Export(obj, requested_formats, &view, &format) < 0) { + assert(format == 0); return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 71e16019286f56..5c160a053cbcb7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2339,6 +2339,7 @@ unicode_export(PyObject *unicode, Py_buffer *view, uint32_t *pformat, { if (PyBuffer_FillInfo(view, unicode, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { + *pformat = 0; return -1; } view->itemsize = itemsize; @@ -2363,7 +2364,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); - return -1; + goto error; } Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); @@ -2402,7 +2403,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_UCS2 *ucs2 = PyMem_Malloc((len + 1) * sizeof(Py_UCS2)); if (!ucs2) { PyErr_NoMemory(); - return -1; + goto error; } _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, @@ -2429,7 +2430,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (requested_formats & PyUnicode_FORMAT_UCS4) { Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); if (ucs4 == NULL) { - return -1; + goto error; } return unicode_export(unicode, view, format, len, ucs4, @@ -2441,7 +2442,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_ssize_t nbytes; const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); if (utf8 == NULL) { - return -1; + goto error; } return unicode_export(unicode, view, format, nbytes, utf8, @@ -2450,6 +2451,10 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, PyErr_SetString(PyExc_ValueError, "unable to find a matching export format"); + goto error; + +error: + *format = 0; return -1; #undef BUFFER_UCS4 From b5be22dab39b654b7f1135d4b9b49da324c30b30 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 15:49:08 +0200 Subject: [PATCH 12/27] Remove trailing space --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index b763d59a2e20e4..a6f261225c8ad2 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -370,7 +370,7 @@ APIs: .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) =================================== ======== =========================== - + UCS-2 and UCS-4 use the native byte order. *requested_formats* can be a single format or a bitwise combination of the From 2960b25ecd7c8ac72bd017e054f89543d4c728cc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 10 Sep 2024 16:38:45 +0200 Subject: [PATCH 13/27] Change constant values --- Include/unicodeobject.h | 10 +++++----- Lib/test/test_capi/test_unicode.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 219f6a00fffb7c..3c482fd606be8b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -249,11 +249,11 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( ); #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030e0000 -#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string) -#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1* -#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2* -#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4* -#define PyUnicode_FORMAT_UTF8 0x10 // char* +#define PyUnicode_FORMAT_UCS1 0x01 // Py_UCS1* +#define PyUnicode_FORMAT_UCS2 0x02 // Py_UCS2* +#define PyUnicode_FORMAT_UCS4 0x04 // Py_UCS4* +#define PyUnicode_FORMAT_UTF8 0x08 // char* +#define PyUnicode_FORMAT_ASCII 0x10 // char* (ASCII string) PyAPI_FUNC(int) PyUnicode_Export( PyObject *unicode, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6f026d6dd87225..6d34b95714f186 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -29,11 +29,11 @@ class Str(str): pass -PyUnicode_FORMAT_ASCII = 0x01 -PyUnicode_FORMAT_UCS1 = 0x02 -PyUnicode_FORMAT_UCS2 = 0x04 -PyUnicode_FORMAT_UCS4 = 0x08 -PyUnicode_FORMAT_UTF8 = 0x10 +PyUnicode_FORMAT_UCS1 = 0x01 +PyUnicode_FORMAT_UCS2 = 0x02 +PyUnicode_FORMAT_UCS4 = 0x04 +PyUnicode_FORMAT_UTF8 = 0x08 +PyUnicode_FORMAT_ASCII = 0x10 # Invalid native format PyUnicode_FORMAT_INVALID = 0x20 From bcb41f3f56e8781680713861bd7426739285aaca Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 11 Sep 2024 12:03:29 +0200 Subject: [PATCH 14/27] Update constants value in the doc --- Doc/c-api/unicode.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index a6f261225c8ad2..22e7668991dcdd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -364,11 +364,11 @@ APIs: =================================== ======== =========================== Constant Identifier Value Description =================================== ======== =========================== - .. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``) - .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``) - .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``) - .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) - .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) + .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x01`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x02`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x04`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x08`` UTF-8 string (``char*``) + .. c:macro:: PyUnicode_FORMAT_ASCII ``0x10`` ASCII string (``Py_UCS1*``) =================================== ======== =========================== UCS-2 and UCS-4 use the native byte order. From 44cb702253e8c845d9112c40122ef67403d9000a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 11:48:52 +0200 Subject: [PATCH 15/27] Remove unicode_releasebuffer(); use bytes instead --- Lib/collections/__init__.py | 3 -- Objects/unicodeobject.c | 67 +++++++++++++------------------------ 2 files changed, 24 insertions(+), 46 deletions(-) diff --git a/Lib/collections/__init__.py b/Lib/collections/__init__.py index 1f4af677da2484..b47e728484c8ac 100644 --- a/Lib/collections/__init__.py +++ b/Lib/collections/__init__.py @@ -1595,6 +1595,3 @@ def upper(self): def zfill(self, width): return self.__class__(self.data.zfill(width)) - - def __release_buffer__(self, view): - raise NotImplementedError diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c160a053cbcb7..8c0fae933e8037 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2333,11 +2333,11 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, static int -unicode_export(PyObject *unicode, Py_buffer *view, uint32_t *pformat, +unicode_export(PyObject *obj, Py_buffer *view, uint32_t *pformat, Py_ssize_t len, const void *buf, int itemsize, const char *format, uint32_t internal_format) { - if (PyBuffer_FillInfo(view, unicode, (void*)buf, len, + if (PyBuffer_FillInfo(view, obj, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { *pformat = 0; return -1; @@ -2400,11 +2400,11 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (kind == PyUnicode_1BYTE_KIND && requested_formats & PyUnicode_FORMAT_UCS2) { - Py_UCS2 *ucs2 = PyMem_Malloc((len + 1) * sizeof(Py_UCS2)); - if (!ucs2) { - PyErr_NoMemory(); + PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); + if (!bytes) { goto error; } + Py_UCS2 *ucs2 = (Py_UCS2*)PyBytes_AS_STRING(bytes); _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, PyUnicode_1BYTE_DATA(unicode), @@ -2412,9 +2412,11 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, ucs2); ucs2[len] = 0; - return unicode_export(unicode, view, format, - len, ucs2, - 2, "H", PyUnicode_FORMAT_UCS2); + int res = unicode_export(bytes, view, format, + len, ucs2, + 2, "H", PyUnicode_FORMAT_UCS2); + Py_DECREF(bytes); + return res; } // Native UCS4 @@ -2432,9 +2434,19 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (ucs4 == NULL) { goto error; } - return unicode_export(unicode, view, format, - len, ucs4, - 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + + PyObject *bytes = PyBytes_FromStringAndSize((char*)ucs4, (len + 1) * 4); + PyMem_Free(ucs4); + if (bytes == NULL) { + goto error; + } + ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); + + int res = unicode_export(bytes, view, format, + len, ucs4, + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + Py_DECREF(bytes); + return res; } // Encode UCS1, UCS2 or UCS4 to UTF-8 @@ -2461,33 +2473,6 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, } -static void -unicode_releasebuffer(PyObject *unicode, Py_buffer *view) -{ - uintptr_t format = (uintptr_t)view->internal; - switch (format) - { - case PyUnicode_FORMAT_ASCII: - case PyUnicode_FORMAT_UCS1: - case PyUnicode_FORMAT_UTF8: - // nothing to release - break; - case PyUnicode_FORMAT_UCS2: - if (PyUnicode_KIND(unicode) != PyUnicode_2BYTE_KIND) { - PyMem_Free(view->buf); - } - break; - case PyUnicode_FORMAT_UCS4: - if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { - PyMem_Free(view->buf); - } - break; - default: - // ignore silently an unknown format - break; - } -} - PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) @@ -15447,10 +15432,6 @@ errors defaults to 'strict'."); static PyObject *unicode_iter(PyObject *seq); -static PyBufferProcs unicode_as_buffer = { - .bf_releasebuffer = unicode_releasebuffer, -}; - PyTypeObject PyUnicode_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "str", /* tp_name */ @@ -15471,7 +15452,7 @@ PyTypeObject PyUnicode_Type = { (reprfunc) unicode_str, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ - &unicode_as_buffer, /* tp_as_buffer */ + 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS | _Py_TPFLAGS_MATCH_SELF, /* tp_flags */ From 1809d8d1eecc3cb6f2035ebc50a4640d04cb36b7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 11:59:11 +0200 Subject: [PATCH 16/27] PyUnicode_Export() returns the format Use signed int32_t for the format. --- Doc/c-api/unicode.rst | 11 +++--- Include/unicodeobject.h | 9 +++-- Modules/_testlimitedcapi/unicode.c | 7 ++-- Objects/unicodeobject.c | 54 +++++++++++++----------------- 4 files changed, 37 insertions(+), 44 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 22e7668991dcdd..9a0e217cea654e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,12 +341,12 @@ APIs: .. versionadded:: 3.3 -.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view, uint32_t *format) +.. c:function:: int PyUnicode_Export(PyObject *unicode, int32_t requested_formats, Py_buffer *view) Export the contents of the *unicode* string in one of the *requested_formats*. - * On success, fill *view*, set *\*format*, and return ``0``. - * On error, set an exception, set *\*format* to 0, and return ``-1``. + * On success, fill *view*, and return a format (greater than ``0``). + * On error, set an exception, and return ``-1``. *view* is left unchanged. After a successful call to :c:func:`PyUnicode_Export`, @@ -375,14 +375,15 @@ APIs: *requested_formats* can be a single format or a bitwise combination of the formats in the table above. - On success, *\*format* will be set to a single one of the requested flags. + On success, the returned format will be set to a single one of the requested + flags. Note that future versions of Python may introduce additional formats. .. versionadded:: 3.14 -.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) +.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, int32_t format) Create a Unicode string object from a buffer in a supported format. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 3c482fd606be8b..878f28b8a61acb 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -255,15 +255,14 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( #define PyUnicode_FORMAT_UTF8 0x08 // char* #define PyUnicode_FORMAT_ASCII 0x10 // char* (ASCII string) -PyAPI_FUNC(int) PyUnicode_Export( +PyAPI_FUNC(int32_t) PyUnicode_Export( PyObject *unicode, - uint32_t requested_formats, - Py_buffer *view, - uint32_t *format); + int32_t requested_formats, + Py_buffer *view); PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, Py_ssize_t nbytes, - uint32_t format); + int32_t format); #endif /* --- wchar_t support for platforms which support it --------------------- */ diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index b37aca149e818d..c1676fd4c375d4 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1849,9 +1849,8 @@ unicode_export(PyObject *self, PyObject *args) } Py_buffer view; - uint32_t format = (uint32_t)UNINITIALIZED_INT; - if (PyUnicode_Export(obj, requested_formats, &view, &format) < 0) { - assert(format == 0); + int32_t format = PyUnicode_Export(obj, requested_formats, &view); + if (format < 0) { return NULL; } @@ -1899,7 +1898,7 @@ unicode_import(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "y#I", &data, &nbytes, &format)) { return NULL; } - return PyUnicode_Import(data, nbytes, format); + return PyUnicode_Import(data, nbytes, (int32_t)format); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8c0fae933e8037..d8d017e2c38693 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2332,27 +2332,25 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, } -static int -unicode_export(PyObject *obj, Py_buffer *view, uint32_t *pformat, +static int32_t +unicode_export(PyObject *obj, Py_buffer *view, Py_ssize_t len, const void *buf, - int itemsize, const char *format, uint32_t internal_format) + int itemsize, const char *format, int32_t internal_format) { if (PyBuffer_FillInfo(view, obj, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { - *pformat = 0; return -1; } view->itemsize = itemsize; view->format = (char*)format; view->internal = (void*)(uintptr_t)internal_format; - *pformat = internal_format; - return 0; + return internal_format; } -int -PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, - Py_buffer *view, uint32_t *format) +int32_t +PyUnicode_Export(PyObject *unicode, int32_t requested_formats, + Py_buffer *view) { #if SIZEOF_INT == 4 # define BUFFER_UCS4 "I" @@ -2364,7 +2362,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); - goto error; + return -1; } Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); @@ -2372,7 +2370,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (PyUnicode_IS_ASCII(unicode) && (requested_formats & PyUnicode_FORMAT_ASCII)) { - return unicode_export(unicode, view, format, + return unicode_export(unicode, view, len, PyUnicode_1BYTE_DATA(unicode), 1, "B", PyUnicode_FORMAT_ASCII); } @@ -2382,7 +2380,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (kind == PyUnicode_1BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS1)) { - return unicode_export(unicode, view, format, + return unicode_export(unicode, view, len, PyUnicode_1BYTE_DATA(unicode), 1, "B", PyUnicode_FORMAT_UCS1); } @@ -2391,7 +2389,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (kind == PyUnicode_2BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS2)) { - return unicode_export(unicode, view, format, + return unicode_export(unicode, view, len, PyUnicode_2BYTE_DATA(unicode), 2, "H", PyUnicode_FORMAT_UCS2); } @@ -2402,7 +2400,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, { PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); if (!bytes) { - goto error; + return -1; } Py_UCS2 *ucs2 = (Py_UCS2*)PyBytes_AS_STRING(bytes); @@ -2412,9 +2410,9 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, ucs2); ucs2[len] = 0; - int res = unicode_export(bytes, view, format, - len, ucs2, - 2, "H", PyUnicode_FORMAT_UCS2); + int32_t res = unicode_export(bytes, view, + len, ucs2, + 2, "H", PyUnicode_FORMAT_UCS2); Py_DECREF(bytes); return res; } @@ -2423,7 +2421,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (kind == PyUnicode_4BYTE_KIND && (requested_formats & PyUnicode_FORMAT_UCS4)) { - return unicode_export(unicode, view, format, + return unicode_export(unicode, view, len, PyUnicode_4BYTE_DATA(unicode), 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); } @@ -2432,19 +2430,19 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, if (requested_formats & PyUnicode_FORMAT_UCS4) { Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); if (ucs4 == NULL) { - goto error; + return -1; } PyObject *bytes = PyBytes_FromStringAndSize((char*)ucs4, (len + 1) * 4); PyMem_Free(ucs4); if (bytes == NULL) { - goto error; + return -1; } ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); - int res = unicode_export(bytes, view, format, - len, ucs4, - 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + int32_t res = unicode_export(bytes, view, + len, ucs4, + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); Py_DECREF(bytes); return res; } @@ -2454,19 +2452,15 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_ssize_t nbytes; const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); if (utf8 == NULL) { - goto error; + return -1; } - return unicode_export(unicode, view, format, + return unicode_export(unicode, view, nbytes, utf8, 1, "B", PyUnicode_FORMAT_UTF8); } PyErr_SetString(PyExc_ValueError, "unable to find a matching export format"); - goto error; - -error: - *format = 0; return -1; #undef BUFFER_UCS4 @@ -2475,7 +2469,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, - uint32_t format) + int32_t format) { if (nbytes < 0) { PyErr_SetString(PyExc_ValueError, "Negative nbytes"); From 6707ef497ee135a5c0dd43d1902bc81ab3c07ea4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 12:34:58 +0200 Subject: [PATCH 17/27] Fix PyUnicode_Export() signature in doc --- Doc/c-api/unicode.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 9a0e217cea654e..b521f48b3dd58b 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,7 +341,7 @@ APIs: .. versionadded:: 3.3 -.. c:function:: int PyUnicode_Export(PyObject *unicode, int32_t requested_formats, Py_buffer *view) +.. c:function:: int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, Py_buffer *view) Export the contents of the *unicode* string in one of the *requested_formats*. @@ -355,7 +355,7 @@ APIs: The buffer is read-only and must not be modified. - *unicode*, *view* and *format* must not be NULL. + *unicode* and *view* must not be NULL. Available formats: From abf5c5836be7dfb8d09bae76284128fce00d8d0e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 12:35:54 +0200 Subject: [PATCH 18/27] Use _PyUnicode_EncodeUTF16() and _PyUnicode_EncodeUTF32() --- Modules/_testlimitedcapi/unicode.c | 24 --------------------- Objects/unicodeobject.c | 34 ++++++++++-------------------- 2 files changed, 11 insertions(+), 47 deletions(-) diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index c1676fd4c375d4..1646f5f111eecb 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1854,30 +1854,6 @@ unicode_export(PyObject *self, PyObject *args) return NULL; } - // Make sure that the exported string ends with a NUL character - char *data = view.buf; - Py_ssize_t nbytes = view.len * view.itemsize; - switch (format) - { - case PyUnicode_FORMAT_ASCII: - case PyUnicode_FORMAT_UCS1: - assert(data[nbytes] == 0); - break; - case PyUnicode_FORMAT_UCS2: - assert(data[nbytes] == 0); - assert(data[nbytes + 1] == 0); - break; - case PyUnicode_FORMAT_UCS4: - assert(data[nbytes] == 0); - assert(data[nbytes + 1] == 0); - assert(data[nbytes + 2] == 0); - assert(data[nbytes + 3] == 0); - break; - case PyUnicode_FORMAT_UTF8: - assert(data[nbytes] == 0); - break; - } - assert(view.format != NULL); PyObject *res = Py_BuildValue("y#Iis", view.buf, view.len * view.itemsize, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d8d017e2c38693..d7b7b2e8d50a23 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2335,7 +2335,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, static int32_t unicode_export(PyObject *obj, Py_buffer *view, Py_ssize_t len, const void *buf, - int itemsize, const char *format, int32_t internal_format) + int itemsize, const char *format, int32_t export_format) { if (PyBuffer_FillInfo(view, obj, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { @@ -2343,8 +2343,7 @@ unicode_export(PyObject *obj, Py_buffer *view, } view->itemsize = itemsize; view->format = (char*)format; - view->internal = (void*)(uintptr_t)internal_format; - return internal_format; + return export_format; } @@ -2398,20 +2397,15 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, if (kind == PyUnicode_1BYTE_KIND && requested_formats & PyUnicode_FORMAT_UCS2) { - PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); + const int byteorder = (PY_BIG_ENDIAN == 1) ? 1 : -1; + PyObject *bytes = _PyUnicode_EncodeUTF16(unicode, NULL, byteorder); if (!bytes) { return -1; } - Py_UCS2 *ucs2 = (Py_UCS2*)PyBytes_AS_STRING(bytes); - - _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, - PyUnicode_1BYTE_DATA(unicode), - PyUnicode_1BYTE_DATA(unicode) + len, - ucs2); - ucs2[len] = 0; + void *data = PyBytes_AS_STRING(bytes); int32_t res = unicode_export(bytes, view, - len, ucs2, + len, data, 2, "H", PyUnicode_FORMAT_UCS2); Py_DECREF(bytes); return res; @@ -2428,20 +2422,14 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, // Convert ASCII, UCS1 or UCS2 to UCS4 if (requested_formats & PyUnicode_FORMAT_UCS4) { - Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); - if (ucs4 == NULL) { - return -1; - } - - PyObject *bytes = PyBytes_FromStringAndSize((char*)ucs4, (len + 1) * 4); - PyMem_Free(ucs4); - if (bytes == NULL) { + const int byteorder = (PY_BIG_ENDIAN == 1) ? 1 : -1; + PyObject *bytes = _PyUnicode_EncodeUTF32(unicode, NULL, byteorder); + if (!bytes) { return -1; } - ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); - + void *data = PyBytes_AS_STRING(bytes); int32_t res = unicode_export(bytes, view, - len, ucs4, + len, data, 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); Py_DECREF(bytes); return res; From 033fc07105ba47f2d15f321acb75bd22a0154075 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 12:43:17 +0200 Subject: [PATCH 19/27] Use signed int in C tests --- Modules/_testlimitedcapi/unicode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 1646f5f111eecb..b20b60dd40f196 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1855,9 +1855,9 @@ unicode_export(PyObject *self, PyObject *args) } assert(view.format != NULL); - PyObject *res = Py_BuildValue("y#Iis", + PyObject *res = Py_BuildValue("y#iis", view.buf, view.len * view.itemsize, - (unsigned int)format, + (int)format, (int)view.itemsize, view.format); PyBuffer_Release(&view); return res; @@ -1870,8 +1870,8 @@ unicode_import(PyObject *self, PyObject *args) { const void *data; Py_ssize_t nbytes; - unsigned int format; - if (!PyArg_ParseTuple(args, "y#I", &data, &nbytes, &format)) { + int format; + if (!PyArg_ParseTuple(args, "y#i", &data, &nbytes, &format)) { return NULL; } return PyUnicode_Import(data, nbytes, (int32_t)format); From 078dfcfd5f7a234455ae10b70bfdc089cc6ff92f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 15:41:50 +0200 Subject: [PATCH 20/27] Update stable_abi: remove PyUnicode_GetBufferFormat() --- Doc/data/stable_abi.dat | 1 - Lib/test/test_stable_abi_ctypes.py | 1 - Misc/stable_abi.toml | 2 -- PC/python3dll.c | 1 - 4 files changed, 5 deletions(-) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index a6745986c2025e..e60d809e969c06 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -798,7 +798,6 @@ func,PyUnicode_FromOrdinal,3.2,, func,PyUnicode_FromString,3.2,, func,PyUnicode_FromStringAndSize,3.2,, func,PyUnicode_FromWideChar,3.2,, -func,PyUnicode_GetBufferFormat,3.14,, func,PyUnicode_GetDefaultEncoding,3.2,, func,PyUnicode_GetLength,3.7,, func,PyUnicode_Import,3.14,, diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index b496b43d4ef6cd..483f42c8d14ec3 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -820,7 +820,6 @@ def test_windows_feature_macros(self): "PyUnicode_FromString", "PyUnicode_FromStringAndSize", "PyUnicode_FromWideChar", - "PyUnicode_GetBufferFormat", "PyUnicode_GetDefaultEncoding", "PyUnicode_GetLength", "PyUnicode_GetSize", diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 7fb8971326a064..e21506a9ca5c63 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2538,7 +2538,5 @@ added = '3.14' [function.PyUnicode_Export] added = '3.14' -[function.PyUnicode_GetBufferFormat] - added = '3.14' [function.PyUnicode_Import] added = '3.14' diff --git a/PC/python3dll.c b/PC/python3dll.c index 1bfa238eb7054d..02206b14abcf82 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -731,7 +731,6 @@ EXPORT_FUNC(PyUnicode_FromStringAndSize) EXPORT_FUNC(PyUnicode_FromWideChar) EXPORT_FUNC(PyUnicode_FSConverter) EXPORT_FUNC(PyUnicode_FSDecoder) -EXPORT_FUNC(PyUnicode_GetBufferFormat) EXPORT_FUNC(PyUnicode_GetDefaultEncoding) EXPORT_FUNC(PyUnicode_GetLength) EXPORT_FUNC(PyUnicode_GetSize) From 79c6d01a3fb031e653927bb98b132dd50a483609 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 15:43:45 +0200 Subject: [PATCH 21/27] Revert "Use _PyUnicode_EncodeUTF16() and _PyUnicode_EncodeUTF32()" This reverts commit abf5c5836be7dfb8d09bae76284128fce00d8d0e. --- Modules/_testlimitedcapi/unicode.c | 24 +++++++++++++++++++++ Objects/unicodeobject.c | 34 ++++++++++++++++++++---------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index b20b60dd40f196..adb8db59b08883 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1854,6 +1854,30 @@ unicode_export(PyObject *self, PyObject *args) return NULL; } + // Make sure that the exported string ends with a NUL character + char *data = view.buf; + Py_ssize_t nbytes = view.len * view.itemsize; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + assert(data[nbytes] == 0); + break; + case PyUnicode_FORMAT_UCS2: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + break; + case PyUnicode_FORMAT_UCS4: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + assert(data[nbytes + 2] == 0); + assert(data[nbytes + 3] == 0); + break; + case PyUnicode_FORMAT_UTF8: + assert(data[nbytes] == 0); + break; + } + assert(view.format != NULL); PyObject *res = Py_BuildValue("y#iis", view.buf, view.len * view.itemsize, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d7b7b2e8d50a23..d8d017e2c38693 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2335,7 +2335,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, static int32_t unicode_export(PyObject *obj, Py_buffer *view, Py_ssize_t len, const void *buf, - int itemsize, const char *format, int32_t export_format) + int itemsize, const char *format, int32_t internal_format) { if (PyBuffer_FillInfo(view, obj, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { @@ -2343,7 +2343,8 @@ unicode_export(PyObject *obj, Py_buffer *view, } view->itemsize = itemsize; view->format = (char*)format; - return export_format; + view->internal = (void*)(uintptr_t)internal_format; + return internal_format; } @@ -2397,15 +2398,20 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, if (kind == PyUnicode_1BYTE_KIND && requested_formats & PyUnicode_FORMAT_UCS2) { - const int byteorder = (PY_BIG_ENDIAN == 1) ? 1 : -1; - PyObject *bytes = _PyUnicode_EncodeUTF16(unicode, NULL, byteorder); + PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); if (!bytes) { return -1; } - void *data = PyBytes_AS_STRING(bytes); + Py_UCS2 *ucs2 = (Py_UCS2*)PyBytes_AS_STRING(bytes); + + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(unicode), + PyUnicode_1BYTE_DATA(unicode) + len, + ucs2); + ucs2[len] = 0; int32_t res = unicode_export(bytes, view, - len, data, + len, ucs2, 2, "H", PyUnicode_FORMAT_UCS2); Py_DECREF(bytes); return res; @@ -2422,14 +2428,20 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, // Convert ASCII, UCS1 or UCS2 to UCS4 if (requested_formats & PyUnicode_FORMAT_UCS4) { - const int byteorder = (PY_BIG_ENDIAN == 1) ? 1 : -1; - PyObject *bytes = _PyUnicode_EncodeUTF32(unicode, NULL, byteorder); - if (!bytes) { + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); + if (ucs4 == NULL) { + return -1; + } + + PyObject *bytes = PyBytes_FromStringAndSize((char*)ucs4, (len + 1) * 4); + PyMem_Free(ucs4); + if (bytes == NULL) { return -1; } - void *data = PyBytes_AS_STRING(bytes); + ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); + int32_t res = unicode_export(bytes, view, - len, data, + len, ucs4, 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); Py_DECREF(bytes); return res; From 5479ab217d98bd7e0a7098234db8f71c6bd308d6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 12 Sep 2024 16:17:13 +0200 Subject: [PATCH 22/27] Allow surrogate characters in UTF-8 --- Lib/test/test_capi/test_unicode.py | 18 ++++--- Objects/unicodeobject.c | 77 ++++++++++++++++++++++-------- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6d34b95714f186..eb544f9c444a48 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1766,28 +1766,29 @@ def check_ucs1(text, formats): def check_ucs2(text, formats): self.assertEqual(unicode_export(text, formats), - (text.encode(ucs2_enc), + (text.encode(ucs2_enc, 'surrogatepass'), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) def check_ucs4(text, formats): self.assertEqual(unicode_export(text, formats), - (text.encode(ucs4_enc), + (text.encode(ucs4_enc, 'surrogatepass'), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) def check_utf8(text): self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8), - (text.encode('utf8'), + (text.encode('utf8', 'surrogatepass'), PyUnicode_FORMAT_UTF8, 1, 'B')) + # export as native format check_ucs1("abc", formats) check_ucs1("latin1:\xe9", formats) check_ucs2('ucs2:\u20ac', formats) check_ucs4('ucs4:\U0010ffff', formats) - # export ASCII as UCS1 + # convert ASCII to UCS1 check_ucs1("abc", PyUnicode_FORMAT_UCS1) - # export ASCII and UCS1 to UCS2 + # convert ASCII and UCS1 to UCS2 check_ucs2("abc", PyUnicode_FORMAT_UCS2) check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2) @@ -1797,12 +1798,17 @@ def check_utf8(text): check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4) - # always export to UTF8 + # always encode to UTF8 check_utf8("abc") check_utf8("latin1:\xe9") check_utf8('ucs2:\u20ac') check_utf8('ucs4:\U0010ffff') + # surrogates + check_ucs2('\udc80', PyUnicode_FORMAT_UCS2) + check_ucs4('\udc80', PyUnicode_FORMAT_UCS4) + check_utf8('\udc80') + # No supported format or invalid format for formats in (0, PyUnicode_FORMAT_INVALID): err_msg = "unable to find a matching export format" diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d8d017e2c38693..f71d7214e44916 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2335,7 +2335,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, static int32_t unicode_export(PyObject *obj, Py_buffer *view, Py_ssize_t len, const void *buf, - int itemsize, const char *format, int32_t internal_format) + int itemsize, const char *format, int32_t export_format) { if (PyBuffer_FillInfo(view, obj, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) { @@ -2343,8 +2343,31 @@ unicode_export(PyObject *obj, Py_buffer *view, } view->itemsize = itemsize; view->format = (char*)format; - view->internal = (void*)(uintptr_t)internal_format; - return internal_format; + return export_format; +} + + +static int32_t +unicode_export_bytes(PyObject *bytes, Py_buffer *view, + int itemsize, const char *format, int32_t export_format) +{ + const void *buf = PyBytes_AS_STRING(bytes); + assert((PyBytes_GET_SIZE(bytes) % itemsize) == 0); + Py_ssize_t len = PyBytes_GET_SIZE(bytes) / itemsize; + assert(len >= 1); + len--; // ignore the trailing NULL character + + if (PyBuffer_FillInfo(view, bytes, (void*)buf, len, + 1, PyBUF_SIMPLE) < 0) + { + Py_DECREF(bytes); + return -1; + } + Py_DECREF(bytes); + + view->itemsize = itemsize; + view->format = (char*)format; + return export_format; } @@ -2410,11 +2433,8 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, ucs2); ucs2[len] = 0; - int32_t res = unicode_export(bytes, view, - len, ucs2, - 2, "H", PyUnicode_FORMAT_UCS2); - Py_DECREF(bytes); - return res; + return unicode_export_bytes(bytes, view, + 2, "H", PyUnicode_FORMAT_UCS2); } // Native UCS4 @@ -2438,25 +2458,44 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, if (bytes == NULL) { return -1; } - ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); - int32_t res = unicode_export(bytes, view, - len, ucs4, - 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); - Py_DECREF(bytes); - return res; + return unicode_export_bytes(bytes, view, + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); } // Encode UCS1, UCS2 or UCS4 to UTF-8 if (requested_formats & PyUnicode_FORMAT_UTF8) { Py_ssize_t nbytes; const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); - if (utf8 == NULL) { - return -1; + if (utf8 != NULL) { + return unicode_export(unicode, view, + nbytes, utf8, + 1, "B", PyUnicode_FORMAT_UTF8); } - return unicode_export(unicode, view, - nbytes, utf8, - 1, "B", PyUnicode_FORMAT_UTF8); + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + PyErr_Clear(); + PyObject *bytes = _PyUnicode_AsUTF8String(unicode, "surrogatepass"); + if (bytes == NULL) { + return -1; + } + len = PyBytes_GET_SIZE(bytes); + + // Copy to add a NULL character + PyObject *bytes2 = PyBytes_FromStringAndSize(NULL, len + 1); + if (bytes2 == NULL) { + Py_DECREF(bytes); + return -1; + } + + char *str = PyBytes_AS_STRING(bytes2); + memcpy(str, PyBytes_AS_STRING(bytes), len); + str[len] = '\0'; + Py_DECREF(bytes); + + return unicode_export_bytes(bytes2, view, + 1, "B", PyUnicode_FORMAT_UTF8); + } + return -1; } PyErr_SetString(PyExc_ValueError, From f71f2307ff36d430a762b2647a7b34415b964ad3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 14 Sep 2024 00:12:41 +0200 Subject: [PATCH 23/27] Avoid a second copy in the UTF-8 export --- Objects/unicodeobject.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f71d7214e44916..13c5e340abd003 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2348,14 +2348,10 @@ unicode_export(PyObject *obj, Py_buffer *view, static int32_t -unicode_export_bytes(PyObject *bytes, Py_buffer *view, +unicode_export_bytes(PyObject *bytes, Py_buffer *view, Py_ssize_t len, int itemsize, const char *format, int32_t export_format) { const void *buf = PyBytes_AS_STRING(bytes); - assert((PyBytes_GET_SIZE(bytes) % itemsize) == 0); - Py_ssize_t len = PyBytes_GET_SIZE(bytes) / itemsize; - assert(len >= 1); - len--; // ignore the trailing NULL character if (PyBuffer_FillInfo(view, bytes, (void*)buf, len, 1, PyBUF_SIMPLE) < 0) @@ -2433,7 +2429,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, ucs2); ucs2[len] = 0; - return unicode_export_bytes(bytes, view, + return unicode_export_bytes(bytes, view, len, 2, "H", PyUnicode_FORMAT_UCS2); } @@ -2459,7 +2455,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, return -1; } - return unicode_export_bytes(bytes, view, + return unicode_export_bytes(bytes, view, len, 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); } @@ -2480,19 +2476,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, } len = PyBytes_GET_SIZE(bytes); - // Copy to add a NULL character - PyObject *bytes2 = PyBytes_FromStringAndSize(NULL, len + 1); - if (bytes2 == NULL) { - Py_DECREF(bytes); - return -1; - } - - char *str = PyBytes_AS_STRING(bytes2); - memcpy(str, PyBytes_AS_STRING(bytes), len); - str[len] = '\0'; - Py_DECREF(bytes); - - return unicode_export_bytes(bytes2, view, + return unicode_export_bytes(bytes, view, PyBytes_GET_SIZE(bytes), 1, "B", PyUnicode_FORMAT_UTF8); } return -1; From 492f10a1c120008fa9cea81d7c1c8c5e67ac46bf Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 14 Sep 2024 00:20:34 +0200 Subject: [PATCH 24/27] UCS-4 export: remove one memory copy --- Objects/unicodeobject.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 13c5e340abd003..a6f011d88a7ee0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -208,6 +208,9 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); #endif +static Py_UCS4* +as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, + int copy_null); // Return a reference to the immortal empty string singleton. @@ -2444,16 +2447,13 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, // Convert ASCII, UCS1 or UCS2 to UCS4 if (requested_formats & PyUnicode_FORMAT_UCS4) { - Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); - if (ucs4 == NULL) { - return -1; - } - - PyObject *bytes = PyBytes_FromStringAndSize((char*)ucs4, (len + 1) * 4); - PyMem_Free(ucs4); + PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 4); if (bytes == NULL) { return -1; } + Py_UCS4 *ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); + + (void)as_ucs4(unicode, ucs4, len + 1, 1); return unicode_export_bytes(bytes, view, len, 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); @@ -2709,15 +2709,14 @@ static Py_UCS4* as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, int copy_null) { - int kind; - const void *data; - Py_ssize_t len, targetlen; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - len = PyUnicode_GET_LENGTH(string); - targetlen = len; - if (copy_null) + int kind = PyUnicode_KIND(string); + const void *data = PyUnicode_DATA(string); + Py_ssize_t len = PyUnicode_GET_LENGTH(string); + Py_ssize_t targetlen = len; + if (copy_null) { targetlen++; + } + if (!target) { target = PyMem_New(Py_UCS4, targetlen); if (!target) { @@ -2729,11 +2728,13 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, if (targetsize < targetlen) { PyErr_Format(PyExc_SystemError, "string is longer than the buffer"); - if (copy_null && 0 < targetsize) + if (copy_null && 0 < targetsize) { target[0] = 0; + } return NULL; } } + if (kind == PyUnicode_1BYTE_KIND) { const Py_UCS1 *start = (const Py_UCS1 *) data; _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); @@ -2748,8 +2749,10 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, else { Py_UNREACHABLE(); } - if (copy_null) + if (copy_null) { target[len] = 0; + } + return target; } From b031163710e9e16cca0390b9816b7438a4a45e96 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 16 Sep 2024 14:46:27 +0200 Subject: [PATCH 25/27] Update Py_buffer format Use "=H" and "=I" formats. --- Lib/test/test_capi/test_unicode.py | 9 ++------- Objects/unicodeobject.c | 18 ++++-------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index eb544f9c444a48..8dcb2fc02b9422 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1746,13 +1746,8 @@ def test_unicode_export(self): | PyUnicode_FORMAT_UCS2 | PyUnicode_FORMAT_UCS4) BUFFER_UCS1 = 'B' - BUFFER_UCS2 = 'H' - if struct.calcsize('I') == 4: - BUFFER_UCS4 = 'I' - elif struct.calcsize('L') == 4: - BUFFER_UCS4 = 'L' - else: - self.fail("unable to get BUFFER_UCS4 ") + BUFFER_UCS2 = '=H' + BUFFER_UCS4 = '=I' def check_ucs1(text, formats): if formats == PyUnicode_FORMAT_UCS1: diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a6f011d88a7ee0..213fce11cc1f9c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2374,14 +2374,6 @@ int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, Py_buffer *view) { -#if SIZEOF_INT == 4 -# define BUFFER_UCS4 "I" -#elif SIZEOF_LONG == 4 -# define BUFFER_UCS4 "L" -#else -# error "unable to find BUFFER_UCS4" -#endif - if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); return -1; @@ -2413,7 +2405,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, { return unicode_export(unicode, view, len, PyUnicode_2BYTE_DATA(unicode), - 2, "H", PyUnicode_FORMAT_UCS2); + 2, "=H", PyUnicode_FORMAT_UCS2); } // Convert ASCII or UCS1 to UCS2 @@ -2433,7 +2425,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, ucs2[len] = 0; return unicode_export_bytes(bytes, view, len, - 2, "H", PyUnicode_FORMAT_UCS2); + 2, "=H", PyUnicode_FORMAT_UCS2); } // Native UCS4 @@ -2442,7 +2434,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, { return unicode_export(unicode, view, len, PyUnicode_4BYTE_DATA(unicode), - 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + 4, "=I", PyUnicode_FORMAT_UCS4); } // Convert ASCII, UCS1 or UCS2 to UCS4 @@ -2456,7 +2448,7 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, (void)as_ucs4(unicode, ucs4, len + 1, 1); return unicode_export_bytes(bytes, view, len, - 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + 4, "=I", PyUnicode_FORMAT_UCS4); } // Encode UCS1, UCS2 or UCS4 to UTF-8 @@ -2485,8 +2477,6 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, PyErr_SetString(PyExc_ValueError, "unable to find a matching export format"); return -1; - -#undef BUFFER_UCS4 } From 21e60125b654cc949e0560f8d490d817ad74fc54 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 23 Sep 2024 17:50:53 +0200 Subject: [PATCH 26/27] Add PyUnicode_EXPORT_COPY flag --- Doc/c-api/unicode.rst | 22 +++++- Include/unicodeobject.h | 4 + Lib/test/test_capi/test_unicode.py | 117 ++++++++++++++++------------- Modules/_testlimitedcapi/unicode.c | 6 +- Objects/unicodeobject.c | 17 +++-- 5 files changed, 103 insertions(+), 63 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index b521f48b3dd58b..2c216a5dd0ed20 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,7 +341,7 @@ APIs: .. versionadded:: 3.3 -.. c:function:: int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, Py_buffer *view) +.. c:function:: int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, uint32_t flags, Py_buffer *view) Export the contents of the *unicode* string in one of the *requested_formats*. @@ -380,6 +380,26 @@ APIs: Note that future versions of Python may introduce additional formats. + By default, if the :c:macro:`PyUnicode_EXPORT_COPY` flag is not set in + *flags*, no memory is copied and no conversion is done. + + If the :c:macro:`PyUnicode_EXPORT_COPY` flag is set in *flags*, the function + can copy memory to provide the requested format and convert from a format + to another. + + The :c:macro:`PyUnicode_EXPORT_COPY` flag is needed to export to + :c:macro:`PyUnicode_FORMAT_UTF8` a string containing surrogate characters. + + Available flags: + + .. c:namespace:: NULL + + ================================== ======== =================== + Flag Value Description + ================================== ======== =================== + .. c:macro:: PyUnicode_EXPORT_COPY ``0x01`` Allow memory copies + ================================== ======== =================== + .. versionadded:: 3.14 diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 878f28b8a61acb..5b1eb15f2703e4 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -255,9 +255,13 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( #define PyUnicode_FORMAT_UTF8 0x08 // char* #define PyUnicode_FORMAT_ASCII 0x10 // char* (ASCII string) +#define PyUnicode_EXPORT_COPY 0x01 + + PyAPI_FUNC(int32_t) PyUnicode_Export( PyObject *unicode, int32_t requested_formats, + uint32_t flags, Py_buffer *view); PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 8dcb2fc02b9422..b6ecc2a5a6b811 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -37,6 +37,9 @@ class Str(str): # Invalid native format PyUnicode_FORMAT_INVALID = 0x20 +PyUnicode_EXPORT_COPY = 0x01 + + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1749,31 +1752,36 @@ def test_unicode_export(self): BUFFER_UCS2 = '=H' BUFFER_UCS4 = '=I' - def check_ucs1(text, formats): + def check_ucs1(text, formats, flags=0): if formats == PyUnicode_FORMAT_UCS1: export_format = PyUnicode_FORMAT_UCS1 elif text.isascii(): export_format = PyUnicode_FORMAT_ASCII else: export_format = PyUnicode_FORMAT_UCS1 - self.assertEqual(unicode_export(text, formats), + self.assertEqual(unicode_export(text, formats, flags), (text.encode('latin1'), export_format, 1, BUFFER_UCS1)) - def check_ucs2(text, formats): - self.assertEqual(unicode_export(text, formats), + def check_ucs2(text, formats, flags=0): + self.assertEqual(unicode_export(text, formats, flags), (text.encode(ucs2_enc, 'surrogatepass'), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) - def check_ucs4(text, formats): - self.assertEqual(unicode_export(text, formats), + def check_ucs4(text, formats, flags=0): + self.assertEqual(unicode_export(text, formats, flags), (text.encode(ucs4_enc, 'surrogatepass'), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) - def check_utf8(text): - self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8), + def check_utf8(text, flags=0): + self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8, flags), (text.encode('utf8', 'surrogatepass'), PyUnicode_FORMAT_UTF8, 1, 'B')) + def check_no_matching_format(text, formats, flags=0): + err_msg = "unable to find a matching export format" + with self.assertRaisesRegex(ValueError, err_msg): + unicode_export('abc', formats, flags) + # export as native format check_ucs1("abc", formats) check_ucs1("latin1:\xe9", formats) @@ -1783,15 +1791,19 @@ def check_utf8(text): # convert ASCII to UCS1 check_ucs1("abc", PyUnicode_FORMAT_UCS1) - # convert ASCII and UCS1 to UCS2 - check_ucs2("abc", PyUnicode_FORMAT_UCS2) - check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2) + # convert to UCS2 (need PyUnicode_EXPORT_COPY) + check_no_matching_format("abc", PyUnicode_FORMAT_UCS2) + check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS2) + check_ucs2("abc", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY) + check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY) - # always export to UCS4 - check_ucs4("abc", PyUnicode_FORMAT_UCS4) - check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4) - check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) - check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4) + # convert to UCS4 (need PyUnicode_EXPORT_COPY) + check_no_matching_format("abc", PyUnicode_FORMAT_UCS4) + check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS4) + check_no_matching_format('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) + check_ucs4("abc", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) # always encode to UTF8 check_utf8("abc") @@ -1801,15 +1813,13 @@ def check_utf8(text): # surrogates check_ucs2('\udc80', PyUnicode_FORMAT_UCS2) - check_ucs4('\udc80', PyUnicode_FORMAT_UCS4) - check_utf8('\udc80') + check_ucs4('\udc80', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_utf8('\udc80', PyUnicode_EXPORT_COPY) # No supported format or invalid format for formats in (0, PyUnicode_FORMAT_INVALID): - err_msg = "unable to find a matching export format" with self.subTest(formats=formats): - with self.assertRaisesRegex(ValueError, err_msg): - unicode_export('abc', formats) + check_no_matching_format('abc', formats) def test_unicode_import(self): # Test PyUnicode_Import() @@ -1867,6 +1877,39 @@ def test_unicode_import(self): with self.assertRaises(ValueError): unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) + def test_unicode_export_import_roundtrip(self): + unicode_export = _testlimitedcapi.unicode_export + unicode_import = _testlimitedcapi.unicode_import + + ASCII = PyUnicode_FORMAT_ASCII + UCS1 = PyUnicode_FORMAT_UCS1 + UCS2 = PyUnicode_FORMAT_UCS2 + UCS4 = PyUnicode_FORMAT_UCS4 + UTF8 = PyUnicode_FORMAT_UTF8 + ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) + + def roundtrip(string, formats): + export = unicode_export(string, formats, PyUnicode_EXPORT_COPY) + buf, buf_fmt, item_size, view_fmt = export + self.assertEqual(unicode_import(buf, buf_fmt), string) + + for string, allowed_formats in ( + ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), + ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), + ('ucs4:\U0001f638', {UCS4, UTF8}), + ): + for formats in ASCII, UCS1, UCS2, UCS4, UTF8: + with self.subTest(string=string, formats=formats): + if formats not in allowed_formats: + with self.assertRaises(ValueError): + unicode_export(string, formats, PyUnicode_EXPORT_COPY) + else: + roundtrip(string, formats) + + roundtrip(string, ALL) + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): @@ -2049,38 +2092,6 @@ def test_recover_error(self): self.assertEqual(writer.finish(), 'Hello World.') - def test_unicode_export_import_roundtrip(self): - unicode_export = _testlimitedcapi.unicode_export - unicode_import = _testlimitedcapi.unicode_import - - ASCII = PyUnicode_FORMAT_ASCII - UCS1 = PyUnicode_FORMAT_UCS1 - UCS2 = PyUnicode_FORMAT_UCS2 - UCS4 = PyUnicode_FORMAT_UCS4 - UTF8 = PyUnicode_FORMAT_UTF8 - ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) - - def roundtrip(string, formats): - buf, buf_fmt, item_size, view_fmt = unicode_export(string, formats) - self.assertEqual(unicode_import(buf, buf_fmt), string) - - for string, allowed_formats in ( - ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), - ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), - ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), - ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), - ('ucs4:\U0001f638', {UCS4, UTF8}), - ): - for formats in ASCII, UCS1, UCS2, UCS4, UTF8: - with self.subTest(string=string, formats=formats): - if formats not in allowed_formats: - with self.assertRaises(ValueError): - unicode_export(string, formats) - else: - roundtrip(string, formats) - - roundtrip(string, ALL) - if __name__ == '__main__': unittest.main() diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index adb8db59b08883..9b6c0ee9a9d38f 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1843,13 +1843,13 @@ static PyObject* unicode_export(PyObject *self, PyObject *args) { PyObject *obj; - unsigned int requested_formats; - if (!PyArg_ParseTuple(args, "OI", &obj, &requested_formats)) { + unsigned int requested_formats, flags; + if (!PyArg_ParseTuple(args, "OII", &obj, &requested_formats, &flags)) { return NULL; } Py_buffer view; - int32_t format = PyUnicode_Export(obj, requested_formats, &view); + int32_t format = PyUnicode_Export(obj, requested_formats, flags, &view); if (format < 0) { return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 213fce11cc1f9c..2f907e2558d534 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2372,7 +2372,7 @@ unicode_export_bytes(PyObject *bytes, Py_buffer *view, Py_ssize_t len, int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, - Py_buffer *view) + uint32_t flags, Py_buffer *view) { if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); @@ -2408,8 +2408,9 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, 2, "=H", PyUnicode_FORMAT_UCS2); } - // Convert ASCII or UCS1 to UCS2 - if (kind == PyUnicode_1BYTE_KIND + // Convert ASCII or UCS1 to UCS2 (need PyUnicode_EXPORT_COPY) + if (flags & PyUnicode_EXPORT_COPY + && kind == PyUnicode_1BYTE_KIND && requested_formats & PyUnicode_FORMAT_UCS2) { PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); @@ -2437,8 +2438,10 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, 4, "=I", PyUnicode_FORMAT_UCS4); } - // Convert ASCII, UCS1 or UCS2 to UCS4 - if (requested_formats & PyUnicode_FORMAT_UCS4) { + // Convert ASCII, UCS1 or UCS2 to UCS4 (need PyUnicode_EXPORT_COPY) + if (flags & PyUnicode_EXPORT_COPY + && requested_formats & PyUnicode_FORMAT_UCS4) + { PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 4); if (bytes == NULL) { return -1; @@ -2460,7 +2463,9 @@ PyUnicode_Export(PyObject *unicode, int32_t requested_formats, nbytes, utf8, 1, "B", PyUnicode_FORMAT_UTF8); } - if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + if (flags & PyUnicode_EXPORT_COPY + && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + { PyErr_Clear(); PyObject *bytes = _PyUnicode_AsUTF8String(unicode, "surrogatepass"); if (bytes == NULL) { From 3267ce69776bc9ccddaf64405b11f75bd20c326b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 23 Sep 2024 17:55:20 +0200 Subject: [PATCH 27/27] doc --- Doc/c-api/unicode.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 2c216a5dd0ed20..4182d87472d546 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -394,11 +394,11 @@ APIs: .. c:namespace:: NULL - ================================== ======== =================== + ================================== ======== =================================== Flag Value Description - ================================== ======== =================== - .. c:macro:: PyUnicode_EXPORT_COPY ``0x01`` Allow memory copies - ================================== ======== =================== + ================================== ======== =================================== + .. c:macro:: PyUnicode_EXPORT_COPY ``0x01`` Allow memory copies and conversions + ================================== ======== =================================== .. versionadded:: 3.14