8000 gh-119609: Add PyUnicode_Export() function · python/cpython@c84f314 · GitHub
[go: up one dir, main page]

Skip to content

Commit c84f314

Browse files
committed
gh-119609: Add PyUnicode_Export() function
Add PyUnicode_Export(), PyUnicode_GetBufferFormat() and PyUnicode_Import() functions to the limited C API.
1 parent 092abc4 commit c84f314

File tree

11 files changed

+584
-4
lines changed

11 files changed

+584
-4
lines changed

Doc/c-api/unicode.rst

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,71 @@ APIs:
341341
.. versionadded:: 3.3
342342
343343
344+
.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view)
345+
346+
Export the contents of the *unicode* string in one of the requested format
347+
*requested_formats*.
348+
349+
* On success, fill *view*, and return ``0``.
350+
* On error, set an exception and return ``-1``.
351+
352+
The export must be released by :c:func:`PyBuffer_Release`.
353+
The contents of the buffer are valid until they are released.
354+
355+
The buffer is read-only and must not be modified.
356+
357+
*unicode* and *view* must not be NULL.
358+
359+
Available formats:
360+
361+
.. c:namespace:: NULL
362+
363+
============ A935 ======================= ======== ===========================
364+
Constant Identifier Value Description
365+
=================================== ======== ===========================
366+
.. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``)
367+
.. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``)
368+
.. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``)
369+
.. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``)
370+
.. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``)
371+
=================================== ======== ===========================
372+
373+
*requested_formats* can be a single format or a bitwise combination of the
374+
formats in the table above.
375+
On success, *\*format* will be set to a single one of the requested flags.
376+
377+
Note that future versions of Python may introduce additional formats.
378+
379+
.. versionadded:: 3.14
380+
381+
382+
.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format)
383+
384+
Get the format of the buffer *view*.
385+
386+
* On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value
387+
and return ``0``.
388+
* On error, set an exception and return ``-1``.
389+
390+
*view* must be a buffer filled by :c:func:`PyUnicode_Export`.
391+
392+
.. versionadded:: 3.14
393+
394+
395+
.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
396+
397+
Create a string object from a buffer in an “export format”.
398+
399+
* Return a reference to a new string object on success.
400+
* Set an exception and return ``NULL`` on error.
401+
402+
*data* must not be NULL. *nbytes* must be positive or zero.
403+
404+
See :c:func:`PyUnicode_Export` for the available formats.
405+
406+
.. versionadded:: 3.14
407+
408+
344409
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
345410
Py_ssize_t size)
346411

Doc/data/stable_abi.dat

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Doc/whatsnew/3.14.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,10 @@ New Features
529529

530530
(Contributed by Victor Stinner in :gh:`107954`.)
531531

532+
* Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`,
533+
and :c:func:`PyUnicode_Import` functions to export and import strings.
534+
(Contributed by Victor Stinner in :gh:`119609`.)
535+
532536

533537
Porting to Python 3.14
534538
----------------------

Include/unicodeobject.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
248248
const char *u /* UTF-8 encoded string */
249249
);
250250

251+
#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string)
252+
#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1*
253+
#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2*
254+
#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4*
255+
#define PyUnicode_FORMAT_UTF8 0x10 // char*
256+
257+
PyAPI_FUNC(int) PyUnicode_Export(
258+
PyObject *unicode,
259+
uint32_t requested_formats,
260+
Py_buffer *view);
261+
PyAPI_FUNC(int) PyUnicode_GetBufferFormat(
262+
const Py_buffer *view,
263+
uint32_t *format);
264+
PyAPI_FUNC(PyObject*) PyUnicode_Import(
265+
const void *data,
266+
Py_ssize_t nbytes,
267+
uint32_t format);
268+
251269
/* --- wchar_t support for platforms which support it --------------------- */
252270

253271
#ifdef HAVE_WCHAR_H

Lib/test/test_capi/test_unicode.py

Lines changed: 180 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
import unittest
1+
import struct
22
import sys
3+
import unittest
34
from test import support
45
from test.support import import_helper
56

@@ -28,6 +29,14 @@ class Str(str):
2829
pass
2930

3031

32+
PyUnicode_FORMAT_ASCII = 0x01
33+
PyUnicode_FORMAT_UCS1 = 0x02
34+
PyUnicode_FORMAT_UCS2 = 0x04
35+
PyUnicode_FORMAT_UCS4 = 0x08
36+
PyUnicode_FORMAT_UTF8 = 0x10
37+
# Invalid native format
38+
PyUnicode_FORMAT_INVALID = 0x20
39+
3140
class CAPITest(unittest.TestCase):
3241

3342
@support.cpython_only
@@ -1721,6 +1730,142 @@ def test_pep393_utf8_caching_bug(self):
17211730
# Check that the second call returns the same result
17221731
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
17231732

1733+
def test_unicode_export(self):
1734+
# Test PyUnicode_Export() and PyUnicode_FreeExport()
1735+
unicode_export = _testlimitedcapi.unicode_export
1736+
if sys.byteorder == 'little':
1737+
ucs2_enc = 'utf-16le'
1738+
ucs4_enc = 'utf-32le'
1739+
else:
1740+
ucs2_enc = 'utf-16be'
1741+
ucs4_enc = 'utf-32be'
1742+
1743+
# export to the native format
1744+
formats = (PyUnicode_FORMAT_ASCII
1745+
| PyUnicode_FORMAT_UCS1
1746+
| PyUnicode_FORMAT_UCS2
1747+
| PyUnicode_FORMAT_UCS4)
1748+
BUFFER_UCS1 = 'B'
1749+
BUFFER_UCS2 = 'H'
1750+
if struct.calcsize('I') == 4:
1751+
BUFFER_UCS4 = 'I'
1752+
elif struct.calcsize('L') == 4:
1753+
BUFFER_UCS4 = 'L'
1754+
else:
1755+
self.fail("unable to get BUFFER_UCS4 ")
1756+
1757+
def check_ucs1(text, formats):
1758+
if formats == PyUnicode_FORMAT_UCS1:
1759+
export_format = PyUnicode_FORMAT_UCS1
1760+
elif text.isascii():
1761+
export_format = PyUnicode_FORMAT_ASCII
1762+
else:
1763+
export_format = PyUnicode_FORMAT_UCS1
1764+
self.assertEqual(unicode_export(text, formats),
1765+
(text.encode('latin1'), export_format, 1, BUFFER_UCS1))
1766+
1767+
def check_ucs2(text, formats):
1768+
self.assertEqual(unicode_export(text, formats),
1769+
(text.encode(ucs2_enc),
1770+
PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
1771+
1772+
def check_ucs4(text, formats):
1773+
self.assertEqual(unicode_export(text, formats),
1774+
(text.encode(ucs4_enc),
1775+
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1776+
1777+
def check_utf8(text):
1778+
self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8),
1779+
(text.encode('utf8'),
1780+
PyUnicode_FORMAT_UTF8, 1, 'B'))
1781+
1782+
check_ucs1("abc", formats)
1783+
check_ucs1("latin1:\xe9", formats)
1784+
check_ucs2('ucs2:\u20ac', formats)
1785+
check_ucs4('ucs4:\U0010ffff', formats)
1786+
1787+
# export ASCII as UCS1
1788+
check_ucs1("abc", PyUnicode_FORMAT_UCS1)
1789+
1790+
# export ASCII and UCS1 to UCS2
1791+
check_ucs2("abc", PyUnicode_FORMAT_UCS2)
1792+
check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2)
1793+
1794+
# always export to UCS4
1795+
check_ucs4("abc", PyUnicode_FORMAT_UCS4)
1796+
check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4)
1797+
check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4)
1798+
check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4)
1799+
1800+
# always export to UTF8
1801+
check_utf8("abc")
1802+
check_utf8("latin1:\xe9")
1803+
check_utf8('ucs2:\u20ac')
1804+
check_utf8('ucs4:\U0010ffff')
1805+
1806+
# No supported format or invalid format
1807+
for formats in (0, PyUnicode_FORMAT_INVALID):
1808+
err_msg = "unable to find a matching export format"
1809+
with self.subTest(formats=formats):
1810+
with self.assertRaisesRegex(ValueError, err_msg):
1811+
unicode_export('abc', formats)
1812+
1813+
def test_unicode_import(self):
1814+
# Test PyUnicode_Import()
1815+
unicode_import = _testlimitedcapi.unicode_import
1816+
if sys.byteorder == 'little':
1817+
ucs2_enc = 'utf-16le'
1818+
ucs4_enc = 'utf-32le'
1819+
else:
1820+
ucs2_enc = 'utf-16be'
1821+
ucs4_enc = 'utf-32be'
1822+
1823+
self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
1824+
"abc")
1825+
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
1826+
"latin1:\xe9")
1827+
1828+
self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
1829+
PyUnicode_FORMAT_UCS2),
1830+
'ucs2:\u20ac')
1831+
1832+
self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
1833+
PyUnicode_FORMAT_UCS4),
1834+
'ucs4:\U0010ffff')
1835+
1836+
text = "abc\xe9\U0010ffff"
1837+
self.assertEqual(unicode_import(text.encode('utf8'),
1838+
PyUnicode_FORMAT_UTF8),
1839+
text)
1840+
1841+
# Empty string
1842+
for native_format in (
1843+
PyUnicode_FORMAT_ASCII,
1844+
PyUnicode_FORMAT_UCS1,
1845+
PyUnicode_FORMAT_UCS2,
1846+
PyUnicode_FORMAT_UCS4,
1847+
PyUnicode_FORMAT_UTF8,
1848+
):
1849+
with self.subTest(native_format=native_format):
1850+
self.assertEqual(unicode_import(b'', native_format),
1851+
'')
1852+
1853+
# Invalid format
1854+
with self.assertRaises(ValueError):
1855+
unicode_import(b'', PyUnicode_FORMAT_INVALID)
1856+
1857+
# Invalid size
1858+
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
1859+
with self.assertRaises(ValueError):
1860+
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
1861+
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
1862+
with self.assertRaises(ValueError):
1863+
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
1864+
with self.assertRaises(ValueError):
1865+
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
1866+
with self.assertRaises(ValueError):
1867+
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
1868+
17241869

17251870
class PyUnicodeWriterTest(unittest.TestCase):
17261871
def create_writer(self, size):
@@ -1903,6 +2048,38 @@ def test_recover_error(self):
19032048

19042049
self.assertEqual(writer.finish(), 'Hello World.')
19052050

1906-
1907-
if __name__ == "__main__":
2051+
def test_unicode_export_import_roundtrip(self):
2052+
unicode_export = _testlimitedcapi.unicode_export
2053+
unicode_import = _testlimitedcapi.unicode_import
2054+
2055+
ASCII = PyUnicode_FORMAT_ASCII
2056+
UCS1 = PyUnicode_FORMAT_UCS1
2057+
UCS2 = PyUnicode_FORMAT_UCS2
2058+
UCS4 = PyUnicode_FORMAT_UCS4
2059+
UTF8 = PyUnicode_FORMAT_UTF8
2060+
ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)
2061+
2062+
def roundtrip(string, formats):
2063+
buf, buf_fmt, item_size, view_fmt = unicode_export(string, formats)
2064+
self.assertEqual(unicode_import(buf, buf_fmt), string)
2065+
2066+
for string, allowed_formats in (
2067+
('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
2068+
('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
2069+
('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
2070+
('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
2071+
('ucs4:\U0001f638', {UCS4, UTF8}),
2072+
):
2073+
for formats in ASCII, UCS1, UCS2, UCS4, UTF8:
2074+
with self.subTest(string=string, formats=formats):
2075+
if formats not in allowed_formats:
2076+
with self.assertRaises(ValueError):
2077+
unicode_export(string, formats)
2078+
else:
2079+
roundtrip(string, formats)
2080+
2081+
roundtrip(string, ALL)
2082+
2083+
2084+
if __name__ == '__main__':
19082085
unittest.main()

Lib/test/test_stable_abi_ctypes.py

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, and
2+
:c:func:`PyUnicode_Import` functions to export and import strings. Patch by
3+
Victor Stinner.

Misc/stable_abi.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,3 +2526,19 @@
25262526
added = '3.14'
25272527
[function.PyLong_AsUInt64]
25282528
added = '3.14'
2529+
[const.PyUnicode_FORMAT_ASCII]
2530+
added = '3.14'
2531+
[const.PyUnicode_FORMAT_UCS1]
2532+
added = '3.14'
2533+
[const.PyUnicode_FORMAT_UCS2]
2534+
added = '3.14'
2535+
[const.PyUnicode_FORMAT_UCS4]
2536+
added = '3.14'
2537+
[const.PyUnicode_FORMAT_UTF8]
2538+
added = '3.14'
2539+
[function.PyUnicode_Export]
2540+
added = '3.14'
2541+
[function.PyUnicode_GetBufferFormat]
2542+
added = '3.14'
2543+
[function.PyUnicode_Import]
2544+
added = '3.14'

0 commit comments

Comments
 (0)
0