8000 gh-119609: Add PyUnicode_Export() and PyUnicode_Import() functions by vstinner · Pull Request #119610 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-119609: Add PyUnicode_Export() and PyUnicode_Import() functions #119610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
Closed
Prev Previous commit
Next Next commit
Export UCS1 as UCS2
Co-Authored-By: Petr Viktorin <encukou@gmail.com>
  • Loading branch information
vstinner and encukou committed Jun 21, 2024
commit 1310b6d80ede2eb3aa3f15bdd299bdfde411889b
6 changes: 6 additions & 0 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,12 @@ def test_unicode_export(self):
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
(b'abc', PyUnicode_FORMAT_UCS1))

# export ASCII and UCS1 to UCS2
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2),
('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2),
('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2))

# always export to UCS4
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
Expand Down
30 changes: 27 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2105,9 +2105,9 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode);
goto error;
}

Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);

// Native ASCII
if (PyUnicode_IS_ASCII(unicode)
&& (requested_formats & PyUnicode_FORMAT_ASCII))
{
Expand All @@ -2116,6 +2116,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
return PyUnicode_1BYTE_DATA(unicode);
}

// Native UCS1
int kind = PyUnicode_KIND(unicode);
if (kind == PyUnicode_1BYTE_KIND
&& (requested_formats & PyUnicode_FORMAT_UCS1))
Expand All @@ -2125,6 +2126,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
return PyUnicode_1BYTE_DATA(unicode);
}

// Native UCS2
if (kind == PyUnicode_2BYTE_KIND
&& (requested_formats & PyUnicode_FORMAT_UCS2))
{
Expand All @@ -2133,6 +2135,28 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
return PyUnicode_2BYTE_DATA(unicode);
}

// Convert ASCII or UCS1 to UCS2
if (kind == PyUnicode_1BYTE_KIND
&& requested_formats & PyUnicode_FORMAT_UCS2)
{
Py_UCS2 *ucs2 = PyMem_Malloc((len + 1) * sizeof(Py_UCS2));
if (!ucs2) {
PyErr_NoMemory();
goto error;
}

_PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2,
PyUnicode_1BYTE_DATA(unicode),
PyUnicode_1BYTE_DATA(unicode) + len,
ucs2);
ucs2[len] = 0;

*format = PyUnicode_FORMAT_UCS2;
*size = len * 2;
return ucs2;
}

// Native UCS4
if (kind == PyUnicode_4BYTE_KIND
&& (requested_formats & PyUnicode_FORMAT_UCS4))
{
Expand All @@ -2141,8 +2165,8 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
return PyUnicode_4BYTE_DATA(unicode);
}

// Convert ASCII, UCS1 or UCS2 to UCS4
if (requested_formats & PyUnicode_FORMAT_UCS4) {
// Convert UCS1 or UCS2 to UCS4
Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode);
if (ucs4 == NULL) {
goto error;
Expand All @@ -2152,6 +2176,7 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,
return ucs4;
}

// Convert to UTF-8
if (requested_formats & PyUnicode_FORMAT_UTF8) {
// Encode UCS1, UCS2 or UCS4 to UTF-8
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, size);
Expand All @@ -2164,7 +2189,6 @@ PyUnicode_Export(PyObject *unicode, uint32_t requested_formats,

PyErr_Format(PyExc_ValueError, "unable to find a matching export format");


error:
*size = 0;
*format = 0;
Expand Down
0