8000 gh-119182: Add PyUnicodeWriter C API by vstinner · Pull Request #119184 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-119182: Add PyUnicodeWriter C API #119184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jun 17, 2024
Merged
Next Next commit
gh-119182: Add PyUnicodeWriter C API
  • Loading branch information
vstinner committed Jun 7, 2024
commit 3c4da2e470e51eb8dec58e21f4cabe8c1032cb93
41 changes: 39 additions & 2 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,44 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
Py_ssize_t size);


/* --- _PyUnicodeWriter API ----------------------------------------------- */
/* --- Public PyUnicodeWriter API ----------------------------------------- */

typedef struct PyUnicodeWriter PyUnicodeWriter;

PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(void);
PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer);
PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer);

PyAPI_FUNC(void) PyUnicodeWriter_SetOverallocate(
PyUnicodeWriter *writer,
int overallocate);

PyAPI_FUNC(int) PyUnicodeWriter_WriteChar(
PyUnicodeWriter *writer,
Py_UCS4 ch);
PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
PyUnicodeWriter *writer,
const char *str,
Py_ssize_t size);

PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
PyUnicodeWriter *writer,
PyObject *str);
PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr(
PyUnicodeWriter *writer,
PyObject *obj);
PyAPI_FUNC(int) PyUnicodeWriter_WriteSubstring(
PyUnicodeWriter *writer,
PyObject *str,
Py_ssize_t start,
Py_ssize_t end);
PyAPI_FUNC(int) PyUnicodeWriter_Format(
PyUnicodeWriter *writer,
const char *format,
...);


/* --- Private _PyUnicodeWriter API --------------------------------------- */

typedef struct {
PyObject *buffer;
Expand All @@ -466,7 +503,7 @@ typedef struct {
/* If readonly is 1, buff 8000 er is a shared string (cannot be modified)
and size is set to 0. */
unsigned char readonly;
} _PyUnicodeWriter ;
} _PyUnicodeWriter;

// Initialize a Unicode writer.
//
Expand Down
165 changes: 165 additions & 0 deletions Modules/_testcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,167 @@ unicode_copycharacters(PyObject *self, PyObject *args)
}


static PyObject *
test_unicodewriter(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create();
if (writer == NULL) {
return NULL;
}

// test PyUnicodeWriter_SetOverallocate()
PyUnicodeWriter_SetOverallocate(writer, 1);

// test PyUnicodeWriter_WriteUTF8()
if (PyUnicodeWriter_WriteUTF8(writer, "var", -1) < 0) {
goto error;
}

// test PyUnicodeWriter_WriteChar()
if (PyUnicodeWriter_WriteChar(writer, '=') < 0) {
goto error;
}

// test PyUnicodeWriter_WriteSubstring()
PyObject *str = PyUnicode_FromString("[long]");
if (str == NULL) {
goto error;
}
int ret = PyUnicodeWriter_WriteSubstring(writer, str, 1, 5);
Py_CLEAR(str);
if (ret < 0) {
goto error;
}

// test PyUnicodeWriter_WriteStr()
str = PyUnicode_FromString(" value ");
if (str == NULL) {
goto error;
}
ret = PyUnicodeWriter_WriteStr(writer, str);
Py_CLEAR(str);
if (ret < 0) {
goto error;
}

// test PyUnicodeWriter_WriteRepr()
str = PyUnicode_FromString("repr");
if (str == NULL) {
goto error;
}
ret = PyUnicodeWriter_WriteRepr(writer, str);
Py_CLEAR(str);
if (ret < 0) {
goto error;
}

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result, "var=long value 'repr'"));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyObject *
test_unicodewriter_utf8(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create();
if (writer == NULL) {
return NULL;
}
if (PyUnicodeWriter_WriteUTF8(writer, "ascii", -1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
if (PyUnicodeWriter_WriteUTF8(writer, "latin1=\xC3\xA9", -1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
if (PyUnicodeWriter_WriteUTF8(writer, "euro=\xE2\x82\xAC", -1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
goto error;
}

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result,
"ascii-latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyObject *
test_unicodewriter_invalid_utf8(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create();
if (writer == NULL) {
return NULL;
}
assert(PyUnicodeWriter_WriteUTF8(writer, "invalid=\xFF", -1) < 0);
PyUnicodeWriter_Discard(writer);

assert(PyErr_ExceptionMatches(PyExc_UnicodeDecodeError));
PyErr_Clear();

Py_RETURN_NONE;
}


static PyObject *
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create();
if (writer == NULL) {
return NULL;
}

// test PyUnicodeWriter_Format()
if (PyUnicodeWriter_Format(writer, "%s %i", "Hello", 123) < 0) {
goto error;
}

// test PyUnicodeWriter_WriteChar()
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
goto error;
}

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result, "Hello 123."));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyMethodDef TestMethods[] = {
{"unicode_new", unicode_new, METH_VARARGS},
{"unicode_fill", unicode_fill, METH_VARARGS},
Expand All @@ -229,6 +390,10 @@ static PyMethodDef TestMethods[] = {
{"unicode_asucs4copy", unicode_asucs4copy, METH_VARARGS},
{"unicode_asutf8", unicode_asutf8, METH_VARARGS},
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS},
{"test_unicodewriter", test_unicodewriter, METH_NOARGS},
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
{NULL},
};

Expand Down
Loading
0