8000 bpo-36299: array('u') uses Py_UCS4 instead of Py_UNICODE by methane · Pull Request #12497 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-36299: array('u') uses Py_UCS4 instead of Py_UNICODE #12497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
bpo-36299: array('u') uses Py_UCS4 instead of Py_UNICODE
  • Loading branch information
methane committed Mar 22, 2019
commit 3a4ac0535ecc339a23b917080458ebbe482adc34
8 changes: 2 additions & 6 deletions Lib/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,12 +1123,8 @@ def test_unicode(self):

def test_issue17223(self):
# this used to crash
if sizeof_wchar == 4:
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
else:
# PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
self.skipTest("specific to 32-bit wchar_t")
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
a = array.array('u', invalid_str)
self.assertRaises(ValueError, a.tounicode)
self.assertRaises(ValueError, str, a)
Expand Down
94 changes: 46 additions & 48 deletions Modules/arraymodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#endif /* HAVE_SYS_TYPES_H */
#endif /* !STDC_HEADERS */

/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
#define MAX_UNICODE 0x10ffff

/*[clinic input]
module array
[clinic start generated code]*/
Expand Down Expand Up @@ -237,24 +240,26 @@ BB_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
static PyObject *
u_getitem(arrayobject *ap, Py_ssize_t i)
{
return PyUnicode_FromOrdinal(((Py_UNICODE *) ap->ob_item)[i]);
return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]);
}

static int
u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
{
Py_UNICODE *p;
Py_ssize_t len;

if (!PyArg_Parse(v, "u#;array item must be unicode character", &p, &len))
if (!PyUnicode_Check(v)) {
PyErr_SetString(PyExc_TypeError,
"array item must be unicode character");
return -1;
if (len != 1) {
}
if (PyUnicode_GetLength(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"array item must be unicode character");
return -1;
}
if (i >= 0)
((Py_UNICODE *)ap->ob_item)[i] = p[0];

if (i >= 0) {
((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_ReadChar(v, 0);
}
return 0;
}

Expand Down Expand Up @@ -532,7 +537,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)

DEFINE_COMPAREITEMS(b, signed char)
DEFINE_COMPAREITEMS(BB, unsigned char)
DEFINE_COMPAREITEMS(u, Py_UNICODE)
DEFINE_COMPAREITEMS(u, Py_UCS4)
DEFINE_COMPAREITEMS(h, short)
DEFINE_COMPAREITEMS(HH, unsigned short)
DEFINE_COMPAREITEMS(i, int)
Expand All @@ -550,7 +555,7 @@ DEFINE_COMPAREITEMS(QQ, unsigned long long)
static const struct arraydescr descriptors[] = {
{'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1},
{'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0},
{'u', sizeof(Py_UNICODE), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
{'u', sizeof(Py_UCS4), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
{'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1},
{'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0},
{'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1},
Expand Down Expand Up @@ -1701,7 +1706,7 @@ array_array_tostring_impl(arrayobject *self)
/*[clinic input]
array.array.fromunicode

ustr: Py_UNICODE(zeroes=True)
ustr: unicode
/

Extends this array with data from the unicode string ustr.
Expand All @@ -1712,25 +1717,25 @@ some other type.
[clinic start generated code]*/

static PyObject *
array_array_fromunicode_impl(arrayobject *self, const Py_UNICODE *ustr,
Py_ssize_clean_t ustr_length)
/*[clinic end generated code: output=cf2f662908e2befc input=150f00566ffbca6e]*/
array_array_fromunicode_impl(arrayobject *self, PyObject *ustr)
/*[clinic end generated code: output=24359f5e001a7f2b input=025db1fdade7a4ce]*/
{
char typecode;

typecode = self->ob_descr->typecode;
if (typecode != 'u') {
if (self->ob_descr->typecode != 'u') {
PyErr_SetString(PyExc_ValueError,
"fromunicode() may only be called on "
"unicode type arrays");
return NULL;
}

Py_ssize_t ustr_length = PyUnicode_GetLength(ustr);
if (ustr_length > 0) {
Py_ssize_t old_size = Py_SIZE(self);
if (array_resize(self, old_size + ustr_length) == -1)
return NULL;
memcpy(self->ob_item + old_size * sizeof(Py_UNICODE),
ustr, ustr_length * sizeof(Py_UNICODE));
if (PyUnicode_AsUCS4(ustr, ((Py_UCS4*)self->ob_item) + old_size,
ustr_length, 0) == NULL) {
return NULL;
}
}

Py_RETURN_NONE;
Expand All @@ -1750,14 +1755,21 @@ static PyObject *
array_array_tounicode_impl(arrayobject *self)
/*[clinic end generated code: output=08e442378336e1ef input=127242eebe70b66d]*/
{
char typecode;
typecode = self->ob_descr->typecode;
if (typecode != 'u') {
if (self->ob_descr->typecode != 'u') {
PyErr_SetString(PyExc_ValueError,
"tounicode() may only be called on unicode type arrays");
return NULL;
}
return PyUnicode_FromWideChar((Py_UNICODE *) self->ob_item, Py_SIZE(self));
Py_UCS4 *item = (Py_UCS4*)self->ob_item;
for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
if (item[i] > MAX_UNICODE) {
PyErr_SetString(PyExc_ValueError,
"code point not in range(0x110000)");
return NULL;
}
}
return PyUnicode_FromKindAndData(
PyUnicode_4BYTE_KIND, self->ob_item, Py_SIZE(self));
}

/*[clinic input]
Expand Down Expand Up @@ -1828,13 +1840,7 @@ typecode_to_mformat_code(char typecode)
return UNSIGNED_INT8;

case 'u':
if (sizeof(Py_UNICODE) == 2) {
return UTF16_LE + is_big_endian;
}
if (sizeof(Py_UNICODE) == 4) {
return UTF32_LE + is_big_endian;
}
return UNKNOWN_FORMAT;
return UTF32_LE + is_big_endian;

case 'f':
if (sizeof(float) == 4) {
Expand Down Expand Up @@ -2585,11 +2591,9 @@ array_buffer_getbuf(arrayobject *self, Py_buffer *view, int flags)
view->internal = NULL;
if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
view->format = (char *)self->ob_descr->formats;
#ifdef Py_UNICODE_WIDE
if (self->ob_descr->typecode == 'u') {
view->format = "w";
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why this is "w".

}
#endif
}

self->ob_exports++;
Expand Down Expand Up @@ -2711,30 +2715,24 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
Py_DECREF(v);
}
else if (initial != NULL && PyUnicode_Check(initial)) {
Py_UNICODE *ustr;
Py_ssize_t n;

ustr = PyUnicode_AsUnicode(initial);
if (ustr == NULL) {
PyErr_NoMemory();
Py_DECREF(a);
return NULL;
}

n = PyUnicode_GET_DATA_SIZE(initial);
Py_ssize_t n = PyUnicode_GetLength(initial);
if (n > 0) {
arrayobject *self = (arrayobject *)a;
char *item = self->ob_item;
item = (char *)PyMem_Realloc(item, n);
item = (char *)PyMem_Realloc(item, n * sizeof(Py_UCS4));
if (item == NULL) {
PyErr_NoMemory();
Py_DECREF(a);
return NULL;
}
self->ob_item = item;
Py_SIZE(self) = n / sizeof(Py_UNICODE);
memcpy(item, ustr, n);
self->allocated = Py_SIZE(self);
self->allocated = n;

if (PyUnicode_AsUCS4(initial, (Py_UCS4*)item, n, 0) == NULL) {
Py_DECREF(a);
return NULL;
}
Py_SIZE(self) = n;
}
}
else if (initial != NULL && array_Check(initial) && len > 0) {
Expand Down
17 changes: 10 additions & 7 deletions Modules/clinic/arraymodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0