8000 BUG, MAINT: Stop using the error-prone deprecated Py_UNICODE apis by eric-wieser · Pull Request #15385 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG, MAINT: Stop using the error-prone deprecated Py_UNICODE apis #15385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/release/upcoming_changes/15385.new_feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
``np.str_`` scalars now support the buffer protocol
---------------------------------------------------
``np.str_`` arrays are always stored as UCS4, so the corresponding scalars
now expose this through the buffer interface, meaning
``memoryview(np.str_('test'))`` now works.
19 changes: 0 additions & 19 deletions numpy/core/defchararray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2679,25 +2679,6 @@ class adds the following functionality:
itemsize = len(obj)
shape = len(obj) // itemsize

if unicode:
if sys.maxunicode == 0xffff:
# On a narrow Python build, the buffer for Unicode
# strings is UCS2, which doesn't match the buffer for
# NumPy Unicode types, which is ALWAYS UCS4.
# Therefore, we need to convert the buffer. On Python
# 2.6 and later, we can use the utf_32 codec. Earlier
# versions don't have that codec, so we convert to a
# numerical array that matches the input buffer, and
# then use NumPy to convert it to UCS4. All of this
# should happen in native endianness.
obj = obj.encode('utf_32')
else:
obj = str(obj)
else:
# Let the default Unicode -> string encoding (if any) take
# precedence.
obj = bytes(obj)

return chararray(shape, itemsize=itemsize, unicode=unicode,
buffer=obj, order=order)
Comment on lines -2682 to 2683
Copy link
Member Author
@eric-wieser eric-wieser Feb 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code never made sense in the first place, as chararray.__new__ has an identity crisis over whether it's trying to be np.ndarray.__new__ or np.array, and accepts str objects in place of the buffer.

Edit: Perhaps it was a workaround for the original bug.


Expand Down
8 changes: 7 additions & 1 deletion numpy/core/include/numpy/arrayscalars.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,13 @@ typedef struct {
} PyScalarObject;

#define PyStringScalarObject PyStringObject
#define PyUnicodeScalarObject PyUnicodeObject
#define PyStringScalarObject PyStringObject
typedef struct {
/* note that the PyObject_HEAD macro lives right here */
PyUnicodeObject base;
Py_UCS4 *obval;
} PyUnicodeScalarObject;


typedef struct {
PyObject_VAR_HEAD
Expand Down 8000
116 changes: 8 additions & 108 deletions numpy/core/src/common/ucsnarrow.c
< 8000 td id="diff-0bf5441d69042dda2554d5009b738b0e3a00091445642279b2fcc128a19bcf02L42" data-line-number="42" class="blob-num blob-num-deletion js-linkable-line-number"> 9E88
Original file line number Diff line number Diff line change
Expand Up @@ -16,76 +16,12 @@
#include "ctors.h"

/*
* Functions only needed on narrow builds of Python for converting back and
* forth between the NumPy Unicode data-type (always 4-bytes) and the
* Python Unicode scalar (2-bytes on a narrow build).
*/

/*
* The ucs2 buffer must be large enough to hold 2*ucs4length characters
* due to the use of surrogate pairs.
* This file originally contained functions only needed on narrow builds of
* Python for converting back and forth between the NumPy Unicode data-type
* (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
*
* The return value is the number of ucs2 bytes used-up which
* is ucs4length + number of surrogate pairs found.
*
* Values above 0xffff are converted to surrogate pairs.
* This "narrow" interface is now deprecated in python and unused in NumPy.
*/
NPY_NO_EXPORT int
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
{
int i;
int numucs2 = 0;
npy_ucs4 chr;
for (i = 0; i < ucs4length; i++) {
chr = *ucs4++;
if (chr > 0xffff) {
numucs2++;
chr -= 0x10000L;
*ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
}
else {
*ucs2++ = (Py_UNICODE) chr;
}
numucs2++;
}
return numucs2;
}


/*
* This converts a UCS2 buffer of the given length to UCS4 buffer.
* It converts up to ucs4len characters of UCS2
*
* It returns the number of characters converted which can
* be less than ucs2len if there are surrogate pairs in ucs2.
*
* The return value is the actual size of the used part of the ucs4 buffer.
*/
NPY_NO_EXPORT int
PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
{
int i;
npy_ucs4 chr;
Py_UNICODE ch;
int numchars=0;

for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
ch = *ucs2++;
if (ch >= 0xd800 && ch <= 0xdfff) {
/* surrogate pair */
chr = ((npy_ucs4)(ch-0xd800)) << 10;
chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */
i++;
}
else {
chr = (npy_ucs4) ch;
}
*ucs4++ = chr;
numchars++;
}
return numchars;
}

/*
* Returns a PyUnicodeObject initialized from a buffer containing
Expand All @@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
npy_ucs4 const *src = (npy_ucs4 const *)src_char;
npy_ucs4 *buf = NULL;
PyUnicodeObject *ret;

/* swap and align if needed */
if (swap || align) {
buf = (npy_ucs4 *)malloc(size);
if (buf == NULL) {
PyErr_NoMemory();
goto fail;
return NULL;
}
memcpy(buf, src, size);
if (swap) {
Expand All @@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
while (ucs4len > 0 && src[ucs4len - 1] == 0) {
ucs4len--;
}

/* produce PyUnicode object */
#ifdef Py_UNICODE_WIDE
{
ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
(Py_ssize_t) ucs4len);
if (ret == NULL) {
goto fail;
}
}
#else
{
Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
Py_ssize_t ucs2len;
Py_UNICODE *tmp;

if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
PyErr_NoMemory();
goto fail;
}
ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
free(tmp);
if (ret == NULL) {
goto fail;
}
}
#endif

if (buf) {
free(buf);
}
PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
PyUnicode_4BYTE_KIND, src, ucs4len);
free(buf);
return ret;

fail:
if (buf) {
free(buf);
}
return NULL;
}
6 changes: 0 additions & 6 deletions numpy/core/src/common/ucsnarrow.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
#ifndef _NPY_UCSNARROW_H_
#define _NPY_UCSNARROW_H_

NPY_NO_EXPORT int
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);

NPY_NO_EXPORT int
PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);

NPY_NO_EXPORT PyUnicodeObject *
PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);

Expand Down
58 changes: 30 additions & 28 deletions numpy/core/src/multiarray/arraytypes.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -450,12 +450,6 @@ static int
UNICODE_setitem(PyObject *op, void *ov, void *vap)
{
PyArrayObject *ap = vap;
PyObject *temp;
Py_UNICODE *ptr;
int datalen;
#ifndef Py_UNICODE_WIDE
char *buffer;
#endif

if (PyArray_IsZeroDim(op)) {
return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
Expand All @@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
"setting an array element with a sequence");
return -1;
}

PyObject *temp;
if (PyBytes_Check(op)) {
/* Try to decode from ASCII */
temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
Expand All @@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else if ((temp=PyObject_Str(op)) == NULL) {
return -1;
}
ptr = PyUnicode_AS_UNICODE(temp);
if ((ptr == NULL) || (PyErr_Occurred())) {

/* truncate if needed */
Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
Py_ssize_t actual_len = PyUnicode_GetLength(temp);
if (actual_len < 0) {
Py_DECREF(temp);
return -1;
}
datalen = PyUnicode_GET_DATA_SIZE(temp);
if (actual_len > max_len) {
Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
if (temp == NULL) {
return -1;
}
actual_len = max_len;
}

#ifdef Py_UNICODE_WIDE
memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
#else
Py_ssize_t num_bytes = actual_len * 4;

char *buffer;
if (!PyArray_ISALIGNED(ap)) {
buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
buffer = PyArray_malloc(num_bytes);
if (buffer == NULL) {
Py_DECREF(temp);
PyErr_NoMemory();
Expand All @@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else {
buffer = ov;
}
datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
datalen <<= 2;
if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
PyArray_free(buffer);
Py_DECREF(temp);
return -1;
}

if (!PyArray_ISALIGNED(ap)) {
memcpy(ov, buffer, datalen);
memcpy(ov, buffer, num_bytes);
PyArray_free(buffer);
}
#endif

/* Fill in the rest of the space with 0 */
if (PyArray_DESCR(ap)->elsize > datalen) {
memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
if (PyArray_DESCR(ap)->elsize > num_bytes) {
memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
}
if (PyArray_ISBYTESWAPPED(ap)) {
byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
byte_swap_vector(ov, actual_len, 4);
}
Py_DECREF(temp);
return 0;
Expand Down Expand Up @@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
return nonz;
}

#ifdef Py_UNICODE_WIDE
#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
#else
#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
#endif
Comment on lines -2653 to -2657
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty sure this was a bug, but constructing a failing string is non-trivial


static npy_bool
UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
{
Expand All @@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
if (*ip == '\0') {
seen_null = NPY_TRUE;
}
else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
nonz = NPY_TRUE;
break;
}
Expand Down
5 changes: 0 additions & 5 deletions numpy/core/src/multiarray/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
descr = PyArray_DescrFromScalar(self);
view->buf = (void *)scalar_value(self, descr);
elsize = descr->elsize;
#ifndef Py_UNICODE_WIDE
if (descr->type_num == NPY_UNICODE) {
elsize >>= 1;
}
#endif
view->len = elsize;
if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
Expand Down
32 changes: 20 additions & 12 deletions numpy/core/src/multiarray/common.c
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
PyObject *obj, PyArray_Descr *last_dtype, int string_type)
{
int itemsize;
PyObject *temp;

if (string_type == NPY_STRING) {
if ((temp = PyObject_Str(obj)) == NULL) {
PyObject *temp = PyObject_Str(obj);
if (temp == NULL) {
return NULL;
}
/* assume that when we do the encoding elsewhere we'll use ASCII */
itemsize = PyUnicode_GetLength(temp);
Py_DECREF(temp);
if (itemsize < 0) {
return NULL;
}
}
else if (string_type == NPY_UNICODE) {
if ((temp = PyObject_Str(obj)) == NULL) {
PyObject *temp = PyObject_Str(obj);
if (temp == NULL) {
return NULL;
}
itemsize = PyUnicode_GET_DATA_SIZE(temp);
#ifndef Py_UNICODE_WIDE
itemsize <<= 1;
#endif
itemsize = PyUnicode_GetLength(temp);
Py_DECREF(temp);
if (itemsize < 0) {
return NULL;
}
itemsize *= 4; /* convert UCS4 codepoints to bytes */
}
else {
return NULL;
}
Py_DECREF(temp);
if (last_dtype != NULL &&
last_dtype->type_num == string_type &&
last_dtype->elsize >= itemsize) {
Expand Down Expand Up @@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,

/* Check if it's a Unicode string */
if (PyUnicode_Check(obj)) {
int itemsize = PyUnicode_GET_DATA_SIZE(obj);
#ifndef Py_UNICODE_WIDE
itemsize <<= 1;
#endif
int itemsize = PyUnicode_GetLength(obj);
if (itemsize < 0) {
goto fail;
}
itemsize *= 4;

/*
* If it's already a big enough unicode object,
Expand Down
Loading
0