8000 Merge pull request #15385 from eric-wieser/fix-unicode-ucs2 · numpy/numpy@1f9ab28 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1f9ab28

Browse files
authored
Merge pull request #15385 from eric-wieser/fix-unicode-ucs2
BUG, MAINT: Stop using the error-prone deprecated Py_UNICODE apis
2 parents 491f41a + d0b7b66 commit 1f9ab28

File tree

12 files changed

+180
-268
lines changed

12 files changed

+180
-268
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
``np.str_`` scalars now support the buffer protocol
2+
---------------------------------------------------
3+
``np.str_`` arrays are always stored as UCS4, so the corresponding scalars
4+
now expose this through the buffer interface, meaning
5+
``memoryview(np.str_('test'))`` now works.

numpy/core/defchararray.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2679,25 +2679,6 @@ class adds the following functionality:
26792679
itemsize = len(obj)
26802680
shape = len(obj) // itemsize
26812681

2682-
if unicode:
2683-
if sys.maxunicode == 0xffff:
2684-
# On a narrow Python build, the buffer for Unicode
2685-
# strings is UCS2, which doesn't match the buffer for
2686-
# NumPy Unicode types, which is ALWAYS UCS4.
2687-
# Therefore, we need to convert the buffer. On Python
2688-
# 2.6 and later, we can use the utf_32 codec. Earlier
2689-
# versions don't have that codec, so we convert to a
2690-
# numerical array that matches the input buffer, and
2691-
# then use NumPy to convert it to UCS4. All of this
2692-
# should happen in native endianness.
2693-
obj = obj.encode('utf_32')
2694-
else:
2695-
obj = str(obj)
2696-
else:
2697-
# Let the default Unicode -> string encoding (if any) take
2698-
# precedence.
2699-
obj = bytes(obj)
2700-
27012682
return chararray(shape, itemsize=itemsize, unicode=unicode,
27022683
buffer=obj, order=order)
27032684

numpy/core/include/numpy/arrayscalars.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,13 @@ typedef struct {
135135
} PyScalarObject;
136136

137137
#define PyStringScalarObject PyStringObject
138-
#define PyUnicodeScalarObject PyUnicodeObject
138+
#define PyStringScalarObject PyStringObject
139+
typedef struct {
140+
/* note that the PyObject_HEAD macro lives right here */
141+
PyUnicodeObject base;
142+
Py_UCS4 *obval;
143+
} PyUnicodeScalarObject;
144+
139145

140146
typedef struct {
141147
PyObject_VAR_HEAD

numpy/core/src/common/ucsnarrow.c

Lines changed: 8 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -16,76 +16,12 @@
1616
#include "ctors.h"
1717

1818
/*
19-
* Functions only needed on narrow builds of Python for converting back and
20-
* forth between the NumPy Unicode data-type (always 4-bytes) and the
21-
* Python Unicode scalar (2-bytes on a narrow build).
22-
*/
23-
24-
/*
25-
* The ucs2 buffer must be large enough to hold 2*ucs4length characters
26-
* due to the use of surrogate pairs.
19+
* This file originally contained functions only needed on narrow builds of
20+
* Python for converting back and forth between the NumPy Unicode data-type
21+
* (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
2722
*
28-
* The return value is the number of ucs2 bytes used-up which
29-
* is ucs4length + number of surrogate pairs found.
30-
*
31-
* Values above 0xffff are converted to surrogate pairs.
23+
* This "narrow" interface is now deprecated in python and unused in NumPy.
3224
*/
33-
NPY_NO_EXPORT int
34-
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
35-
{
36-
int i;
37-
int numucs2 = 0;
38-
npy_ucs4 chr;
39-
for (i = 0; i < ucs4length; i++) {
40-
chr = *ucs4++;
41-
if (chr > 0xffff) {
42-
numucs2++;
43-
chr -= 0x10000L;
44-
*ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
45-
*ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
46-
}
47-
else {
48-
*ucs2++ = (Py_UNICODE) chr;
49-
}
50-
numucs2++;
51-
}
52-
return numucs2;
53-
}
54-
55-
56-
/*
57-
* This converts a UCS2 buffer of the given length to UCS4 buffer.
58-
* It converts up to ucs4len characters of UCS2
59-
*
60-
* It returns the number of characters converted which can
61-
* be less than ucs2len if there are surrogate pairs in ucs2.
62-
*
63-
* The return value is the actual size of the used part of the ucs4 buffer.
64-
*/
65-
NPY_NO_EXPORT int
66-
PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
67-
{
68-
int i;
69-
npy_ucs4 chr;
70-
Py_UNICODE ch;
71-
int numchars=0;
72-
73-
for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
74-
ch = *ucs2++;
75-
if (ch >= 0xd800 && ch <= 0xdfff) {
76-
/* surrogate pair */
77-
chr = ((npy_ucs4)(ch-0xd800)) << 10;
78-
chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */
79-
i++;
80-
}
81-
else {
82-
chr = (npy_ucs4) ch;
83-
}
84-
*ucs4++ = chr;
85-
numchars++;
86-
}
87-
return numchars;
88-
}
8925

9026
/*
9127
* Returns a PyUnicodeObject initialized from a buffer containing
@@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
11248
Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
11349
npy_ucs4 const *src = (npy_ucs4 const *)src_char;
11450
npy_ucs4 *buf = NULL;
115-
PyUnicodeObject *ret;
11651

11752
/* swap and align if needed */
11853
if (swap || align) {
11954
buf = (npy_ucs4 *)malloc(size);
12055
if (buf == NULL) {
12156
PyErr_NoMemory();
122-
goto fail;
57+
return NULL;
12358
}
12459
memcpy(buf, src, size);
12560
if (swap) {
@@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
13267
while (ucs4len > 0 && src[ucs4len - 1] == 0) {
13368
ucs4len--;
13469
}
135-
136-
/* produce PyUnicode object */
137-
#ifdef Py_UNICODE_WIDE
138-
{
139-
ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
140-
(Py_ssize_t) ucs4len);
141-
if (ret == NULL) {
142-
goto fail;
143-
}
144-
}
145-
#else
146-
{
147-
Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
148-
Py_ssize_t ucs2len;
149-
Py_UNICODE *tmp;
150-
151-
if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
152-
PyErr_NoMemory();
153-
goto fail;
154-
}
155-
ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
156-
ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
157-
free(tmp);
158-
if (ret == NULL) {
159-
goto fail;
160-
}
161-
}
162-
#endif
163-
164-
if (buf) {
165-
free(buf);
166-
}
70+
PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
71+
PyUnicode_4BYTE_KIND, src, ucs4len);
72+
free(buf);
16773
return ret;
168-
169-
fail:
170-
if (buf) {
171-
free(buf);
172-
}
173-
return NULL;
17474
}

numpy/core/src/common/ucsnarrow.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
#ifndef _NPY_UCSNARROW_H_
22
#define _NPY_UCSNARROW_H_
33

4-
NPY_NO_EXPORT int
5-
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);
6-
7-
NPY_NO_EXPORT int
8-
PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);
9-
104
NPY_NO_EXPORT PyUnicodeObject *
115
PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
126

numpy/core/src/multiarray/arraytypes.c.src

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -450,12 +450,6 @@ static int
450450
UNICODE_setitem(PyObject *op, void *ov, void *vap)
451451
{
452452
PyArrayObject *ap = vap;
453-
PyObject *temp;
454-
Py_UNICODE *ptr;
455-
int datalen;
456-
#ifndef Py_UNICODE_WIDE
457-
char *buffer;
458-
#endif
459453

460454
if (PyArray_IsZeroDim(op)) {
461455
return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
@@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
466460
"setting an array element with a sequence");
467461
return -1;
468462
}
463+
464+
PyObject *temp;
469465
if (PyBytes_Check(op)) {
470466
/* Try to decode from ASCII */
471467
temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
@@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
476472
else if ((temp=PyObject_Str(op)) == NULL) {
477473
return -1;
478474
}
479-
ptr = PyUnicode_AS_UNICODE(temp);
480-
if ((ptr == NULL) || (PyErr_Occurred())) {
475+
476+
/* truncate if needed */
477+
Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
478+
Py_ssize_t actual_len = PyUnicode_GetLength(temp);
479+
if (actual_len < 0) {
481480
Py_DECREF(temp);
482481
return -1;
483482
}
484-
datalen = PyUnicode_GET_DATA_SIZE(temp);
483+
if (actual_len > max_len) {
484+
Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
485+
if (temp == NULL) {
486+
return -1;
487+
}
488+
actual_len = max_len;
489+
}
485490

486-
#ifdef Py_UNICODE_WIDE
487-
memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
488-
#else
491+
Py_ssize_t num_bytes = actual_len * 4;
492+
493+
char *buffer;
489494
if (!PyArray_ISALIGNED(ap)) {
490-
buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
495+
buffer = PyArray_malloc(num_bytes);
491496
if (buffer == NULL) {
492497
Py_DECREF(temp);
493498
PyErr_NoMemory();
@@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
497502
else {
498503
buffer = ov;
499504
}
500-
datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
501-
datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
502-
datalen <<= 2;
505+
if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
506+
PyArray_free(buffer);
507+
Py_DECREF(temp);
508+
return -1;
509+
}
510+
503511
if (!PyArray_ISALIGNED(ap)) {
504-
memcpy(ov, buffer, datalen);
512+
memcpy(ov, buffer, num_bytes);
505513
PyArray_free(buffer);
506514
}
507-
#endif
515+
508516
/* Fill in the rest of the space with 0 */
509-
if (PyArray_DESCR(ap)->elsize > datalen) {
510-
memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
517+
if (PyArray_DESCR(ap)->elsize > num_bytes) {
518+
memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
511519
}
512520
if (PyArray_ISBYTESWAPPED(ap)) {
513-
byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
521+
byte_swap_vector(ov, actual_len, 4);
514522
}
515523
Py_DECREF(temp);
516524
return 0;
@@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
26502658
return nonz;
26512659
}
26522660

2653-
#ifdef Py_UNICODE_WIDE
2654-
#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
2655-
#else
2656-
#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
2657-
#endif
2658-
26592661
static npy_bool
26602662
UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
26612663
{
@@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
26812683
if (*ip == '\0') {
26822684
seen_null = NPY_TRUE;
26832685
}
2684-
else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
2686+
else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
26852687
nonz = NPY_TRUE;
26862688
break;
26872689
}

numpy/core/src/multiarray/buffer.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
832832
descr = PyArray_DescrFromScalar(self);
833833
view->buf = (void *)scalar_value(self, descr);
834834
elsize = descr->elsize;
835-
#ifndef Py_UNICODE_WIDE
836-
if (descr->type_num == NPY_UNICODE) {
837-
elsize >>= 1;
838-
}
839-
#endif
840835
view->len = elsize;
841836
if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
842837
elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */

numpy/core/src/multiarray/common.c

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
130130
PyObject *obj, PyArray_Descr *last_dtype, int string_type)
131131
{
132132
int itemsize;
133-
PyObject *temp;
134133

135134
if (string_type == NPY_STRING) {
136-
if ((temp = PyObject_Str(obj)) == NULL) {
135+
PyObject *temp = PyObject_Str(obj);
136+
if (temp == NULL) {
137137
return NULL;
138138
}
139+
/* assume that when we do the encoding elsewhere we'll use ASCII */
139140
itemsize = PyUnicode_GetLength(temp);
141+
Py_DECREF(temp);
142+
if (itemsize < 0) {
143+
return NULL;
144+
}
140145
}
141146
else if (string_type == NPY_UNICODE) {
142-
if ((temp = PyObject_Str(obj)) == NULL) {
147+
PyObject *temp = PyObject_Str(obj);
148+
if (temp == NULL) {
143149
return NULL;
144150
}
145-
itemsize = PyUnicode_GET_DATA_SIZE(temp);
146-
#ifndef Py_UNICODE_WIDE
147-
itemsize <<= 1;
148-
#endif
151+
itemsize = PyUnicode_GetLength(temp);
152+
Py_DECREF(temp);
153+
if (itemsize < 0) {
154+
return NULL;
155+
}
156+
itemsize *= 4; /* convert UCS4 codepoints to bytes */
149157
}
150158
else {
151159
return NULL;
152160
}
153-
Py_DECREF(temp);
154161
if (last_dtype != NULL &&
155162
last_dtype->type_num == string_type &&
156163
last_dtype->elsize >= itemsize) {
@@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
258265

259266
/* Check if it's a Unicode string */
260267
if (PyUnicode_Check(obj)) {
261-
int itemsize = PyUnicode_GET_DATA_SIZE(obj);
262-
#ifndef Py_UNICODE_WIDE
263-
itemsize <<= 1;
264-
#endif
268+
int itemsize = PyUnicode_GetLength(obj);
269+
if (itemsize < 0) {
270+
goto fail;
271+
}
272+
itemsize *= 4;
265273

266274
/*
267275
* If it's already a big enough unicode object,

0 commit comments

Comments
 (0)
0