8000 packer: Use PyUnicode_AsUTF8AndSize() for utf-8 (#272) · Martiniele/msgpack-python@60ef387 · GitHub
[go: up one dir, main page]

Skip to content

Commit 60ef387

Browse files
authored
packer: Use PyUnicode_AsUTF8AndSize() for utf-8 (msgpack#272)
1 parent 5534d0c commit 60ef387

File tree

3 files changed

+71
-14
lines changed

3 files changed

+71
-14
lines changed

docker/runtests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ for V in cp36-cp36m cp35-cp35m cp27-cp27m cp27-cp27mu; do
99
pushd test # prevent importing msgpack package in current directory.
1010
$PYBIN/python -c 'import sys; print(hex(sys.maxsize))'
1111
$PYBIN/python -c 'from msgpack import _packer, _unpacker'
12-
$PYBIN/py.test -v
12+
$PYBIN/pytest -v .
1313
popd
1414
done

msgpack/_packer.pyx

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ cdef extern from "Python.h":
1313
int PyMemoryView_Check(object obj)
1414
int PyByteArray_Check(object obj)
1515
int PyByteArray_CheckExact(object obj)
16+
char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t *l) except NULL
1617

1718

1819
cdef extern from "pack.h":
@@ -37,6 +38,7 @@ cdef extern from "pack.h":
3738
int msgpack_pack_bin(msgpack_packer* pk, size_t l)
3839
int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l)
3940
int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l)
41+
int msgpack_pack_unicode(msgpack_packer* pk, object o, long long limit)
4042

4143
cdef int DEFAULT_RECURSE_LIMIT=511
4244
cdef long long ITEM_LIMIT = (2**32)-1
@@ -126,8 +128,12 @@ cdef class Packer(object):
126128
raise TypeError("default must be a callable.")
127129
self._default = default
128130
if encoding is None:
129-
self.encoding = 'utf_8'
130-
self.unicode_errors = NULL
131+
if unicode_errors is None:
132+
self.encoding = NULL
133+
self.unicode_errors = NULL
134+
else:
135+
self.encoding = "utf_8"
136+
self.unicode_errors = unicode_errors
131137
else:
132138
if isinstance(encoding, unicode):
133139
self._bencoding = encoding.encode('ascii')
@@ -140,6 +146,8 @@ cdef class Packer(object):
140146
self._berrors = unicode_errors
141147
if self._berrors is not None:
142148
self.unicode_errors = PyBytes_AsString(self._berrors)
149+
else:
150+
self.unicode_errors = NULL
143151

144152
def __dealloc__(self):
145153
PyMem_Free(self.pk.buf)
@@ -206,17 +214,19 @@ cdef class Packer(object):
206214
if ret == 0:
207215
ret = msgpack_pack_raw_body(&self.pk, rawval, L)
208216
elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
209-
if not self.encoding:
210-
raise TypeError("Can't encode unicode string: no encoding is specified")
211-
#TODO: Use faster API for UTF-8
212-
o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
213-
L = len(o)
214-
if L > ITEM_LIMIT:
215-
raise PackValueError("unicode string is too large")
216-
rawval = o
217-
ret = msgpack_pack_raw(&self.pk, L)
218-
if ret == 0:
219-
ret = msgpack_pack_raw_body(&self.pk, rawval, L)
217+
if self.encoding == NULL:
218+
ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
219+
if ret == -2:
220+
raise PackValueError("unicode string is too large")
221+
else:
222+
o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
223+
L = len(o)
224+
if L > ITEM_LIMIT:
225+
raise PackValueError("unicode string is too large")
226+
ret = msgpack_pack_raw(&self.pk, L)
227+
if ret == 0:
228+
rawval = o
229+
ret = msgpack_pack_raw_body(&self.pk, rawval, L)
220230
elif PyDict_CheckExact(o):
221231
d = <dict>o
222232
L = len(d)

msgpack/pack.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,53 @@ static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_
6767

6868
#include "pack_template.h"
6969

70+
// return -2 when o is too long
71+
static inline int
72+
msgpack_pack_unicode(msgpack_packer *pk, PyObject *o, long long limit)
73+
{
74+
#if PY_MAJOR_VERSION >= 3
75+
assert(PyUnicode_Check(o));
76+
77+
Py_ssize_t len;
78+
const char* buf = PyUnicode_AsUTF8AndSize(o, &len);
79+
if (buf == NULL)
80+
return -1;
81+
82+
if (len > limit) {
83+
return -2;
84+
}
85+
86+
int ret = msgpack_pack_raw(pk, len);
87+
if (ret) return ret;
88+
89+
return msgpack_pack_raw_body(pk, buf, len);
90+
#else
91+
PyObject *bytes;
92+
Py_ssize_t len;
93+
int ret;
94+
95+
// py2
96+
bytes = PyUnicode_AsUTF8String(o);
97+
if (bytes == NULL)
98+
return -1;
99+
100+
len = PyString_GET_SIZE(bytes);
101+
if (len > limit) {
102+
Py_DECREF(bytes);
103+
return -2;
104+
}
105+
106+
ret = msgpack_pack_raw(pk, len);
107+
if (ret) {
108+
Py_DECREF(bytes);
109+
return -1;
110+
}
111+
ret = msgpack_pack_raw_body(pk, PyString_AS_STRING(bytes), len);
112+
Py_DECREF(bytes);
113+
return ret;
114+
#endif
115+
}
116+
70117
#ifdef __cplusplus
71118
}
72119
#endif

0 commit comments

Comments
 (0)
0