8000 Implemented encoding for strings · urso/msgpack-python@752e3d1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 752e3d1

Browse files
author
tailhook
committed
Implemented encoding for strings
* Packer by default uses `utf-8` encoding by default * Unpacker uses `None` by default, so no decoding is done * Both pack and unpack has `encoding` and `unicode_errors` arguments, if `encoding` is `None` no encoding/decoding is done, otherwise it is python codec. `unicode_errors` is supplied as `errors` parameter to codec
1 parent af7113b commit 752e3d1

File tree

5 files changed

+169
-25
lines changed

5 files changed

+169
-25
lines changed

msgpack/_msgpack.pyx

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ cdef int DEFAULT_RECURSE_LIMIT=511
3636

3737
cdef class Packer(object):
3838
"""MessagePack Packer
39-
39+
4040
usage:
4141
4242
packer = Packer()
@@ -45,6 +45,10 @@ cdef class Packer(object):
4545
"""
4646
cdef msgpack_packer pk
4747
cdef object _default
48+
cdef object _bencoding
49+
cdef object _berrors
50+
cdef char *encoding
51+
cdef char *unicode_errors
4852

4953
def __cinit__(self):
5054
cdef int buf_size = 1024*1024
@@ -54,11 +58,25 @@ cdef class Packer(object):
5458
self.pk.buf_size = buf_size
5559
self.pk.length = 0
5660

57-
def __init__(self, default=None):
61+
def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'):
5862
if default is not None:
5963
if not PyCallable_Check(default):
6064
raise TypeError("default must be a callable.")
6165
self._default = default
66+
if encoding is None:
67+
self.encoding = NULL
68+
self.unicode_errors = NULL
69+
else:
70+
if isinstance(encoding, unicode):
71+
self._bencoding = encoding.encode('ascii')
72+
else:
73+
self._bencoding = encoding
74+
self.encoding = PyBytes_AsString(self._bencoding)
75+
if isinstance(unicode_errors, unicode):
76+
self._berrors = unicode_errors.encode('ascii')
77+
else:
78+
self._berrors = unicode_errors
79+
self.unicode_errors = PyBytes_AsString(self._berrors)
6280

6381
def __dealloc__(self):
6482
free(self.pk.buf);
@@ -68,7 +86,7 @@ cdef class Packer(object):
6886
cdef unsigned long long ullval
6987
cdef long longval
7088
cdef double fval
71-
cdef char* rawval
89+
cdef char* rawval
7290
cdef int ret
7391
cdef dict d
7492

@@ -101,7 +119,9 @@ cdef class Packer(object):
101119
if ret == 0:
102120
ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
103121
elif PyUnicode_Check(o):
104-
o = PyUnicode_AsUTF8String(o)
122+
if not self.encoding:
123+
raise TypeError("Can't encode utf-8 no encoding is specified")
124+
o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
105125
rawval = o
106126
ret = msgpack_pack_raw(&self.pk, len(o))
107127
if ret == 0:
@@ -138,14 +158,14 @@ cdef class Packer(object):
138158
return buf
139159

140160

141-
def pack(object o, object stream, default=None):
161+
def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'):
142162
"""pack an object `o` and write it to stream)."""
143-
packer = Packer(default=default)
163+
packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
144164
stream.write(packer.pack(o))
145165

146-
def packb(object o, default=None):
166+
def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'):
147167
"""pack o and return packed bytes."""
148-
packer = Packer(default=default)
168+
packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
149169
return packer.pack(o)
150170

151171
dumps = packs = packb
@@ -155,6 +175,8 @@ cdef extern from "unpack.h":
155175
int use_list
156176
PyObject* object_hook
157177
PyObject* list_hook
178+
char *encoding
179+
char *unicode_errors
158180

159181
ctypedef struct template_context:
160182
msgpack_user user
@@ -164,12 +186,12 @@ cdef extern from "unpack.h":
164186
PyObject* key
165187

166188
int template_execute(template_context* ctx, const_char_ptr data,
167-
size_t len, size_t* off)
189+
size_t len, size_t* off) except -1
168190
void template_init(template_context* ctx)
169191
object template_data(template_context* ctx)
170192

171193

172-
def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0):
194+
def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
173195
"""Unpack packed_bytes to object. Returns an unpacked object."""
174196
cdef template_context ctx
175197
cdef size_t off = 0
@@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
179201
cdef Py_ssize_t buf_len
180202
PyObject_AsReadBuffer(packed, <const_void_ptr*>&buf, &buf_len)
181203

204+
if encoding is None:
205+
enc = NULL
206+
else:
207+
if isinstance(encoding, unicode):
208+
bencoding = encoding.encode('ascii')
209+
else:
210+
bencoding = encoding
211+
if isinstance(unicode_errors, unicode):
212+
berrors = unicode_errors.encode('ascii')
213+
else:
214+
berrors = unicode_errors
215+
enc = PyBytes_AsString(bencoding)
216+
err = PyBytes_AsString(berrors)
217+
182218
template_init(&ctx)
183219
ctx.user.use_list = use_list
184220
ctx.user.object_hook = ctx.user.list_hook = NULL
221+
ctx.user.encoding = enc
222+
ctx.user.unicode_errors = err
185223
if object_hook is not None:
186224
if not PyCallable_Check(object_hook):
187225
raise TypeError("object_hook must be a callable.")
@@ -191,19 +229,21 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
191229
raise TypeError("list_hook must be a callable.")
192230
ctx.user.list_hook = <PyObject*>list_hook
193231
_gc_disable()
194-
ret = template_execute(&ctx, buf, buf_len, &off)
195-
_gc_enable()
232+
try:
233+
ret = template_execute(&ctx, buf, buf_len, &off)
234+
finally:
235+
_gc_enable()
196236
if ret == 1:
197237
return template_data(&ctx)
198238
else:
199239
return None
200240

201241
loads = unpacks = unpackb
202242

203-
def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0):
243+
def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
204244
"""unpack an object from stream."""
205245
return unpackb(stream.read(), use_list=use_list,
206-
object_hook=object_hook, list_hook=list_hook)
246+
object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors)
207247

208248
cdef class Unpacker(object):
209249
"""Unpacker(read_size=1024*1024)
@@ -236,7 +276,7 @@ cdef class Unpacker(object):
236276
self.buf = NULL;
237277

238278
def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0,
239-
object object_hook=None, object list_hook=None):
279+
object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None):
240280
if read_size == 0:
241281
read_size = 1024*1024
242282
self.use_list = use_list
@@ -292,7 +332,7 @@ cdef class Unpacker(object):
292332
new_size = tail + _buf_len
293333
if new_size < buf_size*2:
294334
new_size = buf_size*2
295-
buf = <char*>realloc(buf, new_size)
335+
buf = <char*>realloc(buf, new_size)
296336
if buf == NULL:
297337
# self.buf still holds old buffer and will be freed during
298338
# obj destruction

msgpack/unpack.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ typedef struct unpack_user {
2323
int use_list;
2424
PyObject *object_hook;
2525
PyObject *list_hook;
26+
const char *encoding;
27+
const char *unicode_errors;
2628
} unpack_user;
2729

2830

@@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec
197199
static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
198200
{
199201
PyObject *py;
200-
py = PyBytes_FromStringAndSize(p, l);
202+
if(u->encoding) {
203+
py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
204+
} else {
205+
py = PyBytes_FromStringAndSize(p, l);
206+
}
201207
if (!py)
202208
return -1;
203209
*o = py;

test/test_pack.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,63 @@ def testPack():
1515
0, 1, 127, 128, 255, 256, 65535, 65536,
1616
-1, -32, -33, -128, -129, -32768, -32769,
1717
1.0,
18-
"", "a", "a"*31, "a"*32,
18+
b"", b"a", b"a"*31, b"a"*32,
1919
None, True, False,
20-
(), ((),), ((), None,),
21-
{None: 0},
22-
(1<<23),
20+
(), ((),), ((), None,),
21+
{None: 0},
22+
(1<<23),
2323
]
2424
for td in test_data:
2525
check(td)
2626

27+
def testPackUnicode():
28+
test_data = [
29+
u"", u"abcd", (u"defgh",), u"Русский текст",
30+
]
31+
for td in test_data:
32+
re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
33+
assert_equal(re, td)
34+
35+
def testPackUTF32():
36+
test_data = [
37+
u"", u"abcd", (u"defgh",), u"Русский текст",
38+
]
39+
for td in test_data:
40+
print(packs(td, encoding='utf-32'))
41+
re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
42+
assert_equal(re, td)
43+
44+
def testPackBytes():
45+
test_data = [
46+
b"", b"abcd", (b"defgh",),
47+
]
48+
for td in test_data:
49+
check(td)
50+
51+
def testIgnoreUnicodeErrors():
52+
re = unpacks(packs(b'abc\xeddef'),
53+
encoding='utf-8', unicode_errors='ignore')
54+
assert_equal(re, "abcdef")
55+
56+
@raises(UnicodeDecodeError)
57+
def testStrictUnicodeUnpack():
58+
unpacks(packs(b'abc\xeddef'), encoding='utf-8')
59+
60+
@raises(UnicodeEncodeError)
61+
def testStrictUnicodePack():
62+
packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict')
63+
64+
def testIgnoreErrorsPack():
65+
re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
66+
assert_equal(re, u"abcdef")
67+
68+
@raises(TypeError)
69+
def testNoEncoding():
70+
packs(u"abc", encoding=None)
71+
72+
def testDecodeBinary():
73+
re = unpacks(packs(u"abc"), encoding=None)
74+
assert_equal(re, b"abc")
75+
2776
if __name__ == '__main__':
2877
main()

test3/test_obj.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_decode_hook():
2626
unpacked = unpacks(packed, object_hook=_decode_complex)
2727
eq_(unpacked[1], 1+2j)
2828

29-
@raises(TypeError)
29+
@raises(ValueError)
3030
def test_bad_hook():
3131
packed = packs([3, 1+2j], default=lambda o: o)
3232
unpacked = unpacks(packed)

test3/test_pack.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,61 @@ def testPack():
1717
1.0,
1818
b"", b"a", b"a"*31, b"a"*32,
1919
None, True, False,
20-
(), ((),), ((), None,),
21-
{None: 0},
22-
(1<<23),
20+
(), ((),), ((), None,),
21+
{None: 0},
22+
(1<<23),
2323
]
2424
for td in test_data:
2525
check(td)
2626

27+
def testPackUnicode():
28+
test_data = [
29+
"", "abcd", ("defgh",), "Русский текст",
30+
]
31+
for td in test_data:
32+
re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
33+
assert_equal(re, td)
34+
35+
def testPackUTF32():
36+
test_data = [
37+
"", "abcd", ("defgh",), "Русский текст",
38+
]
39+
for td in test_data:
40+
print(packs(td, encoding='utf-32'))
41+
re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
42+
assert_equal(re, td)
43+
44+
def testPackBytes():
45+
test_data = [
46+
b"", b"abcd", (b"defgh",),
47+
]
48+
for td in test_data:
49+
check(td)
50+
51+
def testIgnoreUnicodeErrors():
52+
re = unpacks(packs(b'abc\xeddef'),
53+
encoding='utf-8', unicode_errors='ignore')
54+
assert_equal(re, "abcdef")
55+
56+
@raises(UnicodeDecodeError)
57+
def testStrictUnicodeUnpack():
58+
unpacks(packs(b'abc\xeddef'), encoding='utf-8')
59+
60+
@raises(UnicodeEncodeError)
61+
def testStrictUnicodePack():
62+
packs("abc\xeddef", encoding='ascii', unicode_errors='strict')
63+
64+
def testIgnoreErrorsPack():
65+
re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
66+
assert_equal(re, "abcdef")
67+
68+
@raises(TypeError)
69+
def testNoEncoding():
70+
packs("abc", encoding=None)
71+
72+
def testDecodeBinary():
73+
re = unpacks(packs("abc"), encoding=None)
74+
assert_equal(re, b"abc")
75+
2776
if __name__ == '__main__':
2877
main()

0 commit comments

Comments
 (0)
0