From 511978280b01d952fcb5beb6b5eb04ff273d9108 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Mon, 8 Jan 2018 17:15:45 +0900 Subject: [PATCH 01/11] Add raw_as_bytes option to Unpacker. --- msgpack/_unpacker.pyx | 71 ++++++++++++++++++++++++++++++++----------- msgpack/fallback.py | 24 ++++++++++++++- msgpack/unpack.h | 5 +-- 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/msgpack/_unpacker.pyx b/msgpack/_unpacker.pyx index 564749e7..fbd1af6a 100644 --- a/msgpack/_unpacker.pyx +++ b/msgpack/_unpacker.pyx @@ -43,8 +43,9 @@ from msgpack import ExtType cdef extern from "unpack.h": ctypedef struct msgpack_user: bint use_list - PyObject* object_hook + bint raw_as_bytes bint has_pairs_hook # call object_hook with k-v pairs + PyObject* object_hook PyObject* list_hook PyObject* ext_hook char *encoding @@ -73,12 +74,14 @@ cdef extern from "unpack.h": cdef inline init_ctx(unpack_context *ctx, object object_hook, object object_pairs_hook, object list_hook, object ext_hook, - bint use_list, char* encoding, char* unicode_errors, + bint use_list, bint raw_as_bytes, + char* encoding, char* unicode_errors, Py_ssize_t max_str_len, Py_ssize_t max_bin_len, Py_ssize_t max_array_len, Py_ssize_t max_map_len, Py_ssize_t max_ext_len): unpack_init(ctx) ctx.user.use_list = use_list + ctx.user.raw_as_bytes = raw_as_bytes ctx.user.object_hook = ctx.user.list_hook = NULL ctx.user.max_str_len = max_str_len ctx.user.max_bin_len = max_bin_len @@ -155,7 +158,7 @@ cdef inline int get_data_from_buffer(object obj, return 1 def unpackb(object packed, object object_hook=None, object list_hook=None, - bint use_list=1, encoding=None, unicode_errors="strict", + bint use_list=1, raw_as_bytes=None, encoding=None, unicode_errors="strict", object_pairs_hook=None, ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, @@ -179,22 +182,38 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, cdef char* cenc = NULL cdef char* cerr = NULL cdef int new_protocol = 0 + cdef bint _raw_as_bytes + + if encoding is not None: + PyErr_WarnEx( + DeprecationWarning, + "encoding is deprecated, Use raw_as_bytes=False instead.", + 1) + if isinstance(encoding, unicode): + encoding = encoding.encode('ascii') + cenc = PyBytes_AsString(encoding) + + if unicode_errors is not None: + if isinstance(unicode_errors, unicode): + unicode_errors = unicode_errors.encode('ascii') + cerr = PyBytes_AsString(unicode_errors) + + if raw_as_bytes is None: + PyErr_WarnEx( + FutureWarning, + "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", + 1) + _raw_as_bytes = 1 + else: + if encoding is not None: + raise TypeError("raw_as_bytes and encoding are mutually exclusive") + _raw_as_bytes = raw_as_bytes + encoding = "utf_8" get_data_from_buffer(packed, &view, &buf, &buf_len, &new_protocol) - try: - if encoding is not None: - if isinstance(encoding, unicode): - encoding = encoding.encode('ascii') - cenc = PyBytes_AsString(encoding) - - if unicode_errors is not None: - if isinstance(unicode_errors, unicode): - unicode_errors = unicode_errors.encode('ascii') - cerr = PyBytes_AsString(unicode_errors) - init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, - use_list, cenc, cerr, + use_list, _raw_as_bytes, cenc, cerr, max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) ret = unpack_construct(&ctx, buf, buf_len, &off) finally: @@ -252,6 +271,12 @@ cdef class Unpacker(object): If true, unpack msgpack array to Python list. Otherwise, unpack to Python tuple. (default: True) + :param bool raw_as_bytes: + If true, unpack msgpack raw to Python bytes. Otherwise, unpack to Python str + (or unicode on Python 2) by decoding with UTF-8 encoding. + Currently, the default is true, but it will be changed to false in near future. + So you must specify it explicitly. + :param callable object_hook: When specified, it should be callable. Unpacker calls it with a dict argument after unpacking msgpack map. @@ -263,6 +288,7 @@ cdef class Unpacker(object): (See also simplejson) :param str encoding: + (Deprecated, use raw_as_bytes instead) Encoding used for decoding msgpack raw. If it is None (default), msgpack raw is deserialized to Python bytes. @@ -324,7 +350,7 @@ cdef class Unpacker(object): PyMem_Free(self.buf) self.buf = NULL - def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, + def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, raw_as_bytes=None, object object_hook=None, object object_pairs_hook=None, object list_hook=None, encoding=None, unicode_errors='strict', int max_buffer_size=0, object ext_hook=ExtType, @@ -336,6 +362,17 @@ cdef class Unpacker(object): cdef char *cenc=NULL, cdef char *cerr=NULL + cdef bint _raw_as_bytes + + if raw_as_bytes is None: + PyErr_WarnEx( + FutureWarning, + "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", + 1) + _raw_as_bytes = 1 + else: + _raw_as_bytes = raw_as_bytes + self.object_hook = object_hook self.object_pairs_hook = object_pairs_hook self.list_hook = list_hook @@ -381,7 +418,7 @@ cdef class Unpacker(object): cerr = PyBytes_AsString(self.unicode_errors) init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, - ext_hook, use_list, cenc, cerr, + ext_hook, use_list, _raw_as_bytes, cenc, cerr, max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 5447b530..329155e7 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -145,6 +145,12 @@ class Unpacker(object): If true, unpack msgpack array to Python list. Otherwise, unpack to Python tuple. (default: True) + :param bool raw_as_bytes: + If true, unpack msgpack raw to Python bytes. Otherwise, unpack to Python str + (or unicode on Python 2) by decoding with UTF-8 encoding. + Currently, the default is true, but it will be changed to false in near future. + So you must specify it explicitly. + :param callable object_hook: When specified, it should be callable. Unpacker calls it with a dict argument after unpacking msgpack map. @@ -199,7 +205,7 @@ class Unpacker(object): process(o) """ - def __init__(self, file_like=None, read_size=0, use_list=True, + def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=None, object_hook=None, object_pairs_hook=None, list_hook=None, encoding=None, unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType, @@ -208,6 +214,22 @@ def __init__(self, file_like=None, read_size=0, use_list=True, max_array_len=2147483647, max_map_len=2147483647, max_ext_len=2147483647): + if encoding is not None: + warnings.warn( + "encoding is deprecated, Use raw_as_bytes=False instead.", + DeprecationWarning) + + if raw_as_bytes is None: + warnings.warn( + "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", + FutureWarning) + raw_as_bytes = True + else: + raw_as_bytes = bool(raw_as_bytes) + if encoding is not None: + raise TypeError("raw_as_bytes and encoding are mutually exclusive") + encoding = "utf_8" + if file_like is None: self._feeding = True else: diff --git a/msgpack/unpack.h b/msgpack/unpack.h index da2cfb6a..32bb19b2 100644 --- a/msgpack/unpack.h +++ b/msgpack/unpack.h @@ -20,9 +20,10 @@ #include "unpack_define.h" typedef struct unpack_user { - int use_list; - PyObject *object_hook; + bool use_list; + bool raw_as_bytes; bool has_pairs_hook; + PyObject *object_hook; PyObject *list_hook; PyObject *ext_hook; const char *encoding; From 2fe47afc4044cb1f0ff317b491b5f8ac4da89b63 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Wed, 10 Jan 2018 15:34:08 +0900 Subject: [PATCH 02/11] wip --- msgpack/_unpacker.pyx | 43 ++++++++++++++++++++----------------------- msgpack/unpack.h | 7 +++++-- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/msgpack/_unpacker.pyx b/msgpack/_unpacker.pyx index fbd1af6a..de339902 100644 --- a/msgpack/_unpacker.pyx +++ b/msgpack/_unpacker.pyx @@ -158,7 +158,7 @@ cdef inline int get_data_from_buffer(object obj, return 1 def unpackb(object packed, object object_hook=None, object list_hook=None, - bint use_list=1, raw_as_bytes=None, encoding=None, unicode_errors="strict", + bint use_list=True, int raw_as_bytes=-1, encoding=None, unicode_errors="strict", object_pairs_hook=None, ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, @@ -184,32 +184,29 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, cdef int new_protocol = 0 cdef bint _raw_as_bytes + if raw_as_bytes == -1: + _raw_as_bytes = 1 + else: + if encoding is not None: + raise TypeError("raw_as_bytes and encoding are mutually exclusive") + _raw_as_bytes = raw_as_bytes + if encoding is not None: - PyErr_WarnEx( - DeprecationWarning, - "encoding is deprecated, Use raw_as_bytes=False instead.", - 1) + #PyErr_WarnEx( + # DeprecationWarning, + # "encoding is deprecated, Use raw_as_bytes=False instead.", + # 1) + _raw_as_bytes = 0 if isinstance(encoding, unicode): encoding = encoding.encode('ascii') cenc = PyBytes_AsString(encoding) if unicode_errors is not None: + #PyErr_WarnEx(DeprecationWarning, "unicode_errors is deprecated", 1) if isinstance(unicode_errors, unicode): unicode_errors = unicode_errors.encode('ascii') cerr = PyBytes_AsString(unicode_errors) - if raw_as_bytes is None: - PyErr_WarnEx( - FutureWarning, - "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", - 1) - _raw_as_bytes = 1 - else: - if encoding is not None: - raise TypeError("raw_as_bytes and encoding are mutually exclusive") - _raw_as_bytes = raw_as_bytes - encoding = "utf_8" - get_data_from_buffer(packed, &view, &buf, &buf_len, &new_protocol) try: init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, @@ -350,7 +347,7 @@ cdef class Unpacker(object): PyMem_Free(self.buf) self.buf = NULL - def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, raw_as_bytes=None, + def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, bint raw_as_bytes=-1, object object_hook=None, object object_pairs_hook=None, object list_hook=None, encoding=None, unicode_errors='strict', int max_buffer_size=0, object ext_hook=ExtType, @@ -364,11 +361,11 @@ cdef class Unpacker(object): cdef bint _raw_as_bytes - if raw_as_bytes is None: - PyErr_WarnEx( - FutureWarning, - "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", - 1) + if raw_as_bytes < 0: + #PyErr_WarnEx( + # FutureWarning, + # "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", + # 1) _raw_as_bytes = 1 else: _raw_as_bytes = raw_as_bytes diff --git a/msgpack/unpack.h b/msgpack/unpack.h index 32bb19b2..8c2fc467 100644 --- a/msgpack/unpack.h +++ b/msgpack/unpack.h @@ -226,10 +226,13 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* } PyObject *py; - if(u->encoding) { + + if (u->encoding) { py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); - } else { + } else if (u->raw_as_bytes) { py = PyBytes_FromStringAndSize(p, l); + } else { + py = PyUnicode_DecodeUTF8(p, l, NULL); } if (!py) return -1; From e4b04234a5372b559f85285e7ca50afa18b4342c Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 12:13:30 +0900 Subject: [PATCH 03/11] wip --- msgpack/_unpacker.pyx | 81 +++++++++++++++++++------------------------ msgpack/fallback.py | 38 ++++++++++---------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/msgpack/_unpacker.pyx b/msgpack/_unpacker.pyx index de339902..e3f2daab 100644 --- a/msgpack/_unpacker.pyx +++ b/msgpack/_unpacker.pyx @@ -158,7 +158,8 @@ cdef inline int get_data_from_buffer(object obj, return 1 def unpackb(object packed, object object_hook=None, object list_hook=None, - bint use_list=True, int raw_as_bytes=-1, encoding=None, unicode_errors="strict", + bint use_list=True, bint raw_as_bytes=True, + encoding=None, unicode_errors="strict", object_pairs_hook=None, ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, @@ -182,35 +183,27 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, cdef char* cenc = NULL cdef char* cerr = NULL cdef int new_protocol = 0 - cdef bint _raw_as_bytes - - if raw_as_bytes == -1: - _raw_as_bytes = 1 - else: - if encoding is not None: - raise TypeError("raw_as_bytes and encoding are mutually exclusive") - _raw_as_bytes = raw_as_bytes if encoding is not None: - #PyErr_WarnEx( - # DeprecationWarning, - # "encoding is deprecated, Use raw_as_bytes=False instead.", - # 1) - _raw_as_bytes = 0 + PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated, Use raw_as_bytes=False instead.", 1) if isinstance(encoding, unicode): encoding = encoding.encode('ascii') + elif not isinstance(encoding, bytes): + raise TypeError("encoding should be bytes or unicode") cenc = PyBytes_AsString(encoding) if unicode_errors is not None: - #PyErr_WarnEx(DeprecationWarning, "unicode_errors is deprecated", 1) + PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated", 1) if isinstance(unicode_errors, unicode): unicode_errors = unicode_errors.encode('ascii') + elif not isinstance(unicode_errors, bytes): + raise TypeError("unicode_errors should be bytes or unicode") cerr = PyBytes_AsString(unicode_errors) get_data_from_buffer(packed, &view, &buf, &buf_len, &new_protocol) try: init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, - use_list, _raw_as_bytes, cenc, cerr, + use_list, raw_as_bytes, cenc, cerr, max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) ret = unpack_construct(&ctx, buf, buf_len, &off) finally: @@ -269,10 +262,14 @@ cdef class Unpacker(object): Otherwise, unpack to Python tuple. (default: True) :param bool raw_as_bytes: - If true, unpack msgpack raw to Python bytes. Otherwise, unpack to Python str - (or unicode on Python 2) by decoding with UTF-8 encoding. - Currently, the default is true, but it will be changed to false in near future. - So you must specify it explicitly. + If true, unpack msgpack raw to Python bytes (default). + Otherwise, unpack to Python str (or unicode on Python 2) by decoding + with UTF-8 encoding (recommended). + Currently, the default is true, but it will be changed to false in + near future. So you must specify it explicitly for keeping backward + compatibility. + + *encoding* option which is deprecated overrides this option. :param callable object_hook: When specified, it should be callable. @@ -284,15 +281,6 @@ cdef class Unpacker(object): Unpacker calls it with a list of key-value pairs after unpacking msgpack map. (See also simplejson) - :param str encoding: - (Deprecated, use raw_as_bytes instead) - Encoding used for decoding msgpack raw. - If it is None (default), msgpack raw is deserialized to Python bytes. - - :param str unicode_errors: - Used for decoding msgpack raw with *encoding*. - (default: `'strict'`) - :param int max_buffer_size: Limits size of data waiting unpacked. 0 means system's INT_MAX (default). Raises `BufferFull` exception when it is insufficient. @@ -310,16 +298,25 @@ cdef class Unpacker(object): :param int max_map_len: Limits max length of map. (default: 2**31-1) + :param str encoding: + Deprecated, use raw_as_bytes instead. + Encoding used for decoding msgpack raw. + If it is None (default), msgpack raw is deserialized to Python bytes. + + :param str unicode_errors: + Deprecated. Used for decoding msgpack raw with *encoding*. + (default: `'strict'`) + - example of streaming deserialize from file-like object:: + Example of streaming deserialize from file-like object:: - unpacker = Unpacker(file_like) + unpacker = Unpacker(file_like, raw_as_bytes=False) for o in unpacker: process(o) - example of streaming deserialize from socket:: + Example of streaming deserialize from socket:: - unpacker = Unpacker() + unpacker = Unpacker(raw_as_bytes=False) while True: buf = sock.recv(1024**2) if not buf: @@ -347,7 +344,8 @@ cdef class Unpacker(object): PyMem_Free(self.buf) self.buf = NULL - def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, bint raw_as_bytes=-1, + def __init__(self, file_like=None, Py_ssize_t read_size=0, + bint use_list=True, bint raw_as_bytes=False, object object_hook=None, object object_pairs_hook=None, object list_hook=None, encoding=None, unicode_errors='strict', int max_buffer_size=0, object ext_hook=ExtType, @@ -359,17 +357,6 @@ cdef class Unpacker(object): cdef char *cenc=NULL, cdef char *cerr=NULL - cdef bint _raw_as_bytes - - if raw_as_bytes < 0: - #PyErr_WarnEx( - # FutureWarning, - # "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", - # 1) - _raw_as_bytes = 1 - else: - _raw_as_bytes = raw_as_bytes - self.object_hook = object_hook self.object_pairs_hook = object_pairs_hook self.list_hook = list_hook @@ -397,6 +384,7 @@ cdef class Unpacker(object): self.stream_offset = 0 if encoding is not None: + PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated, Use raw_as_bytes=False instead.", 1) if isinstance(encoding, unicode): self.encoding = encoding.encode('ascii') elif isinstance(encoding, bytes): @@ -406,6 +394,7 @@ cdef class Unpacker(object): cenc = PyBytes_AsString(self.encoding) if unicode_errors is not None: + PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated", 1) if isinstance(unicode_errors, unicode): self.unicode_errors = unicode_errors.encode('ascii') elif isinstance(unicode_errors, bytes): @@ -415,7 +404,7 @@ cdef class Unpacker(object): cerr = PyBytes_AsString(self.unicode_errors) init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, - ext_hook, use_list, _raw_as_bytes, cenc, cerr, + ext_hook, use_list, raw_as_bytes, cenc, cerr, max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 329155e7..afe5e65f 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -146,10 +146,14 @@ class Unpacker(object): Otherwise, unpack to Python tuple. (default: True) :param bool raw_as_bytes: - If true, unpack msgpack raw to Python bytes. Otherwise, unpack to Python str - (or unicode on Python 2) by decoding with UTF-8 encoding. - Currently, the default is true, but it will be changed to false in near future. - So you must specify it explicitly. + If true, unpack msgpack raw to Python bytes (default). + Otherwise, unpack to Python str (or unicode on Python 2) by decoding + with UTF-8 encoding (recommended). + Currently, the default is true, but it will be changed to false in + near future. So you must specify it explicitly for keeping backward + compatibility. + + *encoding* option which is deprecated overrides this option. :param callable object_hook: When specified, it should be callable. @@ -189,13 +193,13 @@ class Unpacker(object): example of streaming deserialize from file-like object:: - unpacker = Unpacker(file_like) + unpacker = Unpacker(file_like, raw_as_bytes=False) for o in unpacker: process(o) example of streaming deserialize from socket:: - unpacker = Unpacker() + unpacker = Unpacker(raw_as_bytes=False) while True: buf = sock.recv(1024**2) if not buf: @@ -205,7 +209,7 @@ class Unpacker(object): process(o) """ - def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=None, + def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=False, object_hook=None, object_pairs_hook=None, list_hook=None, encoding=None, unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType, @@ -217,18 +221,11 @@ def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=None if encoding is not None: warnings.warn( "encoding is deprecated, Use raw_as_bytes=False instead.", - DeprecationWarning) - - if raw_as_bytes is None: + PendingDeprecationWarning) + if unicode_errors is not None: warnings.warn( - "raw_as_bytes option is not specified. Default value of the option will be changed in future version.", - FutureWarning) - raw_as_bytes = True - else: - raw_as_bytes = bool(raw_as_bytes) - if encoding is not None: - raise TypeError("raw_as_bytes and encoding are mutually exclusive") - encoding = "utf_8" + "unicode_errors is deprecated.", + PendingDeprecationWarning) if file_like is None: self._feeding = True @@ -256,6 +253,7 @@ def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=None if read_size > self._max_buffer_size: raise ValueError("read_size must be smaller than max_buffer_size") self._read_size = read_size or min(self._max_buffer_size, 16*1024) + self._raw_as_bytes = bool(raw_as_bytes) self._encoding = encoding self._unicode_errors = unicode_errors self._use_list = use_list @@ -604,8 +602,10 @@ def _unpack(self, execute=EX_CONSTRUCT): if typ == TYPE_RAW: if self._encoding is not None: obj = obj.decode(self._encoding, self._unicode_errors) - else: + elif self._raw_as_bytes: obj = bytes(obj) + else: + obj = obj.decode('utf_8') return obj if typ == TYPE_EXT: return self._ext_hook(n, bytes(obj)) From 69c76c60eedd2a835876e3ea2cb9d1cee84f5d42 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 12:25:19 +0900 Subject: [PATCH 04/11] fix --- Makefile | 3 ++- msgpack/_unpacker.pyx | 2 +- msgpack/fallback.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 84decd80..6a9906c6 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,8 @@ cython: .PHONY: test test: - py.test -v test + pytest -v test + MSGPACK_PUREPYTHON=1 pytest -v test .PHONY: serve-doc serve-doc: all diff --git a/msgpack/_unpacker.pyx b/msgpack/_unpacker.pyx index e3f2daab..b796d045 100644 --- a/msgpack/_unpacker.pyx +++ b/msgpack/_unpacker.pyx @@ -345,7 +345,7 @@ cdef class Unpacker(object): self.buf = NULL def __init__(self, file_like=None, Py_ssize_t read_size=0, - bint use_list=True, bint raw_as_bytes=False, + bint use_list=True, bint raw_as_bytes=True, object object_hook=None, object object_pairs_hook=None, object list_hook=None, encoding=None, unicode_errors='strict', int max_buffer_size=0, object ext_hook=ExtType, diff --git a/msgpack/fallback.py b/msgpack/fallback.py index afe5e65f..2555e929 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -209,7 +209,7 @@ class Unpacker(object): process(o) """ - def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=False, + def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=True, object_hook=None, object_pairs_hook=None, list_hook=None, encoding=None, unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType, From 1e23b1a939a7c13ad1354e74f96c37c301876a5c Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 12:39:32 +0900 Subject: [PATCH 05/11] Deprecate encoding and unicode_errors of Packer --- msgpack/_packer.pyx | 11 ++++++++--- msgpack/fallback.py | 14 +++++++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 13a18f6c..994fef45 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -2,7 +2,7 @@ #cython: embedsignature=True from cpython cimport * -#from cpython.exc cimport PyErr_WarnEx +from cpython.exc cimport PyErr_WarnEx from msgpack.exceptions import PackValueError, PackOverflowError from msgpack import ExtType @@ -110,9 +110,13 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', + def __init__(self, default=None, encoding=None, unicode_errors=None, bint use_single_float=False, bint autoreset=True, bint use_bin_type=False, bint strict_types=False): + if encoding is not None: + PyErr_WarnEx(PendingDeprecationWarning, "encoding is deprecated.", 1) + if unicode_errors is not None: + PyErr_WarnEx(PendingDeprecationWarning, "unicode_errors is deprecated.", 1) self.use_float = use_single_float self.strict_types = strict_types self.autoreset = autoreset @@ -122,7 +126,7 @@ cdef class Packer(object): raise TypeError("default must be a callable.") self._default = default if encoding is None: - self.encoding = NULL + self.encoding = 'utf_8' self.unicode_errors = NULL else: if isinstance(encoding, unicode): @@ -203,6 +207,7 @@ cdef class Packer(object): elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): if not self.encoding: raise TypeError("Can't encode unicode string: no encoding is specified") + #TODO: Use faster API for UTF-8 o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) L = len(o) if L > ITEM_LIMIT: diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 2555e929..cdd7af17 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -704,9 +704,21 @@ class Packer(object): :param str unicode_errors: (deprecated) Error handler for encoding unicode. (default: 'strict') """ - def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', + def __init__(self, default=None, encoding=None, unicode_errors=None, use_single_float=False, autoreset=True, use_bin_type=False, strict_types=False): + if encoding is None: + encoding = 'utf_8' + else: + warnings.warn( + "encoding is deprecated, Use raw_as_bytes=False instead.", + PendingDeprecationWarning) + + if unicode_errors is not None: + warnings.warn( + "unicode_errors is deprecated.", + PendingDeprecationWarning) + self._strict_types = strict_types self._use_float = use_single_float self._autoreset = autoreset From 382c00e297af12b21bbb83fb357a2c96b4f3f4b5 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 12:45:25 +0900 Subject: [PATCH 06/11] Use raw_as_bytes in test --- test/test_limits.py | 4 ++-- test/test_pack.py | 18 +++++++++--------- test/test_stricttype.py | 8 ++++---- test/test_unpack.py | 10 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/test_limits.py b/test/test_limits.py index 197ef461..3febc30c 100644 --- a/test/test_limits.py +++ b/test/test_limits.py @@ -39,11 +39,11 @@ def test_max_str_len(): d = 'x' * 3 packed = packb(d) - unpacker = Unpacker(max_str_len=3, encoding='utf-8') + unpacker = Unpacker(max_str_len=3, raw_as_bytes=False) unpacker.feed(packed) assert unpacker.unpack() == d - unpacker = Unpacker(max_str_len=2, encoding='utf-8') + unpacker = Unpacker(max_str_len=2, raw_as_bytes=False) with pytest.raises(UnpackValueError): unpacker.feed(packed) unpacker.unpack() diff --git a/test/test_pack.py b/test/test_pack.py index ac931038..f925db0b 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -31,14 +31,14 @@ def testPack(): def testPackUnicode(): test_data = ["", "abcd", ["defgh"], "Русский текст"] for td in test_data: - re = unpackb(packb(td, encoding='utf-8'), use_list=1, encoding='utf-8') + re = unpackb(packb(td), use_list=1, raw_as_bytes=False) assert re == td - packer = Packer(encoding='utf-8') + packer = Packer() data = packer.pack(td) - re = Unpacker(BytesIO(data), encoding=str('utf-8'), use_list=1).unpack() + re = Unpacker(BytesIO(data), raw_as_bytes=False, use_list=1).unpack() assert re == td -def testPackUTF32(): +def testPackUTF32(): # deprecated try: test_data = [ "", @@ -66,20 +66,20 @@ def testPackByteArrays(): for td in test_data: check(td) -def testIgnoreUnicodeErrors(): +def testIgnoreUnicodeErrors(): # deprecated re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1) assert re == "abcdef" def testStrictUnicodeUnpack(): with raises(UnicodeDecodeError): - unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1) + unpackb(packb(b'abc\xeddef'), raw_as_bytes=False, use_list=1) -def testStrictUnicodePack(): +def testStrictUnicodePack(): # deprecated with raises(UnicodeEncodeError): packb("abc\xeddef", encoding='ascii', unicode_errors='strict') -def testIgnoreErrorsPack(): - re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8', use_list=1) +def testIgnoreErrorsPack(): # deprecated + re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw_as_bytes=False, use_list=1) assert re == "abcdef" def testNoEncoding(): diff --git a/test/test_stricttype.py b/test/test_stricttype.py index 0f865c83..13239f1e 100644 --- a/test/test_stricttype.py +++ b/test/test_stricttype.py @@ -11,7 +11,7 @@ def default(o): return dict(o._asdict()) raise TypeError('Unsupported type %s' % (type(o),)) packed = packb(T(1, 42), strict_types=True, use_bin_type=True, default=default) - unpacked = unpackb(packed, encoding='utf-8') + unpacked = unpackb(packed, raw_as_bytes=False) assert unpacked == {'foo': 1, 'bar': 42} @@ -32,7 +32,7 @@ def convert(o): return o data = packb(t, strict_types=True, use_bin_type=True, default=default) - expected = unpackb(data, encoding='utf-8', object_hook=convert) + expected = unpackb(data, raw_as_bytes=False, object_hook=convert) assert expected == t @@ -53,10 +53,10 @@ def default(o): def convert(code, payload): if code == MSGPACK_EXT_TYPE_TUPLE: # Unpack and convert to tuple - return tuple(unpackb(payload, encoding='utf-8', ext_hook=convert)) + return tuple(unpackb(payload, raw_as_bytes=False, ext_hook=convert)) raise ValueError('Unknown Ext code {}'.format(code)) data = packb(t, strict_types=True, use_bin_type=True, default=default) - expected = unpackb(data, encoding='utf-8', ext_hook=convert) + expected = unpackb(data, raw_as_bytes=False, ext_hook=convert) assert expected == t diff --git a/test/test_unpack.py b/test/test_unpack.py index c0d711cd..143f9993 100644 --- a/test/test_unpack.py +++ b/test/test_unpack.py @@ -47,8 +47,8 @@ def test_unpacker_ext_hook(): class MyUnpacker(Unpacker): def __init__(self): - super(MyUnpacker, self).__init__(ext_hook=self._hook, - encoding='utf-8') + super(MyUnpacker, self).__init__( + ext_hook=self._hook, raw_as_bytes=False) def _hook(self, code, data): if code == 1: @@ -57,11 +57,11 @@ def _hook(self, code, data): return ExtType(code, data) unpacker = MyUnpacker() - unpacker.feed(packb({'a': 1}, encoding='utf-8')) + unpacker.feed(packb({'a': 1})) assert unpacker.unpack() == {'a': 1} - unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8')) + unpacker.feed(packb({'a': ExtType(1, b'123')})) assert unpacker.unpack() == {'a': 123} - unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8')) + unpacker.feed(packb({'a': ExtType(2, b'321')})) assert unpacker.unpack() == {'a': ExtType(2, b'321')} From 10331bb83f91f6c08b0ae139e48f5c36c4c16d41 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 13:03:15 +0900 Subject: [PATCH 07/11] Update README --- README.rst | 78 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index 01a8b2a4..a5038db5 100644 --- a/README.rst +++ b/README.rst @@ -10,8 +10,21 @@ MessagePack for Python :target: https://msgpack-python.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status -IMPORTANT: Upgrading from msgpack-0.4 --------------------------------------- + +What's this +----------- + +`MessagePack `_ is an efficient binary serialization format. +It lets you exchange data among multiple languages like JSON. +But it's faster and smaller. +This package provides CPython bindings for reading and writing MessagePack data. + + +Very important notes for existing users +--------------------------------------- + +PyPI package name +^^^^^^^^^^^^^^^^^ TL;DR: When upgrading from msgpack-0.4 or earlier, don't do `pip install -U msgpack-python`. Do `pip uninstall msgpack-python; pip install msgpack` instead. @@ -24,13 +37,37 @@ Sadly, this doesn't work for upgrade install. After `pip install -U msgpack-pyt msgpack is removed and `import msgpack` fail. -What's this ------------ +Deprecating encoding option +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +encoding and unicode_errors options are deprecated. + +In case of packer, use UTF-8 always. Storing other than UTF-8 is not recommended. + +For backward compatibility, you can use ``use_bin_type=False`` and pack ``bytes`` +object into msgpack raw type. + +In case of unpacker, there is new ``raw_as_bytes`` option. It is ``True`` by default +for backward compatibility, but it is changed to ``False`` in near future. +You can use ``raw_as_bytes=False`` instead of ``encoding='utf-8'``. + +Planned backward incompatible changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When msgpack 1.0, I planning these breaking changes: + +* packer and unpacker: Remove ``encoding`` and ``unicode_errors`` option. +* packer: Change default of ``use_bin_type`` option from False to True. +* unpacker: Change default of ``raw_as_bytes`` option from True to False. +* unpacker: Reduce all ``max_xxx_len`` options for typical usage. +* unpacker: Remove ``write_bytes`` option from all methods. + +To avoid these breaking changes breaks your application, please: + +* Don't use deprecated options. +* Pass ``use_bin_type`` and ``raw_as_bytes`` options explicitly. +* If your application handle large (>1MB) data, specify ``max_xxx_len`` options too. -`MessagePack `_ is an efficient binary serialization format. -It lets you exchange data among multiple languages like JSON. -But it's faster and smaller. -This package provides CPython bindings for reading and writing MessagePack data. Install ------- @@ -76,14 +113,14 @@ msgpack provides ``dumps`` and ``loads`` as an alias for compatibility with >>> import msgpack >>> msgpack.packb([1, 2, 3], use_bin_type=True) '\x93\x01\x02\x03' - >>> msgpack.unpackb(_) + >>> msgpack.unpackb(_, raw_as_bytes=False) [1, 2, 3] ``unpack`` unpacks msgpack's array to Python's list, but can also unpack to tuple: .. code-block:: pycon - >>> msgpack.unpackb(b'\x93\x01\x02\x03', use_list=False) + >>> msgpack.unpackb(b'\x93\x01\x02\x03', use_list=False, raw_as_bytes=False) (1, 2, 3) You should always specify the ``use_list`` keyword argument for backward compatibility. @@ -109,7 +146,7 @@ stream (or from bytes provided through its ``feed`` method). buf.seek(0) - unpacker = msgpack.Unpacker(buf) + unpacker = msgpack.Unpacker(buf, raw_as_bytes=False) for unpacked in unpacker: print(unpacked) @@ -142,7 +179,7 @@ It is also possible to pack/unpack custom data types. Here is an example for packed_dict = msgpack.packb(useful_dict, default=encode_datetime, use_bin_type=True) - this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime) + this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime, raw_as_bytes=False) ``Unpacker``'s ``object_hook`` callback receives a dict; the ``object_pairs_hook`` callback may instead be used to receive a list of @@ -172,7 +209,7 @@ It is also possible to pack/unpack custom data types using the **ext** type. ... >>> data = array.array('d', [1.2, 3.4]) >>> packed = msgpack.packb(data, default=default, use_bin_type=True) - >>> unpacked = msgpack.unpackb(packed, ext_hook=ext_hook) + >>> unpacked = msgpack.unpackb(packed, ext_hook=ext_hook, raw_as_bytes=False) >>> data == unpacked True @@ -217,14 +254,10 @@ Early versions of msgpack didn't distinguish string and binary types (like Pytho The type for representing both string and binary types was named **raw**. For backward compatibility reasons, msgpack-python will still default all -strings to byte strings, unless you specify the `use_bin_type=True` option in +strings to byte strings, unless you specify the ``use_bin_type=True`` option in the packer. If you do so, it will use a non-standard type called **bin** to serialize byte arrays, and **raw** becomes to mean **str**. If you want to -distinguish **bin** and **raw** in the unpacker, specify `encoding='utf-8'`. - -**In future version, default value of ``use_bin_type`` will be changed to ``True``. -To avoid this change will break your code, you must specify it explicitly -even when you want to use old format.** +distinguish **bin** and **raw** in the unpacker, specify ``raw_as_bytes=False``. Note that Python 2 defaults to byte-arrays over Unicode strings: @@ -234,7 +267,7 @@ Note that Python 2 defaults to byte-arrays over Unicode strings: >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'])) ['spam', 'eggs'] >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True), - encoding='utf-8') + raw_as_bytes=False) ['spam', u'eggs'] This is the same code in Python 3 (same behaviour, but Python 3 has a @@ -246,7 +279,7 @@ different default): >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'])) [b'spam', b'eggs'] >>> msgpack.unpackb(msgpack.packb([b'spam', u'eggs'], use_bin_type=True), - encoding='utf-8') + raw_as_bytes=False) [b'spam', 'eggs'] @@ -277,6 +310,7 @@ You can use ``gc.disable()`` when unpacking large message. use_list option ^^^^^^^^^^^^^^^ + List is the default sequence type of Python. But tuple is lighter than list. You can use ``use_list=False`` while unpacking when performance is important. @@ -295,7 +329,7 @@ Test MessagePack uses `pytest` for testing. Run test with following command: - $ pytest -v test + $ make test .. From 123a6249e0746b566fda4f2e8949287a08640f9c Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 14:58:20 +0900 Subject: [PATCH 08/11] fix --- msgpack/fallback.py | 10 ++++++++-- test/test_pack.py | 4 ---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index cdd7af17..d95f6218 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -211,21 +211,25 @@ class Unpacker(object): def __init__(self, file_like=None, read_size=0, use_list=True, raw_as_bytes=True, object_hook=None, object_pairs_hook=None, list_hook=None, - encoding=None, unicode_errors='strict', max_buffer_size=0, + encoding=None, unicode_errors=None, max_buffer_size=0, ext_hook=ExtType, max_str_len=2147483647, # 2**32-1 max_bin_len=2147483647, max_array_len=2147483647, max_map_len=2147483647, max_ext_len=2147483647): + if encoding is not None: warnings.warn( "encoding is deprecated, Use raw_as_bytes=False instead.", PendingDeprecationWarning) + if unicode_errors is not None: warnings.warn( "unicode_errors is deprecated.", PendingDeprecationWarning) + else: + unicode_errors = 'strict' if file_like is None: self._feeding = True @@ -714,7 +718,9 @@ def __init__(self, default=None, encoding=None, unicode_errors=None, "encoding is deprecated, Use raw_as_bytes=False instead.", PendingDeprecationWarning) - if unicode_errors is not None: + if unicode_errors is None: + unicode_errors = 'strict' + else: warnings.warn( "unicode_errors is deprecated.", PendingDeprecationWarning) diff --git a/test/test_pack.py b/test/test_pack.py index f925db0b..29f5887e 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -82,10 +82,6 @@ def testIgnoreErrorsPack(): # deprecated re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw_as_bytes=False, use_list=1) assert re == "abcdef" -def testNoEncoding(): - with raises(TypeError): - packb("abc", encoding=None) - def testDecodeBinary(): re = unpackb(packb(b"abc"), encoding=None, use_list=1) assert re == b"abc" From fefd055630c5e9b32bce1faa00140bdfba601e6a Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 15:13:22 +0900 Subject: [PATCH 09/11] Fix AppVeyor's test script --- ci/runtests.bat | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/runtests.bat b/ci/runtests.bat index 9efea00c..02404679 100644 --- a/ci/runtests.bat +++ b/ci/runtests.bat @@ -3,5 +3,7 @@ %PYTHON%\python.exe setup.py install %PYTHON%\python.exe -c "import sys; print(hex(sys.maxsize))" %PYTHON%\python.exe -c "from msgpack import _packer, _unpacker" -%PYTHON%\python.exe -m pytest -v test %PYTHON%\python.exe setup.py bdist_wheel +%PYTHON%\python.exe -m pytest -v test +SET EL=%ERRORLEVEL% +exit /b %EL% From 17b50b470d91c3a5ea102cd40b4f09083ca3ef01 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 16:48:54 +0900 Subject: [PATCH 10/11] fix --- msgpack/_packer.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 994fef45..3c8bce48 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -39,7 +39,7 @@ cdef extern from "pack.h": int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l) cdef int DEFAULT_RECURSE_LIMIT=511 -cdef size_t ITEM_LIMIT = (2**32)-1 +cdef Py_ssize_t ITEM_LIMIT = (2**32)-1 cdef inline int PyBytesLike_Check(object o): @@ -138,7 +138,8 @@ cdef class Packer(object): self._berrors = unicode_errors.encode('ascii') else: self._berrors = unicode_errors - self.unicode_errors = PyBytes_AsString(self._berrors) + if self._berrors is not None: + self.unicode_errors = PyBytes_AsString(self._berrors) def __dealloc__(self): PyMem_Free(self.pk.buf) @@ -153,7 +154,7 @@ cdef class Packer(object): cdef char* rawval cdef int ret cdef dict d - cdef size_t L + cdef Py_ssize_t L cdef int default_used = 0 cdef bint strict_types = self.strict_types cdef Py_buffer view From 8f158a479aeece2dc93ceefe67e2c13308ffc6f9 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 16:53:51 +0900 Subject: [PATCH 11/11] Fix for 32bit environment --- msgpack/_packer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 3c8bce48..39da91b5 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -39,7 +39,7 @@ cdef extern from "pack.h": int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l) cdef int DEFAULT_RECURSE_LIMIT=511 -cdef Py_ssize_t ITEM_LIMIT = (2**32)-1 +cdef long long ITEM_LIMIT = (2**32)-1 cdef inline int PyBytesLike_Check(object o):