From fd83111f750a9c4ebd801a50fe847f51e6eceabd Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:11:38 +0900 Subject: [PATCH 01/12] Revert "Remove encoding/unicode_errors options from Packer (#378)" This reverts commit e1ed0044bf31dc0d6ef951f6298de4f420170968. --- msgpack/_packer.pyx | 46 ++++++++++++++++++++++++++++++++++++++------- msgpack/fallback.py | 26 +++++++++++++++++++++++-- test/test_pack.py | 30 +++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 9 deletions(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 2e698e16..e6209145 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -89,9 +89,19 @@ cdef class Packer(object): Additionally tuples will not be serialized as lists. This is useful when trying to implement accurate serialization for python types. + + :param str unicode_errors: + Error handler for encoding unicode. (default: 'strict') + + :param str encoding: + (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8') """ cdef msgpack_packer pk cdef object _default + cdef object _bencoding + cdef object _berrors + cdef const char *encoding + cdef const char *unicode_errors cdef bint strict_types cdef bool use_float cdef bint autoreset @@ -104,11 +114,11 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, - bint use_single_float=False, - bint autoreset=True, - bint use_bin_type=False, + def __init__(self, default=None, encoding=None, unicode_errors=None, + bint use_single_float=False, bint autoreset=True, bint use_bin_type=False, bint strict_types=False): + if encoding is not None: + PyErr_WarnEx(DeprecationWarning, "encoding is deprecated.", 1) self.use_float = use_single_float self.strict_types = strict_types self.autoreset = autoreset @@ -118,6 +128,18 @@ cdef class Packer(object): raise TypeError("default must be a callable.") self._default = default + self._bencoding = encoding + if encoding is None: + self.encoding = 'utf-8' + else: + self.encoding = self._bencoding + + self._berrors = unicode_errors + if unicode_errors is None: + self.unicode_errors = NULL + else: + self.unicode_errors = self._berrors + def __dealloc__(self): PyMem_Free(self.pk.buf) self.pk.buf = NULL @@ -183,9 +205,19 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): - ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); - if ret == -2: - raise ValueError("unicode string is too large") + if self.encoding == NULL and self.unicode_errors == NULL: + ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); + if ret == -2: + raise ValueError("unicode string is too large") + else: + o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + L = Py_SIZE(o) + if L > ITEM_LIMIT: + raise ValueError("unicode string is too large") + ret = msgpack_pack_raw(&self.pk, L) + if ret == 0: + rawval = o + ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyDict_CheckExact(o): d = o L = len(d) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 5dab9065..1ed6e773 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -752,14 +752,32 @@ class Packer(object): Additionally tuples will not be serialized as lists. This is useful when trying to implement accurate serialization for python types. + + :param str encoding: + (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8') + + :param str unicode_errors: + Error handler for encoding unicode. (default: 'strict') """ - def __init__(self, default=None, + def __init__(self, default=None, encoding=None, unicode_errors=None, use_single_float=False, autoreset=True, use_bin_type=False, strict_types=False): + if encoding is None: + encoding = 'utf_8' + else: + warnings.warn( + "encoding is deprecated, Use raw=False instead.", + DeprecationWarning, stacklevel=2) + + if unicode_errors is None: + unicode_errors = 'strict' + self._strict_types = strict_types self._use_float = use_single_float self._autoreset = autoreset self._use_bin_type = use_bin_type + self._encoding = encoding + self._unicode_errors = unicode_errors self._buffer = StringIO() if default is not None: if not callable(default): @@ -816,7 +834,11 @@ def _pack(self, obj, nest_limit=DEFAULT_RECURSE_LIMIT, self._pack_bin_header(n) return self._buffer.write(obj) if check(obj, unicode): - obj = obj.encode("utf-8") + if self._encoding is None: + raise TypeError( + "Can't encode unicode string: " + "no encoding is specified") + obj = obj.encode(self._encoding, self._unicode_errors) n = len(obj) if n >= 2**32: raise ValueError("String is too large") diff --git a/test/test_pack.py b/test/test_pack.py index 194b2c92..3658a977 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -40,6 +40,21 @@ def testPackUnicode(): re = Unpacker(BytesIO(data), raw=False, use_list=1).unpack() assert re == td +def testPackUTF32(): # deprecated + try: + test_data = [ + "", + "abcd", + ["defgh"], + "Русский текст", + ] + for td in test_data: + with pytest.deprecated_call(): + re = unpackb(packb(td, encoding='utf-32'), use_list=1, encoding='utf-32') + assert re == td + except LookupError as e: + xfail(e) + def testPackBytes(): test_data = [ b"", b"abcd", (b"defgh",), @@ -54,11 +69,26 @@ def testPackByteArrays(): for td in test_data: check(td) +def testIgnoreUnicodeErrors(): # deprecated + with pytest.deprecated_call(): + re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1) + assert re == "abcdef" + def testStrictUnicodeUnpack(): packed = packb(b'abc\xeddef') with pytest.raises(UnicodeDecodeError): unpackb(packed, raw=False, use_list=1) +def testStrictUnicodePack(): # deprecated + with raises(UnicodeEncodeError): + with pytest.deprecated_call(): + packb("abc\xeddef", encoding='ascii', unicode_errors='strict') + +def testIgnoreErrorsPack(): # deprecated + with pytest.deprecated_call(): + re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw=False, use_list=1) + assert re == "abcdef" + def testDecodeBinary(): re = unpackb(packb(b"abc"), encoding=None, use_list=1) assert re == b"abc" From 97da8761045e69445648507c0ef74f9518ceaa1b Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:29:23 +0900 Subject: [PATCH 02/12] Remove encoding option from the Packer encoding must be UTF-8. --- msgpack/_packer.pyx | 20 +++++--------------- msgpack/fallback.py | 22 ++++------------------ test/test_pack.py | 29 ++++------------------------- 3 files changed, 13 insertions(+), 58 deletions(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index e6209145..76313990 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -91,10 +91,8 @@ cdef class Packer(object): for python types. :param str unicode_errors: - Error handler for encoding unicode. (default: 'strict') - - :param str encoding: - (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8') + The error handler for encoding unicode. (default: 'strict') + DO NOT USE THIS!! This option is kept for very specific usage. """ cdef msgpack_packer pk cdef object _default @@ -114,11 +112,9 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, encoding=None, unicode_errors=None, + def __init__(self, *, default=None, unicode_errors=None, bint use_single_float=False, bint autoreset=True, bint use_bin_type=False, bint strict_types=False): - if encoding is not None: - PyErr_WarnEx(DeprecationWarning, "encoding is deprecated.", 1) self.use_float = use_single_float self.strict_types = strict_types self.autoreset = autoreset @@ -128,12 +124,6 @@ cdef class Packer(object): raise TypeError("default must be a callable.") self._default = default - self._bencoding = encoding - if encoding is None: - self.encoding = 'utf-8' - else: - self.encoding = self._bencoding - self._berrors = unicode_errors if unicode_errors is None: self.unicode_errors = NULL @@ -205,12 +195,12 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): - if self.encoding == NULL and self.unicode_errors == NULL: + if self.unicode_errors == NULL: ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); if ret == -2: raise ValueError("unicode string is too large") else: - o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + o = PyUnicode_AsEncodedString(o, NULL, self.unicode_errors) L = Py_SIZE(o) if L > ITEM_LIMIT: raise ValueError("unicode string is too large") diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 1ed6e773..844eca79 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -753,22 +753,13 @@ class Packer(object): This is useful when trying to implement accurate serialization for python types. - :param str encoding: - (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8') - :param str unicode_errors: - Error handler for encoding unicode. (default: 'strict') + The error handler for encoding unicode. (default: 'strict') + DO NOT USE THIS!! This option is kept for very specific usage. """ - def __init__(self, default=None, encoding=None, unicode_errors=None, + def __init__(self, default=None, unicode_errors=None, use_single_float=False, autoreset=True, use_bin_type=False, strict_types=False): - if encoding is None: - encoding = 'utf_8' - else: - warnings.warn( - "encoding is deprecated, Use raw=False instead.", - DeprecationWarning, stacklevel=2) - if unicode_errors is None: unicode_errors = 'strict' @@ -776,7 +767,6 @@ def __init__(self, default=None, encoding=None, unicode_errors=None, self._use_float = use_single_float self._autoreset = autoreset self._use_bin_type = use_bin_type - self._encoding = encoding self._unicode_errors = unicode_errors self._buffer = StringIO() if default is not None: @@ -834,11 +824,7 @@ def _pack(self, obj, nest_limit=DEFAULT_RECURSE_LIMIT, self._pack_bin_header(n) return self._buffer.write(obj) if check(obj, unicode): - if self._encoding is None: - raise TypeError( - "Can't encode unicode string: " - "no encoding is specified") - obj = obj.encode(self._encoding, self._unicode_errors) + obj = obj.encode("utf-8", self._unicode_errors) n = len(obj) if n >= 2**32: raise ValueError("String is too large") diff --git a/test/test_pack.py b/test/test_pack.py index 3658a977..5e001e6a 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -40,21 +40,6 @@ def testPackUnicode(): re = Unpacker(BytesIO(data), raw=False, use_list=1).unpack() assert re == td -def testPackUTF32(): # deprecated - try: - test_data = [ - "", - "abcd", - ["defgh"], - "Русский текст", - ] - for td in test_data: - with pytest.deprecated_call(): - re = unpackb(packb(td, encoding='utf-32'), use_list=1, encoding='utf-32') - assert re == td - except LookupError as e: - xfail(e) - def testPackBytes(): test_data = [ b"", b"abcd", (b"defgh",), @@ -69,28 +54,22 @@ def testPackByteArrays(): for td in test_data: check(td) -def testIgnoreUnicodeErrors(): # deprecated - with pytest.deprecated_call(): - re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1) +def testIgnoreUnicodeErrors(): + re = unpackb(packb(b'abc\xeddef', use_bin_type=False), unicode_errors='ignore', use_list=1) assert re == "abcdef" def testStrictUnicodeUnpack(): - packed = packb(b'abc\xeddef') + packed = packb(b'abc\xeddef', use_bin_type=False) with pytest.raises(UnicodeDecodeError): unpackb(packed, raw=False, use_list=1) -def testStrictUnicodePack(): # deprecated - with raises(UnicodeEncodeError): - with pytest.deprecated_call(): - packb("abc\xeddef", encoding='ascii', unicode_errors='strict') - def testIgnoreErrorsPack(): # deprecated with pytest.deprecated_call(): re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef" def testDecodeBinary(): - re = unpackb(packb(b"abc"), encoding=None, use_list=1) + re = unpackb(packb(b"abc"), use_list=1) assert re == b"abc" def testPackFloat(): From d25453455b5f1f259b7205ca6ea7e0567ffeafa0 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:33:34 +0900 Subject: [PATCH 03/12] fixup --- msgpack/_packer.pyx | 2 -- msgpack/fallback.py | 3 --- test/test_pack.py | 4 ++-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 76313990..8b1a392c 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -96,9 +96,7 @@ cdef class Packer(object): """ cdef msgpack_packer pk cdef object _default - cdef object _bencoding cdef object _berrors - cdef const char *encoding cdef const char *unicode_errors cdef bint strict_types cdef bool use_float diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 844eca79..cacfce11 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -760,9 +760,6 @@ class Packer(object): def __init__(self, default=None, unicode_errors=None, use_single_float=False, autoreset=True, use_bin_type=False, strict_types=False): - if unicode_errors is None: - unicode_errors = 'strict' - self._strict_types = strict_types self._use_float = use_single_float self._autoreset = autoreset diff --git a/test/test_pack.py b/test/test_pack.py index 5e001e6a..856b147e 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -63,9 +63,9 @@ def testStrictUnicodeUnpack(): with pytest.raises(UnicodeDecodeError): unpackb(packed, raw=False, use_list=1) -def testIgnoreErrorsPack(): # deprecated +def testIgnoreErrorsPack(): with pytest.deprecated_call(): - re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw=False, use_list=1) + re = unpackb(packb("abcФФФdef", unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef" def testDecodeBinary(): From 2b4ffcfaea33beb8f4789f96aa4c0ec80b9b2071 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:44:58 +0900 Subject: [PATCH 04/12] fix tests --- test/test_pack.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_pack.py b/test/test_pack.py index 856b147e..129bb6c2 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -55,7 +55,7 @@ def testPackByteArrays(): check(td) def testIgnoreUnicodeErrors(): - re = unpackb(packb(b'abc\xeddef', use_bin_type=False), unicode_errors='ignore', use_list=1) + re = unpackb(packb(b'abc\xeddef', use_bin_type=False), raw=False, unicode_errors='ignore', use_list=1) assert re == "abcdef" def testStrictUnicodeUnpack(): @@ -64,8 +64,7 @@ def testStrictUnicodeUnpack(): unpackb(packed, raw=False, use_list=1) def testIgnoreErrorsPack(): - with pytest.deprecated_call(): - re = unpackb(packb("abcФФФdef", unicode_errors='ignore'), raw=False, use_list=1) + re = unpackb(packb("abcФФФdef", unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef" def testDecodeBinary(): From 2a4a98a3c118c723885b118765893694c315a1de Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:45:30 +0900 Subject: [PATCH 05/12] Update ChangeLog. --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 1352af83..1d784af7 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,7 +5,7 @@ Release Date: TBD * Remove Python 2 support from the ``msgpack/_cmsgpack``. ``msgpack/fallback`` still supports Python 2. -* Remove encoding and unicode_errors options from the Packer. +* Remove ``encoding`` option from the Packer. 0.6.2 From 3b582a8e9f03a2d67815b5951fc3477e8889267e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:47:50 +0900 Subject: [PATCH 06/12] fix test --- test/test_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pack.py b/test/test_pack.py index 129bb6c2..2b7c5f92 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -64,7 +64,7 @@ def testStrictUnicodeUnpack(): unpackb(packed, raw=False, use_list=1) def testIgnoreErrorsPack(): - re = unpackb(packb("abcФФФdef", unicode_errors='ignore'), raw=False, use_list=1) + re = unpackb(packb("abcФФФdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef" def testDecodeBinary(): From a9a769d596ec52018e4cff5bde1758bb30fcffdf Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 19:54:06 +0900 Subject: [PATCH 07/12] Use surrogateescape-d input --- test/test_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pack.py b/test/test_pack.py index 2b7c5f92..ad190169 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -64,7 +64,7 @@ def testStrictUnicodeUnpack(): unpackb(packed, raw=False, use_list=1) def testIgnoreErrorsPack(): - re = unpackb(packb("abcФФФdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1) + re = unpackb(packb(u"abc\uDC80\uDCFFdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef" def testDecodeBinary(): From 28d737b2cc011b83c4eb22baee4913efbb82a3f9 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 20:00:01 +0900 Subject: [PATCH 08/12] fix fallback --- msgpack/fallback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index cacfce11..d0d9fe79 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -764,7 +764,7 @@ def __init__(self, default=None, unicode_errors=None, self._use_float = use_single_float self._autoreset = autoreset self._use_bin_type = use_bin_type - self._unicode_errors = unicode_errors + self._unicode_errors = unicode_errors or "strict" self._buffer = StringIO() if default is not None: if not callable(default): From b582067408cfd187fe4faf82c1d4cb9d6ef03416 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 20:06:57 +0900 Subject: [PATCH 09/12] fix fallback --- msgpack/fallback.py | 2 +- test/test_pack.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index d0d9fe79..0c0c101c 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -667,7 +667,7 @@ def _unpack(self, execute=EX_CONSTRUCT): elif self._raw: obj = bytes(obj) else: - obj = obj.decode('utf_8') + obj = obj.decode('utf_8', self._unicode_errors) return obj if typ == TYPE_EXT: return self._ext_hook(n, bytes(obj)) diff --git a/test/test_pack.py b/test/test_pack.py index ad190169..9d1012b0 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -55,7 +55,8 @@ def testPackByteArrays(): check(td) def testIgnoreUnicodeErrors(): - re = unpackb(packb(b'abc\xeddef', use_bin_type=False), raw=False, unicode_errors='ignore', use_list=1) + re = unpackb(packb(b'abc\xeddef', use_bin_type=False), + raw=False, unicode_errors='ignore') assert re == "abcdef" def testStrictUnicodeUnpack(): From 1963adf9f19eb3887d80ea3a107bce4ef79d04c0 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 20:40:28 +0900 Subject: [PATCH 10/12] Fix PY2 test --- test/test_pack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_pack.py b/test/test_pack.py index 9d1012b0..bacf23a5 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -5,6 +5,7 @@ from collections import OrderedDict from io import BytesIO import struct +import sys import pytest from pytest import raises, xfail @@ -54,6 +55,7 @@ def testPackByteArrays(): for td in test_data: check(td) +@pytest.mark.skipif(sys.version_info < (3,0), "Python 2 passes invalid surrogates") def testIgnoreUnicodeErrors(): re = unpackb(packb(b'abc\xeddef', use_bin_type=False), raw=False, unicode_errors='ignore') From 72272db0156a6865c58187380a0ea96eecb50370 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 20:47:05 +0900 Subject: [PATCH 11/12] fixup --- test/test_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pack.py b/test/test_pack.py index bacf23a5..a97c86d2 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -55,7 +55,7 @@ def testPackByteArrays(): for td in test_data: check(td) -@pytest.mark.skipif(sys.version_info < (3,0), "Python 2 passes invalid surrogates") +@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates") def testIgnoreUnicodeErrors(): re = unpackb(packb(b'abc\xeddef', use_bin_type=False), raw=False, unicode_errors='ignore') From ffcff9af5b919b62535a687ea90704c338de7244 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 3 Dec 2019 20:49:31 +0900 Subject: [PATCH 12/12] fix PY2 again --- test/test_pack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_pack.py b/test/test_pack.py index a97c86d2..b6752e5a 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -66,6 +66,7 @@ def testStrictUnicodeUnpack(): with pytest.raises(UnicodeDecodeError): unpackb(packed, raw=False, use_list=1) +@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates") def testIgnoreErrorsPack(): re = unpackb(packb(u"abc\uDC80\uDCFFdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1) assert re == "abcdef"