Revert "Remove encoding/unicode_errors options from Packer (#378)"

This reverts commit e1ed004.
msgpack · methane · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
commit fd83111f750a9c4ebd801a50fe847f51e6eceabd
diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx
@@ -89,9 +89,19 @@ cdef class Packer(object):
         Additionally tuples will not be serialized as lists.
         This is useful when trying to implement accurate serialization
         for python types.
+
+    :param str unicode_errors:
+        Error handler for encoding unicode. (default: 'strict')
+
+    :param str encoding:
+        (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8')
     """
     cdef msgpack_packer pk
     cdef object _default
+    cdef object _bencoding
+    cdef object _berrors
+    cdef const char *encoding
+    cdef const char *unicode_errors
     cdef bint strict_types
     cdef bool use_float
     cdef bint autoreset
@@ -104,11 +114,11 @@ cdef class Packer(object):
         self.pk.buf_size = buf_size
         self.pk.length = 0
 
-    def __init__(self, default=None,
-                 bint use_single_float=False,
-                 bint autoreset=True,
-                 bint use_bin_type=False,
+    def __init__(self, default=None, encoding=None, unicode_errors=None,
+                 bint use_single_float=False, bint autoreset=True, bint use_bin_type=False,
                  bint strict_types=False):
+        if encoding is not None:
+            PyErr_WarnEx(DeprecationWarning, "encoding is deprecated.", 1)
         self.use_float = use_single_float
         self.strict_types = strict_types
         self.autoreset = autoreset
@@ -118,6 +128,18 @@ cdef class Packer(object):
                 raise TypeError("default must be a callable.")
         self._default = default
 
+        self._bencoding = encoding
+        if encoding is None:
+            self.encoding = 'utf-8'
+        else:
+            self.encoding = self._bencoding
+
+        self._berrors = unicode_errors
+        if unicode_errors is None:
+            self.unicode_errors = NULL
+        else:
+            self.unicode_errors = self._berrors
+
     def __dealloc__(self):
         PyMem_Free(self.pk.buf)
         self.pk.buf = NULL
@@ -183,9 +205,19 @@ cdef class Packer(object):
                 if ret == 0:
                     ret = msgpack_pack_raw_body(&self.pk, rawval, L)
             elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
-                ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
-                if ret == -2:
-                    raise ValueError("unicode string is too large")
+                if self.encoding == NULL and self.unicode_errors == NULL:
+                    ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
+                    if ret == -2:
+                        raise ValueError("unicode string is too large")
+                else:
+                    o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
+                    L = Py_SIZE(o)
+                    if L > ITEM_LIMIT:
+                        raise ValueError("unicode string is too large")
+                    ret = msgpack_pack_raw(&self.pk, L)
+                    if ret == 0:
+                        rawval = o
+                        ret = msgpack_pack_raw_body(&self.pk, rawval, L)
             elif PyDict_CheckExact(o):
                 d = <dict>o
                 L = len(d)

diff --git a/msgpack/fallback.py b/msgpack/fallback.py
@@ -752,14 +752,32 @@ class Packer(object):
         Additionally tuples will not be serialized as lists.
         This is useful when trying to implement accurate serialization
         for python types.
+
+    :param str encoding:
+        (deprecated) Convert unicode to bytes with this encoding. (default: 'utf-8')
+
+    :param str unicode_errors:
+        Error handler for encoding unicode. (default: 'strict')
     """
-    def __init__(self, default=None,
+    def __init__(self, default=None, encoding=None, unicode_errors=None,
                  use_single_float=False, autoreset=True, use_bin_type=False,
                  strict_types=False):
+        if encoding is None:
+            encoding = 'utf_8'
+        else:
+            warnings.warn(
+                "encoding is deprecated, Use raw=False instead.",
+                DeprecationWarning, stacklevel=2)
+
+        if unicode_errors is None:
+            unicode_errors = 'strict'
+
         self._strict_types = strict_types
         self._use_float = use_single_float
         self._autoreset = autoreset
         self._use_bin_type = use_bin_type
+        self._encoding = encoding
+        self._unicode_errors = unicode_errors
         self._buffer = StringIO()
         if default is not None:
             if not callable(default):
@@ -816,7 +834,11 @@ def _pack(self, obj, nest_limit=DEFAULT_RECURSE_LIMIT,
                 self._pack_bin_header(n)
                 return self._buffer.write(obj)
             if check(obj, unicode):
-                obj = obj.encode("utf-8")
+                if self._encoding is None:
+                    raise TypeError(
+                        "Can't encode unicode string: "
+                        "no encoding is specified")
+                obj = obj.encode(self._encoding, self._unicode_errors)
                 n = len(obj)
                 if n >= 2**32:
                     raise ValueError("String is too large")

diff --git a/test/test_pack.py b/test/test_pack.py
@@ -40,6 +40,21 @@ def testPackUnicode():
         re = Unpacker(BytesIO(data), raw=False, use_list=1).unpack()
         assert re == td
 
+def testPackUTF32():  # deprecated
+    try:
+        test_data = [
+            "",
+            "abcd",
+            ["defgh"],
+            "Русский текст",
+            ]
+        for td in test_data:
+            with pytest.deprecated_call():
+                re = unpackb(packb(td, encoding='utf-32'), use_list=1, encoding='utf-32')
+            assert re == td
+    except LookupError as e:
+        xfail(e)
+
 def testPackBytes():
     test_data = [
         b"", b"abcd", (b"defgh",),
@@ -54,11 +69,26 @@ def testPackByteArrays():
     for td in test_data:
         check(td)
 
+def testIgnoreUnicodeErrors(): # deprecated
+    with pytest.deprecated_call():
+        re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1)
+    assert re == "abcdef"
+
 def testStrictUnicodeUnpack():
     packed = packb(b'abc\xeddef')
     with pytest.raises(UnicodeDecodeError):
         unpackb(packed, raw=False, use_list=1)
 
+def testStrictUnicodePack():  # deprecated
+    with raises(UnicodeEncodeError):
+        with pytest.deprecated_call():
+            packb("abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():  # deprecated
+    with pytest.deprecated_call():
+        re = unpackb(packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), raw=False, use_list=1)
+    assert re == "abcdef"
+
 def testDecodeBinary():
     re = unpackb(packb(b"abc"), encoding=None, use_list=1)
     assert re == b"abc"