Skip to content

Commit b161562

Browse files
committed
Issue python#17909: Accept binary input in json.loads
json.loads (and hence json.load) now support binary input encoded as UTF-8, UTF-16 or UTF-32. Patch by Serhiy Storchaka.
1 parent 457fc9a commit b161562

6 files changed

Lines changed: 70 additions & 16 deletions

File tree

Doc/library/json.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,9 @@ Basic Usage
268268

269269
.. function:: loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)
270270

271-
Deserialize *s* (a :class:`str` instance containing a JSON document) to a
272-
Python object using this :ref:`conversion table <json-to-py-table>`.
271+
Deserialize *s* (a :class:`str`, :class:`bytes` or :class:`bytearray`
272+
instance containing a JSON document) to a Python object using this
273+
:ref:`conversion table <json-to-py-table>`.
273274

274275
The other arguments have the same meaning as in :func:`load`, except
275276
*encoding* which is ignored and deprecated.

Doc/whatsnew/3.6.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,14 @@ restriction that :class:`importlib.machinery.BuiltinImporter` and
680680
:term:`path-like object`.
681681

682682

683+
json
684+
----
685+
686+
:func:`json.load` and :func:`json.loads` now support binary input. Encoded
687+
JSON should be represented using either UTF-8, UTF-16, or UTF-32.
688+
(Contributed by Serhiy Storchaka in :issue:`17909`.)
689+
690+
683691
os
684692
--
685693

Lib/json/__init__.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105

106106
from .decoder import JSONDecoder, JSONDecodeError
107107
from .encoder import JSONEncoder
108+
import codecs
108109

109110
_default_encoder = JSONEncoder(
110111
skipkeys=False,
@@ -240,6 +241,35 @@ def dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True,
240241
_default_decoder = JSONDecoder(object_hook=None, object_pairs_hook=None)
241242

242243

244+
def detect_encoding(b):
245+
bstartswith = b.startswith
246+
if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)):
247+
return 'utf-32'
248+
if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
249+
return 'utf-16'
250+
if bstartswith(codecs.BOM_UTF8):
251+
return 'utf-8-sig'
252+
253+
if len(b) >= 4:
254+
if not b[0]:
255+
# 00 00 -- -- - utf-32-be
256+
# 00 XX -- -- - utf-16-be
257+
return 'utf-16-be' if b[1] else 'utf-32-be'
258+
if not b[1]:
259+
# XX 00 00 00 - utf-32-le
260+
# XX 00 XX XX - utf-16-le
261+
return 'utf-16-le' if b[2] or b[3] else 'utf-32-le'
262+
elif len(b) == 2:
263+
if not b[0]:
264+
# 00 XX - utf-16-be
265+
return 'utf-16-be'
266+
if not b[1]:
267+
# XX 00 - utf-16-le
268+
return 'utf-16-le'
269+
# default
270+
return 'utf-8'
271+
272+
243273
def load(fp, *, cls=None, object_hook=None, parse_float=None,
244274
parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
245275
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
@@ -270,8 +300,8 @@ def load(fp, *, cls=None, object_hook=None, parse_float=None,
270300

271301
def loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None,
272302
parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
273-
"""Deserialize ``s`` (a ``str`` instance containing a JSON
274-
document) to a Python object.
303+
"""Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance
304+
containing a JSON document) to a Python object.
275305
276306
``object_hook`` is an optional function that will be called with the
277307
result of any object literal decode (a ``dict``). The return value of
@@ -307,12 +337,16 @@ def loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None,
307337
The ``encoding`` argument is ignored and deprecated.
308338
309339
"""
310-
if not isinstance(s, str):
311-
raise TypeError('the JSON object must be str, not {!r}'.format(
312-
s.__class__.__name__))
313-
if s.startswith(u'\ufeff'):
314-
raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)",
315-
s, 0)
340+
if isinstance(s, str):
341+
if s.startswith('\ufeff'):
342+
raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)",
343+
s, 0)
344+
else:
345+
if not isinstance(s, (bytes, bytearray)):
346+
raise TypeError('the JSON object must be str, bytes or bytearray, '
347+
'not {!r}'.format(s.__class__.__name__))
348+
s = s.decode(detect_encoding(s), 'surrogatepass')
349+
316350
if (cls is None and object_hook is None and
317351
parse_int is None and parse_float is None and
318352
parse_constant is None and object_pairs_hook is None and not kw):

Lib/test/test_json/test_decode.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,8 @@ def test_invalid_escape(self):
7272

7373
def test_invalid_input_type(self):
7474
msg = 'the JSON object must be str'
75-
for value in [1, 3.14, b'bytes', b'\xff\x00', [], {}, None]:
75+
for value in [1, 3.14, [], {}, None]:
7676
self.assertRaisesRegex(TypeError, msg, self.loads, value)
77-
with self.assertRaisesRegex(TypeError, msg):
78-
self.json.load(BytesIO(b'[1,2,3]'))
7977

8078
def test_string_with_utf8_bom(self):
8179
# see #18958

Lib/test/test_json/test_unicode.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
from collections import OrderedDict
23
from test.test_json import PyTest, CTest
34

@@ -52,9 +53,18 @@ def test_bytes_encode(self):
5253
self.assertRaises(TypeError, self.dumps, [b"hi"])
5354

5455
def test_bytes_decode(self):
55-
self.assertRaises(TypeError, self.loads, b'"hi"')
56-
self.assertRaises(TypeError, self.loads, b'["hi"]')
57-
56+
for encoding, bom in [
57+
('utf-8', codecs.BOM_UTF8),
58+
('utf-16be', codecs.BOM_UTF16_BE),
59+
('utf-16le', codecs.BOM_UTF16_LE),
60+
('utf-32be', codecs.BOM_UTF32_BE),
61+
('utf-32le', codecs.BOM_UTF32_LE),
62+
]:
63+
data = ["a\xb5\u20ac\U0001d120"]
64+
encoded = self.dumps(data).encode(encoding)
65+
self.assertEqual(self.loads(bom + encoded), data)
66+
self.assertEqual(self.loads(encoded), data)
67+
self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]')
5868

5969
def test_object_pairs_hook_with_unicode(self):
6070
s = '{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}'

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ Core and Builtins
135135
Library
136136
-------
137137

138+
- Issue #17909: ``json.load`` and ``json.loads`` now support binary input
139+
encoded as UTF-8, UTF-16 or UTF-32. Patch by Serhiy Storchaka.
140+
138141
- Issue #27137: the pure Python fallback implementation of ``functools.partial``
139142
now matches the behaviour of its accelerated C counterpart for subclassing,
140143
pickling and text representation purposes. Patch by Emanuel Barry and

0 commit comments

Comments
 (0)