Skip to content

Commit 4d9d256

Browse files
committed
#17445: difflib: add diff_bytes(), to compare bytes rather than str
Some applications (e.g. traditional Unix diff, version control systems) neither know nor care about the encodings of the files they are comparing. They are textual, but to the diff utility they are just bytes. This worked fine under Python 2, because all of the hardcoded strings in difflib.py are ASCII, so could safely be combined with old-style u'' strings. But it stopped working in 3.x. The solution is to use surrogate escapes for a lossless bytes->str->bytes roundtrip. That means {unified,context}_diff() can continue to just handle strings without worrying about bytes. Callers who have to deal with bytes will need to change to using diff_bytes(). Use case: Mercurial's test runner uses difflib to compare current hg output with known good output. But Mercurial's output is just bytes, since it can contain: * file contents (arbitrary unknown encoding) * filenames (arbitrary unknown encoding) * usernames and commit messages (usually UTF-8, but not guaranteed because old versions of Mercurial did not enforce it) * user messages (locale encoding) Since the output of any given hg command can include text in multiple encodings, it is hopeless to try to treat it as decodable Unicode text. It's just bytes, all the way down. This is an elaboration of a patch by Terry Reedy.
1 parent d19458a commit 4d9d256

File tree

5 files changed

+218
-2
lines changed

5 files changed

+218
-2
lines changed

Doc/library/difflib.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,21 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
315315

316316
See :ref:`difflib-interface` for a more detailed example.
317317

318+
.. function:: diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\\n')
319+
320+
Compare *a* and *b* (lists of bytes objects) using *dfunc*; yield a
321+
sequence of delta lines (also bytes) in the format returned by *dfunc*.
322+
*dfunc* must be a callable, typically either :func:`unified_diff` or
323+
:func:`context_diff`.
324+
325+
Allows you to compare data with unknown or inconsistent encoding. All
326+
inputs except *n* must be bytes objects, not str. Works by losslessly
327+
converting all inputs (except *n*) to str, and calling ``dfunc(a, b,
328+
fromfile, tofile, fromfiledate, tofiledate, n, lineterm)``. The output of
329+
*dfunc* is then converted back to bytes, so the delta lines that you
330+
receive have the same unknown/inconsistent encodings as *a* and *b*.
331+
332+
.. versionadded:: 3.5
318333

319334
.. function:: IS_LINE_JUNK(line)
320335

Doc/whatsnew/3.5.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,9 @@ difflib
302302
charset of HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``.
303303
(Contributed by Berker Peksag in :issue:`2052`.)
304304

305+
* It's now possible to compare lists of byte strings with
306+
:func:`difflib.diff_bytes` (fixes a regression from Python 2).
307+
305308
distutils
306309
---------
307310

Lib/difflib.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
3030
'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
31-
'unified_diff', 'HtmlDiff', 'Match']
31+
'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
3232

3333
from heapq import nlargest as _nlargest
3434
from collections import namedtuple as _namedtuple
@@ -1174,6 +1174,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
11741174
four
11751175
"""
11761176

1177+
_check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
11771178
started = False
11781179
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
11791180
if not started:
@@ -1261,6 +1262,7 @@ def context_diff(a, b, fromfile='', tofile='',
12611262
four
12621263
"""
12631264

1265+
_check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
12641266
prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
12651267
started = False
12661268
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
@@ -1292,6 +1294,53 @@ def context_diff(a, b, fromfile='', tofile='',
12921294
for line in b[j1:j2]:
12931295
yield prefix[tag] + line
12941296

1297+
def _check_types(a, b, *args):
1298+
# Checking types is weird, but the alternative is garbled output when
1299+
# someone passes mixed bytes and str to {unified,context}_diff(). E.g.
1300+
# without this check, passing filenames as bytes results in output like
1301+
# --- b'oldfile.txt'
1302+
# +++ b'newfile.txt'
1303+
# because of how str.format() incorporates bytes objects.
1304+
if a and not isinstance(a[0], str):
1305+
raise TypeError('lines to compare must be str, not %s (%r)' %
1306+
(type(a[0]).__name__, a[0]))
1307+
if b and not isinstance(b[0], str):
1308+
raise TypeError('lines to compare must be str, not %s (%r)' %
1309+
(type(b[0]).__name__, b[0]))
1310+
for arg in args:
1311+
if not isinstance(arg, str):
1312+
raise TypeError('all arguments must be str, not: %r' % (arg,))
1313+
1314+
def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
1315+
fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
1316+
r"""
1317+
Compare `a` and `b`, two sequences of lines represented as bytes rather
1318+
than str. This is a wrapper for `dfunc`, which is typically either
1319+
unified_diff() or context_diff(). Inputs are losslessly converted to
1320+
strings so that `dfunc` only has to worry about strings, and encoded
1321+
back to bytes on return. This is necessary to compare files with
1322+
unknown or inconsistent encoding. All other inputs (except `n`) must be
1323+
bytes rather than str.
1324+
"""
1325+
def decode(s):
1326+
try:
1327+
return s.decode('ascii', 'surrogateescape')
1328+
except AttributeError as err:
1329+
msg = ('all arguments must be bytes, not %s (%r)' %
1330+
(type(s).__name__, s))
1331+
raise TypeError(msg) from err
1332+
a = list(map(decode, a))
1333+
b = list(map(decode, b))
1334+
fromfile = decode(fromfile)
1335+
tofile = decode(tofile)
1336+
fromfiledate = decode(fromfiledate)
1337+
tofiledate = decode(tofiledate)
1338+
lineterm = decode(lineterm)
1339+
1340+
lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
1341+
for line in lines:
1342+
yield line.encode('ascii', 'surrogateescape')
1343+
12951344
def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
12961345
r"""
12971346
Compare `a` and `b` (lists of strings); return a `Differ`-style delta.

Lib/test/test_difflib.py

Lines changed: 146 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,157 @@ def test_range_format_context(self):
322322
self.assertEqual(fmt(0,0), '0')
323323

324324

325+
class TestBytes(unittest.TestCase):
326+
# don't really care about the content of the output, just the fact
327+
# that it's bytes and we don't crash
328+
def check(self, diff):
329+
diff = list(diff) # trigger exceptions first
330+
for line in diff:
331+
self.assertIsInstance(
332+
line, bytes,
333+
"all lines of diff should be bytes, but got: %r" % line)
334+
335+
def test_byte_content(self):
336+
# if we receive byte strings, we return byte strings
337+
a = [b'hello', b'andr\xe9'] # iso-8859-1 bytes
338+
b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes
339+
340+
unified = difflib.unified_diff
341+
context = difflib.context_diff
342+
343+
check = self.check
344+
check(difflib.diff_bytes(unified, a, a))
345+
check(difflib.diff_bytes(unified, a, b))
346+
347+
# now with filenames (content and filenames are all bytes!)
348+
check(difflib.diff_bytes(unified, a, a, b'a', b'a'))
349+
check(difflib.diff_bytes(unified, a, b, b'a', b'b'))
350+
351+
# and with filenames and dates
352+
check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013'))
353+
check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013'))
354+
355+
# same all over again, with context diff
356+
check(difflib.diff_bytes(context, a, a))
357+
check(difflib.diff_bytes(context, a, b))
358+
check(difflib.diff_bytes(context, a, a, b'a', b'a'))
359+
check(difflib.diff_bytes(context, a, b, b'a', b'b'))
360+
check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
361+
check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
362+
363+
def test_byte_filenames(self):
364+
# somebody renamed a file from ISO-8859-2 to UTF-8
365+
fna = b'\xb3odz.txt' # "łodz.txt"
366+
fnb = b'\xc5\x82odz.txt'
367+
368+
# they transcoded the content at the same time
369+
a = [b'\xa3odz is a city in Poland.']
370+
b = [b'\xc5\x81odz is a city in Poland.']
371+
372+
check = self.check
373+
unified = difflib.unified_diff
374+
context = difflib.context_diff
375+
check(difflib.diff_bytes(unified, a, b, fna, fnb))
376+
check(difflib.diff_bytes(context, a, b, fna, fnb))
377+
378+
def assertDiff(expect, actual):
379+
# do not compare expect and equal as lists, because unittest
380+
# uses difflib to report difference between lists
381+
actual = list(actual)
382+
self.assertEqual(len(expect), len(actual))
383+
for e, a in zip(expect, actual):
384+
self.assertEqual(e, a)
385+
386+
expect = [
387+
b'--- \xb3odz.txt',
388+
b'+++ \xc5\x82odz.txt',
389+
b'@@ -1 +1 @@',
390+
b'-\xa3odz is a city in Poland.',
391+
b'+\xc5\x81odz is a city in Poland.',
392+
]
393+
actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'')
394+
assertDiff(expect, actual)
395+
396+
# with dates (plain ASCII)
397+
datea = b'2005-03-18'
398+
dateb = b'2005-03-19'
399+
check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb))
400+
check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb))
401+
402+
expect = [
403+
# note the mixed encodings here: this is deeply wrong by every
404+
# tenet of Unicode, but it doesn't crash, it's parseable by
405+
# patch, and it's how UNIX(tm) diff behaves
406+
b'--- \xb3odz.txt\t2005-03-18',
407+
b'+++ \xc5\x82odz.txt\t2005-03-19',
408+
b'@@ -1 +1 @@',
409+
b'-\xa3odz is a city in Poland.',
410+
b'+\xc5\x81odz is a city in Poland.',
411+
]
412+
actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb,
413+
lineterm=b'')
414+
assertDiff(expect, actual)
415+
416+
def test_mixed_types_content(self):
417+
# type of input content must be consistent: all str or all bytes
418+
a = [b'hello']
419+
b = ['hello']
420+
421+
unified = difflib.unified_diff
422+
context = difflib.context_diff
423+
424+
expect = "lines to compare must be str, not bytes (b'hello')"
425+
self._assert_type_error(expect, unified, a, b)
426+
self._assert_type_error(expect, unified, b, a)
427+
self._assert_type_error(expect, context, a, b)
428+
self._assert_type_error(expect, context, b, a)
429+
430+
expect = "all arguments must be bytes, not str ('hello')"
431+
self._assert_type_error(expect, difflib.diff_bytes, unified, a, b)
432+
self._assert_type_error(expect, difflib.diff_bytes, unified, b, a)
433+
self._assert_type_error(expect, difflib.diff_bytes, context, a, b)
434+
self._assert_type_error(expect, difflib.diff_bytes, context, b, a)
435+
436+
def test_mixed_types_filenames(self):
437+
# cannot pass filenames as bytes if content is str (this may not be
438+
# the right behaviour, but at least the test demonstrates how
439+
# things work)
440+
a = ['hello\n']
441+
b = ['ohell\n']
442+
fna = b'ol\xe9.txt' # filename transcoded from ISO-8859-1
443+
fnb = b'ol\xc3a9.txt' # to UTF-8
444+
self._assert_type_error(
445+
"all arguments must be str, not: b'ol\\xe9.txt'",
446+
difflib.unified_diff, a, b, fna, fnb)
447+
448+
def test_mixed_types_dates(self):
449+
# type of dates must be consistent with type of contents
450+
a = [b'foo\n']
451+
b = [b'bar\n']
452+
datea = '1 fév'
453+
dateb = '3 fév'
454+
self._assert_type_error(
455+
"all arguments must be bytes, not str ('1 fév')",
456+
difflib.diff_bytes, difflib.unified_diff,
457+
a, b, b'a', b'b', datea, dateb)
458+
459+
# if input is str, non-ASCII dates are fine
460+
a = ['foo\n']
461+
b = ['bar\n']
462+
list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb))
463+
464+
def _assert_type_error(self, msg, generator, *args):
465+
with self.assertRaises(TypeError) as ctx:
466+
list(generator(*args))
467+
self.assertEqual(msg, str(ctx.exception))
468+
469+
325470
def test_main():
326471
difflib.HtmlDiff._default_prefix = 0
327472
Doctests = doctest.DocTestSuite(difflib)
328473
run_unittest(
329474
TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
330-
TestOutputFormat, Doctests)
475+
TestOutputFormat, TestBytes, Doctests)
331476

332477
if __name__ == '__main__':
333478
test_main()

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,10 @@ Library
195195
- Issue #23310: Fix MagicMock's initializer to work with __methods__, just
196196
like configure_mock(). Patch by Kasia Jachim.
197197

198+
- Issue #17445: add difflib.diff_bytes() to support comparison of
199+
byte strings (fixes a regression from Python 2).
200+
201+
198202
Build
199203
-----
200204

0 commit comments

Comments
 (0)