Skip to content

Commit d54cf8f

Browse files
ShaharNavehyouknowone
authored andcommitted
Update difflib.py from 3.14.3
1 parent 32b5778 commit d54cf8f

File tree

3 files changed

+150
-95
lines changed

3 files changed

+150
-95
lines changed

Lib/difflib.py

Lines changed: 96 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ class SequenceMatcher:
7878
sequences. As a rule of thumb, a .ratio() value over 0.6 means the
7979
sequences are close matches:
8080
81-
>>> print(round(s.ratio(), 3))
82-
0.866
81+
>>> print(round(s.ratio(), 2))
82+
0.87
8383
>>>
8484
8585
If you're only interested in where the sequences match,
@@ -908,87 +908,85 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908908
+ abcdefGhijkl
909909
? ^ ^ ^
910910
"""
911-
912-
# don't synch up unless the lines have a similarity score of at
913-
# least cutoff; best_ratio tracks the best score seen so far
914-
best_ratio, cutoff = 0.74, 0.75
911+
# Don't synch up unless the lines have a similarity score above
912+
# cutoff. Previously only the smallest pair was handled here,
913+
# and if there are many pairs with the best ratio, recursion
914+
# could grow very deep, and runtime cubic. See:
915+
# https://github.com/python/cpython/issues/119105
916+
#
917+
# Later, more pathological cases prompted removing recursion
918+
# entirely.
919+
cutoff = 0.74999
915920
cruncher = SequenceMatcher(self.charjunk)
916-
eqi, eqj = None, None # 1st indices of equal lines (if any)
921+
crqr = cruncher.real_quick_ratio
922+
cqr = cruncher.quick_ratio
923+
cr = cruncher.ratio
917924

918-
# search for the pair that matches best without being identical
919-
# (identical lines must be junk lines, & we don't want to synch up
920-
# on junk -- unless we have to)
925+
WINDOW = 10
926+
best_i = best_j = None
927+
dump_i, dump_j = alo, blo # smallest indices not yet resolved
921928
for j in range(blo, bhi):
922-
bj = b[j]
923-
cruncher.set_seq2(bj)
924-
for i in range(alo, ahi):
925-
ai = a[i]
926-
if ai == bj:
927-
if eqi is None:
928-
eqi, eqj = i, j
929-
continue
930-
cruncher.set_seq1(ai)
931-
# computing similarity is expensive, so use the quick
932-
# upper bounds first -- have seen this speed up messy
933-
# compares by a factor of 3.
934-
# note that ratio() is only expensive to compute the first
935-
# time it's called on a sequence pair; the expensive part
936-
# of the computation is cached by cruncher
937-
if cruncher.real_quick_ratio() > best_ratio and \
938-
cruncher.quick_ratio() > best_ratio and \
939-
cruncher.ratio() > best_ratio:
940-
best_ratio, best_i, best_j = cruncher.ratio(), i, j
941-
if best_ratio < cutoff:
942-
# no non-identical "pretty close" pair
943-
if eqi is None:
944-
# no identical pair either -- treat it as a straight replace
945-
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
946-
return
947-
# no close pair, but an identical pair -- synch up on that
948-
best_i, best_j, best_ratio = eqi, eqj, 1.0
949-
else:
950-
# there's a close pair, so forget the identical pair (if any)
951-
eqi = None
952-
953-
# a[best_i] very similar to b[best_j]; eqi is None iff they're not
954-
# identical
955-
956-
# pump out diffs from before the synch point
957-
yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
958-
959-
# do intraline marking on the synch pair
960-
aelt, belt = a[best_i], b[best_j]
961-
if eqi is None:
962-
# pump out a '-', '?', '+', '?' quad for the synched lines
963-
atags = btags = ""
964-
cruncher.set_seqs(aelt, belt)
965-
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
966-
la, lb = ai2 - ai1, bj2 - bj1
967-
if tag == 'replace':
968-
atags += '^' * la
969-
btags += '^' * lb
970-
elif tag == 'delete':
971-
atags += '-' * la
972-
elif tag == 'insert':
973-
btags += '+' * lb
974-
elif tag == 'equal':
975-
atags += ' ' * la
976-
btags += ' ' * lb
977-
else:
978-
raise ValueError('unknown tag %r' % (tag,))
979-
yield from self._qformat(aelt, belt, atags, btags)
980-
else:
981-
# the synch pair is identical
982-
yield ' ' + aelt
929+
cruncher.set_seq2(b[j])
930+
# Search the corresponding i's within WINDOW for rhe highest
931+
# ratio greater than `cutoff`.
932+
aequiv = alo + (j - blo)
933+
arange = range(max(aequiv - WINDOW, dump_i),
934+
min(aequiv + WINDOW + 1, ahi))
935+
if not arange: # likely exit if `a` is shorter than `b`
936+
break
937+
best_ratio = cutoff
938+
for i in arange:
939+
cruncher.set_seq1(a[i])
940+
# Ordering by cheapest to most expensive ratio is very
941+
# valuable, most often getting out early.
942+
if (crqr() > best_ratio
943+
and cqr() > best_ratio
944+
and cr() > best_ratio):
945+
best_i, best_j, best_ratio = i, j, cr()
946+
947+
if best_i is None:
948+
# found nothing to synch on yet - move to next j
949+
continue
983950

984-
# pump out diffs from after the synch point
985-
yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
951+
# pump out straight replace from before this synch pair
952+
yield from self._fancy_helper(a, dump_i, best_i,
953+
b, dump_j, best_j)
954+
# do intraline marking on the synch pair
955+
aelt, belt = a[best_i], b[best_j]
956+
if aelt != belt:
957+
# pump out a '-', '?', '+', '?' quad for the synched lines
958+
atags = btags = ""
959+
cruncher.set_seqs(aelt, belt)
960+
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
961+
la, lb = ai2 - ai1, bj2 - bj1
962+
if tag == 'replace':
963+
atags += '^' * la
964+
btags += '^' * lb
965+
elif tag == 'delete':
966+
atags += '-' * la
967+
elif tag == 'insert':
968+
btags += '+' * lb
969+
elif tag == 'equal':
970+
atags += ' ' * la
971+
btags += ' ' * lb
972+
else:
973+
raise ValueError('unknown tag %r' % (tag,))
974+
yield from self._qformat(aelt, belt, atags, btags)
975+
else:
976+
# the synch pair is identical
977+
yield ' ' + aelt
978+
dump_i, dump_j = best_i + 1, best_j + 1
979+
best_i = best_j = None
980+
981+
# pump out straight replace from after the last synch pair
982+
yield from self._fancy_helper(a, dump_i, ahi,
983+
b, dump_j, bhi)
986984

987985
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
988986
g = []
989987
if alo < ahi:
990988
if blo < bhi:
991-
g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
989+
g = self._plain_replace(a, alo, ahi, b, blo, bhi)
992990
else:
993991
g = self._dump('-', a, alo, ahi)
994992
elif blo < bhi:
@@ -1040,11 +1038,9 @@ def _qformat(self, aline, bline, atags, btags):
10401038
# remaining is that perhaps it was really the case that " volatile"
10411039
# was inserted after "private". I can live with that <wink>.
10421040

1043-
import re
1044-
1045-
def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
1041+
def IS_LINE_JUNK(line, pat=None):
10461042
r"""
1047-
Return True for ignorable line: iff `line` is blank or contains a single '#'.
1043+
Return True for ignorable line: if `line` is blank or contains a single '#'.
10481044
10491045
Examples:
10501046
@@ -1056,6 +1052,11 @@ def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
10561052
False
10571053
"""
10581054

1055+
if pat is None:
1056+
# Default: match '#' or the empty string
1057+
return line.strip() in '#'
1058+
# Previous versions used the undocumented parameter 'pat' as a
1059+
# match function. Retain this behaviour for compatibility.
10591060
return pat(line) is not None
10601061

10611062
def IS_CHARACTER_JUNK(ch, ws=" \t"):
@@ -1266,6 +1267,12 @@ def _check_types(a, b, *args):
12661267
if b and not isinstance(b[0], str):
12671268
raise TypeError('lines to compare must be str, not %s (%r)' %
12681269
(type(b[0]).__name__, b[0]))
1270+
if isinstance(a, str):
1271+
raise TypeError('input must be a sequence of strings, not %s' %
1272+
type(a).__name__)
1273+
if isinstance(b, str):
1274+
raise TypeError('input must be a sequence of strings, not %s' %
1275+
type(b).__name__)
12691276
for arg in args:
12701277
if not isinstance(arg, str):
12711278
raise TypeError('all arguments must be str, not: %r' % (arg,))
@@ -1628,13 +1635,22 @@ def _line_pair_iterator():
16281635
</html>"""
16291636

16301637
_styles = """
1638+
:root {color-scheme: light dark}
16311639
table.diff {font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; border:medium}
16321640
.diff_header {background-color:#e0e0e0}
16331641
td.diff_header {text-align:right}
16341642
.diff_next {background-color:#c0c0c0}
1635-
.diff_add {background-color:#aaffaa}
1643+
.diff_add {background-color:palegreen}
16361644
.diff_chg {background-color:#ffff77}
1637-
.diff_sub {background-color:#ffaaaa}"""
1645+
.diff_sub {background-color:#ffaaaa}
1646+
1647+
@media (prefers-color-scheme: dark) {
1648+
.diff_header {background-color:#666}
1649+
.diff_next {background-color:#393939}
1650+
.diff_add {background-color:darkgreen}
1651+
.diff_chg {background-color:#847415}
1652+
.diff_sub {background-color:darkred}
1653+
}"""
16381654

16391655
_table_template = """
16401656
<table class="diff" id="difflib_chg_%(prefix)s_top"
@@ -2014,7 +2030,6 @@ def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
20142030
replace('\1','</span>'). \
20152031
replace('\t','&nbsp;')
20162032

2017-
del re
20182033

20192034
def restore(delta, which):
20202035
r"""
@@ -2047,10 +2062,3 @@ def restore(delta, which):
20472062
for line in delta:
20482063
if line[:2] in prefixes:
20492064
yield line[2:]
2050-
2051-
def _test():
2052-
import doctest, difflib
2053-
return doctest.testmod(difflib)
2054-
2055-
if __name__ == "__main__":
2056-
_test()

Lib/test/test_difflib.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,26 @@ def test_make_file_usascii_charset_with_nonascii_input(self):
282282
self.assertIn('content="text/html; charset=us-ascii"', output)
283283
self.assertIn('&#305;mpl&#305;c&#305;t', output)
284284

285+
class TestDiffer(unittest.TestCase):
286+
def test_close_matches_aligned(self):
287+
# Of the 4 closely matching pairs, we want 1 to match with 3,
288+
# and 2 with 4, to align with a "top to bottom" mental model.
289+
a = ["cat\n", "dog\n", "close match 1\n", "close match 2\n"]
290+
b = ["close match 3\n", "close match 4\n", "kitten\n", "puppy\n"]
291+
m = difflib.Differ().compare(a, b)
292+
self.assertEqual(list(m),
293+
['- cat\n',
294+
'- dog\n',
295+
'- close match 1\n',
296+
'? ^\n',
297+
'+ close match 3\n',
298+
'? ^\n',
299+
'- close match 2\n',
300+
'? ^\n',
301+
'+ close match 4\n',
302+
'? ^\n',
303+
'+ kitten\n',
304+
'+ puppy\n'])
285305

286306
def test_one_insert(self):
287307
m = difflib.Differ().compare('b' * 2, 'a' + 'b' * 2)
@@ -294,7 +314,7 @@ def test_one_delete(self):
294314

295315
class TestOutputFormat(unittest.TestCase):
296316
def test_tab_delimiter(self):
297-
args = ['one', 'two', 'Original', 'Current',
317+
args = [['one'], ['two'], 'Original', 'Current',
298318
'2005-01-26 23:30:50', '2010-04-02 10:20:52']
299319
ud = difflib.unified_diff(*args, lineterm='')
300320
self.assertEqual(list(ud)[0:2], [
@@ -306,7 +326,7 @@ def test_tab_delimiter(self):
306326
"--- Current\t2010-04-02 10:20:52"])
307327

308328
def test_no_trailing_tab_on_empty_filedate(self):
309-
args = ['one', 'two', 'Original', 'Current']
329+
args = [['one'], ['two'], 'Original', 'Current']
310330
ud = difflib.unified_diff(*args, lineterm='')
311331
self.assertEqual(list(ud)[0:2], ["--- Original", "+++ Current"])
312332

@@ -446,6 +466,28 @@ def assertDiff(expect, actual):
446466
lineterm=b'')
447467
assertDiff(expect, actual)
448468

469+
470+
class TestInputTypes(unittest.TestCase):
471+
def _assert_type_error(self, msg, generator, *args):
472+
with self.assertRaises(TypeError) as ctx:
473+
list(generator(*args))
474+
self.assertEqual(msg, str(ctx.exception))
475+
476+
def test_input_type_checks(self):
477+
unified = difflib.unified_diff
478+
context = difflib.context_diff
479+
480+
expect = "input must be a sequence of strings, not str"
481+
self._assert_type_error(expect, unified, 'a', ['b'])
482+
self._assert_type_error(expect, context, 'a', ['b'])
483+
484+
self._assert_type_error(expect, unified, ['a'], 'b')
485+
self._assert_type_error(expect, context, ['a'], 'b')
486+
487+
expect = "lines to compare must be str, not NoneType (None)"
488+
self._assert_type_error(expect, unified, ['a'], [None])
489+
self._assert_type_error(expect, context, ['a'], [None])
490+
449491
def test_mixed_types_content(self):
450492
# type of input content must be consistent: all str or all bytes
451493
a = [b'hello']
@@ -494,10 +536,6 @@ def test_mixed_types_dates(self):
494536
b = ['bar\n']
495537
list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb))
496538

497-
def _assert_type_error(self, msg, generator, *args):
498-
with self.assertRaises(TypeError) as ctx:
499-
list(generator(*args))
500-
self.assertEqual(msg, str(ctx.exception))
501539

502540
class TestJunkAPIs(unittest.TestCase):
503541
def test_is_line_junk_true(self):

Lib/test/test_difflib_expect.html

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,22 @@
99
content="text/html; charset=utf-8" />
1010
<title></title>
1111
<style type="text/css">
12+
:root {color-scheme: light dark}
1213
table.diff {font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; border:medium}
1314
.diff_header {background-color:#e0e0e0}
1415
td.diff_header {text-align:right}
1516
.diff_next {background-color:#c0c0c0}
16-
.diff_add {background-color:#aaffaa}
17+
.diff_add {background-color:palegreen}
1718
.diff_chg {background-color:#ffff77}
1819
.diff_sub {background-color:#ffaaaa}
20+
21+
@media (prefers-color-scheme: dark) {
22+
.diff_header {background-color:#666}
23+
.diff_next {background-color:#393939}
24+
.diff_add {background-color:darkgreen}
25+
.diff_chg {background-color:#847415}
26+
.diff_sub {background-color:darkred}
27+
}
1928
</style>
2029
</head>
2130

0 commit comments

Comments
 (0)