@@ -78,8 +78,8 @@ class SequenceMatcher:
7878 sequences. As a rule of thumb, a .ratio() value over 0.6 means the
7979 sequences are close matches:
8080
81- >>> print(round(s.ratio(), 3 ))
82- 0.866
81+ >>> print(round(s.ratio(), 2 ))
82+ 0.87
8383 >>>
8484
8585 If you're only interested in where the sequences match,
@@ -908,87 +908,85 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908908 + abcdefGhijkl
909909 ? ^ ^ ^
910910 """
911-
912- # don't synch up unless the lines have a similarity score of at
913- # least cutoff; best_ratio tracks the best score seen so far
914- best_ratio , cutoff = 0.74 , 0.75
911+ # Don't synch up unless the lines have a similarity score above
912+ # cutoff. Previously only the smallest pair was handled here,
913+ # and if there are many pairs with the best ratio, recursion
914+ # could grow very deep, and runtime cubic. See:
915+ # https://github.com/python/cpython/issues/119105
916+ #
917+ # Later, more pathological cases prompted removing recursion
918+ # entirely.
919+ cutoff = 0.74999
915920 cruncher = SequenceMatcher (self .charjunk )
916- eqi , eqj = None , None # 1st indices of equal lines (if any)
921+ crqr = cruncher .real_quick_ratio
922+ cqr = cruncher .quick_ratio
923+ cr = cruncher .ratio
917924
918- # search for the pair that matches best without being identical
919- # (identical lines must be junk lines, & we don't want to synch up
920- # on junk -- unless we have to)
925+ WINDOW = 10
926+ best_i = best_j = None
927+ dump_i , dump_j = alo , blo # smallest indices not yet resolved
921928 for j in range (blo , bhi ):
922- bj = b [j ]
923- cruncher .set_seq2 (bj )
924- for i in range (alo , ahi ):
925- ai = a [i ]
926- if ai == bj :
927- if eqi is None :
928- eqi , eqj = i , j
929- continue
930- cruncher .set_seq1 (ai )
931- # computing similarity is expensive, so use the quick
932- # upper bounds first -- have seen this speed up messy
933- # compares by a factor of 3.
934- # note that ratio() is only expensive to compute the first
935- # time it's called on a sequence pair; the expensive part
936- # of the computation is cached by cruncher
937- if cruncher .real_quick_ratio () > best_ratio and \
938- cruncher .quick_ratio () > best_ratio and \
939- cruncher .ratio () > best_ratio :
940- best_ratio , best_i , best_j = cruncher .ratio (), i , j
941- if best_ratio < cutoff :
942- # no non-identical "pretty close" pair
943- if eqi is None :
944- # no identical pair either -- treat it as a straight replace
945- yield from self ._plain_replace (a , alo , ahi , b , blo , bhi )
946- return
947- # no close pair, but an identical pair -- synch up on that
948- best_i , best_j , best_ratio = eqi , eqj , 1.0
949- else :
950- # there's a close pair, so forget the identical pair (if any)
951- eqi = None
952-
953- # a[best_i] very similar to b[best_j]; eqi is None iff they're not
954- # identical
955-
956- # pump out diffs from before the synch point
957- yield from self ._fancy_helper (a , alo , best_i , b , blo , best_j )
958-
959- # do intraline marking on the synch pair
960- aelt , belt = a [best_i ], b [best_j ]
961- if eqi is None :
962- # pump out a '-', '?', '+', '?' quad for the synched lines
963- atags = btags = ""
964- cruncher .set_seqs (aelt , belt )
965- for tag , ai1 , ai2 , bj1 , bj2 in cruncher .get_opcodes ():
966- la , lb = ai2 - ai1 , bj2 - bj1
967- if tag == 'replace' :
968- atags += '^' * la
969- btags += '^' * lb
970- elif tag == 'delete' :
971- atags += '-' * la
972- elif tag == 'insert' :
973- btags += '+' * lb
974- elif tag == 'equal' :
975- atags += ' ' * la
976- btags += ' ' * lb
977- else :
978- raise ValueError ('unknown tag %r' % (tag ,))
979- yield from self ._qformat (aelt , belt , atags , btags )
980- else :
981- # the synch pair is identical
982- yield ' ' + aelt
929+ cruncher .set_seq2 (b [j ])
930+ # Search the corresponding i's within WINDOW for rhe highest
931+ # ratio greater than `cutoff`.
932+ aequiv = alo + (j - blo )
933+ arange = range (max (aequiv - WINDOW , dump_i ),
934+ min (aequiv + WINDOW + 1 , ahi ))
935+ if not arange : # likely exit if `a` is shorter than `b`
936+ break
937+ best_ratio = cutoff
938+ for i in arange :
939+ cruncher .set_seq1 (a [i ])
940+ # Ordering by cheapest to most expensive ratio is very
941+ # valuable, most often getting out early.
942+ if (crqr () > best_ratio
943+ and cqr () > best_ratio
944+ and cr () > best_ratio ):
945+ best_i , best_j , best_ratio = i , j , cr ()
946+
947+ if best_i is None :
948+ # found nothing to synch on yet - move to next j
949+ continue
983950
984- # pump out diffs from after the synch point
985- yield from self ._fancy_helper (a , best_i + 1 , ahi , b , best_j + 1 , bhi )
951+ # pump out straight replace from before this synch pair
952+ yield from self ._fancy_helper (a , dump_i , best_i ,
953+ b , dump_j , best_j )
954+ # do intraline marking on the synch pair
955+ aelt , belt = a [best_i ], b [best_j ]
956+ if aelt != belt :
957+ # pump out a '-', '?', '+', '?' quad for the synched lines
958+ atags = btags = ""
959+ cruncher .set_seqs (aelt , belt )
960+ for tag , ai1 , ai2 , bj1 , bj2 in cruncher .get_opcodes ():
961+ la , lb = ai2 - ai1 , bj2 - bj1
962+ if tag == 'replace' :
963+ atags += '^' * la
964+ btags += '^' * lb
965+ elif tag == 'delete' :
966+ atags += '-' * la
967+ elif tag == 'insert' :
968+ btags += '+' * lb
969+ elif tag == 'equal' :
970+ atags += ' ' * la
971+ btags += ' ' * lb
972+ else :
973+ raise ValueError ('unknown tag %r' % (tag ,))
974+ yield from self ._qformat (aelt , belt , atags , btags )
975+ else :
976+ # the synch pair is identical
977+ yield ' ' + aelt
978+ dump_i , dump_j = best_i + 1 , best_j + 1
979+ best_i = best_j = None
980+
981+ # pump out straight replace from after the last synch pair
982+ yield from self ._fancy_helper (a , dump_i , ahi ,
983+ b , dump_j , bhi )
986984
987985 def _fancy_helper (self , a , alo , ahi , b , blo , bhi ):
988986 g = []
989987 if alo < ahi :
990988 if blo < bhi :
991- g = self ._fancy_replace (a , alo , ahi , b , blo , bhi )
989+ g = self ._plain_replace (a , alo , ahi , b , blo , bhi )
992990 else :
993991 g = self ._dump ('-' , a , alo , ahi )
994992 elif blo < bhi :
@@ -1040,11 +1038,9 @@ def _qformat(self, aline, bline, atags, btags):
10401038# remaining is that perhaps it was really the case that " volatile"
10411039# was inserted after "private". I can live with that <wink>.
10421040
1043- import re
1044-
1045- def IS_LINE_JUNK (line , pat = re .compile (r"\s*(?:#\s*)?$" ).match ):
1041+ def IS_LINE_JUNK (line , pat = None ):
10461042 r"""
1047- Return True for ignorable line: iff `line` is blank or contains a single '#'.
1043+ Return True for ignorable line: if `line` is blank or contains a single '#'.
10481044
10491045 Examples:
10501046
@@ -1056,6 +1052,11 @@ def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
10561052 False
10571053 """
10581054
1055+ if pat is None :
1056+ # Default: match '#' or the empty string
1057+ return line .strip () in '#'
1058+ # Previous versions used the undocumented parameter 'pat' as a
1059+ # match function. Retain this behaviour for compatibility.
10591060 return pat (line ) is not None
10601061
10611062def IS_CHARACTER_JUNK (ch , ws = " \t " ):
@@ -1266,6 +1267,12 @@ def _check_types(a, b, *args):
12661267 if b and not isinstance (b [0 ], str ):
12671268 raise TypeError ('lines to compare must be str, not %s (%r)' %
12681269 (type (b [0 ]).__name__ , b [0 ]))
1270+ if isinstance (a , str ):
1271+ raise TypeError ('input must be a sequence of strings, not %s' %
1272+ type (a ).__name__ )
1273+ if isinstance (b , str ):
1274+ raise TypeError ('input must be a sequence of strings, not %s' %
1275+ type (b ).__name__ )
12691276 for arg in args :
12701277 if not isinstance (arg , str ):
12711278 raise TypeError ('all arguments must be str, not: %r' % (arg ,))
@@ -1628,13 +1635,22 @@ def _line_pair_iterator():
16281635</html>"""
16291636
16301637_styles = """
1638+ :root {color-scheme: light dark}
16311639 table.diff {font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; border:medium}
16321640 .diff_header {background-color:#e0e0e0}
16331641 td.diff_header {text-align:right}
16341642 .diff_next {background-color:#c0c0c0}
1635- .diff_add {background-color:#aaffaa }
1643+ .diff_add {background-color:palegreen }
16361644 .diff_chg {background-color:#ffff77}
1637- .diff_sub {background-color:#ffaaaa}"""
1645+ .diff_sub {background-color:#ffaaaa}
1646+
1647+ @media (prefers-color-scheme: dark) {
1648+ .diff_header {background-color:#666}
1649+ .diff_next {background-color:#393939}
1650+ .diff_add {background-color:darkgreen}
1651+ .diff_chg {background-color:#847415}
1652+ .diff_sub {background-color:darkred}
1653+ }"""
16381654
16391655_table_template = """
16401656 <table class="diff" id="difflib_chg_%(prefix)s_top"
@@ -2014,7 +2030,6 @@ def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
20142030 replace ('\1 ' ,'</span>' ). \
20152031 replace ('\t ' ,' ' )
20162032
2017- del re
20182033
20192034def restore (delta , which ):
20202035 r"""
@@ -2047,10 +2062,3 @@ def restore(delta, which):
20472062 for line in delta :
20482063 if line [:2 ] in prefixes :
20492064 yield line [2 :]
2050-
2051- def _test ():
2052- import doctest , difflib
2053- return doctest .testmod (difflib )
2054-
2055- if __name__ == "__main__" :
2056- _test ()
0 commit comments