diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst
index e5afa174413541..b4b539b150bafc 100644
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -40,6 +40,15 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
complicated way on how many elements the sequences have in common; best case
time is linear.
+ .. impl-detail::
+
+ On CPython, the :class:`SequenceMatcher` class is implemented in C for
+ speed. The pure-Python reference implementation remains available as
+ :mod:`!_pydifflib` for alternative Python implementations. Output is
+ bit-identical between the two implementations, including tie-breaks;
+ typical workloads run 5--25x faster than the pure-Python version, with
+ character/byte sequences seeing the largest gains.
+
**Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
automatically treats certain sequence items as junk. The heuristic counts how many
times each individual item appears in the sequence. If an item's duplicates (after
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 99a1ffb8ad5229..8fe950b7dbb346 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1579,12 +1579,14 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(adobe));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_child));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_parent));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ahi));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(align));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_interpreters));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_threads));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(allow_code));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alo));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alphabet));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(any));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(append));
@@ -1599,13 +1601,16 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(athrow));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(attribute));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(autocommit));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(autojunk));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(backtick));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(base));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(before));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bhi));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(big));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(binary_form));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bit_offset));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bit_size));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(blo));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(block));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(blocking));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bound));
@@ -1850,6 +1855,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(is_struct));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isatty));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isinstance));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isjunk));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isoformat));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isolation_level));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(istext));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index d5818402a508cb..31ea6eed898769 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -302,12 +302,14 @@ struct _Py_global_strings {
STRUCT_FOR_ID(adobe)
STRUCT_FOR_ID(after_in_child)
STRUCT_FOR_ID(after_in_parent)
+ STRUCT_FOR_ID(ahi)
STRUCT_FOR_ID(alias)
STRUCT_FOR_ID(align)
STRUCT_FOR_ID(all)
STRUCT_FOR_ID(all_interpreters)
STRUCT_FOR_ID(all_threads)
STRUCT_FOR_ID(allow_code)
+ STRUCT_FOR_ID(alo)
STRUCT_FOR_ID(alphabet)
STRUCT_FOR_ID(any)
STRUCT_FOR_ID(append)
@@ -322,13 +324,16 @@ struct _Py_global_strings {
STRUCT_FOR_ID(athrow)
STRUCT_FOR_ID(attribute)
STRUCT_FOR_ID(autocommit)
+ STRUCT_FOR_ID(autojunk)
STRUCT_FOR_ID(backtick)
STRUCT_FOR_ID(base)
STRUCT_FOR_ID(before)
+ STRUCT_FOR_ID(bhi)
STRUCT_FOR_ID(big)
STRUCT_FOR_ID(binary_form)
STRUCT_FOR_ID(bit_offset)
STRUCT_FOR_ID(bit_size)
+ STRUCT_FOR_ID(blo)
STRUCT_FOR_ID(block)
STRUCT_FOR_ID(blocking)
STRUCT_FOR_ID(bound)
@@ -573,6 +578,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(is_struct)
STRUCT_FOR_ID(isatty)
STRUCT_FOR_ID(isinstance)
+ STRUCT_FOR_ID(isjunk)
STRUCT_FOR_ID(isoformat)
STRUCT_FOR_ID(isolation_level)
STRUCT_FOR_ID(istext)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index 8227f3fa9eedcf..15c583b897b106 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1577,12 +1577,14 @@ extern "C" {
INIT_ID(adobe), \
INIT_ID(after_in_child), \
INIT_ID(after_in_parent), \
+ INIT_ID(ahi), \
INIT_ID(alias), \
INIT_ID(align), \
INIT_ID(all), \
INIT_ID(all_interpreters), \
INIT_ID(all_threads), \
INIT_ID(allow_code), \
+ INIT_ID(alo), \
INIT_ID(alphabet), \
INIT_ID(any), \
INIT_ID(append), \
@@ -1597,13 +1599,16 @@ extern "C" {
INIT_ID(athrow), \
INIT_ID(attribute), \
INIT_ID(autocommit), \
+ INIT_ID(autojunk), \
INIT_ID(backtick), \
INIT_ID(base), \
INIT_ID(before), \
+ INIT_ID(bhi), \
INIT_ID(big), \
INIT_ID(binary_form), \
INIT_ID(bit_offset), \
INIT_ID(bit_size), \
+ INIT_ID(blo), \
INIT_ID(block), \
INIT_ID(blocking), \
INIT_ID(bound), \
@@ -1848,6 +1853,7 @@ extern "C" {
INIT_ID(is_struct), \
INIT_ID(isatty), \
INIT_ID(isinstance), \
+ INIT_ID(isjunk), \
INIT_ID(isoformat), \
INIT_ID(isolation_level), \
INIT_ID(istext), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index cb731e9a688878..7abcd0349b0532 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -988,6 +988,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(ahi);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(alias);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1012,6 +1016,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(alo);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(alphabet);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1068,6 +1076,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(autojunk);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(backtick);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1080,6 +1092,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(bhi);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(big);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1096,6 +1112,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(blo);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(block);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2072,6 +2092,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
assert(PyUnicode_GET_LENGTH(string) != 1);
+ string = &_Py_ID(isjunk);
+ _PyUnicode_InternStatic(interp, &string);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ assert(PyUnicode_GET_LENGTH(string) != 1);
string = &_Py_ID(isoformat);
_PyUnicode_InternStatic(interp, &string);
assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Lib/_pydifflib.py b/Lib/_pydifflib.py
new file mode 100644
index 00000000000000..351490f065f1e8
--- /dev/null
+++ b/Lib/_pydifflib.py
@@ -0,0 +1,2087 @@
+"""Pure-Python implementation of the difflib module.
+
+This is the reference implementation; the public :mod:`difflib` module
+prefers the C-accelerated SequenceMatcher provided by ``_difflib`` when
+available, falling back to the classes defined here. Alternative Python
+implementations may use this file as a self-contained reference.
+"""
+
+__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
+ 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
+ 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
+
+from heapq import nlargest as _nlargest
+from collections import namedtuple as _namedtuple
+from types import GenericAlias
+lazy from _colorize import can_colorize, get_theme
+
+Match = _namedtuple('Match', 'a b size', module='difflib')
+
+def _calculate_ratio(matches, length):
+ if length:
+ return 2.0 * matches / length
+ return 1.0
+
+class SequenceMatcher:
+
+ """
+ SequenceMatcher is a flexible class for comparing pairs of sequences of
+ any type, so long as the sequence elements are hashable. The basic
+ algorithm predates, and is a little fancier than, an algorithm
+ published in the late 1980's by Ratcliff and Obershelp under the
+ hyperbolic name "gestalt pattern matching". The basic idea is to find
+ the longest contiguous matching subsequence that contains no "junk"
+ elements (R-O doesn't address junk). The same idea is then applied
+ recursively to the pieces of the sequences to the left and to the right
+ of the matching subsequence. This does not yield minimal edit
+ sequences, but does tend to yield matches that "look right" to people.
+
+ SequenceMatcher tries to compute a "human-friendly diff" between two
+ sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
+ longest *contiguous* & junk-free matching subsequence. That's what
+ catches peoples' eyes. The Windows(tm) windiff has another interesting
+ notion, pairing up elements that appear uniquely in each sequence.
+ That, and the method here, appear to yield more intuitive difference
+ reports than does diff. This method appears to be the least vulnerable
+ to syncing up on blocks of "junk lines", though (like blank lines in
+ ordinary text files, or maybe "
" lines in HTML files). That may be
+ because this is the only method of the 3 that has a *concept* of
+ "junk" .
+
+ Example, comparing two strings, and considering blanks to be "junk":
+
+ >>> s = SequenceMatcher(lambda x: x == " ",
+ ... "private Thread currentThread;",
+ ... "private volatile Thread currentThread;")
+ >>>
+
+ .ratio() returns a float in [0, 1], measuring the "similarity" of the
+ sequences. As a rule of thumb, a .ratio() value over 0.6 means the
+ sequences are close matches:
+
+ >>> print(round(s.ratio(), 2))
+ 0.87
+ >>>
+
+ If you're only interested in where the sequences match,
+ .get_matching_blocks() is handy:
+
+ >>> for block in s.get_matching_blocks():
+ ... print("a[%d] and b[%d] match for %d elements" % block)
+ a[0] and b[0] match for 8 elements
+ a[8] and b[17] match for 21 elements
+ a[29] and b[38] match for 0 elements
+
+ Note that the last tuple returned by .get_matching_blocks() is always a
+ dummy, (len(a), len(b), 0), and this is the only case in which the last
+ tuple element (number of elements matched) is 0.
+
+ If you want to know how to change the first sequence into the second,
+ use .get_opcodes():
+
+ >>> for opcode in s.get_opcodes():
+ ... print("%6s a[%d:%d] b[%d:%d]" % opcode)
+ equal a[0:8] b[0:8]
+ insert a[8:8] b[8:17]
+ equal a[8:29] b[17:38]
+
+ See the Differ class for a fancy human-friendly file differencer, which
+ uses SequenceMatcher both to compare sequences of lines, and to compare
+ sequences of characters within similar (near-matching) lines.
+
+ See also function get_close_matches() in this module, which shows how
+ simple code building on SequenceMatcher can be used to do useful work.
+
+ Timing: Basic R-O is cubic time worst case and quadratic time expected
+ case. SequenceMatcher is quadratic time for the worst case and has
+ expected-case behavior dependent in a complicated way on how many
+ elements the sequences have in common; best case time is linear.
+ """
+
+ def __init__(self, isjunk=None, a='', b='', autojunk=True):
+ """Construct a SequenceMatcher.
+
+ Optional arg isjunk is None (the default), or a one-argument
+ function that takes a sequence element and returns true iff the
+ element is junk. None is equivalent to passing "lambda x: 0", i.e.
+ no elements are considered to be junk. For example, pass
+ lambda x: x in " \\t"
+ if you're comparing lines as sequences of characters, and don't
+ want to synch up on blanks or hard tabs.
+
+ Optional arg a is the first of two sequences to be compared. By
+ default, an empty string. The elements of a must be hashable. See
+ also .set_seqs() and .set_seq1().
+
+ Optional arg b is the second of two sequences to be compared. By
+ default, an empty string. The elements of b must be hashable. See
+ also .set_seqs() and .set_seq2().
+
+ Optional arg autojunk should be set to False to disable the
+ "automatic junk heuristic" that treats popular elements as junk
+ (see module documentation for more information).
+ """
+
+ # Members:
+ # a
+ # first sequence
+ # b
+ # second sequence; differences are computed as "what do
+ # we need to do to 'a' to change it into 'b'?"
+ # b2j
+ # for x in b, b2j[x] is a list of the indices (into b)
+ # at which x appears; junk and popular elements do not appear
+ # fullbcount
+ # for x in b, fullbcount[x] == the number of times x
+ # appears in b; only materialized if really needed (used
+ # only for computing quick_ratio())
+ # matching_blocks
+ # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
+ # ascending & non-overlapping in i and in j; terminated by
+ # a dummy (len(a), len(b), 0) sentinel
+ # opcodes
+ # a list of (tag, i1, i2, j1, j2) tuples, where tag is
+ # one of
+ # 'replace' a[i1:i2] should be replaced by b[j1:j2]
+ # 'delete' a[i1:i2] should be deleted
+ # 'insert' b[j1:j2] should be inserted
+ # 'equal' a[i1:i2] == b[j1:j2]
+ # isjunk
+ # a user-supplied function taking a sequence element and
+ # returning true iff the element is "junk" -- this has
+ # subtle but helpful effects on the algorithm, which I'll
+ # get around to writing up someday <0.9 wink>.
+ # DON'T USE! Only __chain_b uses this. Use "in self.bjunk".
+ # bjunk
+ # the items in b for which isjunk is True.
+ # bpopular
+ # nonjunk items in b treated as junk by the heuristic (if used).
+
+ self.isjunk = isjunk
+ self.a = self.b = None
+ self.autojunk = autojunk
+ self.set_seqs(a, b)
+
+ def set_seqs(self, a, b):
+ """Set the two sequences to be compared.
+
+ >>> s = SequenceMatcher()
+ >>> s.set_seqs("abcd", "bcde")
+ >>> s.ratio()
+ 0.75
+ """
+
+ self.set_seq1(a)
+ self.set_seq2(b)
+
+ def set_seq1(self, a):
+ """Set the first sequence to be compared.
+
+ The second sequence to be compared is not changed.
+
+ >>> s = SequenceMatcher(None, "abcd", "bcde")
+ >>> s.ratio()
+ 0.75
+ >>> s.set_seq1("bcde")
+ >>> s.ratio()
+ 1.0
+ >>>
+
+ SequenceMatcher computes and caches detailed information about the
+ second sequence, so if you want to compare one sequence S against
+ many sequences, use .set_seq2(S) once and call .set_seq1(x)
+ repeatedly for each of the other sequences.
+
+ See also set_seqs() and set_seq2().
+ """
+
+ if a is self.a:
+ return
+ self.a = a
+ self.matching_blocks = self.opcodes = None
+
+ def set_seq2(self, b):
+ """Set the second sequence to be compared.
+
+ The first sequence to be compared is not changed.
+
+ >>> s = SequenceMatcher(None, "abcd", "bcde")
+ >>> s.ratio()
+ 0.75
+ >>> s.set_seq2("abcd")
+ >>> s.ratio()
+ 1.0
+ >>>
+
+ SequenceMatcher computes and caches detailed information about the
+ second sequence, so if you want to compare one sequence S against
+ many sequences, use .set_seq2(S) once and call .set_seq1(x)
+ repeatedly for each of the other sequences.
+
+ See also set_seqs() and set_seq1().
+ """
+
+ if b is self.b:
+ return
+ self.b = b
+ self.matching_blocks = self.opcodes = None
+ self.fullbcount = None
+ self.__chain_b()
+
+ # For each element x in b, set b2j[x] to a list of the indices in
+ # b where x appears; the indices are in increasing order; note that
+ # the number of times x appears in b is len(b2j[x]) ...
+ # when self.isjunk is defined, junk elements don't show up in this
+ # map at all, which stops the central find_longest_match method
+ # from starting any matching block at a junk element ...
+ # b2j also does not contain entries for "popular" elements, meaning
+ # elements that account for more than 1 + 1% of the total elements, and
+ # when the sequence is reasonably large (>= 200 elements); this can
+ # be viewed as an adaptive notion of semi-junk, and yields an enormous
+ # speedup when, e.g., comparing program files with hundreds of
+ # instances of "return NULL;" ...
+ # note that this is only called when b changes; so for cross-product
+ # kinds of matches, it's best to call set_seq2 once, then set_seq1
+ # repeatedly
+
+ def __chain_b(self):
+ # Because isjunk is a user-defined (not C) function, and we test
+ # for junk a LOT, it's important to minimize the number of calls.
+ # Before the tricks described here, __chain_b was by far the most
+ # time-consuming routine in the whole module! If anyone sees
+ # Jim Roskind, thank him again for profile.py -- I never would
+ # have guessed that.
+ # The first trick is to build b2j ignoring the possibility
+ # of junk. I.e., we don't call isjunk at all yet. Throwing
+ # out the junk later is much cheaper than building b2j "right"
+ # from the start.
+ b = self.b
+ self.b2j = b2j = {}
+
+ for i, elt in enumerate(b):
+ indices = b2j.setdefault(elt, [])
+ indices.append(i)
+
+ # Purge junk elements
+ self.bjunk = junk = set()
+ isjunk = self.isjunk
+ if isjunk:
+ for elt in b2j.keys():
+ if isjunk(elt):
+ junk.add(elt)
+ for elt in junk: # separate loop avoids separate list of keys
+ del b2j[elt]
+
+ # Purge popular elements that are not junk
+ self.bpopular = popular = set()
+ n = len(b)
+ if self.autojunk and n >= 200:
+ ntest = n // 100 + 1
+ for elt, idxs in b2j.items():
+ if len(idxs) > ntest:
+ popular.add(elt)
+ for elt in popular: # ditto; as fast for 1% deletion
+ del b2j[elt]
+
+ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
+ """Find longest matching block in a[alo:ahi] and b[blo:bhi].
+
+ By default it will find the longest match in the entirety of a and b.
+
+ If isjunk is not defined:
+
+ Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
+ alo <= i <= i+k <= ahi
+ blo <= j <= j+k <= bhi
+ and for all (i',j',k') meeting those conditions,
+ k >= k'
+ i <= i'
+ and if i == i', j <= j'
+
+ In other words, of all maximal matching blocks, return one that
+ starts earliest in a, and of all those maximal matching blocks that
+ start earliest in a, return the one that starts earliest in b.
+
+ >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
+ >>> s.find_longest_match(0, 5, 0, 9)
+ Match(a=0, b=4, size=5)
+
+ If isjunk is defined, first the longest matching block is
+ determined as above, but with the additional restriction that no
+ junk element appears in the block. Then that block is extended as
+ far as possible by matching (only) junk elements on both sides. So
+ the resulting block never matches on junk except as identical junk
+ happens to be adjacent to an "interesting" match.
+
+ Here's the same example as before, but considering blanks to be
+ junk. That prevents " abcd" from matching the " abcd" at the tail
+ end of the second sequence directly. Instead only the "abcd" can
+ match, and matches the leftmost "abcd" in the second sequence:
+
+ >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
+ >>> s.find_longest_match(0, 5, 0, 9)
+ Match(a=1, b=0, size=4)
+
+ If no blocks match, return (alo, blo, 0).
+
+ >>> s = SequenceMatcher(None, "ab", "c")
+ >>> s.find_longest_match(0, 2, 0, 1)
+ Match(a=0, b=0, size=0)
+ """
+
+ # CAUTION: stripping common prefix or suffix would be incorrect.
+ # E.g.,
+ # ab
+ # acab
+ # Longest matching block is "ab", but if common prefix is
+ # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
+ # strip, so ends up claiming that ab is changed to acab by
+ # inserting "ca" in the middle. That's minimal but unintuitive:
+ # "it's obvious" that someone inserted "ac" at the front.
+ # Windiff ends up at the same place as diff, but by pairing up
+ # the unique 'b's and then matching the first two 'a's.
+
+ a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
+ if ahi is None:
+ ahi = len(a)
+ if bhi is None:
+ bhi = len(b)
+ besti, bestj, bestsize = alo, blo, 0
+ # find longest junk-free match
+ # during an iteration of the loop, j2len[j] = length of longest
+ # junk-free match ending with a[i-1] and b[j]
+ j2len = {}
+ nothing = []
+ for i in range(alo, ahi):
+ # look at all instances of a[i] in b; note that because
+ # b2j has no junk keys, the loop is skipped if a[i] is junk
+ j2lenget = j2len.get
+ newj2len = {}
+ for j in b2j.get(a[i], nothing):
+ # a[i] matches b[j]
+ if j < blo:
+ continue
+ if j >= bhi:
+ break
+ k = newj2len[j] = j2lenget(j-1, 0) + 1
+ if k > bestsize:
+ besti, bestj, bestsize = i-k+1, j-k+1, k
+ j2len = newj2len
+
+ # Extend the best by non-junk elements on each end. In particular,
+ # "popular" non-junk elements aren't in b2j, which greatly speeds
+ # the inner loop above, but also means "the best" match so far
+ # doesn't contain any junk *or* popular non-junk elements.
+ while besti > alo and bestj > blo and \
+ not isbjunk(b[bestj-1]) and \
+ a[besti-1] == b[bestj-1]:
+ besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+ while besti+bestsize < ahi and bestj+bestsize < bhi and \
+ not isbjunk(b[bestj+bestsize]) and \
+ a[besti+bestsize] == b[bestj+bestsize]:
+ bestsize += 1
+
+ # Now that we have a wholly interesting match (albeit possibly
+ # empty!), we may as well suck up the matching junk on each
+ # side of it too. Can't think of a good reason not to, and it
+ # saves post-processing the (possibly considerable) expense of
+ # figuring out what to do with it. In the case of an empty
+ # interesting match, this is clearly the right thing to do,
+ # because no other kind of match is possible in the regions.
+ while besti > alo and bestj > blo and \
+ isbjunk(b[bestj-1]) and \
+ a[besti-1] == b[bestj-1]:
+ besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+ while besti+bestsize < ahi and bestj+bestsize < bhi and \
+ isbjunk(b[bestj+bestsize]) and \
+ a[besti+bestsize] == b[bestj+bestsize]:
+ bestsize = bestsize + 1
+
+ return Match(besti, bestj, bestsize)
+
+ def get_matching_blocks(self):
+ """Return list of triples describing matching subsequences.
+
+ Each triple is of the form (i, j, n), and means that
+ a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
+ i and in j. New in Python 2.5, it's also guaranteed that if
+ (i, j, n) and (i', j', n') are adjacent triples in the list, and
+ the second is not the last triple in the list, then i+n != i' or
+ j+n != j'. IOW, adjacent triples never describe adjacent equal
+ blocks.
+
+ The last triple is a dummy, (len(a), len(b), 0), and is the only
+ triple with n==0.
+
+ >>> s = SequenceMatcher(None, "abxcd", "abcd")
+ >>> list(s.get_matching_blocks())
+ [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
+ """
+
+ if self.matching_blocks is not None:
+ return self.matching_blocks
+ la, lb = len(self.a), len(self.b)
+
+ # This is most naturally expressed as a recursive algorithm, but
+ # at least one user bumped into extreme use cases that exceeded
+ # the recursion limit on their box. So, now we maintain a list
+ # ('queue`) of blocks we still need to look at, and append partial
+ # results to `matching_blocks` in a loop; the matches are sorted
+ # at the end.
+ queue = [(0, la, 0, lb)]
+ matching_blocks = []
+ while queue:
+ alo, ahi, blo, bhi = queue.pop()
+ i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
+ # a[alo:i] vs b[blo:j] unknown
+ # a[i:i+k] same as b[j:j+k]
+ # a[i+k:ahi] vs b[j+k:bhi] unknown
+ if k: # if k is 0, there was no matching block
+ matching_blocks.append(x)
+ if alo < i and blo < j:
+ queue.append((alo, i, blo, j))
+ if i+k < ahi and j+k < bhi:
+ queue.append((i+k, ahi, j+k, bhi))
+ matching_blocks.sort()
+
+ # It's possible that we have adjacent equal blocks in the
+ # matching_blocks list now. Starting with 2.5, this code was added
+ # to collapse them.
+ i1 = j1 = k1 = 0
+ non_adjacent = []
+ for i2, j2, k2 in matching_blocks:
+ # Is this block adjacent to i1, j1, k1?
+ if i1 + k1 == i2 and j1 + k1 == j2:
+ # Yes, so collapse them -- this just increases the length of
+ # the first block by the length of the second, and the first
+ # block so lengthened remains the block to compare against.
+ k1 += k2
+ else:
+ # Not adjacent. Remember the first block (k1==0 means it's
+ # the dummy we started with), and make the second block the
+ # new block to compare against.
+ if k1:
+ non_adjacent.append((i1, j1, k1))
+ i1, j1, k1 = i2, j2, k2
+ if k1:
+ non_adjacent.append((i1, j1, k1))
+
+ non_adjacent.append( (la, lb, 0) )
+ self.matching_blocks = list(map(Match._make, non_adjacent))
+ return self.matching_blocks
+
+ def get_opcodes(self):
+ """Return list of 5-tuples describing how to turn a into b.
+
+ Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
+ has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
+ tuple preceding it, and likewise for j1 == the previous j2.
+
+ The tags are strings, with these meanings:
+
+ 'replace': a[i1:i2] should be replaced by b[j1:j2]
+ 'delete': a[i1:i2] should be deleted.
+ Note that j1==j2 in this case.
+ 'insert': b[j1:j2] should be inserted at a[i1:i1].
+ Note that i1==i2 in this case.
+ 'equal': a[i1:i2] == b[j1:j2]
+
+ >>> a = "qabxcd"
+ >>> b = "abycdf"
+ >>> s = SequenceMatcher(None, a, b)
+ >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
+ ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
+ ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
+ delete a[0:1] (q) b[0:0] ()
+ equal a[1:3] (ab) b[0:2] (ab)
+ replace a[3:4] (x) b[2:3] (y)
+ equal a[4:6] (cd) b[3:5] (cd)
+ insert a[6:6] () b[5:6] (f)
+ """
+
+ if self.opcodes is not None:
+ return self.opcodes
+ i = j = 0
+ self.opcodes = answer = []
+ for ai, bj, size in self.get_matching_blocks():
+ # invariant: we've pumped out correct diffs to change
+ # a[:i] into b[:j], and the next matching block is
+ # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
+ # out a diff to change a[i:ai] into b[j:bj], pump out
+ # the matching block, and move (i,j) beyond the match
+ tag = ''
+ if i < ai and j < bj:
+ tag = 'replace'
+ elif i < ai:
+ tag = 'delete'
+ elif j < bj:
+ tag = 'insert'
+ if tag:
+ answer.append( (tag, i, ai, j, bj) )
+ i, j = ai+size, bj+size
+ # the list of matching blocks is terminated by a
+ # sentinel with size 0
+ if size:
+ answer.append( ('equal', ai, i, bj, j) )
+ return answer
+
+ def get_grouped_opcodes(self, n=3):
+ """ Isolate change clusters by eliminating ranges with no changes.
+
+ Return a generator of groups with up to n lines of context.
+ Each group is in the same format as returned by get_opcodes().
+
+ >>> from pprint import pprint
+ >>> a = list(map(str, range(1,40)))
+ >>> b = a[:]
+ >>> b[8:8] = ['i'] # Make an insertion
+ >>> b[20] += 'x' # Make a replacement
+ >>> b[23:28] = [] # Make a deletion
+ >>> b[30] += 'y' # Make another replacement
+ >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
+ [
+ [('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
+ [
+ ('equal', 16, 19, 17, 20),
+ ('replace', 19, 20, 20, 21),
+ ('equal', 20, 22, 21, 23),
+ ('delete', 22, 27, 23, 23),
+ ('equal', 27, 30, 23, 26),
+ ],
+ [('equal', 31, 34, 27, 30), ('replace', 34, 35, 30, 31), ('equal', 35, 38, 31, 34)],
+ ]
+ """
+
+ codes = self.get_opcodes()
+ if not codes:
+ codes = [("equal", 0, 1, 0, 1)]
+ # Fixup leading and trailing groups if they show no changes.
+ if codes[0][0] == 'equal':
+ tag, i1, i2, j1, j2 = codes[0]
+ codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
+ if codes[-1][0] == 'equal':
+ tag, i1, i2, j1, j2 = codes[-1]
+ codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
+
+ nn = n + n
+ group = []
+ for tag, i1, i2, j1, j2 in codes:
+ # End the current group and start a new one whenever
+ # there is a large range with no changes.
+ if tag == 'equal' and i2-i1 > nn:
+ group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
+ yield group
+ group = []
+ i1, j1 = max(i1, i2-n), max(j1, j2-n)
+ group.append((tag, i1, i2, j1 ,j2))
+ if group and not (len(group)==1 and group[0][0] == 'equal'):
+ yield group
+
+ def ratio(self):
+ """Return a measure of the sequences' similarity (float in [0,1]).
+
+ Where T is the total number of elements in both sequences, and
+ M is the number of matches, this is 2.0*M / T.
+ Note that this is 1 if the sequences are identical, and 0 if
+ they have nothing in common.
+
+ .ratio() is expensive to compute if you haven't already computed
+ .get_matching_blocks() or .get_opcodes(), in which case you may
+ want to try .quick_ratio() or .real_quick_ratio() first to get an
+ upper bound.
+
+ >>> s = SequenceMatcher(None, "abcd", "bcde")
+ >>> s.ratio()
+ 0.75
+ >>> s.quick_ratio()
+ 0.75
+ >>> s.real_quick_ratio()
+ 1.0
+ """
+
+ matches = sum(triple[-1] for triple in self.get_matching_blocks())
+ return _calculate_ratio(matches, len(self.a) + len(self.b))
+
+ def quick_ratio(self):
+ """Return an upper bound on ratio() relatively quickly.
+
+ This isn't defined beyond that it is an upper bound on .ratio(), and
+ is faster to compute.
+ """
+
+ # viewing a and b as multisets, set matches to the cardinality
+ # of their intersection; this counts the number of matches
+ # without regard to order, so is clearly an upper bound
+ if self.fullbcount is None:
+ self.fullbcount = fullbcount = {}
+ for elt in self.b:
+ fullbcount[elt] = fullbcount.get(elt, 0) + 1
+ fullbcount = self.fullbcount
+ # avail[x] is the number of times x appears in 'b' less the
+ # number of times we've seen it in 'a' so far ... kinda
+ avail = {}
+ matches = 0
+ for elt in self.a:
+ if elt in avail:
+ numb = avail[elt]
+ else:
+ numb = fullbcount.get(elt, 0)
+ avail[elt] = numb - 1
+ if numb > 0:
+ matches += 1
+ return _calculate_ratio(matches, len(self.a) + len(self.b))
+
+ def real_quick_ratio(self):
+ """Return an upper bound on ratio() very quickly.
+
+ This isn't defined beyond that it is an upper bound on .ratio(), and
+ is faster to compute than either .ratio() or .quick_ratio().
+ """
+
+ la, lb = len(self.a), len(self.b)
+ # can't have more matches than the number of elements in the
+ # shorter sequence
+ return _calculate_ratio(min(la, lb), la + lb)
+
+ __class_getitem__ = classmethod(GenericAlias)
+
+
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+ """Use SequenceMatcher to return list of the best "good enough" matches.
+
+ word is a sequence for which close matches are desired (typically a
+ string).
+
+ possibilities is a list of sequences against which to match word
+ (typically a list of strings).
+
+ Optional arg n (default 3) is the maximum number of close matches to
+ return. n must be > 0.
+
+ Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
+ that don't score at least that similar to word are ignored.
+
+ The best (no more than n) matches among the possibilities are returned
+ in a list, sorted by similarity score, most similar first.
+
+ >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
+ ['apple', 'ape']
+ >>> import keyword as _keyword
+ >>> get_close_matches("wheel", _keyword.kwlist)
+ ['while']
+ >>> get_close_matches("Apple", _keyword.kwlist)
+ []
+ >>> get_close_matches("accept", _keyword.kwlist)
+ ['except']
+ """
+
+ if not n > 0:
+ raise ValueError("n must be > 0: %r" % (n,))
+ if not 0.0 <= cutoff <= 1.0:
+ raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+ result = []
+ s = SequenceMatcher()
+ s.set_seq2(word)
+ for x in possibilities:
+ s.set_seq1(x)
+ if s.real_quick_ratio() < cutoff or s.quick_ratio() < cutoff:
+ continue
+
+ ratio = s.ratio()
+ if ratio >= cutoff:
+ result.append((ratio, x))
+
+ # Move the best scorers to head of list
+ result = _nlargest(n, result)
+ # Strip scores for the best n matches
+ return [x for score, x in result]
+
+
+def _keep_original_ws(s, tag_s):
+ """Replace whitespace with the original whitespace characters in `s`"""
+ return ''.join(
+ c if tag_c == " " and c.isspace() else tag_c
+ for c, tag_c in zip(s, tag_s)
+ )
+
+
+
+class Differ:
+ r"""
+ Differ is a class for comparing sequences of lines of text, and
+ producing human-readable differences or deltas. Differ uses
+ SequenceMatcher both to compare sequences of lines, and to compare
+ sequences of characters within similar (near-matching) lines.
+
+ Each line of a Differ delta begins with a two-letter code:
+
+ '- ' line unique to sequence 1
+ '+ ' line unique to sequence 2
+ ' ' line common to both sequences
+ '? ' line not present in either input sequence
+
+ Lines beginning with '? ' attempt to guide the eye to intraline
+ differences, and were not present in either input sequence. These lines
+ can be confusing if the sequences contain tab characters.
+
+ Note that Differ makes no claim to produce a *minimal* diff. To the
+ contrary, minimal diffs are often counter-intuitive, because they synch
+ up anywhere possible, sometimes accidental matches 100 pages apart.
+ Restricting synch points to contiguous matches preserves some notion of
+ locality, at the occasional cost of producing a longer diff.
+
+ Example: Comparing two texts.
+
+ First we set up the texts, sequences of individual single-line strings
+ ending with newlines (such sequences can also be obtained from the
+ `readlines()` method of file-like objects):
+
+ >>> text1 = ''' 1. Beautiful is better than ugly.
+ ... 2. Explicit is better than implicit.
+ ... 3. Simple is better than complex.
+ ... 4. Complex is better than complicated.
+ ... '''.splitlines(keepends=True)
+ >>> len(text1)
+ 4
+ >>> text1[0][-1]
+ '\n'
+ >>> text2 = ''' 1. Beautiful is better than ugly.
+ ... 3. Simple is better than complex.
+ ... 4. Complicated is better than complex.
+ ... 5. Flat is better than nested.
+ ... '''.splitlines(keepends=True)
+
+ Next we instantiate a Differ object:
+
+ >>> d = Differ()
+
+ Note that when instantiating a Differ object we may pass functions to
+ filter out line and character 'junk'. See Differ.__init__ for details.
+
+ Finally, we compare the two:
+
+ >>> result = list(d.compare(text1, text2))
+
+ 'result' is a list of strings, so let's pretty-print it:
+
+ >>> from pprint import pprint as _pprint
+ >>> _pprint(result)
+ [
+ ' 1. Beautiful is better than ugly.\n',
+ '- 2. Explicit is better than implicit.\n',
+ '- 3. Simple is better than complex.\n',
+ '+ 3. Simple is better than complex.\n',
+ '? ++\n',
+ '- 4. Complex is better than complicated.\n',
+ '? ^ ---- ^\n',
+ '+ 4. Complicated is better than complex.\n',
+ '? ++++ ^ ^\n',
+ '+ 5. Flat is better than nested.\n',
+ ]
+
+ As a single multi-line string it looks like this:
+
+ >>> print(''.join(result), end="")
+ 1. Beautiful is better than ugly.
+ - 2. Explicit is better than implicit.
+ - 3. Simple is better than complex.
+ + 3. Simple is better than complex.
+ ? ++
+ - 4. Complex is better than complicated.
+ ? ^ ---- ^
+ + 4. Complicated is better than complex.
+ ? ++++ ^ ^
+ + 5. Flat is better than nested.
+ """
+
+ def __init__(self, linejunk=None, charjunk=None):
+ """
+ Construct a text differencer, with optional filters.
+
+ The two optional keyword parameters are for filter functions:
+
+ - `linejunk`: A function that should accept a single string argument,
+ and return true iff the string is junk. The module-level function
+ `IS_LINE_JUNK` may be used to filter out lines without visible
+ characters, except for at most one splat ('#'). It is recommended
+ to leave linejunk None; the underlying SequenceMatcher class has
+ an adaptive notion of "noise" lines that's better than any static
+ definition the author has ever been able to craft.
+
+ - `charjunk`: A function that should accept a string of length 1. The
+ module-level function `IS_CHARACTER_JUNK` may be used to filter out
+ whitespace characters (a blank or tab; **note**: bad idea to include
+ newline in this!). Use of IS_CHARACTER_JUNK is recommended.
+ """
+
+ self.linejunk = linejunk
+ self.charjunk = charjunk
+
+ def compare(self, a, b):
+ r"""
+ Compare two sequences of lines; generate the resulting delta.
+
+ Each sequence must contain individual single-line strings ending with
+ newlines. Such sequences can be obtained from the `readlines()` method
+ of file-like objects. The delta generated also consists of newline-
+ terminated strings, ready to be printed as-is via the writelines()
+ method of a file-like object.
+
+ Example:
+
+ >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
+ ... 'ore\ntree\nemu\n'.splitlines(True))),
+ ... end="")
+ - one
+ ? ^
+ + ore
+ ? ^
+ - two
+ - three
+ ? -
+ + tree
+ + emu
+ """
+
+ cruncher = SequenceMatcher(self.linejunk, a, b)
+ for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
+ if tag == 'replace':
+ g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
+ elif tag == 'delete':
+ g = self._dump('-', a, alo, ahi)
+ elif tag == 'insert':
+ g = self._dump('+', b, blo, bhi)
+ elif tag == 'equal':
+ g = self._dump(' ', a, alo, ahi)
+ else:
+ raise ValueError('unknown tag %r' % (tag,))
+
+ yield from g
+
+ def _dump(self, tag, x, lo, hi):
+ """Generate comparison results for a same-tagged range."""
+ for i in range(lo, hi):
+ yield '%s %s' % (tag, x[i])
+
+ def _plain_replace(self, a, alo, ahi, b, blo, bhi):
+ assert alo < ahi and blo < bhi
+ # dump the shorter block first -- reduces the burden on short-term
+ # memory if the blocks are of very different sizes
+ if bhi - blo < ahi - alo:
+ first = self._dump('+', b, blo, bhi)
+ second = self._dump('-', a, alo, ahi)
+ else:
+ first = self._dump('-', a, alo, ahi)
+ second = self._dump('+', b, blo, bhi)
+
+ for g in first, second:
+ yield from g
+
+ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
+ r"""
+ When replacing one block of lines with another, search the blocks
+ for *similar* lines; the best-matching pair (if any) is used as a
+ synch point, and intraline difference marking is done on the
+ similar pair. Lots of work, but often worth it.
+
+ Example:
+
+ >>> d = Differ()
+ >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
+ ... ['abcdefGhijkl\n'], 0, 1)
+ >>> print(''.join(results), end="")
+ - abcDefghiJkl
+ ? ^ ^ ^
+ + abcdefGhijkl
+ ? ^ ^ ^
+ """
+ # Don't synch up unless the lines have a similarity score above
+ # cutoff. Previously only the smallest pair was handled here,
+ # and if there are many pairs with the best ratio, recursion
+ # could grow very deep, and runtime cubic. See:
+ # https://github.com/python/cpython/issues/119105
+ #
+ # Later, more pathological cases prompted removing recursion
+ # entirely.
+ cutoff = 0.74999
+ cruncher = SequenceMatcher(self.charjunk)
+ crqr = cruncher.real_quick_ratio
+ cqr = cruncher.quick_ratio
+ cr = cruncher.ratio
+
+ WINDOW = 10
+ best_i = best_j = None
+ dump_i, dump_j = alo, blo # smallest indices not yet resolved
+ for j in range(blo, bhi):
+ cruncher.set_seq2(b[j])
+ # Search the corresponding i's within WINDOW for rhe highest
+ # ratio greater than `cutoff`.
+ aequiv = alo + (j - blo)
+ arange = range(max(aequiv - WINDOW, dump_i),
+ min(aequiv + WINDOW + 1, ahi))
+ if not arange: # likely exit if `a` is shorter than `b`
+ break
+ best_ratio = cutoff
+ for i in arange:
+ cruncher.set_seq1(a[i])
+ # Ordering by cheapest to most expensive ratio is very
+ # valuable, most often getting out early.
+ if crqr() <= best_ratio or cqr() <= best_ratio:
+ continue
+
+ ratio = cr()
+ if ratio > best_ratio:
+ best_i, best_j, best_ratio = i, j, ratio
+
+ if best_i is None:
+ # found nothing to synch on yet - move to next j
+ continue
+
+ # pump out straight replace from before this synch pair
+ yield from self._fancy_helper(a, dump_i, best_i,
+ b, dump_j, best_j)
+ # do intraline marking on the synch pair
+ aelt, belt = a[best_i], b[best_j]
+ if aelt != belt:
+ # pump out a '-', '?', '+', '?' quad for the synched lines
+ atags = btags = ""
+ cruncher.set_seqs(aelt, belt)
+ for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
+ la, lb = ai2 - ai1, bj2 - bj1
+ if tag == 'replace':
+ atags += '^' * la
+ btags += '^' * lb
+ elif tag == 'delete':
+ atags += '-' * la
+ elif tag == 'insert':
+ btags += '+' * lb
+ elif tag == 'equal':
+ atags += ' ' * la
+ btags += ' ' * lb
+ else:
+ raise ValueError('unknown tag %r' % (tag,))
+ yield from self._qformat(aelt, belt, atags, btags)
+ else:
+ # the synch pair is identical
+ yield ' ' + aelt
+ dump_i, dump_j = best_i + 1, best_j + 1
+ best_i = best_j = None
+
+ # pump out straight replace from after the last synch pair
+ yield from self._fancy_helper(a, dump_i, ahi,
+ b, dump_j, bhi)
+
+ def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
+ g = []
+ if alo < ahi:
+ if blo < bhi:
+ g = self._plain_replace(a, alo, ahi, b, blo, bhi)
+ else:
+ g = self._dump('-', a, alo, ahi)
+ elif blo < bhi:
+ g = self._dump('+', b, blo, bhi)
+
+ yield from g
+
+ def _qformat(self, aline, bline, atags, btags):
+ r"""
+ Format "?" output and deal with tabs.
+
+ Example:
+
+ >>> d = Differ()
+ >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
+ ... ' ^ ^ ^ ', ' ^ ^ ^ ')
+ >>> for line in results: print(repr(line))
+ ...
+ '- \tabcDefghiJkl\n'
+ '? \t ^ ^ ^\n'
+ '+ \tabcdefGhijkl\n'
+ '? \t ^ ^ ^\n'
+ """
+ atags = _keep_original_ws(aline, atags).rstrip()
+ btags = _keep_original_ws(bline, btags).rstrip()
+
+ yield "- " + aline
+ if atags:
+ yield f"? {atags}\n"
+
+ yield "+ " + bline
+ if btags:
+ yield f"? {btags}\n"
+
+# With respect to junk, an earlier version of ndiff simply refused to
+# *start* a match with a junk element. The result was cases like this:
+# before: private Thread currentThread;
+# after: private volatile Thread currentThread;
+# If you consider whitespace to be junk, the longest contiguous match
+# not starting with junk is "e Thread currentThread". So ndiff reported
+# that "e volatil" was inserted between the 't' and the 'e' in "private".
+# While an accurate view, to people that's absurd. The current version
+# looks for matching blocks that are entirely junk-free, then extends the
+# longest one of those as far as possible but only with matching junk.
+# So now "currentThread" is matched, then extended to suck up the
+# preceding blank; then "private" is matched, and extended to suck up the
+# following blank; then "Thread" is matched; and finally ndiff reports
+# that "volatile " was inserted before "Thread". The only quibble
+# remaining is that perhaps it was really the case that " volatile"
+# was inserted after "private". I can live with that .
+
+def IS_LINE_JUNK(line, pat=None):
+ r"""
+ Return True for ignorable line: if `line` is blank or contains a single '#'.
+
+ Examples:
+
+ >>> IS_LINE_JUNK('\n')
+ True
+ >>> IS_LINE_JUNK(' # \n')
+ True
+ >>> IS_LINE_JUNK('hello\n')
+ False
+ """
+
+ if pat is None:
+ # Default: match '#' or the empty string
+ return line.strip() in '#'
+ # Previous versions used the undocumented parameter 'pat' as a
+ # match function. Retain this behaviour for compatibility.
+ return pat(line) is not None
+
+def IS_CHARACTER_JUNK(ch, ws=" \t"):
+ r"""
+ Return True for ignorable character: iff `ch` is a space or tab.
+
+ Examples:
+
+ >>> IS_CHARACTER_JUNK(' ')
+ True
+ >>> IS_CHARACTER_JUNK('\t')
+ True
+ >>> IS_CHARACTER_JUNK('\n')
+ False
+ >>> IS_CHARACTER_JUNK('x')
+ False
+ """
+
+ return ch in ws
+
+
+########################################################################
+### Unified Diff
+########################################################################
+
+def _format_range_unified(start, stop):
+ 'Convert range to the "ed" format'
+ # Per the diff spec at http://www.unix.org/single_unix_specification/
+ beginning = start + 1 # lines start numbering with one
+ length = stop - start
+ if length == 1:
+ return '{}'.format(beginning)
+ if not length:
+ beginning -= 1 # empty ranges begin at line just before the range
+ return '{},{}'.format(beginning, length)
+
+def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
+ tofiledate='', n=3, lineterm='\n', *, color=False):
+ r"""
+ Compare two sequences of lines; generate the delta as a unified diff.
+
+ Unified diffs are a compact way of showing line changes and a few
+ lines of context. The number of context lines is set by 'n' which
+ defaults to three.
+
+ By default, the diff control lines (those with ---, +++, or @@) are
+ created with a trailing newline. This is helpful so that inputs
+ created from file.readlines() result in diffs that are suitable for
+ file.writelines() since both the inputs and outputs have trailing
+ newlines.
+
+ For inputs that do not have trailing newlines, set the lineterm
+ argument to "" so that the output will be uniformly newline free.
+
+ Set 'color' to True to enable output in color, similar to
+ 'git diff --color'. Even if enabled, it can be
+ controlled using environment variables such as 'NO_COLOR'.
+
+ The unidiff format normally has a header for filenames and modification
+ times. Any or all of these may be specified using strings for
+ 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+ The modification times are normally expressed in the ISO 8601 format.
+
+ Example:
+
+ >>> for line in unified_diff('one two three four'.split(),
+ ... 'zero one tree four'.split(), 'Original', 'Current',
+ ... '2005-01-26 23:30:50', '2010-04-02 10:20:52',
+ ... lineterm=''):
+ ... print(line) # doctest: +NORMALIZE_WHITESPACE
+ --- Original 2005-01-26 23:30:50
+ +++ Current 2010-04-02 10:20:52
+ @@ -1,4 +1,4 @@
+ +zero
+ one
+ -two
+ -three
+ +tree
+ four
+ """
+
+ if color and can_colorize():
+ t = get_theme(force_color=True).difflib
+ else:
+ t = get_theme(force_no_color=True).difflib
+
+ _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
+ started = False
+ for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+ if not started:
+ started = True
+ fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+ todate = '\t{}'.format(tofiledate) if tofiledate else ''
+ yield f'{t.header}--- {fromfile}{fromdate}{lineterm}{t.reset}'
+ yield f'{t.header}+++ {tofile}{todate}{lineterm}{t.reset}'
+
+ first, last = group[0], group[-1]
+ file1_range = _format_range_unified(first[1], last[2])
+ file2_range = _format_range_unified(first[3], last[4])
+ yield f'{t.hunk}@@ -{file1_range} +{file2_range} @@{lineterm}{t.reset}'
+
+ for tag, i1, i2, j1, j2 in group:
+ if tag == 'equal':
+ for line in a[i1:i2]:
+ yield f'{t.context} {line}{t.reset}'
+ continue
+ if tag in {'replace', 'delete'}:
+ for line in a[i1:i2]:
+ yield f'{t.removed}-{line}{t.reset}'
+ if tag in {'replace', 'insert'}:
+ for line in b[j1:j2]:
+ yield f'{t.added}+{line}{t.reset}'
+
+
+########################################################################
+### Context Diff
+########################################################################
+
+def _format_range_context(start, stop):
+ 'Convert range to the "ed" format'
+ # Per the diff spec at http://www.unix.org/single_unix_specification/
+ beginning = start + 1 # lines start numbering with one
+ length = stop - start
+ if not length:
+ beginning -= 1 # empty ranges begin at line just before the range
+ if length <= 1:
+ return '{}'.format(beginning)
+ return '{},{}'.format(beginning, beginning + length - 1)
+
+# See http://www.unix.org/single_unix_specification/
+def context_diff(a, b, fromfile='', tofile='',
+ fromfiledate='', tofiledate='', n=3, lineterm='\n'):
+ r"""
+ Compare two sequences of lines; generate the delta as a context diff.
+
+ Context diffs are a compact way of showing line changes and a few
+ lines of context. The number of context lines is set by 'n' which
+ defaults to three.
+
+ By default, the diff control lines (those with *** or ---) are
+ created with a trailing newline. This is helpful so that inputs
+ created from file.readlines() result in diffs that are suitable for
+ file.writelines() since both the inputs and outputs have trailing
+ newlines.
+
+ For inputs that do not have trailing newlines, set the lineterm
+ argument to "" so that the output will be uniformly newline free.
+
+ The context diff format normally has a header for filenames and
+ modification times. Any or all of these may be specified using
+ strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+ The modification times are normally expressed in the ISO 8601 format.
+ If not specified, the strings default to blanks.
+
+ Example:
+
+ >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
+ ... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
+ ... end="")
+ *** Original
+ --- Current
+ ***************
+ *** 1,4 ****
+ one
+ ! two
+ ! three
+ four
+ --- 1,4 ----
+ + zero
+ one
+ ! tree
+ four
+ """
+
+ _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
+ prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
+ started = False
+ for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+ if not started:
+ started = True
+ fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+ todate = '\t{}'.format(tofiledate) if tofiledate else ''
+ yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
+ yield '--- {}{}{}'.format(tofile, todate, lineterm)
+
+ first, last = group[0], group[-1]
+ yield '***************' + lineterm
+
+ file1_range = _format_range_context(first[1], last[2])
+ yield '*** {} ****{}'.format(file1_range, lineterm)
+
+ if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
+ for tag, i1, i2, _, _ in group:
+ if tag != 'insert':
+ for line in a[i1:i2]:
+ yield prefix[tag] + line
+
+ file2_range = _format_range_context(first[3], last[4])
+ yield '--- {} ----{}'.format(file2_range, lineterm)
+
+ if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
+ for tag, _, _, j1, j2 in group:
+ if tag != 'delete':
+ for line in b[j1:j2]:
+ yield prefix[tag] + line
+
+def _check_types(a, b, *args):
+ # Checking types is weird, but the alternative is garbled output when
+ # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
+ # without this check, passing filenames as bytes results in output like
+ # --- b'oldfile.txt'
+ # +++ b'newfile.txt'
+ # because of how str.format() incorporates bytes objects.
+ if a and not isinstance(a[0], str):
+ raise TypeError('lines to compare must be str, not %s (%r)' %
+ (type(a[0]).__name__, a[0]))
+ if b and not isinstance(b[0], str):
+ raise TypeError('lines to compare must be str, not %s (%r)' %
+ (type(b[0]).__name__, b[0]))
+ if isinstance(a, str):
+ raise TypeError('input must be a sequence of strings, not %s' %
+ type(a).__name__)
+ if isinstance(b, str):
+ raise TypeError('input must be a sequence of strings, not %s' %
+ type(b).__name__)
+ for arg in args:
+ if not isinstance(arg, str):
+ raise TypeError('all arguments must be str, not: %r' % (arg,))
+
+def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
+ fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
+ r"""
+ Compare `a` and `b`, two sequences of lines represented as bytes rather
+ than str. This is a wrapper for `dfunc`, which is typically either
+ unified_diff() or context_diff(). Inputs are losslessly converted to
+ strings so that `dfunc` only has to worry about strings, and encoded
+ back to bytes on return. This is necessary to compare files with
+ unknown or inconsistent encoding. All other inputs (except `n`) must be
+ bytes rather than str.
+ """
+ def decode(s):
+ try:
+ return s.decode('ascii', 'surrogateescape')
+ except AttributeError as err:
+ msg = ('all arguments must be bytes, not %s (%r)' %
+ (type(s).__name__, s))
+ raise TypeError(msg) from err
+ a = list(map(decode, a))
+ b = list(map(decode, b))
+ fromfile = decode(fromfile)
+ tofile = decode(tofile)
+ fromfiledate = decode(fromfiledate)
+ tofiledate = decode(tofiledate)
+ lineterm = decode(lineterm)
+
+ lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
+ for line in lines:
+ yield line.encode('ascii', 'surrogateescape')
+
+def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
+ r"""
+ Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
+
+ Optional keyword parameters `linejunk` and `charjunk` are for filter
+ functions, or can be None:
+
+ - linejunk: A function that should accept a single string argument and
+ return true iff the string is junk. The default is None, and is
+ recommended; the underlying SequenceMatcher class has an adaptive
+ notion of "noise" lines.
+
+ - charjunk: A function that accepts a character (string of length
+ 1), and returns true iff the character is junk. The default is
+ the module-level function IS_CHARACTER_JUNK, which filters out
+ whitespace characters (a blank or tab; note: it's a bad idea to
+ include newline in this!).
+
+ Tools/scripts/ndiff.py is a command-line front-end to this function.
+
+ Example:
+
+ >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
+ ... 'ore\ntree\nemu\n'.splitlines(keepends=True))
+ >>> print(''.join(diff), end="")
+ - one
+ ? ^
+ + ore
+ ? ^
+ - two
+ - three
+ ? -
+ + tree
+ + emu
+ """
+ return Differ(linejunk, charjunk).compare(a, b)
+
+def _mdiff(fromlines, tolines, context=None, linejunk=None,
+ charjunk=IS_CHARACTER_JUNK):
+ r"""Returns generator yielding marked up from/to side by side differences.
+
+ Arguments:
+ fromlines -- list of text lines to compared to tolines
+ tolines -- list of text lines to be compared to fromlines
+ context -- number of context lines to display on each side of difference,
+ if None, all from/to text lines will be generated.
+ linejunk -- passed on to ndiff (see ndiff documentation)
+ charjunk -- passed on to ndiff (see ndiff documentation)
+
+ This function returns an iterator which returns a tuple:
+ (from line tuple, to line tuple, boolean flag)
+
+ from/to line tuple -- (line num, line text)
+ line num -- integer or None (to indicate a context separation)
+ line text -- original line text with following markers inserted:
+ '\0+' -- marks start of added text
+ '\0-' -- marks start of deleted text
+ '\0^' -- marks start of changed text
+ '\1' -- marks end of added/deleted/changed text
+
+ boolean flag -- None indicates context separation, True indicates
+ either "from" or "to" line contains a change, otherwise False.
+
+ This function/iterator was originally developed to generate side by side
+ file difference for making HTML pages (see HtmlDiff class for example
+ usage).
+
+ Note, this function utilizes the ndiff function to generate the side by
+ side difference markup. Optional ndiff arguments may be passed to this
+ function and they in turn will be passed to ndiff.
+ """
+ import re
+
+ # regular expression for finding intraline change indices
+ change_re = re.compile(r'(\++|\-+|\^+)')
+
+ # create the difference iterator to generate the differences
+ diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
+
+ def _make_line(lines, format_key, side, num_lines=[0,0]):
+ """Returns line of text with user's change markup and line formatting.
+
+ lines -- list of lines from the ndiff generator to produce a line of
+ text from. When producing the line of text to return, the
+ lines used are removed from this list.
+ format_key -- '+' return first line in list with "add" markup around
+ the entire line.
+ '-' return first line in list with "delete" markup around
+ the entire line.
+ '?' return first line in list with add/delete/change
+ intraline markup (indices obtained from second line)
+ None return first line in list with no markup
+ side -- indice into the num_lines list (0=from,1=to)
+ num_lines -- from/to current line number. This is NOT intended to be a
+ passed parameter. It is present as a keyword argument to
+ maintain memory of the current line numbers between calls
+ of this function.
+
+ Note, this function is purposefully not defined at the module scope so
+ that data it needs from its parent function (within whose context it
+ is defined) does not need to be of module scope.
+ """
+ num_lines[side] += 1
+ # Handle case where no user markup is to be added, just return line of
+ # text with user's line format to allow for usage of the line number.
+ if format_key is None:
+ return (num_lines[side],lines.pop(0)[2:])
+ # Handle case of intraline changes
+ if format_key == '?':
+ text, markers = lines.pop(0), lines.pop(0)
+ # find intraline changes (store change type and indices in tuples)
+ sub_info = []
+ def record_sub_info(match_object,sub_info=sub_info):
+ sub_info.append([match_object.group(1)[0],match_object.span()])
+ return match_object.group(1)
+ change_re.sub(record_sub_info,markers)
+ # process each tuple inserting our special marks that won't be
+ # noticed by an xml/html escaper.
+ for key,(begin,end) in reversed(sub_info):
+ text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
+ text = text[2:]
+ # Handle case of add/delete entire line
+ else:
+ text = lines.pop(0)[2:]
+ # if line of text is just a newline, insert a space so there is
+ # something for the user to highlight and see.
+ if not text:
+ text = ' '
+ # insert marks that won't be noticed by an xml/html escaper.
+ text = '\0' + format_key + text + '\1'
+ # Return line of text, first allow user's line formatter to do its
+ # thing (such as adding the line number) then replace the special
+ # marks with what the user's change markup.
+ return (num_lines[side],text)
+
+ def _line_iterator():
+ """Yields from/to lines of text with a change indication.
+
+ This function is an iterator. It itself pulls lines from a
+ differencing iterator, processes them and yields them. When it can
+ it yields both a "from" and a "to" line, otherwise it will yield one
+ or the other. In addition to yielding the lines of from/to text, a
+ boolean flag is yielded to indicate if the text line(s) have
+ differences in them.
+
+ Note, this function is purposefully not defined at the module scope so
+ that data it needs from its parent function (within whose context it
+ is defined) does not need to be of module scope.
+ """
+ lines = []
+ num_blanks_pending, num_blanks_to_yield = 0, 0
+ while True:
+ # Load up next 4 lines so we can look ahead, create strings which
+ # are a concatenation of the first character of each of the 4 lines
+ # so we can do some very readable comparisons.
+ while len(lines) < 4:
+ lines.append(next(diff_lines_iterator, 'X'))
+ s = ''.join([line[0] for line in lines])
+ if s.startswith('X'):
+ # When no more lines, pump out any remaining blank lines so the
+ # corresponding add/delete lines get a matching blank line so
+ # all line pairs get yielded at the next level.
+ num_blanks_to_yield = num_blanks_pending
+ elif s.startswith('-?+?'):
+ # simple intraline change
+ yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
+ continue
+ elif s.startswith('--++'):
+ # in delete block, add block coming: we do NOT want to get
+ # caught up on blank lines yet, just process the delete line
+ num_blanks_pending -= 1
+ yield _make_line(lines,'-',0), None, True
+ continue
+ elif s.startswith(('--?+', '--+', '- ')):
+ # in delete block and see an intraline change or unchanged line
+ # coming: yield the delete line and then blanks
+ from_line,to_line = _make_line(lines,'-',0), None
+ num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
+ elif s.startswith('-+?'):
+ # intraline change
+ yield _make_line(lines,None,0), _make_line(lines,'?',1), True
+ continue
+ elif s.startswith('-?+'):
+ # intraline change
+ yield _make_line(lines,'?',0), _make_line(lines,None,1), True
+ continue
+ elif s.startswith('-'):
+ # delete FROM line
+ num_blanks_pending -= 1
+ yield _make_line(lines,'-',0), None, True
+ continue
+ elif s.startswith('+--'):
+ # in add block, delete block coming: we do NOT want to get
+ # caught up on blank lines yet, just process the add line
+ num_blanks_pending += 1
+ yield None, _make_line(lines,'+',1), True
+ continue
+ elif s.startswith(('+ ', '+-')):
+ # will be leaving an add block: yield blanks then add line
+ from_line, to_line = None, _make_line(lines,'+',1)
+ num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
+ elif s.startswith('+'):
+ # inside an add block, yield the add line
+ num_blanks_pending += 1
+ yield None, _make_line(lines,'+',1), True
+ continue
+ elif s.startswith(' '):
+ # unchanged text, yield it to both sides
+ yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
+ continue
+ # Catch up on the blank lines so when we yield the next from/to
+ # pair, they are lined up.
+ while(num_blanks_to_yield < 0):
+ num_blanks_to_yield += 1
+ yield None,('','\n'),True
+ while(num_blanks_to_yield > 0):
+ num_blanks_to_yield -= 1
+ yield ('','\n'),None,True
+ if s.startswith('X'):
+ return
+ else:
+ yield from_line,to_line,True
+
+ def _line_pair_iterator():
+ """Yields from/to lines of text with a change indication.
+
+ This function is an iterator. It itself pulls lines from the line
+ iterator. Its difference from that iterator is that this function
+ always yields a pair of from/to text lines (with the change
+ indication). If necessary it will collect single from/to lines
+ until it has a matching pair from/to pair to yield.
+
+ Note, this function is purposefully not defined at the module scope so
+ that data it needs from its parent function (within whose context it
+ is defined) does not need to be of module scope.
+ """
+ line_iterator = _line_iterator()
+ fromlines,tolines=[],[]
+ while True:
+ # Collecting lines of text until we have a from/to pair
+ while (len(fromlines)==0 or len(tolines)==0):
+ try:
+ from_line, to_line, found_diff = next(line_iterator)
+ except StopIteration:
+ return
+ if from_line is not None:
+ fromlines.append((from_line,found_diff))
+ if to_line is not None:
+ tolines.append((to_line,found_diff))
+ # Once we have a pair, remove them from the collection and yield it
+ from_line, fromDiff = fromlines.pop(0)
+ to_line, to_diff = tolines.pop(0)
+ yield (from_line,to_line,fromDiff or to_diff)
+
+ # Handle case where user does not want context differencing, just yield
+ # them up without doing anything else with them.
+ line_pair_iterator = _line_pair_iterator()
+ if context is None:
+ yield from line_pair_iterator
+ # Handle case where user wants context differencing. We must do some
+ # storage of lines until we know for sure that they are to be yielded.
+ else:
+ context += 1
+ lines_to_write = 0
+ while True:
+ # Store lines up until we find a difference, note use of a
+ # circular queue because we only need to keep around what
+ # we need for context.
+ index, contextLines = 0, [None]*(context)
+ found_diff = False
+ while(found_diff is False):
+ try:
+ from_line, to_line, found_diff = next(line_pair_iterator)
+ except StopIteration:
+ return
+ i = index % context
+ contextLines[i] = (from_line, to_line, found_diff)
+ index += 1
+ # Yield lines that we have collected so far, but first yield
+ # the user's separator.
+ if index > context:
+ yield None, None, None
+ lines_to_write = context
+ else:
+ lines_to_write = index
+ index = 0
+ while(lines_to_write):
+ i = index % context
+ index += 1
+ yield contextLines[i]
+ lines_to_write -= 1
+ # Now yield the context lines after the change
+ lines_to_write = context-1
+ try:
+ while(lines_to_write):
+ from_line, to_line, found_diff = next(line_pair_iterator)
+ # If another change within the context, extend the context
+ if found_diff:
+ lines_to_write = context-1
+ else:
+ lines_to_write -= 1
+ yield from_line, to_line, found_diff
+ except StopIteration:
+ # Catch exception from next() and return normally
+ return
+
+
+_file_template = """
+
+
+
+
+
+ Diff comparison
+
+
+
+
+ %(table)s%(legend)s
+
+
+"""
+
+_styles = """
+ :root {color-scheme: light dark}
+ table.diff {
+ font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
+ border: medium;
+ }
+ .diff_header {
+ background-color: #e0e0e0;
+ font-weight: bold;
+ }
+ td.diff_header {
+ text-align: right;
+ padding: 0 8px;
+ }
+ .diff_next {
+ background-color: #c0c0c0;
+ padding: 4px 0;
+ }
+ .diff_add {background-color:palegreen}
+ .diff_chg {background-color:#ffff77}
+ .diff_sub {background-color:#ffaaaa}
+ table.diff[summary="Legends"] {
+ margin-top: 20px;
+ border: 1px solid #ccc;
+ }
+ table.diff[summary="Legends"] th {
+ background-color: #e0e0e0;
+ padding: 4px 8px;
+ }
+ table.diff[summary="Legends"] td {
+ padding: 4px 8px;
+ }
+
+ @media (prefers-color-scheme: dark) {
+ .diff_header {background-color:#666}
+ .diff_next {background-color:#393939}
+ .diff_add {background-color:darkgreen}
+ .diff_chg {background-color:#847415}
+ .diff_sub {background-color:darkred}
+ table.diff[summary="Legends"] {border-color:#555}
+ table.diff[summary="Legends"] th{background-color:#666}
+ }"""
+
+_table_template = """
+
+
+
+ %(header_row)s
+
+%(data_rows)s
+
"""
+
+_legend = """
+
+ | Legends |
+
+ | Colors |
+ | Added |
+ | Changed |
+ | Deleted |
+ |
+
+ | Links |
+ | (f)irst change |
+ | (n)ext change |
+ | (t)op |
+ |
+
"""
+
+class HtmlDiff(object):
+ """For producing HTML side by side comparison with change highlights.
+
+ This class can be used to create an HTML table (or a complete HTML file
+ containing the table) showing a side by side, line by line comparison
+ of text with inter-line and intra-line change highlights. The table can
+ be generated in either full or contextual difference mode.
+
+ The following methods are provided for HTML generation:
+
+ make_table -- generates HTML for a single side by side table
+ make_file -- generates complete HTML file with a single side by side table
+
+ See Doc/includes/diff.py for an example usage of this class.
+ """
+
+ _file_template = _file_template
+ _styles = _styles
+ _table_template = _table_template
+ _legend = _legend
+ _default_prefix = 0
+
+ def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
+ charjunk=IS_CHARACTER_JUNK):
+ """HtmlDiff instance initializer
+
+ Arguments:
+ tabsize -- tab stop spacing, defaults to 8.
+ wrapcolumn -- column number where lines are broken and wrapped,
+ defaults to None where lines are not wrapped.
+ linejunk,charjunk -- keyword arguments passed into ndiff() (used by
+ HtmlDiff() to generate the side by side HTML differences). See
+ ndiff() documentation for argument default values and descriptions.
+ """
+ self._tabsize = tabsize
+ self._wrapcolumn = wrapcolumn
+ self._linejunk = linejunk
+ self._charjunk = charjunk
+
+ def make_file(self, fromlines, tolines, fromdesc='', todesc='',
+ context=False, numlines=5, *, charset='utf-8'):
+ """Returns HTML file of side by side comparison with change highlights
+
+ Arguments:
+ fromlines -- list of "from" lines
+ tolines -- list of "to" lines
+ fromdesc -- "from" file column header string
+ todesc -- "to" file column header string
+ context -- set to True for contextual differences (defaults to False
+ which shows full differences).
+ numlines -- number of context lines. When context is set True,
+ controls number of lines displayed before and after the change.
+ When context is False, controls the number of lines to place
+ the "next" link anchors before the next change (so click of
+ "next" link jumps to just before the change).
+ charset -- charset of the HTML document
+ """
+
+ return (self._file_template % dict(
+ styles=self._styles,
+ legend=self._legend,
+ table=self.make_table(fromlines, tolines, fromdesc, todesc,
+ context=context, numlines=numlines),
+ charset=charset
+ )).encode(charset, 'xmlcharrefreplace').decode(charset)
+
+ def _tab_newline_replace(self,fromlines,tolines):
+ """Returns from/to line lists with tabs expanded and newlines removed.
+
+ Instead of tab characters being replaced by the number of spaces
+ needed to fill in to the next tab stop, this function will fill
+ the space with tab characters. This is done so that the difference
+ algorithms can identify changes in a file when tabs are replaced by
+ spaces and vice versa. At the end of the HTML generation, the tab
+ characters will be replaced with a nonbreakable space.
+ """
+ def expand_tabs(line):
+ # hide real spaces
+ line = line.replace(' ','\0')
+ # expand tabs into spaces
+ line = line.expandtabs(self._tabsize)
+ # replace spaces from expanded tabs back into tab characters
+ # (we'll replace them with markup after we do differencing)
+ line = line.replace(' ','\t')
+ return line.replace('\0',' ').rstrip('\n')
+ fromlines = [expand_tabs(line) for line in fromlines]
+ tolines = [expand_tabs(line) for line in tolines]
+ return fromlines,tolines
+
+ def _split_line(self,data_list,line_num,text):
+ """Builds list of text lines by splitting text lines at wrap point
+
+ This function will determine if the input text line needs to be
+ wrapped (split) into separate lines. If so, the first wrap point
+ will be determined and the first line appended to the output
+ text line list. This function is used recursively to handle
+ the second part of the split line to further split it.
+ """
+ # if blank line or context separator, just add it to the output list
+ if not line_num:
+ data_list.append((line_num,text))
+ return
+
+ # if line text doesn't need wrapping, just add it to the output list
+ size = len(text)
+ max = self._wrapcolumn
+ if (size <= max) or ((size -(text.count('\0')*3)) <= max):
+ data_list.append((line_num,text))
+ return
+
+ # scan text looking for the wrap point, keeping track if the wrap
+ # point is inside markers
+ i = 0
+ n = 0
+ mark = ''
+ while n < max and i < size:
+ if text[i] == '\0':
+ i += 1
+ mark = text[i]
+ i += 1
+ elif text[i] == '\1':
+ i += 1
+ mark = ''
+ else:
+ i += 1
+ n += 1
+
+ # wrap point is inside text, break it up into separate lines
+ line1 = text[:i]
+ line2 = text[i:]
+
+ # if wrap point is inside markers, place end marker at end of first
+ # line and start marker at beginning of second line because each
+ # line will have its own table tag markup around it.
+ if mark:
+ line1 = line1 + '\1'
+ line2 = '\0' + mark + line2
+
+ # tack on first line onto the output list
+ data_list.append((line_num,line1))
+
+ # use this routine again to wrap the remaining text
+ self._split_line(data_list,'>',line2)
+
+ def _line_wrapper(self,diffs):
+ """Returns iterator that splits (wraps) mdiff text lines"""
+
+ # pull from/to data and flags from mdiff iterator
+ for fromdata,todata,flag in diffs:
+ # check for context separators and pass them through
+ if flag is None:
+ yield fromdata,todata,flag
+ continue
+ (fromline,fromtext),(toline,totext) = fromdata,todata
+ # for each from/to line split it at the wrap column to form
+ # list of text lines.
+ fromlist,tolist = [],[]
+ self._split_line(fromlist,fromline,fromtext)
+ self._split_line(tolist,toline,totext)
+ # yield from/to line in pairs inserting blank lines as
+ # necessary when one side has more wrapped lines
+ while fromlist or tolist:
+ if fromlist:
+ fromdata = fromlist.pop(0)
+ else:
+ fromdata = ('',' ')
+ if tolist:
+ todata = tolist.pop(0)
+ else:
+ todata = ('',' ')
+ yield fromdata,todata,flag
+
+ def _collect_lines(self,diffs):
+ """Collects mdiff output into separate lists
+
+ Before storing the mdiff from/to data into a list, it is converted
+ into a single line of text with HTML markup.
+ """
+
+ fromlist,tolist,flaglist = [],[],[]
+ # pull from/to data and flags from mdiff style iterator
+ for fromdata,todata,flag in diffs:
+ try:
+ # store HTML markup of the lines into the lists
+ fromlist.append(self._format_line(0,flag,*fromdata))
+ tolist.append(self._format_line(1,flag,*todata))
+ except TypeError:
+ # exceptions occur for lines where context separators go
+ fromlist.append(None)
+ tolist.append(None)
+ flaglist.append(flag)
+ return fromlist,tolist,flaglist
+
+ def _format_line(self,side,flag,linenum,text):
+ """Returns HTML markup of "from" / "to" text lines
+
+ side -- 0 or 1 indicating "from" or "to" text
+ flag -- indicates if difference on line
+ linenum -- line number (used for line number column)
+ text -- line text to be marked up
+ """
+ try:
+ linenum = '%d' % linenum
+ id = ' id="%s%s"' % (self._prefix[side],linenum)
+ except TypeError:
+ # handle blank lines where linenum is '>' or ''
+ id = ''
+ # replace those things that would get confused with HTML symbols
+ text=text.replace("&","&").replace(">",">").replace("<","<")
+
+ # make space non-breakable so they don't get compressed or line wrapped
+ text = text.replace(' ',' ').rstrip()
+
+ # add a class to the td tag if there is a difference on the line
+ css_class = ' class="diff_changed" ' if flag else ' '
+
+ return f'' \
+ + f'{text} | '
+
+ def _make_prefix(self):
+ """Create unique anchor prefixes"""
+
+ # Generate a unique anchor prefix so multiple tables
+ # can exist on the same HTML page without conflicts.
+ fromprefix = "from%d_" % HtmlDiff._default_prefix
+ toprefix = "to%d_" % HtmlDiff._default_prefix
+ HtmlDiff._default_prefix += 1
+ # store prefixes so line format method has access
+ self._prefix = [fromprefix,toprefix]
+
+ def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
+ """Makes list of "next" links"""
+
+ # all anchor names will be generated using the unique "to" prefix
+ toprefix = self._prefix[1]
+
+ # process change flags, generating middle column of next anchors/links
+ next_id = ['']*len(flaglist)
+ next_href = ['']*len(flaglist)
+ num_chg, in_change = 0, False
+ last = 0
+ for i,flag in enumerate(flaglist):
+ if flag:
+ if not in_change:
+ in_change = True
+ last = i
+ # at the beginning of a change, drop an anchor a few lines
+ # (the context lines) before the change for the previous
+ # link
+ i = max([0,i-numlines])
+ next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
+ # at the beginning of a change, drop a link to the next
+ # change
+ num_chg += 1
+ next_href[last] = 'n' % (
+ toprefix,num_chg)
+ else:
+ in_change = False
+ # check for cases where there is no content to avoid exceptions
+ if not flaglist:
+ flaglist = [False]
+ next_id = ['']
+ next_href = ['']
+ last = 0
+ if context:
+ fromlist = [' | No Differences Found | ']
+ tolist = fromlist
+ else:
+ fromlist = tolist = [' | Empty File | ']
+ # if not a change on first line, drop a link
+ if not flaglist[0]:
+ next_href[0] = 'f' % toprefix
+ # redo the last link to link to the top
+ next_href[last] = 't' % (toprefix)
+
+ return fromlist,tolist,flaglist,next_href,next_id
+
+ def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
+ numlines=5):
+ """Returns HTML table of side by side comparison with change highlights
+
+ Arguments:
+ fromlines -- list of "from" lines
+ tolines -- list of "to" lines
+ fromdesc -- "from" file column header string
+ todesc -- "to" file column header string
+ context -- set to True for contextual differences (defaults to False
+ which shows full differences).
+ numlines -- number of context lines. When context is set True,
+ controls number of lines displayed before and after the change.
+ When context is False, controls the number of lines to place
+ the "next" link anchors before the next change (so click of
+ "next" link jumps to just before the change).
+ """
+
+ # make unique anchor prefixes so that multiple tables may exist
+ # on the same page without conflict.
+ self._make_prefix()
+
+ # change tabs to spaces before it gets more difficult after we insert
+ # markup
+ fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
+
+ # create diffs iterator which generates side by side from/to data
+ if context:
+ context_lines = numlines
+ else:
+ context_lines = None
+ diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
+ charjunk=self._charjunk)
+
+ # set up iterator to wrap lines that exceed desired width
+ if self._wrapcolumn:
+ diffs = self._line_wrapper(diffs)
+
+ # collect up from/to lines and flags into lists (also format the lines)
+ fromlist,tolist,flaglist = self._collect_lines(diffs)
+
+ # process change flags, generating middle column of next anchors/links
+ fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
+ fromlist,tolist,flaglist,context,numlines)
+
+ s = []
+ fmt = ' | %s | %s' + \
+ '%s | %s
\n'
+ for i in range(len(flaglist)):
+ if flaglist[i] is None:
+ # mdiff yields None on separator lines skip the bogus ones
+ # generated for the first line
+ if i > 0:
+ s.append(' \n \n')
+ else:
+ s.append( fmt % (next_id[i],next_href[i],fromlist[i],
+ next_href[i],tolist[i]))
+ if fromdesc or todesc:
+ header_row = '%s%s%s%s
' % (
+ '
| ',
+ '' % fromdesc,
+ '
| ',
+ '' % todesc)
+ else:
+ header_row = ''
+
+ table = self._table_template % dict(
+ data_rows=''.join(s),
+ header_row=header_row,
+ prefix=self._prefix[1])
+
+ return table.replace('\0+',''). \
+ replace('\0-',''). \
+ replace('\0^',''). \
+ replace('\1',''). \
+ replace('\t',' ')
+
+
+def restore(delta, which):
+ r"""
+ Generate one of the two sequences that generated a delta.
+
+ Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
+ lines originating from file 1 or 2 (parameter `which`), stripping off line
+ prefixes.
+
+ Examples:
+
+ >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
+ ... 'ore\ntree\nemu\n'.splitlines(keepends=True))
+ >>> diff = list(diff)
+ >>> print(''.join(restore(diff, 1)), end="")
+ one
+ two
+ three
+ >>> print(''.join(restore(diff, 2)), end="")
+ ore
+ tree
+ emu
+ """
+ try:
+ tag = {1: "- ", 2: "+ "}[int(which)]
+ except KeyError:
+ raise ValueError('unknown delta choice (must be 1 or 2): %r'
+ % which) from None
+ prefixes = (" ", tag)
+ for line in delta:
+ if line[:2] in prefixes:
+ yield line[2:]
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 7a4ff15c34267b..fd193f51f4df55 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -24,2084 +24,71 @@
Class HtmlDiff:
For producing HTML side by side comparison with change highlights.
-"""
-
-__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
- 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
- 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
-
-from heapq import nlargest as _nlargest
-from collections import namedtuple as _namedtuple
-from types import GenericAlias
-lazy from _colorize import can_colorize, get_theme
-
-Match = _namedtuple('Match', 'a b size')
-
-def _calculate_ratio(matches, length):
- if length:
- return 2.0 * matches / length
- return 1.0
-
-class SequenceMatcher:
-
- """
- SequenceMatcher is a flexible class for comparing pairs of sequences of
- any type, so long as the sequence elements are hashable. The basic
- algorithm predates, and is a little fancier than, an algorithm
- published in the late 1980's by Ratcliff and Obershelp under the
- hyperbolic name "gestalt pattern matching". The basic idea is to find
- the longest contiguous matching subsequence that contains no "junk"
- elements (R-O doesn't address junk). The same idea is then applied
- recursively to the pieces of the sequences to the left and to the right
- of the matching subsequence. This does not yield minimal edit
- sequences, but does tend to yield matches that "look right" to people.
-
- SequenceMatcher tries to compute a "human-friendly diff" between two
- sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
- longest *contiguous* & junk-free matching subsequence. That's what
- catches peoples' eyes. The Windows(tm) windiff has another interesting
- notion, pairing up elements that appear uniquely in each sequence.
- That, and the method here, appear to yield more intuitive difference
- reports than does diff. This method appears to be the least vulnerable
- to syncing up on blocks of "junk lines", though (like blank lines in
- ordinary text files, or maybe "" lines in HTML files). That may be
- because this is the only method of the 3 that has a *concept* of
- "junk" .
-
- Example, comparing two strings, and considering blanks to be "junk":
-
- >>> s = SequenceMatcher(lambda x: x == " ",
- ... "private Thread currentThread;",
- ... "private volatile Thread currentThread;")
- >>>
-
- .ratio() returns a float in [0, 1], measuring the "similarity" of the
- sequences. As a rule of thumb, a .ratio() value over 0.6 means the
- sequences are close matches:
-
- >>> print(round(s.ratio(), 2))
- 0.87
- >>>
-
- If you're only interested in where the sequences match,
- .get_matching_blocks() is handy:
-
- >>> for block in s.get_matching_blocks():
- ... print("a[%d] and b[%d] match for %d elements" % block)
- a[0] and b[0] match for 8 elements
- a[8] and b[17] match for 21 elements
- a[29] and b[38] match for 0 elements
-
- Note that the last tuple returned by .get_matching_blocks() is always a
- dummy, (len(a), len(b), 0), and this is the only case in which the last
- tuple element (number of elements matched) is 0.
-
- If you want to know how to change the first sequence into the second,
- use .get_opcodes():
-
- >>> for opcode in s.get_opcodes():
- ... print("%6s a[%d:%d] b[%d:%d]" % opcode)
- equal a[0:8] b[0:8]
- insert a[8:8] b[8:17]
- equal a[8:29] b[17:38]
-
- See the Differ class for a fancy human-friendly file differencer, which
- uses SequenceMatcher both to compare sequences of lines, and to compare
- sequences of characters within similar (near-matching) lines.
-
- See also function get_close_matches() in this module, which shows how
- simple code building on SequenceMatcher can be used to do useful work.
-
- Timing: Basic R-O is cubic time worst case and quadratic time expected
- case. SequenceMatcher is quadratic time for the worst case and has
- expected-case behavior dependent in a complicated way on how many
- elements the sequences have in common; best case time is linear.
- """
-
- def __init__(self, isjunk=None, a='', b='', autojunk=True):
- """Construct a SequenceMatcher.
-
- Optional arg isjunk is None (the default), or a one-argument
- function that takes a sequence element and returns true iff the
- element is junk. None is equivalent to passing "lambda x: 0", i.e.
- no elements are considered to be junk. For example, pass
- lambda x: x in " \\t"
- if you're comparing lines as sequences of characters, and don't
- want to synch up on blanks or hard tabs.
-
- Optional arg a is the first of two sequences to be compared. By
- default, an empty string. The elements of a must be hashable. See
- also .set_seqs() and .set_seq1().
-
- Optional arg b is the second of two sequences to be compared. By
- default, an empty string. The elements of b must be hashable. See
- also .set_seqs() and .set_seq2().
-
- Optional arg autojunk should be set to False to disable the
- "automatic junk heuristic" that treats popular elements as junk
- (see module documentation for more information).
- """
-
- # Members:
- # a
- # first sequence
- # b
- # second sequence; differences are computed as "what do
- # we need to do to 'a' to change it into 'b'?"
- # b2j
- # for x in b, b2j[x] is a list of the indices (into b)
- # at which x appears; junk and popular elements do not appear
- # fullbcount
- # for x in b, fullbcount[x] == the number of times x
- # appears in b; only materialized if really needed (used
- # only for computing quick_ratio())
- # matching_blocks
- # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
- # ascending & non-overlapping in i and in j; terminated by
- # a dummy (len(a), len(b), 0) sentinel
- # opcodes
- # a list of (tag, i1, i2, j1, j2) tuples, where tag is
- # one of
- # 'replace' a[i1:i2] should be replaced by b[j1:j2]
- # 'delete' a[i1:i2] should be deleted
- # 'insert' b[j1:j2] should be inserted
- # 'equal' a[i1:i2] == b[j1:j2]
- # isjunk
- # a user-supplied function taking a sequence element and
- # returning true iff the element is "junk" -- this has
- # subtle but helpful effects on the algorithm, which I'll
- # get around to writing up someday <0.9 wink>.
- # DON'T USE! Only __chain_b uses this. Use "in self.bjunk".
- # bjunk
- # the items in b for which isjunk is True.
- # bpopular
- # nonjunk items in b treated as junk by the heuristic (if used).
-
- self.isjunk = isjunk
- self.a = self.b = None
- self.autojunk = autojunk
- self.set_seqs(a, b)
-
- def set_seqs(self, a, b):
- """Set the two sequences to be compared.
-
- >>> s = SequenceMatcher()
- >>> s.set_seqs("abcd", "bcde")
- >>> s.ratio()
- 0.75
- """
-
- self.set_seq1(a)
- self.set_seq2(b)
-
- def set_seq1(self, a):
- """Set the first sequence to be compared.
-
- The second sequence to be compared is not changed.
-
- >>> s = SequenceMatcher(None, "abcd", "bcde")
- >>> s.ratio()
- 0.75
- >>> s.set_seq1("bcde")
- >>> s.ratio()
- 1.0
- >>>
-
- SequenceMatcher computes and caches detailed information about the
- second sequence, so if you want to compare one sequence S against
- many sequences, use .set_seq2(S) once and call .set_seq1(x)
- repeatedly for each of the other sequences.
-
- See also set_seqs() and set_seq2().
- """
-
- if a is self.a:
- return
- self.a = a
- self.matching_blocks = self.opcodes = None
-
- def set_seq2(self, b):
- """Set the second sequence to be compared.
-
- The first sequence to be compared is not changed.
-
- >>> s = SequenceMatcher(None, "abcd", "bcde")
- >>> s.ratio()
- 0.75
- >>> s.set_seq2("abcd")
- >>> s.ratio()
- 1.0
- >>>
-
- SequenceMatcher computes and caches detailed information about the
- second sequence, so if you want to compare one sequence S against
- many sequences, use .set_seq2(S) once and call .set_seq1(x)
- repeatedly for each of the other sequences.
-
- See also set_seqs() and set_seq1().
- """
-
- if b is self.b:
- return
- self.b = b
- self.matching_blocks = self.opcodes = None
- self.fullbcount = None
- self.__chain_b()
-
- # For each element x in b, set b2j[x] to a list of the indices in
- # b where x appears; the indices are in increasing order; note that
- # the number of times x appears in b is len(b2j[x]) ...
- # when self.isjunk is defined, junk elements don't show up in this
- # map at all, which stops the central find_longest_match method
- # from starting any matching block at a junk element ...
- # b2j also does not contain entries for "popular" elements, meaning
- # elements that account for more than 1 + 1% of the total elements, and
- # when the sequence is reasonably large (>= 200 elements); this can
- # be viewed as an adaptive notion of semi-junk, and yields an enormous
- # speedup when, e.g., comparing program files with hundreds of
- # instances of "return NULL;" ...
- # note that this is only called when b changes; so for cross-product
- # kinds of matches, it's best to call set_seq2 once, then set_seq1
- # repeatedly
-
- def __chain_b(self):
- # Because isjunk is a user-defined (not C) function, and we test
- # for junk a LOT, it's important to minimize the number of calls.
- # Before the tricks described here, __chain_b was by far the most
- # time-consuming routine in the whole module! If anyone sees
- # Jim Roskind, thank him again for profile.py -- I never would
- # have guessed that.
- # The first trick is to build b2j ignoring the possibility
- # of junk. I.e., we don't call isjunk at all yet. Throwing
- # out the junk later is much cheaper than building b2j "right"
- # from the start.
- b = self.b
- self.b2j = b2j = {}
-
- for i, elt in enumerate(b):
- indices = b2j.setdefault(elt, [])
- indices.append(i)
-
- # Purge junk elements
- self.bjunk = junk = set()
- isjunk = self.isjunk
- if isjunk:
- for elt in b2j.keys():
- if isjunk(elt):
- junk.add(elt)
- for elt in junk: # separate loop avoids separate list of keys
- del b2j[elt]
-
- # Purge popular elements that are not junk
- self.bpopular = popular = set()
- n = len(b)
- if self.autojunk and n >= 200:
- ntest = n // 100 + 1
- for elt, idxs in b2j.items():
- if len(idxs) > ntest:
- popular.add(elt)
- for elt in popular: # ditto; as fast for 1% deletion
- del b2j[elt]
-
- def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
- """Find longest matching block in a[alo:ahi] and b[blo:bhi].
-
- By default it will find the longest match in the entirety of a and b.
-
- If isjunk is not defined:
-
- Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
- alo <= i <= i+k <= ahi
- blo <= j <= j+k <= bhi
- and for all (i',j',k') meeting those conditions,
- k >= k'
- i <= i'
- and if i == i', j <= j'
-
- In other words, of all maximal matching blocks, return one that
- starts earliest in a, and of all those maximal matching blocks that
- start earliest in a, return the one that starts earliest in b.
-
- >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
- >>> s.find_longest_match(0, 5, 0, 9)
- Match(a=0, b=4, size=5)
-
- If isjunk is defined, first the longest matching block is
- determined as above, but with the additional restriction that no
- junk element appears in the block. Then that block is extended as
- far as possible by matching (only) junk elements on both sides. So
- the resulting block never matches on junk except as identical junk
- happens to be adjacent to an "interesting" match.
-
- Here's the same example as before, but considering blanks to be
- junk. That prevents " abcd" from matching the " abcd" at the tail
- end of the second sequence directly. Instead only the "abcd" can
- match, and matches the leftmost "abcd" in the second sequence:
-
- >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
- >>> s.find_longest_match(0, 5, 0, 9)
- Match(a=1, b=0, size=4)
-
- If no blocks match, return (alo, blo, 0).
-
- >>> s = SequenceMatcher(None, "ab", "c")
- >>> s.find_longest_match(0, 2, 0, 1)
- Match(a=0, b=0, size=0)
- """
-
- # CAUTION: stripping common prefix or suffix would be incorrect.
- # E.g.,
- # ab
- # acab
- # Longest matching block is "ab", but if common prefix is
- # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
- # strip, so ends up claiming that ab is changed to acab by
- # inserting "ca" in the middle. That's minimal but unintuitive:
- # "it's obvious" that someone inserted "ac" at the front.
- # Windiff ends up at the same place as diff, but by pairing up
- # the unique 'b's and then matching the first two 'a's.
-
- a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
- if ahi is None:
- ahi = len(a)
- if bhi is None:
- bhi = len(b)
- besti, bestj, bestsize = alo, blo, 0
- # find longest junk-free match
- # during an iteration of the loop, j2len[j] = length of longest
- # junk-free match ending with a[i-1] and b[j]
- j2len = {}
- nothing = []
- for i in range(alo, ahi):
- # look at all instances of a[i] in b; note that because
- # b2j has no junk keys, the loop is skipped if a[i] is junk
- j2lenget = j2len.get
- newj2len = {}
- for j in b2j.get(a[i], nothing):
- # a[i] matches b[j]
- if j < blo:
- continue
- if j >= bhi:
- break
- k = newj2len[j] = j2lenget(j-1, 0) + 1
- if k > bestsize:
- besti, bestj, bestsize = i-k+1, j-k+1, k
- j2len = newj2len
-
- # Extend the best by non-junk elements on each end. In particular,
- # "popular" non-junk elements aren't in b2j, which greatly speeds
- # the inner loop above, but also means "the best" match so far
- # doesn't contain any junk *or* popular non-junk elements.
- while besti > alo and bestj > blo and \
- not isbjunk(b[bestj-1]) and \
- a[besti-1] == b[bestj-1]:
- besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
- while besti+bestsize < ahi and bestj+bestsize < bhi and \
- not isbjunk(b[bestj+bestsize]) and \
- a[besti+bestsize] == b[bestj+bestsize]:
- bestsize += 1
-
- # Now that we have a wholly interesting match (albeit possibly
- # empty!), we may as well suck up the matching junk on each
- # side of it too. Can't think of a good reason not to, and it
- # saves post-processing the (possibly considerable) expense of
- # figuring out what to do with it. In the case of an empty
- # interesting match, this is clearly the right thing to do,
- # because no other kind of match is possible in the regions.
- while besti > alo and bestj > blo and \
- isbjunk(b[bestj-1]) and \
- a[besti-1] == b[bestj-1]:
- besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
- while besti+bestsize < ahi and bestj+bestsize < bhi and \
- isbjunk(b[bestj+bestsize]) and \
- a[besti+bestsize] == b[bestj+bestsize]:
- bestsize = bestsize + 1
-
- return Match(besti, bestj, bestsize)
-
- def get_matching_blocks(self):
- """Return list of triples describing matching subsequences.
-
- Each triple is of the form (i, j, n), and means that
- a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
- i and in j. New in Python 2.5, it's also guaranteed that if
- (i, j, n) and (i', j', n') are adjacent triples in the list, and
- the second is not the last triple in the list, then i+n != i' or
- j+n != j'. IOW, adjacent triples never describe adjacent equal
- blocks.
-
- The last triple is a dummy, (len(a), len(b), 0), and is the only
- triple with n==0.
-
- >>> s = SequenceMatcher(None, "abxcd", "abcd")
- >>> list(s.get_matching_blocks())
- [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
- """
-
- if self.matching_blocks is not None:
- return self.matching_blocks
- la, lb = len(self.a), len(self.b)
-
- # This is most naturally expressed as a recursive algorithm, but
- # at least one user bumped into extreme use cases that exceeded
- # the recursion limit on their box. So, now we maintain a list
- # ('queue`) of blocks we still need to look at, and append partial
- # results to `matching_blocks` in a loop; the matches are sorted
- # at the end.
- queue = [(0, la, 0, lb)]
- matching_blocks = []
- while queue:
- alo, ahi, blo, bhi = queue.pop()
- i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
- # a[alo:i] vs b[blo:j] unknown
- # a[i:i+k] same as b[j:j+k]
- # a[i+k:ahi] vs b[j+k:bhi] unknown
- if k: # if k is 0, there was no matching block
- matching_blocks.append(x)
- if alo < i and blo < j:
- queue.append((alo, i, blo, j))
- if i+k < ahi and j+k < bhi:
- queue.append((i+k, ahi, j+k, bhi))
- matching_blocks.sort()
-
- # It's possible that we have adjacent equal blocks in the
- # matching_blocks list now. Starting with 2.5, this code was added
- # to collapse them.
- i1 = j1 = k1 = 0
- non_adjacent = []
- for i2, j2, k2 in matching_blocks:
- # Is this block adjacent to i1, j1, k1?
- if i1 + k1 == i2 and j1 + k1 == j2:
- # Yes, so collapse them -- this just increases the length of
- # the first block by the length of the second, and the first
- # block so lengthened remains the block to compare against.
- k1 += k2
- else:
- # Not adjacent. Remember the first block (k1==0 means it's
- # the dummy we started with), and make the second block the
- # new block to compare against.
- if k1:
- non_adjacent.append((i1, j1, k1))
- i1, j1, k1 = i2, j2, k2
- if k1:
- non_adjacent.append((i1, j1, k1))
-
- non_adjacent.append( (la, lb, 0) )
- self.matching_blocks = list(map(Match._make, non_adjacent))
- return self.matching_blocks
-
- def get_opcodes(self):
- """Return list of 5-tuples describing how to turn a into b.
-
- Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
- has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
- tuple preceding it, and likewise for j1 == the previous j2.
-
- The tags are strings, with these meanings:
-
- 'replace': a[i1:i2] should be replaced by b[j1:j2]
- 'delete': a[i1:i2] should be deleted.
- Note that j1==j2 in this case.
- 'insert': b[j1:j2] should be inserted at a[i1:i1].
- Note that i1==i2 in this case.
- 'equal': a[i1:i2] == b[j1:j2]
-
- >>> a = "qabxcd"
- >>> b = "abycdf"
- >>> s = SequenceMatcher(None, a, b)
- >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
- ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
- ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
- delete a[0:1] (q) b[0:0] ()
- equal a[1:3] (ab) b[0:2] (ab)
- replace a[3:4] (x) b[2:3] (y)
- equal a[4:6] (cd) b[3:5] (cd)
- insert a[6:6] () b[5:6] (f)
- """
-
- if self.opcodes is not None:
- return self.opcodes
- i = j = 0
- self.opcodes = answer = []
- for ai, bj, size in self.get_matching_blocks():
- # invariant: we've pumped out correct diffs to change
- # a[:i] into b[:j], and the next matching block is
- # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
- # out a diff to change a[i:ai] into b[j:bj], pump out
- # the matching block, and move (i,j) beyond the match
- tag = ''
- if i < ai and j < bj:
- tag = 'replace'
- elif i < ai:
- tag = 'delete'
- elif j < bj:
- tag = 'insert'
- if tag:
- answer.append( (tag, i, ai, j, bj) )
- i, j = ai+size, bj+size
- # the list of matching blocks is terminated by a
- # sentinel with size 0
- if size:
- answer.append( ('equal', ai, i, bj, j) )
- return answer
-
- def get_grouped_opcodes(self, n=3):
- """ Isolate change clusters by eliminating ranges with no changes.
-
- Return a generator of groups with up to n lines of context.
- Each group is in the same format as returned by get_opcodes().
-
- >>> from pprint import pprint
- >>> a = list(map(str, range(1,40)))
- >>> b = a[:]
- >>> b[8:8] = ['i'] # Make an insertion
- >>> b[20] += 'x' # Make a replacement
- >>> b[23:28] = [] # Make a deletion
- >>> b[30] += 'y' # Make another replacement
- >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
- [
- [('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
- [
- ('equal', 16, 19, 17, 20),
- ('replace', 19, 20, 20, 21),
- ('equal', 20, 22, 21, 23),
- ('delete', 22, 27, 23, 23),
- ('equal', 27, 30, 23, 26),
- ],
- [('equal', 31, 34, 27, 30), ('replace', 34, 35, 30, 31), ('equal', 35, 38, 31, 34)],
- ]
- """
-
- codes = self.get_opcodes()
- if not codes:
- codes = [("equal", 0, 1, 0, 1)]
- # Fixup leading and trailing groups if they show no changes.
- if codes[0][0] == 'equal':
- tag, i1, i2, j1, j2 = codes[0]
- codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
- if codes[-1][0] == 'equal':
- tag, i1, i2, j1, j2 = codes[-1]
- codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
-
- nn = n + n
- group = []
- for tag, i1, i2, j1, j2 in codes:
- # End the current group and start a new one whenever
- # there is a large range with no changes.
- if tag == 'equal' and i2-i1 > nn:
- group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
- yield group
- group = []
- i1, j1 = max(i1, i2-n), max(j1, j2-n)
- group.append((tag, i1, i2, j1 ,j2))
- if group and not (len(group)==1 and group[0][0] == 'equal'):
- yield group
-
- def ratio(self):
- """Return a measure of the sequences' similarity (float in [0,1]).
-
- Where T is the total number of elements in both sequences, and
- M is the number of matches, this is 2.0*M / T.
- Note that this is 1 if the sequences are identical, and 0 if
- they have nothing in common.
-
- .ratio() is expensive to compute if you haven't already computed
- .get_matching_blocks() or .get_opcodes(), in which case you may
- want to try .quick_ratio() or .real_quick_ratio() first to get an
- upper bound.
-
- >>> s = SequenceMatcher(None, "abcd", "bcde")
- >>> s.ratio()
- 0.75
- >>> s.quick_ratio()
- 0.75
- >>> s.real_quick_ratio()
- 1.0
- """
-
- matches = sum(triple[-1] for triple in self.get_matching_blocks())
- return _calculate_ratio(matches, len(self.a) + len(self.b))
-
- def quick_ratio(self):
- """Return an upper bound on ratio() relatively quickly.
-
- This isn't defined beyond that it is an upper bound on .ratio(), and
- is faster to compute.
- """
-
- # viewing a and b as multisets, set matches to the cardinality
- # of their intersection; this counts the number of matches
- # without regard to order, so is clearly an upper bound
- if self.fullbcount is None:
- self.fullbcount = fullbcount = {}
- for elt in self.b:
- fullbcount[elt] = fullbcount.get(elt, 0) + 1
- fullbcount = self.fullbcount
- # avail[x] is the number of times x appears in 'b' less the
- # number of times we've seen it in 'a' so far ... kinda
- avail = {}
- matches = 0
- for elt in self.a:
- if elt in avail:
- numb = avail[elt]
- else:
- numb = fullbcount.get(elt, 0)
- avail[elt] = numb - 1
- if numb > 0:
- matches += 1
- return _calculate_ratio(matches, len(self.a) + len(self.b))
-
- def real_quick_ratio(self):
- """Return an upper bound on ratio() very quickly.
-
- This isn't defined beyond that it is an upper bound on .ratio(), and
- is faster to compute than either .ratio() or .quick_ratio().
- """
-
- la, lb = len(self.a), len(self.b)
- # can't have more matches than the number of elements in the
- # shorter sequence
- return _calculate_ratio(min(la, lb), la + lb)
-
- __class_getitem__ = classmethod(GenericAlias)
-
-
-def get_close_matches(word, possibilities, n=3, cutoff=0.6):
- """Use SequenceMatcher to return list of the best "good enough" matches.
-
- word is a sequence for which close matches are desired (typically a
- string).
-
- possibilities is a list of sequences against which to match word
- (typically a list of strings).
-
- Optional arg n (default 3) is the maximum number of close matches to
- return. n must be > 0.
-
- Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
- that don't score at least that similar to word are ignored.
-
- The best (no more than n) matches among the possibilities are returned
- in a list, sorted by similarity score, most similar first.
-
- >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
- ['apple', 'ape']
- >>> import keyword as _keyword
- >>> get_close_matches("wheel", _keyword.kwlist)
- ['while']
- >>> get_close_matches("Apple", _keyword.kwlist)
- []
- >>> get_close_matches("accept", _keyword.kwlist)
- ['except']
- """
-
- if not n > 0:
- raise ValueError("n must be > 0: %r" % (n,))
- if not 0.0 <= cutoff <= 1.0:
- raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
- result = []
- s = SequenceMatcher()
- s.set_seq2(word)
- for x in possibilities:
- s.set_seq1(x)
- if s.real_quick_ratio() < cutoff or s.quick_ratio() < cutoff:
- continue
-
- ratio = s.ratio()
- if ratio >= cutoff:
- result.append((ratio, x))
-
- # Move the best scorers to head of list
- result = _nlargest(n, result)
- # Strip scores for the best n matches
- return [x for score, x in result]
-
-
-def _keep_original_ws(s, tag_s):
- """Replace whitespace with the original whitespace characters in `s`"""
- return ''.join(
- c if tag_c == " " and c.isspace() else tag_c
- for c, tag_c in zip(s, tag_s)
- )
-
-
-
-class Differ:
- r"""
- Differ is a class for comparing sequences of lines of text, and
- producing human-readable differences or deltas. Differ uses
- SequenceMatcher both to compare sequences of lines, and to compare
- sequences of characters within similar (near-matching) lines.
-
- Each line of a Differ delta begins with a two-letter code:
-
- '- ' line unique to sequence 1
- '+ ' line unique to sequence 2
- ' ' line common to both sequences
- '? ' line not present in either input sequence
-
- Lines beginning with '? ' attempt to guide the eye to intraline
- differences, and were not present in either input sequence. These lines
- can be confusing if the sequences contain tab characters.
-
- Note that Differ makes no claim to produce a *minimal* diff. To the
- contrary, minimal diffs are often counter-intuitive, because they synch
- up anywhere possible, sometimes accidental matches 100 pages apart.
- Restricting synch points to contiguous matches preserves some notion of
- locality, at the occasional cost of producing a longer diff.
-
- Example: Comparing two texts.
-
- First we set up the texts, sequences of individual single-line strings
- ending with newlines (such sequences can also be obtained from the
- `readlines()` method of file-like objects):
-
- >>> text1 = ''' 1. Beautiful is better than ugly.
- ... 2. Explicit is better than implicit.
- ... 3. Simple is better than complex.
- ... 4. Complex is better than complicated.
- ... '''.splitlines(keepends=True)
- >>> len(text1)
- 4
- >>> text1[0][-1]
- '\n'
- >>> text2 = ''' 1. Beautiful is better than ugly.
- ... 3. Simple is better than complex.
- ... 4. Complicated is better than complex.
- ... 5. Flat is better than nested.
- ... '''.splitlines(keepends=True)
-
- Next we instantiate a Differ object:
-
- >>> d = Differ()
-
- Note that when instantiating a Differ object we may pass functions to
- filter out line and character 'junk'. See Differ.__init__ for details.
-
- Finally, we compare the two:
-
- >>> result = list(d.compare(text1, text2))
-
- 'result' is a list of strings, so let's pretty-print it:
-
- >>> from pprint import pprint as _pprint
- >>> _pprint(result)
- [
- ' 1. Beautiful is better than ugly.\n',
- '- 2. Explicit is better than implicit.\n',
- '- 3. Simple is better than complex.\n',
- '+ 3. Simple is better than complex.\n',
- '? ++\n',
- '- 4. Complex is better than complicated.\n',
- '? ^ ---- ^\n',
- '+ 4. Complicated is better than complex.\n',
- '? ++++ ^ ^\n',
- '+ 5. Flat is better than nested.\n',
- ]
-
- As a single multi-line string it looks like this:
-
- >>> print(''.join(result), end="")
- 1. Beautiful is better than ugly.
- - 2. Explicit is better than implicit.
- - 3. Simple is better than complex.
- + 3. Simple is better than complex.
- ? ++
- - 4. Complex is better than complicated.
- ? ^ ---- ^
- + 4. Complicated is better than complex.
- ? ++++ ^ ^
- + 5. Flat is better than nested.
- """
-
- def __init__(self, linejunk=None, charjunk=None):
- """
- Construct a text differencer, with optional filters.
-
- The two optional keyword parameters are for filter functions:
-
- - `linejunk`: A function that should accept a single string argument,
- and return true iff the string is junk. The module-level function
- `IS_LINE_JUNK` may be used to filter out lines without visible
- characters, except for at most one splat ('#'). It is recommended
- to leave linejunk None; the underlying SequenceMatcher class has
- an adaptive notion of "noise" lines that's better than any static
- definition the author has ever been able to craft.
-
- - `charjunk`: A function that should accept a string of length 1. The
- module-level function `IS_CHARACTER_JUNK` may be used to filter out
- whitespace characters (a blank or tab; **note**: bad idea to include
- newline in this!). Use of IS_CHARACTER_JUNK is recommended.
- """
-
- self.linejunk = linejunk
- self.charjunk = charjunk
-
- def compare(self, a, b):
- r"""
- Compare two sequences of lines; generate the resulting delta.
-
- Each sequence must contain individual single-line strings ending with
- newlines. Such sequences can be obtained from the `readlines()` method
- of file-like objects. The delta generated also consists of newline-
- terminated strings, ready to be printed as-is via the writelines()
- method of a file-like object.
-
- Example:
-
- >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
- ... 'ore\ntree\nemu\n'.splitlines(True))),
- ... end="")
- - one
- ? ^
- + ore
- ? ^
- - two
- - three
- ? -
- + tree
- + emu
- """
-
- cruncher = SequenceMatcher(self.linejunk, a, b)
- for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
- if tag == 'replace':
- g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
- elif tag == 'delete':
- g = self._dump('-', a, alo, ahi)
- elif tag == 'insert':
- g = self._dump('+', b, blo, bhi)
- elif tag == 'equal':
- g = self._dump(' ', a, alo, ahi)
- else:
- raise ValueError('unknown tag %r' % (tag,))
-
- yield from g
-
- def _dump(self, tag, x, lo, hi):
- """Generate comparison results for a same-tagged range."""
- for i in range(lo, hi):
- yield '%s %s' % (tag, x[i])
-
- def _plain_replace(self, a, alo, ahi, b, blo, bhi):
- assert alo < ahi and blo < bhi
- # dump the shorter block first -- reduces the burden on short-term
- # memory if the blocks are of very different sizes
- if bhi - blo < ahi - alo:
- first = self._dump('+', b, blo, bhi)
- second = self._dump('-', a, alo, ahi)
- else:
- first = self._dump('-', a, alo, ahi)
- second = self._dump('+', b, blo, bhi)
-
- for g in first, second:
- yield from g
-
- def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
- r"""
- When replacing one block of lines with another, search the blocks
- for *similar* lines; the best-matching pair (if any) is used as a
- synch point, and intraline difference marking is done on the
- similar pair. Lots of work, but often worth it.
-
- Example:
- >>> d = Differ()
- >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
- ... ['abcdefGhijkl\n'], 0, 1)
- >>> print(''.join(results), end="")
- - abcDefghiJkl
- ? ^ ^ ^
- + abcdefGhijkl
- ? ^ ^ ^
- """
- # Don't synch up unless the lines have a similarity score above
- # cutoff. Previously only the smallest pair was handled here,
- # and if there are many pairs with the best ratio, recursion
- # could grow very deep, and runtime cubic. See:
- # https://github.com/python/cpython/issues/119105
- #
- # Later, more pathological cases prompted removing recursion
- # entirely.
- cutoff = 0.74999
- cruncher = SequenceMatcher(self.charjunk)
- crqr = cruncher.real_quick_ratio
- cqr = cruncher.quick_ratio
- cr = cruncher.ratio
-
- WINDOW = 10
- best_i = best_j = None
- dump_i, dump_j = alo, blo # smallest indices not yet resolved
- for j in range(blo, bhi):
- cruncher.set_seq2(b[j])
- # Search the corresponding i's within WINDOW for rhe highest
- # ratio greater than `cutoff`.
- aequiv = alo + (j - blo)
- arange = range(max(aequiv - WINDOW, dump_i),
- min(aequiv + WINDOW + 1, ahi))
- if not arange: # likely exit if `a` is shorter than `b`
- break
- best_ratio = cutoff
- for i in arange:
- cruncher.set_seq1(a[i])
- # Ordering by cheapest to most expensive ratio is very
- # valuable, most often getting out early.
- if crqr() <= best_ratio or cqr() <= best_ratio:
- continue
-
- ratio = cr()
- if ratio > best_ratio:
- best_i, best_j, best_ratio = i, j, ratio
-
- if best_i is None:
- # found nothing to synch on yet - move to next j
- continue
-
- # pump out straight replace from before this synch pair
- yield from self._fancy_helper(a, dump_i, best_i,
- b, dump_j, best_j)
- # do intraline marking on the synch pair
- aelt, belt = a[best_i], b[best_j]
- if aelt != belt:
- # pump out a '-', '?', '+', '?' quad for the synched lines
- atags = btags = ""
- cruncher.set_seqs(aelt, belt)
- for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
- la, lb = ai2 - ai1, bj2 - bj1
- if tag == 'replace':
- atags += '^' * la
- btags += '^' * lb
- elif tag == 'delete':
- atags += '-' * la
- elif tag == 'insert':
- btags += '+' * lb
- elif tag == 'equal':
- atags += ' ' * la
- btags += ' ' * lb
- else:
- raise ValueError('unknown tag %r' % (tag,))
- yield from self._qformat(aelt, belt, atags, btags)
- else:
- # the synch pair is identical
- yield ' ' + aelt
- dump_i, dump_j = best_i + 1, best_j + 1
- best_i = best_j = None
-
- # pump out straight replace from after the last synch pair
- yield from self._fancy_helper(a, dump_i, ahi,
- b, dump_j, bhi)
-
- def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
- g = []
- if alo < ahi:
- if blo < bhi:
- g = self._plain_replace(a, alo, ahi, b, blo, bhi)
- else:
- g = self._dump('-', a, alo, ahi)
- elif blo < bhi:
- g = self._dump('+', b, blo, bhi)
-
- yield from g
-
- def _qformat(self, aline, bline, atags, btags):
- r"""
- Format "?" output and deal with tabs.
-
- Example:
-
- >>> d = Differ()
- >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
- ... ' ^ ^ ^ ', ' ^ ^ ^ ')
- >>> for line in results: print(repr(line))
- ...
- '- \tabcDefghiJkl\n'
- '? \t ^ ^ ^\n'
- '+ \tabcdefGhijkl\n'
- '? \t ^ ^ ^\n'
- """
- atags = _keep_original_ws(aline, atags).rstrip()
- btags = _keep_original_ws(bline, btags).rstrip()
-
- yield "- " + aline
- if atags:
- yield f"? {atags}\n"
-
- yield "+ " + bline
- if btags:
- yield f"? {btags}\n"
-
-# With respect to junk, an earlier version of ndiff simply refused to
-# *start* a match with a junk element. The result was cases like this:
-# before: private Thread currentThread;
-# after: private volatile Thread currentThread;
-# If you consider whitespace to be junk, the longest contiguous match
-# not starting with junk is "e Thread currentThread". So ndiff reported
-# that "e volatil" was inserted between the 't' and the 'e' in "private".
-# While an accurate view, to people that's absurd. The current version
-# looks for matching blocks that are entirely junk-free, then extends the
-# longest one of those as far as possible but only with matching junk.
-# So now "currentThread" is matched, then extended to suck up the
-# preceding blank; then "private" is matched, and extended to suck up the
-# following blank; then "Thread" is matched; and finally ndiff reports
-# that "volatile " was inserted before "Thread". The only quibble
-# remaining is that perhaps it was really the case that " volatile"
-# was inserted after "private". I can live with that .
-
-def IS_LINE_JUNK(line, pat=None):
- r"""
- Return True for ignorable line: if `line` is blank or contains a single '#'.
-
- Examples:
-
- >>> IS_LINE_JUNK('\n')
- True
- >>> IS_LINE_JUNK(' # \n')
- True
- >>> IS_LINE_JUNK('hello\n')
- False
- """
-
- if pat is None:
- # Default: match '#' or the empty string
- return line.strip() in '#'
- # Previous versions used the undocumented parameter 'pat' as a
- # match function. Retain this behaviour for compatibility.
- return pat(line) is not None
-
-def IS_CHARACTER_JUNK(ch, ws=" \t"):
- r"""
- Return True for ignorable character: iff `ch` is a space or tab.
-
- Examples:
-
- >>> IS_CHARACTER_JUNK(' ')
- True
- >>> IS_CHARACTER_JUNK('\t')
- True
- >>> IS_CHARACTER_JUNK('\n')
- False
- >>> IS_CHARACTER_JUNK('x')
- False
- """
-
- return ch in ws
-
-
-########################################################################
-### Unified Diff
-########################################################################
-
-def _format_range_unified(start, stop):
- 'Convert range to the "ed" format'
- # Per the diff spec at http://www.unix.org/single_unix_specification/
- beginning = start + 1 # lines start numbering with one
- length = stop - start
- if length == 1:
- return '{}'.format(beginning)
- if not length:
- beginning -= 1 # empty ranges begin at line just before the range
- return '{},{}'.format(beginning, length)
-
-def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
- tofiledate='', n=3, lineterm='\n', *, color=False):
- r"""
- Compare two sequences of lines; generate the delta as a unified diff.
-
- Unified diffs are a compact way of showing line changes and a few
- lines of context. The number of context lines is set by 'n' which
- defaults to three.
-
- By default, the diff control lines (those with ---, +++, or @@) are
- created with a trailing newline. This is helpful so that inputs
- created from file.readlines() result in diffs that are suitable for
- file.writelines() since both the inputs and outputs have trailing
- newlines.
-
- For inputs that do not have trailing newlines, set the lineterm
- argument to "" so that the output will be uniformly newline free.
-
- Set 'color' to True to enable output in color, similar to
- 'git diff --color'. Even if enabled, it can be
- controlled using environment variables such as 'NO_COLOR'.
-
- The unidiff format normally has a header for filenames and modification
- times. Any or all of these may be specified using strings for
- 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
- The modification times are normally expressed in the ISO 8601 format.
-
- Example:
-
- >>> for line in unified_diff('one two three four'.split(),
- ... 'zero one tree four'.split(), 'Original', 'Current',
- ... '2005-01-26 23:30:50', '2010-04-02 10:20:52',
- ... lineterm=''):
- ... print(line) # doctest: +NORMALIZE_WHITESPACE
- --- Original 2005-01-26 23:30:50
- +++ Current 2010-04-02 10:20:52
- @@ -1,4 +1,4 @@
- +zero
- one
- -two
- -three
- +tree
- four
- """
-
- if color and can_colorize():
- t = get_theme(force_color=True).difflib
- else:
- t = get_theme(force_no_color=True).difflib
-
- _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
- started = False
- for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
- if not started:
- started = True
- fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
- todate = '\t{}'.format(tofiledate) if tofiledate else ''
- yield f'{t.header}--- {fromfile}{fromdate}{lineterm}{t.reset}'
- yield f'{t.header}+++ {tofile}{todate}{lineterm}{t.reset}'
-
- first, last = group[0], group[-1]
- file1_range = _format_range_unified(first[1], last[2])
- file2_range = _format_range_unified(first[3], last[4])
- yield f'{t.hunk}@@ -{file1_range} +{file2_range} @@{lineterm}{t.reset}'
-
- for tag, i1, i2, j1, j2 in group:
- if tag == 'equal':
- for line in a[i1:i2]:
- yield f'{t.context} {line}{t.reset}'
- continue
- if tag in {'replace', 'delete'}:
- for line in a[i1:i2]:
- yield f'{t.removed}-{line}{t.reset}'
- if tag in {'replace', 'insert'}:
- for line in b[j1:j2]:
- yield f'{t.added}+{line}{t.reset}'
-
-
-########################################################################
-### Context Diff
-########################################################################
-
-def _format_range_context(start, stop):
- 'Convert range to the "ed" format'
- # Per the diff spec at http://www.unix.org/single_unix_specification/
- beginning = start + 1 # lines start numbering with one
- length = stop - start
- if not length:
- beginning -= 1 # empty ranges begin at line just before the range
- if length <= 1:
- return '{}'.format(beginning)
- return '{},{}'.format(beginning, beginning + length - 1)
-
-# See http://www.unix.org/single_unix_specification/
-def context_diff(a, b, fromfile='', tofile='',
- fromfiledate='', tofiledate='', n=3, lineterm='\n'):
- r"""
- Compare two sequences of lines; generate the delta as a context diff.
-
- Context diffs are a compact way of showing line changes and a few
- lines of context. The number of context lines is set by 'n' which
- defaults to three.
-
- By default, the diff control lines (those with *** or ---) are
- created with a trailing newline. This is helpful so that inputs
- created from file.readlines() result in diffs that are suitable for
- file.writelines() since both the inputs and outputs have trailing
- newlines.
-
- For inputs that do not have trailing newlines, set the lineterm
- argument to "" so that the output will be uniformly newline free.
-
- The context diff format normally has a header for filenames and
- modification times. Any or all of these may be specified using
- strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
- The modification times are normally expressed in the ISO 8601 format.
- If not specified, the strings default to blanks.
-
- Example:
-
- >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
- ... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
- ... end="")
- *** Original
- --- Current
- ***************
- *** 1,4 ****
- one
- ! two
- ! three
- four
- --- 1,4 ----
- + zero
- one
- ! tree
- four
- """
-
- _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
- prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
- started = False
- for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
- if not started:
- started = True
- fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
- todate = '\t{}'.format(tofiledate) if tofiledate else ''
- yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
- yield '--- {}{}{}'.format(tofile, todate, lineterm)
-
- first, last = group[0], group[-1]
- yield '***************' + lineterm
-
- file1_range = _format_range_context(first[1], last[2])
- yield '*** {} ****{}'.format(file1_range, lineterm)
-
- if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
- for tag, i1, i2, _, _ in group:
- if tag != 'insert':
- for line in a[i1:i2]:
- yield prefix[tag] + line
-
- file2_range = _format_range_context(first[3], last[4])
- yield '--- {} ----{}'.format(file2_range, lineterm)
-
- if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
- for tag, _, _, j1, j2 in group:
- if tag != 'delete':
- for line in b[j1:j2]:
- yield prefix[tag] + line
-
-def _check_types(a, b, *args):
- # Checking types is weird, but the alternative is garbled output when
- # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
- # without this check, passing filenames as bytes results in output like
- # --- b'oldfile.txt'
- # +++ b'newfile.txt'
- # because of how str.format() incorporates bytes objects.
- if a and not isinstance(a[0], str):
- raise TypeError('lines to compare must be str, not %s (%r)' %
- (type(a[0]).__name__, a[0]))
- if b and not isinstance(b[0], str):
- raise TypeError('lines to compare must be str, not %s (%r)' %
- (type(b[0]).__name__, b[0]))
- if isinstance(a, str):
- raise TypeError('input must be a sequence of strings, not %s' %
- type(a).__name__)
- if isinstance(b, str):
- raise TypeError('input must be a sequence of strings, not %s' %
- type(b).__name__)
- for arg in args:
- if not isinstance(arg, str):
- raise TypeError('all arguments must be str, not: %r' % (arg,))
-
-def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
- fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
- r"""
- Compare `a` and `b`, two sequences of lines represented as bytes rather
- than str. This is a wrapper for `dfunc`, which is typically either
- unified_diff() or context_diff(). Inputs are losslessly converted to
- strings so that `dfunc` only has to worry about strings, and encoded
- back to bytes on return. This is necessary to compare files with
- unknown or inconsistent encoding. All other inputs (except `n`) must be
- bytes rather than str.
- """
- def decode(s):
- try:
- return s.decode('ascii', 'surrogateescape')
- except AttributeError as err:
- msg = ('all arguments must be bytes, not %s (%r)' %
- (type(s).__name__, s))
- raise TypeError(msg) from err
- a = list(map(decode, a))
- b = list(map(decode, b))
- fromfile = decode(fromfile)
- tofile = decode(tofile)
- fromfiledate = decode(fromfiledate)
- tofiledate = decode(tofiledate)
- lineterm = decode(lineterm)
-
- lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
- for line in lines:
- yield line.encode('ascii', 'surrogateescape')
-
-def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
- r"""
- Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
-
- Optional keyword parameters `linejunk` and `charjunk` are for filter
- functions, or can be None:
-
- - linejunk: A function that should accept a single string argument and
- return true iff the string is junk. The default is None, and is
- recommended; the underlying SequenceMatcher class has an adaptive
- notion of "noise" lines.
-
- - charjunk: A function that accepts a character (string of length
- 1), and returns true iff the character is junk. The default is
- the module-level function IS_CHARACTER_JUNK, which filters out
- whitespace characters (a blank or tab; note: it's a bad idea to
- include newline in this!).
-
- Tools/scripts/ndiff.py is a command-line front-end to this function.
-
- Example:
-
- >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
- ... 'ore\ntree\nemu\n'.splitlines(keepends=True))
- >>> print(''.join(diff), end="")
- - one
- ? ^
- + ore
- ? ^
- - two
- - three
- ? -
- + tree
- + emu
- """
- return Differ(linejunk, charjunk).compare(a, b)
-
-def _mdiff(fromlines, tolines, context=None, linejunk=None,
- charjunk=IS_CHARACTER_JUNK):
- r"""Returns generator yielding marked up from/to side by side differences.
-
- Arguments:
- fromlines -- list of text lines to compared to tolines
- tolines -- list of text lines to be compared to fromlines
- context -- number of context lines to display on each side of difference,
- if None, all from/to text lines will be generated.
- linejunk -- passed on to ndiff (see ndiff documentation)
- charjunk -- passed on to ndiff (see ndiff documentation)
-
- This function returns an iterator which returns a tuple:
- (from line tuple, to line tuple, boolean flag)
-
- from/to line tuple -- (line num, line text)
- line num -- integer or None (to indicate a context separation)
- line text -- original line text with following markers inserted:
- '\0+' -- marks start of added text
- '\0-' -- marks start of deleted text
- '\0^' -- marks start of changed text
- '\1' -- marks end of added/deleted/changed text
-
- boolean flag -- None indicates context separation, True indicates
- either "from" or "to" line contains a change, otherwise False.
-
- This function/iterator was originally developed to generate side by side
- file difference for making HTML pages (see HtmlDiff class for example
- usage).
-
- Note, this function utilizes the ndiff function to generate the side by
- side difference markup. Optional ndiff arguments may be passed to this
- function and they in turn will be passed to ndiff.
- """
- import re
-
- # regular expression for finding intraline change indices
- change_re = re.compile(r'(\++|\-+|\^+)')
-
- # create the difference iterator to generate the differences
- diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
-
- def _make_line(lines, format_key, side, num_lines=[0,0]):
- """Returns line of text with user's change markup and line formatting.
-
- lines -- list of lines from the ndiff generator to produce a line of
- text from. When producing the line of text to return, the
- lines used are removed from this list.
- format_key -- '+' return first line in list with "add" markup around
- the entire line.
- '-' return first line in list with "delete" markup around
- the entire line.
- '?' return first line in list with add/delete/change
- intraline markup (indices obtained from second line)
- None return first line in list with no markup
- side -- indice into the num_lines list (0=from,1=to)
- num_lines -- from/to current line number. This is NOT intended to be a
- passed parameter. It is present as a keyword argument to
- maintain memory of the current line numbers between calls
- of this function.
-
- Note, this function is purposefully not defined at the module scope so
- that data it needs from its parent function (within whose context it
- is defined) does not need to be of module scope.
- """
- num_lines[side] += 1
- # Handle case where no user markup is to be added, just return line of
- # text with user's line format to allow for usage of the line number.
- if format_key is None:
- return (num_lines[side],lines.pop(0)[2:])
- # Handle case of intraline changes
- if format_key == '?':
- text, markers = lines.pop(0), lines.pop(0)
- # find intraline changes (store change type and indices in tuples)
- sub_info = []
- def record_sub_info(match_object,sub_info=sub_info):
- sub_info.append([match_object.group(1)[0],match_object.span()])
- return match_object.group(1)
- change_re.sub(record_sub_info,markers)
- # process each tuple inserting our special marks that won't be
- # noticed by an xml/html escaper.
- for key,(begin,end) in reversed(sub_info):
- text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
- text = text[2:]
- # Handle case of add/delete entire line
- else:
- text = lines.pop(0)[2:]
- # if line of text is just a newline, insert a space so there is
- # something for the user to highlight and see.
- if not text:
- text = ' '
- # insert marks that won't be noticed by an xml/html escaper.
- text = '\0' + format_key + text + '\1'
- # Return line of text, first allow user's line formatter to do its
- # thing (such as adding the line number) then replace the special
- # marks with what the user's change markup.
- return (num_lines[side],text)
-
- def _line_iterator():
- """Yields from/to lines of text with a change indication.
-
- This function is an iterator. It itself pulls lines from a
- differencing iterator, processes them and yields them. When it can
- it yields both a "from" and a "to" line, otherwise it will yield one
- or the other. In addition to yielding the lines of from/to text, a
- boolean flag is yielded to indicate if the text line(s) have
- differences in them.
-
- Note, this function is purposefully not defined at the module scope so
- that data it needs from its parent function (within whose context it
- is defined) does not need to be of module scope.
- """
- lines = []
- num_blanks_pending, num_blanks_to_yield = 0, 0
- while True:
- # Load up next 4 lines so we can look ahead, create strings which
- # are a concatenation of the first character of each of the 4 lines
- # so we can do some very readable comparisons.
- while len(lines) < 4:
- lines.append(next(diff_lines_iterator, 'X'))
- s = ''.join([line[0] for line in lines])
- if s.startswith('X'):
- # When no more lines, pump out any remaining blank lines so the
- # corresponding add/delete lines get a matching blank line so
- # all line pairs get yielded at the next level.
- num_blanks_to_yield = num_blanks_pending
- elif s.startswith('-?+?'):
- # simple intraline change
- yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
- continue
- elif s.startswith('--++'):
- # in delete block, add block coming: we do NOT want to get
- # caught up on blank lines yet, just process the delete line
- num_blanks_pending -= 1
- yield _make_line(lines,'-',0), None, True
- continue
- elif s.startswith(('--?+', '--+', '- ')):
- # in delete block and see an intraline change or unchanged line
- # coming: yield the delete line and then blanks
- from_line,to_line = _make_line(lines,'-',0), None
- num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
- elif s.startswith('-+?'):
- # intraline change
- yield _make_line(lines,None,0), _make_line(lines,'?',1), True
- continue
- elif s.startswith('-?+'):
- # intraline change
- yield _make_line(lines,'?',0), _make_line(lines,None,1), True
- continue
- elif s.startswith('-'):
- # delete FROM line
- num_blanks_pending -= 1
- yield _make_line(lines,'-',0), None, True
- continue
- elif s.startswith('+--'):
- # in add block, delete block coming: we do NOT want to get
- # caught up on blank lines yet, just process the add line
- num_blanks_pending += 1
- yield None, _make_line(lines,'+',1), True
- continue
- elif s.startswith(('+ ', '+-')):
- # will be leaving an add block: yield blanks then add line
- from_line, to_line = None, _make_line(lines,'+',1)
- num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
- elif s.startswith('+'):
- # inside an add block, yield the add line
- num_blanks_pending += 1
- yield None, _make_line(lines,'+',1), True
- continue
- elif s.startswith(' '):
- # unchanged text, yield it to both sides
- yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
- continue
- # Catch up on the blank lines so when we yield the next from/to
- # pair, they are lined up.
- while(num_blanks_to_yield < 0):
- num_blanks_to_yield += 1
- yield None,('','\n'),True
- while(num_blanks_to_yield > 0):
- num_blanks_to_yield -= 1
- yield ('','\n'),None,True
- if s.startswith('X'):
- return
- else:
- yield from_line,to_line,True
-
- def _line_pair_iterator():
- """Yields from/to lines of text with a change indication.
-
- This function is an iterator. It itself pulls lines from the line
- iterator. Its difference from that iterator is that this function
- always yields a pair of from/to text lines (with the change
- indication). If necessary it will collect single from/to lines
- until it has a matching pair from/to pair to yield.
-
- Note, this function is purposefully not defined at the module scope so
- that data it needs from its parent function (within whose context it
- is defined) does not need to be of module scope.
- """
- line_iterator = _line_iterator()
- fromlines,tolines=[],[]
- while True:
- # Collecting lines of text until we have a from/to pair
- while (len(fromlines)==0 or len(tolines)==0):
- try:
- from_line, to_line, found_diff = next(line_iterator)
- except StopIteration:
- return
- if from_line is not None:
- fromlines.append((from_line,found_diff))
- if to_line is not None:
- tolines.append((to_line,found_diff))
- # Once we have a pair, remove them from the collection and yield it
- from_line, fromDiff = fromlines.pop(0)
- to_line, to_diff = tolines.pop(0)
- yield (from_line,to_line,fromDiff or to_diff)
-
- # Handle case where user does not want context differencing, just yield
- # them up without doing anything else with them.
- line_pair_iterator = _line_pair_iterator()
- if context is None:
- yield from line_pair_iterator
- # Handle case where user wants context differencing. We must do some
- # storage of lines until we know for sure that they are to be yielded.
- else:
- context += 1
- lines_to_write = 0
- while True:
- # Store lines up until we find a difference, note use of a
- # circular queue because we only need to keep around what
- # we need for context.
- index, contextLines = 0, [None]*(context)
- found_diff = False
- while(found_diff is False):
- try:
- from_line, to_line, found_diff = next(line_pair_iterator)
- except StopIteration:
- return
- i = index % context
- contextLines[i] = (from_line, to_line, found_diff)
- index += 1
- # Yield lines that we have collected so far, but first yield
- # the user's separator.
- if index > context:
- yield None, None, None
- lines_to_write = context
- else:
- lines_to_write = index
- index = 0
- while(lines_to_write):
- i = index % context
- index += 1
- yield contextLines[i]
- lines_to_write -= 1
- # Now yield the context lines after the change
- lines_to_write = context-1
- try:
- while(lines_to_write):
- from_line, to_line, found_diff = next(line_pair_iterator)
- # If another change within the context, extend the context
- if found_diff:
- lines_to_write = context-1
- else:
- lines_to_write -= 1
- yield from_line, to_line, found_diff
- except StopIteration:
- # Catch exception from next() and return normally
- return
-
-
-_file_template = """
-
-
-
-
-
- Diff comparison
-
-
-
-
- %(table)s%(legend)s
-
-
-"""
-
-_styles = """
- :root {color-scheme: light dark}
- table.diff {
- font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
- border: medium;
- }
- .diff_header {
- background-color: #e0e0e0;
- font-weight: bold;
- }
- td.diff_header {
- text-align: right;
- padding: 0 8px;
- }
- .diff_next {
- background-color: #c0c0c0;
- padding: 4px 0;
- }
- .diff_add {background-color:palegreen}
- .diff_chg {background-color:#ffff77}
- .diff_sub {background-color:#ffaaaa}
- table.diff[summary="Legends"] {
- margin-top: 20px;
- border: 1px solid #ccc;
- }
- table.diff[summary="Legends"] th {
- background-color: #e0e0e0;
- padding: 4px 8px;
- }
- table.diff[summary="Legends"] td {
- padding: 4px 8px;
- }
-
- @media (prefers-color-scheme: dark) {
- .diff_header {background-color:#666}
- .diff_next {background-color:#393939}
- .diff_add {background-color:darkgreen}
- .diff_chg {background-color:#847415}
- .diff_sub {background-color:darkred}
- table.diff[summary="Legends"] {border-color:#555}
- table.diff[summary="Legends"] th{background-color:#666}
- }"""
-
-_table_template = """
-
-
-
- %(header_row)s
-
-%(data_rows)s
-
"""
-
-_legend = """
-
- | Legends |
-
- | Colors |
- | Added |
- | Changed |
- | Deleted |
- |
-
- | Links |
- | (f)irst change |
- | (n)ext change |
- | (t)op |
- |
-
"""
-
-class HtmlDiff(object):
- """For producing HTML side by side comparison with change highlights.
-
- This class can be used to create an HTML table (or a complete HTML file
- containing the table) showing a side by side, line by line comparison
- of text with inter-line and intra-line change highlights. The table can
- be generated in either full or contextual difference mode.
-
- The following methods are provided for HTML generation:
-
- make_table -- generates HTML for a single side by side table
- make_file -- generates complete HTML file with a single side by side table
-
- See Doc/includes/diff.py for an example usage of this class.
- """
-
- _file_template = _file_template
- _styles = _styles
- _table_template = _table_template
- _legend = _legend
- _default_prefix = 0
-
- def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
- charjunk=IS_CHARACTER_JUNK):
- """HtmlDiff instance initializer
-
- Arguments:
- tabsize -- tab stop spacing, defaults to 8.
- wrapcolumn -- column number where lines are broken and wrapped,
- defaults to None where lines are not wrapped.
- linejunk,charjunk -- keyword arguments passed into ndiff() (used by
- HtmlDiff() to generate the side by side HTML differences). See
- ndiff() documentation for argument default values and descriptions.
- """
- self._tabsize = tabsize
- self._wrapcolumn = wrapcolumn
- self._linejunk = linejunk
- self._charjunk = charjunk
-
- def make_file(self, fromlines, tolines, fromdesc='', todesc='',
- context=False, numlines=5, *, charset='utf-8'):
- """Returns HTML file of side by side comparison with change highlights
-
- Arguments:
- fromlines -- list of "from" lines
- tolines -- list of "to" lines
- fromdesc -- "from" file column header string
- todesc -- "to" file column header string
- context -- set to True for contextual differences (defaults to False
- which shows full differences).
- numlines -- number of context lines. When context is set True,
- controls number of lines displayed before and after the change.
- When context is False, controls the number of lines to place
- the "next" link anchors before the next change (so click of
- "next" link jumps to just before the change).
- charset -- charset of the HTML document
- """
-
- return (self._file_template % dict(
- styles=self._styles,
- legend=self._legend,
- table=self.make_table(fromlines, tolines, fromdesc, todesc,
- context=context, numlines=numlines),
- charset=charset
- )).encode(charset, 'xmlcharrefreplace').decode(charset)
-
- def _tab_newline_replace(self,fromlines,tolines):
- """Returns from/to line lists with tabs expanded and newlines removed.
-
- Instead of tab characters being replaced by the number of spaces
- needed to fill in to the next tab stop, this function will fill
- the space with tab characters. This is done so that the difference
- algorithms can identify changes in a file when tabs are replaced by
- spaces and vice versa. At the end of the HTML generation, the tab
- characters will be replaced with a nonbreakable space.
- """
- def expand_tabs(line):
- # hide real spaces
- line = line.replace(' ','\0')
- # expand tabs into spaces
- line = line.expandtabs(self._tabsize)
- # replace spaces from expanded tabs back into tab characters
- # (we'll replace them with markup after we do differencing)
- line = line.replace(' ','\t')
- return line.replace('\0',' ').rstrip('\n')
- fromlines = [expand_tabs(line) for line in fromlines]
- tolines = [expand_tabs(line) for line in tolines]
- return fromlines,tolines
-
- def _split_line(self,data_list,line_num,text):
- """Builds list of text lines by splitting text lines at wrap point
-
- This function will determine if the input text line needs to be
- wrapped (split) into separate lines. If so, the first wrap point
- will be determined and the first line appended to the output
- text line list. This function is used recursively to handle
- the second part of the split line to further split it.
- """
- # if blank line or context separator, just add it to the output list
- if not line_num:
- data_list.append((line_num,text))
- return
-
- # if line text doesn't need wrapping, just add it to the output list
- size = len(text)
- max = self._wrapcolumn
- if (size <= max) or ((size -(text.count('\0')*3)) <= max):
- data_list.append((line_num,text))
- return
-
- # scan text looking for the wrap point, keeping track if the wrap
- # point is inside markers
- i = 0
- n = 0
- mark = ''
- while n < max and i < size:
- if text[i] == '\0':
- i += 1
- mark = text[i]
- i += 1
- elif text[i] == '\1':
- i += 1
- mark = ''
- else:
- i += 1
- n += 1
-
- # wrap point is inside text, break it up into separate lines
- line1 = text[:i]
- line2 = text[i:]
-
- # if wrap point is inside markers, place end marker at end of first
- # line and start marker at beginning of second line because each
- # line will have its own table tag markup around it.
- if mark:
- line1 = line1 + '\1'
- line2 = '\0' + mark + line2
-
- # tack on first line onto the output list
- data_list.append((line_num,line1))
-
- # use this routine again to wrap the remaining text
- self._split_line(data_list,'>',line2)
-
- def _line_wrapper(self,diffs):
- """Returns iterator that splits (wraps) mdiff text lines"""
-
- # pull from/to data and flags from mdiff iterator
- for fromdata,todata,flag in diffs:
- # check for context separators and pass them through
- if flag is None:
- yield fromdata,todata,flag
- continue
- (fromline,fromtext),(toline,totext) = fromdata,todata
- # for each from/to line split it at the wrap column to form
- # list of text lines.
- fromlist,tolist = [],[]
- self._split_line(fromlist,fromline,fromtext)
- self._split_line(tolist,toline,totext)
- # yield from/to line in pairs inserting blank lines as
- # necessary when one side has more wrapped lines
- while fromlist or tolist:
- if fromlist:
- fromdata = fromlist.pop(0)
- else:
- fromdata = ('',' ')
- if tolist:
- todata = tolist.pop(0)
- else:
- todata = ('',' ')
- yield fromdata,todata,flag
-
- def _collect_lines(self,diffs):
- """Collects mdiff output into separate lists
-
- Before storing the mdiff from/to data into a list, it is converted
- into a single line of text with HTML markup.
- """
-
- fromlist,tolist,flaglist = [],[],[]
- # pull from/to data and flags from mdiff style iterator
- for fromdata,todata,flag in diffs:
- try:
- # store HTML markup of the lines into the lists
- fromlist.append(self._format_line(0,flag,*fromdata))
- tolist.append(self._format_line(1,flag,*todata))
- except TypeError:
- # exceptions occur for lines where context separators go
- fromlist.append(None)
- tolist.append(None)
- flaglist.append(flag)
- return fromlist,tolist,flaglist
-
- def _format_line(self,side,flag,linenum,text):
- """Returns HTML markup of "from" / "to" text lines
-
- side -- 0 or 1 indicating "from" or "to" text
- flag -- indicates if difference on line
- linenum -- line number (used for line number column)
- text -- line text to be marked up
- """
- try:
- linenum = '%d' % linenum
- id = ' id="%s%s"' % (self._prefix[side],linenum)
- except TypeError:
- # handle blank lines where linenum is '>' or ''
- id = ''
- # replace those things that would get confused with HTML symbols
- text=text.replace("&","&").replace(">",">").replace("<","<")
-
- # make space non-breakable so they don't get compressed or line wrapped
- text = text.replace(' ',' ').rstrip()
-
- # add a class to the td tag if there is a difference on the line
- css_class = ' class="diff_changed" ' if flag else ' '
-
- return f'' \
- + f'{text} | '
-
- def _make_prefix(self):
- """Create unique anchor prefixes"""
-
- # Generate a unique anchor prefix so multiple tables
- # can exist on the same HTML page without conflicts.
- fromprefix = "from%d_" % HtmlDiff._default_prefix
- toprefix = "to%d_" % HtmlDiff._default_prefix
- HtmlDiff._default_prefix += 1
- # store prefixes so line format method has access
- self._prefix = [fromprefix,toprefix]
-
- def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
- """Makes list of "next" links"""
-
- # all anchor names will be generated using the unique "to" prefix
- toprefix = self._prefix[1]
-
- # process change flags, generating middle column of next anchors/links
- next_id = ['']*len(flaglist)
- next_href = ['']*len(flaglist)
- num_chg, in_change = 0, False
- last = 0
- for i,flag in enumerate(flaglist):
- if flag:
- if not in_change:
- in_change = True
- last = i
- # at the beginning of a change, drop an anchor a few lines
- # (the context lines) before the change for the previous
- # link
- i = max([0,i-numlines])
- next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
- # at the beginning of a change, drop a link to the next
- # change
- num_chg += 1
- next_href[last] = 'n' % (
- toprefix,num_chg)
- else:
- in_change = False
- # check for cases where there is no content to avoid exceptions
- if not flaglist:
- flaglist = [False]
- next_id = ['']
- next_href = ['']
- last = 0
- if context:
- fromlist = [' | No Differences Found | ']
- tolist = fromlist
- else:
- fromlist = tolist = [' | Empty File | ']
- # if not a change on first line, drop a link
- if not flaglist[0]:
- next_href[0] = 'f' % toprefix
- # redo the last link to link to the top
- next_href[last] = 't' % (toprefix)
-
- return fromlist,tolist,flaglist,next_href,next_id
-
- def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
- numlines=5):
- """Returns HTML table of side by side comparison with change highlights
-
- Arguments:
- fromlines -- list of "from" lines
- tolines -- list of "to" lines
- fromdesc -- "from" file column header string
- todesc -- "to" file column header string
- context -- set to True for contextual differences (defaults to False
- which shows full differences).
- numlines -- number of context lines. When context is set True,
- controls number of lines displayed before and after the change.
- When context is False, controls the number of lines to place
- the "next" link anchors before the next change (so click of
- "next" link jumps to just before the change).
- """
-
- # make unique anchor prefixes so that multiple tables may exist
- # on the same page without conflict.
- self._make_prefix()
-
- # change tabs to spaces before it gets more difficult after we insert
- # markup
- fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
-
- # create diffs iterator which generates side by side from/to data
- if context:
- context_lines = numlines
- else:
- context_lines = None
- diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
- charjunk=self._charjunk)
-
- # set up iterator to wrap lines that exceed desired width
- if self._wrapcolumn:
- diffs = self._line_wrapper(diffs)
-
- # collect up from/to lines and flags into lists (also format the lines)
- fromlist,tolist,flaglist = self._collect_lines(diffs)
-
- # process change flags, generating middle column of next anchors/links
- fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
- fromlist,tolist,flaglist,context,numlines)
-
- s = []
- fmt = ' | %s | %s' + \
- '%s | %s
\n'
- for i in range(len(flaglist)):
- if flaglist[i] is None:
- # mdiff yields None on separator lines skip the bogus ones
- # generated for the first line
- if i > 0:
- s.append('
\n \n')
- else:
- s.append( fmt % (next_id[i],next_href[i],fromlist[i],
- next_href[i],tolist[i]))
- if fromdesc or todesc:
- header_row = '%s%s%s%s
' % (
- '
| ',
- '' % fromdesc,
- '
| ',
- '' % todesc)
- else:
- header_row = ''
-
- table = self._table_template % dict(
- data_rows=''.join(s),
- header_row=header_row,
- prefix=self._prefix[1])
-
- return table.replace('\0+',''). \
- replace('\0-',''). \
- replace('\0^',''). \
- replace('\1',''). \
- replace('\t',' ')
-
-
-def restore(delta, which):
- r"""
- Generate one of the two sequences that generated a delta.
-
- Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
- lines originating from file 1 or 2 (parameter `which`), stripping off line
- prefixes.
-
- Examples:
+This module dispatches to a faster C-coded SequenceMatcher (the
+``_difflib`` accelerator module) when available, falling back to the
+pure-Python reference implementation in ``_pydifflib``. The pure-Python
+module is preserved so that alternative Python implementations have a
+self-contained reference; CPython prefers the C version automatically.
+"""
- >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
- ... 'ore\ntree\nemu\n'.splitlines(keepends=True))
- >>> diff = list(diff)
- >>> print(''.join(restore(diff, 1)), end="")
- one
- two
- three
- >>> print(''.join(restore(diff, 2)), end="")
- ore
- tree
- emu
- """
- try:
- tag = {1: "- ", 2: "+ "}[int(which)]
- except KeyError:
- raise ValueError('unknown delta choice (must be 1 or 2): %r'
- % which) from None
- prefixes = (" ", tag)
- for line in delta:
- if line[:2] in prefixes:
- yield line[2:]
+from _pydifflib import * # noqa: F401, F403
+from _pydifflib import __all__ # noqa: F401
+from _pydifflib import SequenceMatcher as _PySequenceMatcher
+# Private helpers referenced by the test suite and (potentially) by other
+# stdlib callers; re-exported to keep ``difflib.X`` working transparently.
+from _pydifflib import ( # noqa: F401
+ _calculate_ratio,
+ _format_range_context,
+ _format_range_unified,
+ _mdiff,
+)
+# can_colorize / get_theme were lazily-imported names on the original
+# Lib/difflib.py module; re-export them here so they remain runtime
+# attributes of ``difflib`` (test_pyclbr.test_easy verifies this) while
+# keeping the lazy contract (test_difflib.LazyImportTest verifies that
+# importing difflib does not import ``_colorize``).
+lazy from _pydifflib import can_colorize, get_theme # noqa: F401
+from _pydifflib import SequenceMatcher as _PySequenceMatcher
+from types import GenericAlias as _GenericAlias
+
+# Use the C-accelerated SequenceMatcher when available. The C type covers
+# the hot methods (__init__, set_seqs/set_seq1/set_seq2, find_longest_match,
+# get_matching_blocks, get_opcodes, ratio); the slow-path methods that the
+# rest of the module needs (quick_ratio, real_quick_ratio,
+# get_grouped_opcodes) are inherited from the pure-Python class.
+try:
+ # Imported under its own name (not aliased) so pyclbr's static analysis
+ # sees the subclass's base as ``SequenceMatcher`` -- matching the
+ # runtime ``__bases__[0].__name__`` from the C type.
+ from _difflib import SequenceMatcher
+except ImportError:
+ pass
+else:
+ class SequenceMatcher(SequenceMatcher): # noqa: F811
+ __doc__ = _PySequenceMatcher.__doc__
+ __class_getitem__ = classmethod(_GenericAlias)
+
+ # Forward the pure-Python slow-path methods. These are defined as
+ # ``def``s (rather than direct attribute assignments) so the source
+ # parser used by pyclbr sees them as methods of this class --
+ # otherwise test_pyclbr.test_easy reports them as missing.
+ def quick_ratio(self):
+ return _PySequenceMatcher.quick_ratio(self)
+
+ def real_quick_ratio(self):
+ return _PySequenceMatcher.real_quick_ratio(self)
+
+ def get_grouped_opcodes(self, n=3):
+ return _PySequenceMatcher.get_grouped_opcodes(self, n)
+
+ # Re-bind the name inside _pydifflib so the helper functions defined
+ # there (unified_diff, context_diff, ndiff, get_close_matches, Differ,
+ # HtmlDiff) -- which look up ``SequenceMatcher`` in their own module's
+ # globals -- pick up the C-accelerated subclass instead of the
+ # pure-Python class. Without this rebind, ``difflib.unified_diff`` would
+ # see no speedup from the accelerator even though ``difflib.SequenceMatcher``
+ # itself is the C class.
+ import _pydifflib as _pyd
+ _pyd.SequenceMatcher = SequenceMatcher
+ del _pyd
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index 46c9b2c1d8c9fc..9d6051d70a9206 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -1,4 +1,5 @@
import difflib
+import _pydifflib
from test import support
from test.support import findfile, force_colorized
from test.support.import_helper import ensure_lazy_imports
@@ -7,6 +8,28 @@
import sys
+# Tests below reference ``difflib.SequenceMatcher``. By default that is the
+# C-accelerated subclass (when ``_difflib`` is available) or the pure-Python
+# class otherwise. The mixin below temporarily swaps it to the pure-Python
+# class from ``_pydifflib`` so the same test suite covers both implementations
+# whenever the accelerator is built.
+_PySequenceMatcher = _pydifflib.SequenceMatcher
+_has_c_accelerator = difflib.SequenceMatcher is not _PySequenceMatcher
+
+
+class _PyImplMixin:
+ """Run a TestCase with ``difflib.SequenceMatcher`` patched to pure Python."""
+
+ def setUp(self):
+ super().setUp()
+ self._orig_SequenceMatcher = difflib.SequenceMatcher
+ difflib.SequenceMatcher = _PySequenceMatcher
+
+ def tearDown(self):
+ difflib.SequenceMatcher = self._orig_SequenceMatcher
+ super().tearDown()
+
+
class TestWithAscii(unittest.TestCase):
def test_one_insert(self):
sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
@@ -201,6 +224,10 @@ class TestSFpatches(unittest.TestCase):
def test_html_diff(self):
# Check SF patch 914575 for generating HTML differences
+ # Reset the global ``HtmlDiff._default_prefix`` counter so that
+ # generated element IDs are stable when this test runs twice
+ # (e.g. once per implementation; see _PyImplMixin below).
+ difflib.HtmlDiff._default_prefix = 0
f1a = ((patch914575_from1 + '123\n'*10)*3)
t1a = (patch914575_to1 + '123\n'*10)*3
f1b = '456\n'*10 + f1a
@@ -657,5 +684,31 @@ def load_tests(loader, tests, pattern):
return tests
+# When the C accelerator is present, generate a parallel ``*_PurePython``
+# class for each TestCase above so the same tests run against the pure-Python
+# implementation as well. Tests that probe import behaviour (LazyImportTest)
+# or are inherently implementation-specific are skipped.
+def _generate_pure_python_variants():
+ if not _has_c_accelerator:
+ return
+ skip = {"LazyImportTest"}
+ module = sys.modules[__name__]
+ for name in list(vars(module)):
+ cls = getattr(module, name)
+ if (isinstance(cls, type)
+ and issubclass(cls, unittest.TestCase)
+ and cls is not unittest.TestCase
+ and not name.endswith("_PurePython")
+ and name not in skip):
+ new_name = name + "_PurePython"
+ new_cls = type(new_name, (_PyImplMixin, cls), {})
+ setattr(module, new_name, new_cls)
+
+
+_generate_pure_python_variants()
+
+
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2026-05-21-10-00-30.gh-issue-150184.P1MnrD.rst b/Misc/NEWS.d/next/Library/2026-05-21-10-00-30.gh-issue-150184.P1MnrD.rst
new file mode 100644
index 00000000000000..281a986560129c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-21-10-00-30.gh-issue-150184.P1MnrD.rst
@@ -0,0 +1,7 @@
+Add a C accelerator (``_difflib``) for :class:`difflib.SequenceMatcher`.
+Output is bit-identical to the pure-Python implementation; typical
+workloads run 5--15x faster, character-level diffs of long strings up to
+9x, and ``bytes`` diffs up to 15x. The pure-Python reference
+implementation is preserved as :mod:`!_pydifflib` so alternative Python
+implementations have a self-contained fallback.
+Patch by Ben Hsing.
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index c3dd47a5e40a67..9e2760aa88b684 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -34,6 +34,7 @@
@MODULE_ARRAY_TRUE@array arraymodule.c
@MODULE__BISECT_TRUE@_bisect _bisectmodule.c
@MODULE__CSV_TRUE@_csv _csv.c
+@MODULE__DIFFLIB_TRUE@_difflib _difflibmodule.c
@MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c
@MODULE__JSON_TRUE@_json _json.c
@MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c
diff --git a/Modules/_difflibmodule.c b/Modules/_difflibmodule.c
new file mode 100644
index 00000000000000..79c56046cacd9b
--- /dev/null
+++ b/Modules/_difflibmodule.c
@@ -0,0 +1,1736 @@
+/*
+ * _difflib accelerator module.
+ *
+ * Provides a fast C implementation of difflib.SequenceMatcher that is used
+ * by Lib/difflib.py when available. The algorithm mirrors the pure-Python
+ * implementation exactly (output is bit-identical, including tie breaks);
+ * the performance comes from operating on integer-label arrays rather than
+ * Python objects in the inner DP and recursion loops.
+ *
+ * The implementation was built incrementally; comments throughout the file
+ * use [phase N] tags to mark code that was added in each optimisation
+ * step. See the design discussion at
+ * https://discuss.python.org/t/
+ * for benchmarks per phase.
+ *
+ * Phase summary:
+ * [phase 1] C port of find_longest_match: paired j2len_val/j2len_ver
+ * int arrays + generation counter replace the per-row
+ * j2len = {} dict. Lives in flm_core().
+ * [phase 2] C port of chain_b: builds b2j without per-element
+ * setdefault/append. Type-specialised iteration of b for
+ * str/list/tuple/bytes. Lives in chain_b().
+ * [phase 3] Full Ratcliff-Obershelp recursion in C: position-indexed
+ * int32 label arrays (a_lbl, a_dp, b_lbl, junk_mask) carry
+ * the work; DP and extension passes are pure C. Lives in
+ * flm_core() (extension passes) and compute_matching_blocks().
+ * [phase 4] Codepoint-keyed cp_full[] / cp_dp[] lookup tables for
+ * str. Lives in chain_b() (table construction, str branch)
+ * and build_a_labels() (str fast path).
+ * [phase 5] Bytes fast path (cp arrays of size 256), persistent DP
+ * scratch (j2len_val2/ver2, no per-call alloca/memset),
+ * skip the max-codepoint scan for UCS1 strings. Lives in
+ * chain_b() (bytes branch + UCS1 shortcut) and flm_core()
+ * (persistent scratch).
+ */
+
+#define PY_SSIZE_T_CLEAN
+
+// clinic/_difflibmodule.c.h uses internal pycore_modsupport.h API
+#ifndef Py_BUILD_CORE_BUILTIN
+# define Py_BUILD_CORE_MODULE 1
+#endif
+
+#include "Python.h"
+#include
+#include
+#include
+#include
+
+
+/*[clinic input]
+module _difflib
+class _difflib.SequenceMatcher "SequenceMatcherObject *" "clinic_state()->SequenceMatcher_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=ee484bc2f95ade86]*/
+
+
+/* ====================================================================== */
+/* Module state and per-instance state. */
+/* ====================================================================== */
+
+typedef struct {
+ PyTypeObject *SequenceMatcher_Type;
+ PyObject *Match; /* difflib.Match namedtuple */
+} _difflib_state;
+
+static inline _difflib_state *
+get_module_state(PyObject *module)
+{
+ void *state = PyModule_GetState(module);
+ assert(state != NULL);
+ return (_difflib_state *)state;
+}
+
+static struct PyModuleDef _difflib_module;
+
+typedef struct {
+ PyObject_HEAD
+
+ /* Public attributes (mirror difflib.SequenceMatcher). */
+ PyObject *isjunk; /* callable or None */
+ PyObject *a; /* current sequence a */
+ PyObject *b; /* current sequence b */
+ PyObject *b2j; /* dict: elt -> list[int] */
+ PyObject *bjunk; /* set */
+ PyObject *bpopular; /* set */
+ PyObject *matching_blocks; /* cached list[Match] or None */
+ PyObject *opcodes; /* cached list[tuple] or None */
+ PyObject *fullbcount; /* cached dict or None (quick_ratio) */
+ int autojunk;
+
+ /* Private C state. */
+ Py_ssize_t la;
+ Py_ssize_t lb;
+ /* [phase 3] Integer-label arrays. Every distinct element of b gets a
+ "full label"; b2j-survivors also get a "DP label" (-1 otherwise).
+ Position-indexed so the DP and extension passes never touch a
+ PyObject in the hot loop. */
+ Py_ssize_t nlbl; /* number of DP labels */
+ int32_t *a_lbl; /* len la, full label or -1 */
+ int32_t *a_dp; /* len la, DP label or -1 */
+ int32_t *b_lbl; /* len lb, full label */
+ int32_t *jbuf; /* concatenated index lists per DP label */
+ int32_t *jstart; /* len nlbl */
+ int32_t *jcount; /* len nlbl */
+ uint8_t *junk_mask; /* len lb, 1 if b[i] in bjunk */
+ PyObject *elt_to_lbl_full;
+ PyObject *elt_to_lbl_dp;
+ /* [phase 4/5] Codepoint-keyed lookup tables for str/bytes inputs.
+ Phase 4 added them for str; phase 5 added the bytes branch. NULL
+ when neither applies. */
+ int32_t *cp_full; /* codepoint-keyed fast path (str/bytes) */
+ int32_t *cp_dp;
+ Py_ssize_t cp_max_plus1;
+ /* [phase 1] j2len_val/ver pair: paired int + generation array
+ replacing the per-row Python `j2len = {}` dict.
+ [phase 5] j2len_val2/ver2: persistent second pair so flm_core
+ no longer alloca()s + memset()s per call. */
+ Py_ssize_t *j2len_val;
+ Py_ssize_t *j2len_ver;
+ Py_ssize_t *j2len_val2;
+ Py_ssize_t *j2len_ver2;
+ Py_ssize_t j2len_size;
+ Py_ssize_t gen;
+
+ int b_ready; /* chain_b() has run for current b */
+ int a_ready; /* a labels built for current a */
+} SequenceMatcherObject;
+
+#include "clinic/_difflibmodule.c.h"
+
+
+/* ====================================================================== */
+/* Small helpers. */
+/* ====================================================================== */
+
+static void
+free_b_state(SequenceMatcherObject *self)
+{
+ PyMem_Free(self->b_lbl);
+ PyMem_Free(self->jbuf);
+ PyMem_Free(self->jstart);
+ PyMem_Free(self->jcount);
+ PyMem_Free(self->junk_mask);
+ PyMem_Free(self->cp_full);
+ PyMem_Free(self->cp_dp);
+ PyMem_Free(self->j2len_val);
+ PyMem_Free(self->j2len_ver);
+ PyMem_Free(self->j2len_val2);
+ PyMem_Free(self->j2len_ver2);
+ self->b_lbl = NULL;
+ self->jbuf = NULL;
+ self->jstart = NULL;
+ self->jcount = NULL;
+ self->junk_mask = NULL;
+ self->cp_full = NULL;
+ self->cp_dp = NULL;
+ self->j2len_val = NULL;
+ self->j2len_ver = NULL;
+ self->j2len_val2 = NULL;
+ self->j2len_ver2 = NULL;
+ Py_CLEAR(self->elt_to_lbl_full);
+ Py_CLEAR(self->elt_to_lbl_dp);
+ self->lb = 0;
+ self->nlbl = 0;
+ self->cp_max_plus1 = 0;
+ self->j2len_size = 0;
+ self->gen = 0;
+ self->b_ready = 0;
+}
+
+static void
+free_a_state(SequenceMatcherObject *self)
+{
+ PyMem_Free(self->a_lbl);
+ PyMem_Free(self->a_dp);
+ self->a_lbl = NULL;
+ self->a_dp = NULL;
+ self->la = 0;
+ self->a_ready = 0;
+}
+
+static void
+invalidate_caches(SequenceMatcherObject *self)
+{
+ Py_CLEAR(self->matching_blocks);
+ Py_CLEAR(self->opcodes);
+}
+
+
+/* ====================================================================== */
+/* chain_b: build b2j, bjunk, bpopular, and integer-label state. */
+/* */
+/* [phase 2] This is the C port of SequenceMatcher.__chain_b(). The */
+/* per-element setdefault/append loop from Python is replaced by direct */
+/* dict access, with type-specialised reads for str/list/tuple/bytes so */
+/* we never go through PySequence_GetItem for those four common cases. */
+/* */
+/* [phase 3] After building b2j, we assign small integer labels: a "full */
+/* label" for every distinct element of b (used by the extension passes */
+/* via b_lbl[]) and a separate "DP label" for elements that survived */
+/* junk/popular pruning (used by the DP via a_dp[]/jbuf[]). */
+/* */
+/* [phase 4/5] If b is str or bytes, we also build codepoint-keyed */
+/* lookup tables cp_full[]/cp_dp[] so build_a_labels can skip dict probes */
+/* entirely. See the cp_sz construction near the bottom of the function. */
+/* ====================================================================== */
+
+static int
+chain_b(SequenceMatcherObject *self)
+{
+ PyObject *b2j = NULL;
+ PyObject *bjunk = NULL;
+ PyObject *bpopular = NULL;
+ PyObject *elt_to_lbl_full = NULL;
+ PyObject *elt_to_lbl_dp = NULL;
+ PyObject *junk_keys = NULL;
+ PyObject *pop_keys = NULL;
+ PyObject *iter_keys = NULL;
+ PyObject *items = NULL;
+ int rc = -1;
+
+ PyObject *b = self->b;
+ Py_ssize_t lb = PyObject_Length(b);
+ if (lb < 0) {
+ goto done;
+ }
+
+ b2j = PyDict_New();
+ bjunk = PySet_New(NULL);
+ bpopular = PySet_New(NULL);
+ if (b2j == NULL || bjunk == NULL || bpopular == NULL) {
+ goto done;
+ }
+
+ int is_str = PyUnicode_Check(b);
+ int is_list = PyList_Check(b);
+ int is_tuple = PyTuple_Check(b);
+ int is_bytes = PyBytes_Check(b);
+ int kind = 0;
+ const void *udata = NULL;
+ const unsigned char *bdata = NULL;
+
+ if (is_str) {
+ if (PyUnicode_READY(b) < 0) {
+ goto done;
+ }
+ kind = PyUnicode_KIND(b);
+ udata = PyUnicode_DATA(b);
+ }
+ else if (is_bytes) {
+ bdata = (const unsigned char *)PyBytes_AS_STRING(b);
+ }
+
+ /* Pass 1: build b2j (elt -> list of indices). */
+ for (Py_ssize_t i = 0; i < lb; i++) {
+ PyObject *elt;
+ if (is_str) {
+ elt = PyUnicode_FromOrdinal(PyUnicode_READ(kind, udata, i));
+ if (elt == NULL) {
+ goto done;
+ }
+ }
+ else if (is_bytes) {
+ elt = PyLong_FromLong(bdata[i]);
+ if (elt == NULL) {
+ goto done;
+ }
+ }
+ else if (is_list) {
+ elt = Py_NewRef(PyList_GET_ITEM(b, i));
+ }
+ else if (is_tuple) {
+ elt = Py_NewRef(PyTuple_GET_ITEM(b, i));
+ }
+ else {
+ elt = PySequence_GetItem(b, i);
+ if (elt == NULL) {
+ goto done;
+ }
+ }
+
+ PyObject *lst = PyDict_GetItemWithError(b2j, elt);
+ if (lst == NULL) {
+ if (PyErr_Occurred()) {
+ Py_DECREF(elt);
+ goto done;
+ }
+ lst = PyList_New(0);
+ if (lst == NULL) {
+ Py_DECREF(elt);
+ goto done;
+ }
+ if (PyDict_SetItem(b2j, elt, lst) < 0) {
+ Py_DECREF(lst);
+ Py_DECREF(elt);
+ goto done;
+ }
+ Py_DECREF(lst);
+ lst = PyDict_GetItemWithError(b2j, elt);
+ if (lst == NULL) {
+ Py_DECREF(elt);
+ goto done;
+ }
+ }
+ PyObject *idx = PyLong_FromSsize_t(i);
+ if (idx == NULL || PyList_Append(lst, idx) < 0) {
+ Py_XDECREF(idx);
+ Py_DECREF(elt);
+ goto done;
+ }
+ Py_DECREF(idx);
+ Py_DECREF(elt);
+ }
+
+ /* Assign FULL labels to all distinct b elements; fill b_lbl. */
+ elt_to_lbl_full = PyDict_New();
+ if (elt_to_lbl_full == NULL) {
+ goto done;
+ }
+ Py_ssize_t nfull = PyDict_GET_SIZE(b2j);
+ int32_t *b_lbl = (int32_t *)PyMem_Malloc(sizeof(int32_t) * (size_t)lb);
+ uint8_t *junk_mask = (uint8_t *)PyMem_Calloc((size_t)(lb > 0 ? lb : 1), 1);
+ if (b_lbl == NULL || junk_mask == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ PyErr_NoMemory();
+ goto done;
+ }
+ {
+ PyObject *k, *v;
+ Py_ssize_t pos = 0;
+ int32_t lbl = 0;
+ while (PyDict_Next(b2j, &pos, &k, &v)) {
+ PyObject *lbl_obj = PyLong_FromLong(lbl);
+ if (lbl_obj == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ if (PyDict_SetItem(elt_to_lbl_full, k, lbl_obj) < 0) {
+ Py_DECREF(lbl_obj);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_DECREF(lbl_obj);
+ Py_ssize_t n = PyList_GET_SIZE(v);
+ for (Py_ssize_t i = 0; i < n; i++) {
+ Py_ssize_t bi = PyLong_AsSsize_t(PyList_GET_ITEM(v, i));
+ if (bi == -1 && PyErr_Occurred()) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ b_lbl[bi] = lbl;
+ }
+ lbl++;
+ }
+ }
+ (void)nfull;
+
+ /* Apply isjunk callback (if any) and mark junk_mask. */
+ if (self->isjunk != Py_None) {
+ junk_keys = PyList_New(0);
+ if (junk_keys == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ iter_keys = PyDict_Keys(b2j);
+ if (iter_keys == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_ssize_t nk = PyList_GET_SIZE(iter_keys);
+ for (Py_ssize_t i = 0; i < nk; i++) {
+ PyObject *k = PyList_GET_ITEM(iter_keys, i);
+ PyObject *res = PyObject_CallOneArg(self->isjunk, k);
+ if (res == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ int truthy = PyObject_IsTrue(res);
+ Py_DECREF(res);
+ if (truthy < 0) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ if (truthy) {
+ if (PySet_Add(bjunk, k) < 0
+ || PyList_Append(junk_keys, k) < 0)
+ {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ }
+ }
+ Py_ssize_t njk = PyList_GET_SIZE(junk_keys);
+ for (Py_ssize_t i = 0; i < njk; i++) {
+ PyObject *k = PyList_GET_ITEM(junk_keys, i);
+ PyObject *lbl_obj = PyDict_GetItemWithError(elt_to_lbl_full, k);
+ if (lbl_obj == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ long lbl = PyLong_AsLong(lbl_obj);
+ for (Py_ssize_t bi = 0; bi < lb; bi++) {
+ if (b_lbl[bi] == (int32_t)lbl) {
+ junk_mask[bi] = 1;
+ }
+ }
+ if (PyDict_DelItem(b2j, k) < 0) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ }
+ }
+
+ /* Autojunk popular elements (matches Lib/difflib.py semantics). */
+ if (self->autojunk && lb >= 200) {
+ Py_ssize_t ntest = lb / 100 + 1;
+ pop_keys = PyList_New(0);
+ items = PyDict_Items(b2j);
+ if (pop_keys == NULL || items == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_ssize_t ni = PyList_GET_SIZE(items);
+ for (Py_ssize_t i = 0; i < ni; i++) {
+ PyObject *kv = PyList_GET_ITEM(items, i);
+ PyObject *k = PyTuple_GET_ITEM(kv, 0);
+ PyObject *v = PyTuple_GET_ITEM(kv, 1);
+ if (PyList_GET_SIZE(v) > ntest) {
+ if (PySet_Add(bpopular, k) < 0
+ || PyList_Append(pop_keys, k) < 0)
+ {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ }
+ }
+ Py_ssize_t npk = PyList_GET_SIZE(pop_keys);
+ for (Py_ssize_t i = 0; i < npk; i++) {
+ if (PyDict_DelItem(b2j, PyList_GET_ITEM(pop_keys, i)) < 0) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ }
+ }
+
+ /* Build DP labels + jbuf from post-junk b2j. */
+ elt_to_lbl_dp = PyDict_New();
+ if (elt_to_lbl_dp == NULL) {
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_ssize_t nlbl = PyDict_GET_SIZE(b2j);
+ Py_ssize_t total = 0;
+ {
+ PyObject *k, *v;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(b2j, &pos, &k, &v)) {
+ total += PyList_GET_SIZE(v);
+ }
+ }
+ int32_t *jbuf = (int32_t *)PyMem_Malloc(
+ sizeof(int32_t) * (size_t)(total > 0 ? total : 1));
+ int32_t *jstart = (int32_t *)PyMem_Malloc(
+ sizeof(int32_t) * (size_t)(nlbl > 0 ? nlbl : 1));
+ int32_t *jcount = (int32_t *)PyMem_Malloc(
+ sizeof(int32_t) * (size_t)(nlbl > 0 ? nlbl : 1));
+ if (jbuf == NULL || jstart == NULL || jcount == NULL) {
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ PyErr_NoMemory();
+ goto done;
+ }
+ {
+ PyObject *k, *v;
+ Py_ssize_t pos = 0;
+ int32_t lbl = 0;
+ int32_t cursor = 0;
+ while (PyDict_Next(b2j, &pos, &k, &v)) {
+ Py_ssize_t n = PyList_GET_SIZE(v);
+ jstart[lbl] = cursor;
+ jcount[lbl] = (int32_t)n;
+ for (Py_ssize_t i = 0; i < n; i++) {
+ Py_ssize_t bi = PyLong_AsSsize_t(PyList_GET_ITEM(v, i));
+ if (bi == -1 && PyErr_Occurred()) {
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ jbuf[cursor + i] = (int32_t)bi;
+ }
+ cursor += (int32_t)n;
+ PyObject *lbl_obj = PyLong_FromLong(lbl);
+ if (lbl_obj == NULL) {
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ if (PyDict_SetItem(elt_to_lbl_dp, k, lbl_obj) < 0) {
+ Py_DECREF(lbl_obj);
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_DECREF(lbl_obj);
+ lbl++;
+ }
+ }
+
+ /* DP scratch (two versioned pairs). */
+ Py_ssize_t scratch_sz = lb + 1;
+ Py_ssize_t *jv = (Py_ssize_t *)PyMem_Malloc(
+ sizeof(Py_ssize_t) * (size_t)scratch_sz);
+ Py_ssize_t *jver = (Py_ssize_t *)PyMem_Calloc(
+ (size_t)scratch_sz, sizeof(Py_ssize_t));
+ Py_ssize_t *jv2 = (Py_ssize_t *)PyMem_Malloc(
+ sizeof(Py_ssize_t) * (size_t)scratch_sz);
+ Py_ssize_t *jver2 = (Py_ssize_t *)PyMem_Calloc(
+ (size_t)scratch_sz, sizeof(Py_ssize_t));
+ if (jv == NULL || jver == NULL || jv2 == NULL || jver2 == NULL) {
+ PyMem_Free(jv);
+ PyMem_Free(jver);
+ PyMem_Free(jv2);
+ PyMem_Free(jver2);
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ PyErr_NoMemory();
+ goto done;
+ }
+
+ /* [phase 4/5] Codepoint-keyed fast path.
+ For str/bytes b we build cp_full[ch] -> full label and
+ cp_dp[ch] -> DP label, indexed directly by the codepoint.
+ build_a_labels can then skip the PyUnicode_FromOrdinal /
+ PyLong_FromLong + dict-probe per element of a.
+ - [phase 4] str branch.
+ - [phase 5] bytes branch (cp_sz fixed at 256) and the UCS1
+ shortcut (cp_sz fixed at 256 instead of scanning to find
+ the max codepoint).
+ */
+ int32_t *cp_full = NULL;
+ int32_t *cp_dp = NULL;
+ Py_ssize_t cp_sz = 0;
+ if (is_str) {
+ if (kind == PyUnicode_1BYTE_KIND) {
+ cp_sz = 256;
+ }
+ else {
+ Py_UCS4 maxch = 0;
+ for (Py_ssize_t i = 0; i < lb; i++) {
+ Py_UCS4 c = PyUnicode_READ(kind, udata, i);
+ if (c > maxch) {
+ maxch = c;
+ }
+ }
+ cp_sz = (Py_ssize_t)maxch + 1;
+ }
+ }
+ else if (is_bytes) {
+ cp_sz = 256;
+ }
+ if (cp_sz > 0) {
+ cp_full = (int32_t *)PyMem_Malloc(sizeof(int32_t) * (size_t)cp_sz);
+ cp_dp = (int32_t *)PyMem_Malloc(sizeof(int32_t) * (size_t)cp_sz);
+ if (cp_full == NULL || cp_dp == NULL) {
+ PyMem_Free(cp_full);
+ PyMem_Free(cp_dp);
+ PyMem_Free(jv);
+ PyMem_Free(jver);
+ PyMem_Free(jv2);
+ PyMem_Free(jver2);
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ PyErr_NoMemory();
+ goto done;
+ }
+ for (Py_ssize_t i = 0; i < cp_sz; i++) {
+ cp_full[i] = -1;
+ cp_dp[i] = -1;
+ }
+ if (is_str) {
+ for (Py_ssize_t i = 0; i < lb; i++) {
+ Py_UCS4 c = PyUnicode_READ(kind, udata, i);
+ cp_full[c] = b_lbl[i];
+ }
+ }
+ else {
+ for (Py_ssize_t i = 0; i < lb; i++) {
+ cp_full[bdata[i]] = b_lbl[i];
+ }
+ }
+ PyObject *k, *v;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(elt_to_lbl_dp, &pos, &k, &v)) {
+ long dp = PyLong_AsLong(v);
+ if (dp == -1 && PyErr_Occurred()) {
+ PyMem_Free(cp_full);
+ PyMem_Free(cp_dp);
+ PyMem_Free(jv);
+ PyMem_Free(jver);
+ PyMem_Free(jv2);
+ PyMem_Free(jver2);
+ PyMem_Free(jbuf);
+ PyMem_Free(jstart);
+ PyMem_Free(jcount);
+ PyMem_Free(b_lbl);
+ PyMem_Free(junk_mask);
+ goto done;
+ }
+ Py_ssize_t cp_idx;
+ if (is_str) {
+ if (!PyUnicode_Check(k) || PyUnicode_GET_LENGTH(k) != 1) {
+ continue;
+ }
+ cp_idx = (Py_ssize_t)PyUnicode_READ_CHAR(k, 0);
+ }
+ else {
+ if (!PyLong_Check(k)) {
+ continue;
+ }
+ long c = PyLong_AsLong(k);
+ if (c < 0 || c >= cp_sz) {
+ continue;
+ }
+ cp_idx = c;
+ }
+ if (cp_idx < cp_sz) {
+ cp_dp[cp_idx] = (int32_t)dp;
+ }
+ }
+ }
+
+ /* Commit. */
+ free_b_state(self);
+ self->lb = lb;
+ self->nlbl = nlbl;
+ self->b_lbl = b_lbl;
+ self->jbuf = jbuf;
+ self->jstart = jstart;
+ self->jcount = jcount;
+ self->junk_mask = junk_mask;
+ self->cp_full = cp_full;
+ self->cp_dp = cp_dp;
+ self->cp_max_plus1 = cp_sz;
+ self->j2len_val = jv;
+ self->j2len_ver = jver;
+ self->j2len_val2 = jv2;
+ self->j2len_ver2 = jver2;
+ self->j2len_size = scratch_sz;
+ self->gen = 0;
+ self->elt_to_lbl_full = elt_to_lbl_full;
+ self->elt_to_lbl_dp = elt_to_lbl_dp;
+ elt_to_lbl_full = NULL;
+ elt_to_lbl_dp = NULL;
+
+ Py_XSETREF(self->b2j, Py_NewRef(b2j));
+ Py_XSETREF(self->bjunk, Py_NewRef(bjunk));
+ Py_XSETREF(self->bpopular, Py_NewRef(bpopular));
+ self->b_ready = 1;
+ self->a_ready = 0;
+ rc = 0;
+
+done:
+ Py_XDECREF(b2j);
+ Py_XDECREF(bjunk);
+ Py_XDECREF(bpopular);
+ Py_XDECREF(elt_to_lbl_full);
+ Py_XDECREF(elt_to_lbl_dp);
+ Py_XDECREF(junk_keys);
+ Py_XDECREF(pop_keys);
+ Py_XDECREF(iter_keys);
+ Py_XDECREF(items);
+ return rc;
+}
+
+
+/* ====================================================================== */
+/* build_a_labels: assign full and DP labels to each position of a. */
+/* */
+/* [phase 3] General path (any sequence type): walk a, look each element */
+/* up in elt_to_lbl_full / elt_to_lbl_dp, store the resulting int32 */
+/* labels in a_lbl[] and a_dp[]. */
+/* */
+/* [phase 4/5] str / bytes fast paths use the codepoint-keyed cp_full[] */
+/* and cp_dp[] tables built by chain_b(), so the per-position element */
+/* reconstruction (PyUnicode_FromOrdinal / PyLong_FromLong) and dict */
+/* probe go away entirely. */
+/* ====================================================================== */
+
+static int
+build_a_labels(SequenceMatcherObject *self)
+{
+ PyObject *a = self->a;
+ Py_ssize_t la = PyObject_Length(a);
+ if (la < 0) {
+ return -1;
+ }
+
+ int32_t *a_lbl = (int32_t *)PyMem_Malloc(
+ sizeof(int32_t) * (size_t)(la > 0 ? la : 1));
+ int32_t *a_dp = (int32_t *)PyMem_Malloc(
+ sizeof(int32_t) * (size_t)(la > 0 ? la : 1));
+ if (a_lbl == NULL || a_dp == NULL) {
+ PyMem_Free(a_lbl);
+ PyMem_Free(a_dp);
+ PyErr_NoMemory();
+ return -1;
+ }
+
+ int is_str = PyUnicode_Check(a);
+ int is_list = PyList_Check(a);
+ int is_tuple = PyTuple_Check(a);
+ int is_bytes = PyBytes_Check(a);
+ int kind = 0;
+ const void *udata = NULL;
+ const unsigned char *adata = NULL;
+
+ if (is_str) {
+ if (PyUnicode_READY(a) < 0) {
+ PyMem_Free(a_lbl);
+ PyMem_Free(a_dp);
+ return -1;
+ }
+ kind = PyUnicode_KIND(a);
+ udata = PyUnicode_DATA(a);
+ }
+ else if (is_bytes) {
+ adata = (const unsigned char *)PyBytes_AS_STRING(a);
+ }
+
+ /* [phase 4] Fast path for str a paired with str b. cp_full / cp_dp
+ were built once in chain_b(); each position of a turns into two
+ array indexings, no PyObject construction or dict lookup. */
+ if (is_str && self->cp_full != NULL) {
+ Py_ssize_t cpmax = self->cp_max_plus1;
+ const int32_t *cf = self->cp_full;
+ const int32_t *cd = self->cp_dp;
+ for (Py_ssize_t i = 0; i < la; i++) {
+ Py_UCS4 c = PyUnicode_READ(kind, udata, i);
+ if ((Py_ssize_t)c < cpmax) {
+ a_lbl[i] = cf[c];
+ a_dp[i] = cd[c];
+ }
+ else {
+ a_lbl[i] = -1;
+ a_dp[i] = -1;
+ }
+ }
+ }
+ /* [phase 5] Fast path for bytes a paired with bytes b. */
+ else if (is_bytes && self->cp_full != NULL) {
+ const int32_t *cf = self->cp_full;
+ const int32_t *cd = self->cp_dp;
+ for (Py_ssize_t i = 0; i < la; i++) {
+ a_lbl[i] = cf[adata[i]];
+ a_dp[i] = cd[adata[i]];
+ }
+ }
+ else {
+ for (Py_ssize_t i = 0; i < la; i++) {
+ PyObject *elt;
+ if (is_str) {
+ elt = PyUnicode_FromOrdinal(PyUnicode_READ(kind, udata, i));
+ if (elt == NULL) {
+ goto error;
+ }
+ }
+ else if (is_bytes) {
+ elt = PyLong_FromLong(adata[i]);
+ if (elt == NULL) {
+ goto error;
+ }
+ }
+ else if (is_list) {
+ elt = Py_NewRef(PyList_GET_ITEM(a, i));
+ }
+ else if (is_tuple) {
+ elt = Py_NewRef(PyTuple_GET_ITEM(a, i));
+ }
+ else {
+ elt = PySequence_GetItem(a, i);
+ if (elt == NULL) {
+ goto error;
+ }
+ }
+
+ PyObject *vf = PyDict_GetItemWithError(self->elt_to_lbl_full, elt);
+ if (vf == NULL) {
+ if (PyErr_Occurred()) {
+ Py_DECREF(elt);
+ goto error;
+ }
+ a_lbl[i] = -1;
+ }
+ else {
+ a_lbl[i] = (int32_t)PyLong_AsLong(vf);
+ }
+ PyObject *vd = PyDict_GetItemWithError(self->elt_to_lbl_dp, elt);
+ if (vd == NULL) {
+ if (PyErr_Occurred()) {
+ Py_DECREF(elt);
+ goto error;
+ }
+ a_dp[i] = -1;
+ }
+ else {
+ a_dp[i] = (int32_t)PyLong_AsLong(vd);
+ }
+ Py_DECREF(elt);
+ }
+ }
+
+ free_a_state(self);
+ self->a_lbl = a_lbl;
+ self->a_dp = a_dp;
+ self->la = la;
+ self->a_ready = 1;
+ return 0;
+
+error:
+ PyMem_Free(a_lbl);
+ PyMem_Free(a_dp);
+ return -1;
+}
+
+
+/* ====================================================================== */
+/* ensure_ready: lazily run chain_b()/build_a_labels() as needed. */
+/* ====================================================================== */
+
+static int
+ensure_ready(SequenceMatcherObject *self)
+{
+ if (!self->b_ready) {
+ if (chain_b(self) < 0) {
+ return -1;
+ }
+ }
+ if (!self->a_ready) {
+ if (build_a_labels(self) < 0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
+/* ====================================================================== */
+/* find_longest_match (core): pure-C DP on integer-label arrays. */
+/* */
+/* [phase 1] The j2len mapping that pure-Python builds with */
+/* newj2len = {} */
+/* ... newj2len[j] = j2lenget(j-1, 0) + 1 */
+/* is replaced by two paired int arrays (val + ver) and a generation */
+/* counter. `cur_ver[j-1] == cur_gen` means the previous row has a real */
+/* value at j-1, so no per-row clear is needed. */
+/* */
+/* [phase 3] The DP itself works on integer DP labels (a_dp[]), and the */
+/* extension passes compare a_lbl[i] / b_lbl[j] (also int32) instead of */
+/* a[i] / b[j] via PyObject_RichCompareBool. */
+/* */
+/* [phase 5] cur_val/cur_ver and nxt_val/nxt_ver live on the instance */
+/* (j2len_val/ver and j2len_val2/ver2), so no alloca/memset per call. */
+/* ====================================================================== */
+
+static void
+flm_core(SequenceMatcherObject *self,
+ Py_ssize_t alo, Py_ssize_t ahi,
+ Py_ssize_t blo, Py_ssize_t bhi,
+ Py_ssize_t *out_i, Py_ssize_t *out_j, Py_ssize_t *out_k)
+{
+ Py_ssize_t besti = alo;
+ Py_ssize_t bestj = blo;
+ Py_ssize_t bestsize = 0;
+
+ const int32_t *A = self->a_dp;
+ const int32_t *JB = self->jbuf;
+ const int32_t *JS = self->jstart;
+ const int32_t *JC = self->jcount;
+ const int32_t *AF = self->a_lbl;
+ const int32_t *BF = self->b_lbl;
+ const uint8_t *JM = self->junk_mask;
+ Py_ssize_t nlbl = self->nlbl;
+
+ Py_ssize_t gen = self->gen;
+ Py_ssize_t *cur_val = self->j2len_val;
+ Py_ssize_t *cur_ver = self->j2len_ver;
+ Py_ssize_t *nxt_val = self->j2len_val2;
+ Py_ssize_t *nxt_ver = self->j2len_ver2;
+ Py_ssize_t cur_gen = ++gen;
+ Py_ssize_t nxt_gen = ++gen;
+
+ for (Py_ssize_t i = alo; i < ahi; i++) {
+ int32_t lab = A[i];
+ if (lab < 0 || lab >= nlbl) {
+ Py_ssize_t *tv = cur_val;
+ cur_val = nxt_val;
+ nxt_val = tv;
+ Py_ssize_t *tr = cur_ver;
+ cur_ver = nxt_ver;
+ nxt_ver = tr;
+ cur_gen = nxt_gen;
+ nxt_gen = ++gen;
+ continue;
+ }
+ Py_ssize_t start = JS[lab];
+ Py_ssize_t n = JC[lab];
+ const int32_t *L = JB + start;
+ Py_ssize_t k0 = 0;
+ while (k0 < n && L[k0] < blo) {
+ k0++;
+ }
+ for (Py_ssize_t idx = k0; idx < n; idx++) {
+ Py_ssize_t j = L[idx];
+ if (j >= bhi) {
+ break;
+ }
+ Py_ssize_t prev = 0;
+ if (j > 0 && cur_ver[j - 1] == cur_gen) {
+ prev = cur_val[j - 1];
+ }
+ Py_ssize_t k = prev + 1;
+ nxt_val[j] = k;
+ nxt_ver[j] = nxt_gen;
+ if (k > bestsize) {
+ besti = i - k + 1;
+ bestj = j - k + 1;
+ bestsize = k;
+ }
+ }
+ Py_ssize_t *tv = cur_val;
+ cur_val = nxt_val;
+ nxt_val = tv;
+ Py_ssize_t *tr = cur_ver;
+ cur_ver = nxt_ver;
+ nxt_ver = tr;
+ cur_gen = nxt_gen;
+ nxt_gen = ++gen;
+ }
+ self->gen = gen;
+
+ /* Extension passes on integer labels. */
+ while (besti > alo && bestj > blo) {
+ Py_ssize_t bj = bestj - 1;
+ if (JM[bj] || AF[besti - 1] < 0 || AF[besti - 1] != BF[bj]) {
+ break;
+ }
+ besti--;
+ bestj--;
+ bestsize++;
+ }
+ while (besti + bestsize < ahi && bestj + bestsize < bhi) {
+ Py_ssize_t bj = bestj + bestsize;
+ if (JM[bj] || AF[besti + bestsize] < 0
+ || AF[besti + bestsize] != BF[bj])
+ {
+ break;
+ }
+ bestsize++;
+ }
+ while (besti > alo && bestj > blo) {
+ Py_ssize_t bj = bestj - 1;
+ if (!JM[bj] || AF[besti - 1] < 0 || AF[besti - 1] != BF[bj]) {
+ break;
+ }
+ besti--;
+ bestj--;
+ bestsize++;
+ }
+ while (besti + bestsize < ahi && bestj + bestsize < bhi) {
+ Py_ssize_t bj = bestj + bestsize;
+ if (!JM[bj] || AF[besti + bestsize] < 0
+ || AF[besti + bestsize] != BF[bj])
+ {
+ break;
+ }
+ bestsize++;
+ }
+
+ *out_i = besti;
+ *out_j = bestj;
+ *out_k = bestsize;
+}
+
+
+/* ====================================================================== */
+/* matching_blocks recursion (in C). */
+/* */
+/* [phase 3] The full Ratcliff-Obershelp recursion lives here. The */
+/* pure-Python equivalent does: */
+/* queue = [(0, la, 0, lb)] */
+/* while queue: */
+/* alo, ahi, blo, bhi = queue.pop() */
+/* i, j, k = self.find_longest_match(alo, ahi, blo, bhi) */
+/* ... */
+/* and crosses the Python/C boundary on every recursive call. This */
+/* function runs the whole queue + flm_core() + sort + collapse loop in */
+/* one C call, so per-recursion overhead disappears. */
+/* ====================================================================== */
+
+typedef struct {
+ Py_ssize_t alo, ahi, blo, bhi;
+} range_t;
+
+typedef struct {
+ Py_ssize_t i, j, k;
+} triple_t;
+
+static int
+triple_compare(const void *a, const void *b)
+{
+ const triple_t *x = (const triple_t *)a;
+ const triple_t *y = (const triple_t *)b;
+ if (x->i != y->i) {
+ return x->i < y->i ? -1 : 1;
+ }
+ if (x->j != y->j) {
+ return x->j < y->j ? -1 : 1;
+ }
+ if (x->k != y->k) {
+ return x->k < y->k ? -1 : 1;
+ }
+ return 0;
+}
+
+static int
+compute_matching_blocks(SequenceMatcherObject *self,
+ triple_t **out_blocks, Py_ssize_t *out_n)
+{
+ Py_ssize_t qcap = 64;
+ range_t *queue = (range_t *)PyMem_Malloc(sizeof(range_t) * (size_t)qcap);
+ Py_ssize_t mcap = 64;
+ triple_t *matches = (triple_t *)PyMem_Malloc(
+ sizeof(triple_t) * (size_t)mcap);
+ triple_t *collapsed = NULL;
+ if (queue == NULL || matches == NULL) {
+ PyMem_Free(queue);
+ PyMem_Free(matches);
+ PyErr_NoMemory();
+ return -1;
+ }
+ Py_ssize_t qn = 0;
+ Py_ssize_t mn = 0;
+
+ range_t r0;
+ r0.alo = 0;
+ r0.ahi = self->la;
+ r0.blo = 0;
+ r0.bhi = self->lb;
+ queue[qn++] = r0;
+
+ while (qn > 0) {
+ range_t r = queue[--qn];
+ Py_ssize_t i, j, k;
+ flm_core(self, r.alo, r.ahi, r.blo, r.bhi, &i, &j, &k);
+ if (k == 0) {
+ continue;
+ }
+ if (mn >= mcap) {
+ mcap *= 2;
+ triple_t *nm = (triple_t *)PyMem_Realloc(
+ matches, sizeof(triple_t) * (size_t)mcap);
+ if (nm == NULL) {
+ PyMem_Free(queue);
+ PyMem_Free(matches);
+ PyErr_NoMemory();
+ return -1;
+ }
+ matches = nm;
+ }
+ triple_t t;
+ t.i = i;
+ t.j = j;
+ t.k = k;
+ matches[mn++] = t;
+ if (r.alo < i && r.blo < j) {
+ if (qn >= qcap) {
+ qcap *= 2;
+ range_t *nq = (range_t *)PyMem_Realloc(
+ queue, sizeof(range_t) * (size_t)qcap);
+ if (nq == NULL) {
+ PyMem_Free(queue);
+ PyMem_Free(matches);
+ PyErr_NoMemory();
+ return -1;
+ }
+ queue = nq;
+ }
+ range_t r2;
+ r2.alo = r.alo;
+ r2.ahi = i;
+ r2.blo = r.blo;
+ r2.bhi = j;
+ queue[qn++] = r2;
+ }
+ if (i + k < r.ahi && j + k < r.bhi) {
+ if (qn >= qcap) {
+ qcap *= 2;
+ range_t *nq = (range_t *)PyMem_Realloc(
+ queue, sizeof(range_t) * (size_t)qcap);
+ if (nq == NULL) {
+ PyMem_Free(queue);
+ PyMem_Free(matches);
+ PyErr_NoMemory();
+ return -1;
+ }
+ queue = nq;
+ }
+ range_t r2;
+ r2.alo = i + k;
+ r2.ahi = r.ahi;
+ r2.blo = j + k;
+ r2.bhi = r.bhi;
+ queue[qn++] = r2;
+ }
+ }
+ PyMem_Free(queue);
+
+ qsort(matches, (size_t)mn, sizeof(triple_t), triple_compare);
+
+ collapsed = (triple_t *)PyMem_Malloc(sizeof(triple_t) * (size_t)(mn + 1));
+ if (collapsed == NULL) {
+ PyMem_Free(matches);
+ PyErr_NoMemory();
+ return -1;
+ }
+ Py_ssize_t on = 0;
+ Py_ssize_t i1 = 0, j1 = 0, k1 = 0;
+ for (Py_ssize_t idx = 0; idx < mn; idx++) {
+ Py_ssize_t i2 = matches[idx].i;
+ Py_ssize_t j2 = matches[idx].j;
+ Py_ssize_t k2 = matches[idx].k;
+ if (i1 + k1 == i2 && j1 + k1 == j2) {
+ k1 += k2;
+ }
+ else {
+ if (k1) {
+ triple_t t;
+ t.i = i1;
+ t.j = j1;
+ t.k = k1;
+ collapsed[on++] = t;
+ }
+ i1 = i2;
+ j1 = j2;
+ k1 = k2;
+ }
+ }
+ if (k1) {
+ triple_t t;
+ t.i = i1;
+ t.j = j1;
+ t.k = k1;
+ collapsed[on++] = t;
+ }
+ triple_t sentinel;
+ sentinel.i = self->la;
+ sentinel.j = self->lb;
+ sentinel.k = 0;
+ collapsed[on++] = sentinel;
+ PyMem_Free(matches);
+
+ *out_blocks = collapsed;
+ *out_n = on;
+ return 0;
+}
+
+
+/* ====================================================================== */
+/* Helpers for building Match namedtuples. */
+/* ====================================================================== */
+
+static PyObject *
+build_match(PyObject *module, Py_ssize_t i, Py_ssize_t j, Py_ssize_t k)
+{
+ _difflib_state *state = get_module_state(module);
+ return PyObject_CallFunction(state->Match, "nnn", i, j, k);
+}
+
+static PyObject *
+module_of(SequenceMatcherObject *self)
+{
+ return PyType_GetModuleByDef(Py_TYPE(self), &_difflib_module);
+}
+
+
+/* ====================================================================== */
+/* Method implementations. */
+/* ====================================================================== */
+
+/*[clinic input]
+_difflib.SequenceMatcher.__init__
+
+ isjunk: object = None
+ a: object(c_default="NULL") = ''
+ b: object(c_default="NULL") = ''
+ autojunk: bool = True
+
+Construct a SequenceMatcher.
+
+See difflib.py for the full documentation; output is identical to the
+pure-Python SequenceMatcher class.
+[clinic start generated code]*/
+
+static int
+_difflib_SequenceMatcher___init___impl(SequenceMatcherObject *self,
+ PyObject *isjunk, PyObject *a,
+ PyObject *b, int autojunk)
+/*[clinic end generated code: output=0d5ef8814b30159b input=aab0c2f4f8a063b4]*/
+{
+ Py_INCREF(isjunk);
+ Py_XSETREF(self->isjunk, isjunk);
+ self->autojunk = autojunk;
+
+ PyObject *empty = PyUnicode_FromStringAndSize(NULL, 0);
+ if (empty == NULL) {
+ return -1;
+ }
+ PyObject *a_val = (a == NULL) ? empty : a;
+ PyObject *b_val = (b == NULL) ? empty : b;
+
+ Py_XSETREF(self->a, Py_NewRef(a_val));
+ Py_XSETREF(self->b, Py_NewRef(b_val));
+ Py_DECREF(empty);
+
+ self->b_ready = 0;
+ self->a_ready = 0;
+ invalidate_caches(self);
+ Py_XSETREF(self->fullbcount, Py_NewRef(Py_None));
+
+ /* Eager: match pure-Python semantics where b2j/bjunk/bpopular are
+ populated immediately so that they are visible as attributes
+ right after construction. */
+ if (chain_b(self) < 0) {
+ return -1;
+ }
+ return 0;
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.set_seq1
+
+ a: object
+ /
+
+Set the first sequence to be compared.
+
+The second sequence to be compared is not changed.
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq1_impl(SequenceMatcherObject *self,
+ PyObject *a)
+/*[clinic end generated code: output=d7bd77eb821dd8b8 input=9445bdbeb31d0bf2]*/
+{
+ if (a == self->a) {
+ Py_RETURN_NONE;
+ }
+ Py_XSETREF(self->a, Py_NewRef(a));
+ self->a_ready = 0;
+ invalidate_caches(self);
+ Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.set_seq2
+
+ b: object
+ /
+
+Set the second sequence to be compared.
+
+The first sequence to be compared is not changed.
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq2_impl(SequenceMatcherObject *self,
+ PyObject *b)
+/*[clinic end generated code: output=1c21f4e4b95dfad8 input=8a4295ec082859be]*/
+{
+ if (b == self->b) {
+ Py_RETURN_NONE;
+ }
+ Py_XSETREF(self->b, Py_NewRef(b));
+ self->b_ready = 0;
+ self->a_ready = 0;
+ invalidate_caches(self);
+ Py_XSETREF(self->fullbcount, Py_NewRef(Py_None));
+ if (chain_b(self) < 0) {
+ return NULL;
+ }
+ Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.set_seqs
+
+ a: object
+ b: object
+ /
+
+Set the two sequences to be compared.
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_set_seqs_impl(SequenceMatcherObject *self,
+ PyObject *a, PyObject *b)
+/*[clinic end generated code: output=6125de76c8b14cda input=d045d9c013583673]*/
+{
+ PyObject *r = _difflib_SequenceMatcher_set_seq1_impl(self, a);
+ if (r == NULL) {
+ return NULL;
+ }
+ Py_DECREF(r);
+ return _difflib_SequenceMatcher_set_seq2_impl(self, b);
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.find_longest_match
+
+ alo: Py_ssize_t = 0
+ ahi: object = None
+ blo: Py_ssize_t = 0
+ bhi: object = None
+
+Find longest matching block in a[alo:ahi] and b[blo:bhi].
+
+By default the entire sequences are searched. Returns Match(i, j, k)
+such that a[i:i+k] equals b[j:j+k].
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_find_longest_match_impl(SequenceMatcherObject *self,
+ Py_ssize_t alo,
+ PyObject *ahi,
+ Py_ssize_t blo,
+ PyObject *bhi)
+/*[clinic end generated code: output=1650f5386c4d5669 input=849e78330a319475]*/
+{
+ if (ensure_ready(self) < 0) {
+ return NULL;
+ }
+ Py_ssize_t ahi_n, bhi_n;
+ if (ahi == Py_None) {
+ ahi_n = self->la;
+ }
+ else {
+ ahi_n = PyNumber_AsSsize_t(ahi, PyExc_OverflowError);
+ if (ahi_n == -1 && PyErr_Occurred()) {
+ return NULL;
+ }
+ }
+ if (bhi == Py_None) {
+ bhi_n = self->lb;
+ }
+ else {
+ bhi_n = PyNumber_AsSsize_t(bhi, PyExc_OverflowError);
+ if (bhi_n == -1 && PyErr_Occurred()) {
+ return NULL;
+ }
+ }
+ Py_ssize_t i, j, k;
+ flm_core(self, alo, ahi_n, blo, bhi_n, &i, &j, &k);
+ return build_match(module_of(self), i, j, k);
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.get_matching_blocks
+
+Return list of triples describing matching subsequences.
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_get_matching_blocks_impl(SequenceMatcherObject *self)
+/*[clinic end generated code: output=3b59fa10d3ad4613 input=b11de093158a3d8a]*/
+{
+ if (self->matching_blocks != NULL) {
+ return Py_NewRef(self->matching_blocks);
+ }
+ if (ensure_ready(self) < 0) {
+ return NULL;
+ }
+ triple_t *blocks = NULL;
+ Py_ssize_t n = 0;
+ if (compute_matching_blocks(self, &blocks, &n) < 0) {
+ return NULL;
+ }
+ PyObject *result = PyList_New(n);
+ if (result == NULL) {
+ PyMem_Free(blocks);
+ return NULL;
+ }
+ PyObject *module = module_of(self);
+ for (Py_ssize_t idx = 0; idx < n; idx++) {
+ PyObject *m = build_match(module, blocks[idx].i,
+ blocks[idx].j, blocks[idx].k);
+ if (m == NULL) {
+ Py_DECREF(result);
+ PyMem_Free(blocks);
+ return NULL;
+ }
+ PyList_SET_ITEM(result, idx, m);
+ }
+ PyMem_Free(blocks);
+ Py_XSETREF(self->matching_blocks, Py_NewRef(result));
+ return result;
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.get_opcodes
+
+Return list of 5-tuples describing how to turn a into b.
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_get_opcodes_impl(SequenceMatcherObject *self)
+/*[clinic end generated code: output=be7b94a026664a7d input=4d38c91ce94a560e]*/
+{
+ if (self->opcodes != NULL) {
+ return Py_NewRef(self->opcodes);
+ }
+ PyObject *blocks = _difflib_SequenceMatcher_get_matching_blocks_impl(self);
+ if (blocks == NULL) {
+ return NULL;
+ }
+ PyObject *answer = PyList_New(0);
+ if (answer == NULL) {
+ Py_DECREF(blocks);
+ return NULL;
+ }
+ Py_ssize_t i = 0, j = 0;
+ Py_ssize_t n = PyList_GET_SIZE(blocks);
+ for (Py_ssize_t bidx = 0; bidx < n; bidx++) {
+ PyObject *m = PyList_GET_ITEM(blocks, bidx);
+ Py_ssize_t ai = PyLong_AsSsize_t(PyTuple_GET_ITEM(m, 0));
+ Py_ssize_t bj = PyLong_AsSsize_t(PyTuple_GET_ITEM(m, 1));
+ Py_ssize_t size = PyLong_AsSsize_t(PyTuple_GET_ITEM(m, 2));
+
+ const char *tag = NULL;
+ if (i < ai && j < bj) {
+ tag = "replace";
+ }
+ else if (i < ai) {
+ tag = "delete";
+ }
+ else if (j < bj) {
+ tag = "insert";
+ }
+ if (tag != NULL) {
+ PyObject *t = Py_BuildValue("(snnnn)", tag, i, ai, j, bj);
+ if (t == NULL || PyList_Append(answer, t) < 0) {
+ Py_XDECREF(t);
+ Py_DECREF(answer);
+ Py_DECREF(blocks);
+ return NULL;
+ }
+ Py_DECREF(t);
+ }
+ i = ai + size;
+ j = bj + size;
+ if (size > 0) {
+ PyObject *t = Py_BuildValue("(snnnn)", "equal", ai, i, bj, j);
+ if (t == NULL || PyList_Append(answer, t) < 0) {
+ Py_XDECREF(t);
+ Py_DECREF(answer);
+ Py_DECREF(blocks);
+ return NULL;
+ }
+ Py_DECREF(t);
+ }
+ }
+ Py_DECREF(blocks);
+ Py_XSETREF(self->opcodes, Py_NewRef(answer));
+ return answer;
+}
+
+/*[clinic input]
+_difflib.SequenceMatcher.ratio
+
+Return a measure of the sequences' similarity (float in [0, 1]).
+[clinic start generated code]*/
+
+static PyObject *
+_difflib_SequenceMatcher_ratio_impl(SequenceMatcherObject *self)
+/*[clinic end generated code: output=1691c4582d293748 input=f8c99bdde6e27e60]*/
+{
+ if (ensure_ready(self) < 0) {
+ return NULL;
+ }
+ triple_t *blocks = NULL;
+ Py_ssize_t n = 0;
+ if (compute_matching_blocks(self, &blocks, &n) < 0) {
+ return NULL;
+ }
+ Py_ssize_t total_k = 0;
+ for (Py_ssize_t i = 0; i < n; i++) {
+ total_k += blocks[i].k;
+ }
+ PyMem_Free(blocks);
+ Py_ssize_t denom = self->la + self->lb;
+ double r = denom > 0 ? (2.0 * (double)total_k / (double)denom) : 1.0;
+ return PyFloat_FromDouble(r);
+}
+
+
+/* ====================================================================== */
+/* Type definition. */
+/* ====================================================================== */
+
+static PyMethodDef sequence_matcher_methods[] = {
+ _DIFFLIB_SEQUENCEMATCHER_SET_SEQS_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_SET_SEQ1_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_SET_SEQ2_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_FIND_LONGEST_MATCH_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_GET_MATCHING_BLOCKS_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_GET_OPCODES_METHODDEF
+ _DIFFLIB_SEQUENCEMATCHER_RATIO_METHODDEF
+ {NULL, NULL, 0, NULL}
+};
+
+static PyMemberDef sequence_matcher_members[] = {
+ {"isjunk", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, isjunk), Py_READONLY},
+ {"a", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, a), Py_READONLY},
+ {"b", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, b), Py_READONLY},
+ {"b2j", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, b2j), Py_READONLY},
+ {"bjunk", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, bjunk), Py_READONLY},
+ {"bpopular", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, bpopular), Py_READONLY},
+ {"autojunk", Py_T_BOOL,
+ offsetof(SequenceMatcherObject, autojunk), Py_READONLY},
+ {"fullbcount", Py_T_OBJECT_EX,
+ offsetof(SequenceMatcherObject, fullbcount), 0},
+ {NULL, 0, 0, 0, NULL}
+};
+
+static int
+sequence_matcher_traverse(PyObject *op, visitproc visit, void *arg)
+{
+ SequenceMatcherObject *self = (SequenceMatcherObject *)op;
+ Py_VISIT(Py_TYPE(self));
+ Py_VISIT(self->isjunk);
+ Py_VISIT(self->a);
+ Py_VISIT(self->b);
+ Py_VISIT(self->b2j);
+ Py_VISIT(self->bjunk);
+ Py_VISIT(self->bpopular);
+ Py_VISIT(self->matching_blocks);
+ Py_VISIT(self->opcodes);
+ Py_VISIT(self->fullbcount);
+ Py_VISIT(self->elt_to_lbl_full);
+ Py_VISIT(self->elt_to_lbl_dp);
+ return 0;
+}
+
+static int
+sequence_matcher_clear(PyObject *op)
+{
+ SequenceMatcherObject *self = (SequenceMatcherObject *)op;
+ Py_CLEAR(self->isjunk);
+ Py_CLEAR(self->a);
+ Py_CLEAR(self->b);
+ Py_CLEAR(self->b2j);
+ Py_CLEAR(self->bjunk);
+ Py_CLEAR(self->bpopular);
+ Py_CLEAR(self->matching_blocks);
+ Py_CLEAR(self->opcodes);
+ Py_CLEAR(self->fullbcount);
+ Py_CLEAR(self->elt_to_lbl_full);
+ Py_CLEAR(self->elt_to_lbl_dp);
+ return 0;
+}
+
+static void
+sequence_matcher_dealloc(PyObject *op)
+{
+ SequenceMatcherObject *self = (SequenceMatcherObject *)op;
+ PyTypeObject *tp = Py_TYPE(self);
+ PyObject_GC_UnTrack(self);
+ free_a_state(self);
+ free_b_state(self);
+ (void)sequence_matcher_clear(op);
+ tp->tp_free((PyObject *)self);
+ Py_DECREF(tp);
+}
+
+static PyType_Slot sequence_matcher_slots[] = {
+ {Py_tp_doc, (void *)_difflib_SequenceMatcher___init____doc__},
+ {Py_tp_init, _difflib_SequenceMatcher___init__},
+ {Py_tp_dealloc, sequence_matcher_dealloc},
+ {Py_tp_traverse, sequence_matcher_traverse},
+ {Py_tp_clear, sequence_matcher_clear},
+ {Py_tp_methods, sequence_matcher_methods},
+ {Py_tp_members, sequence_matcher_members},
+ {0, NULL}
+};
+
+static PyType_Spec sequence_matcher_spec = {
+ .name = "_difflib.SequenceMatcher",
+ .basicsize = sizeof(SequenceMatcherObject),
+ .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE
+ | Py_TPFLAGS_HAVE_GC,
+ .slots = sequence_matcher_slots,
+};
+
+
+/* ====================================================================== */
+/* Module init. */
+/* ====================================================================== */
+
+static int
+_difflib_exec(PyObject *module)
+{
+ _difflib_state *state = get_module_state(module);
+
+ PyObject *difflib = PyImport_ImportModule("collections");
+ if (difflib == NULL) {
+ return -1;
+ }
+ PyObject *namedtuple = PyObject_GetAttrString(difflib, "namedtuple");
+ Py_DECREF(difflib);
+ if (namedtuple == NULL) {
+ return -1;
+ }
+ PyObject *match_args = PyTuple_Pack(2,
+ PyUnicode_FromString("Match"),
+ PyUnicode_FromString("a b size"));
+ if (match_args == NULL) {
+ Py_DECREF(namedtuple);
+ return -1;
+ }
+ PyObject *Match = PyObject_Call(namedtuple, match_args, NULL);
+ Py_DECREF(match_args);
+ Py_DECREF(namedtuple);
+ if (Match == NULL) {
+ return -1;
+ }
+ state->Match = Match;
+
+ PyObject *type = PyType_FromModuleAndSpec(module, &sequence_matcher_spec,
+ NULL);
+ if (type == NULL) {
+ return -1;
+ }
+ state->SequenceMatcher_Type = (PyTypeObject *)type;
+ if (PyModule_AddType(module, (PyTypeObject *)type) < 0) {
+ return -1;
+ }
+ if (PyModule_AddObjectRef(module, "Match", Match) < 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static int
+_difflib_traverse(PyObject *module, visitproc visit, void *arg)
+{
+ _difflib_state *state = get_module_state(module);
+ Py_VISIT(state->SequenceMatcher_Type);
+ Py_VISIT(state->Match);
+ return 0;
+}
+
+static int
+_difflib_clear(PyObject *module)
+{
+ _difflib_state *state = get_module_state(module);
+ Py_CLEAR(state->SequenceMatcher_Type);
+ Py_CLEAR(state->Match);
+ return 0;
+}
+
+static PyModuleDef_Slot _difflib_slots[] = {
+ {Py_mod_exec, _difflib_exec},
+ {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+ /* The module's own state is per-interpreter (no globals); per-instance
+ state on SequenceMatcherObject is unsynchronised, matching the
+ pure-Python contract that callers don't share an instance across
+ threads. Both are fine under free-threading. */
+ {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+ {0, NULL}
+};
+
+static struct PyModuleDef _difflib_module = {
+ PyModuleDef_HEAD_INIT,
+ .m_name = "_difflib",
+ .m_doc = PyDoc_STR(
+ "C accelerator for difflib.SequenceMatcher."),
+ .m_size = sizeof(_difflib_state),
+ .m_slots = _difflib_slots,
+ .m_traverse = _difflib_traverse,
+ .m_clear = _difflib_clear,
+};
+
+PyMODINIT_FUNC
+PyInit__difflib(void)
+{
+ return PyModuleDef_Init(&_difflib_module);
+}
diff --git a/Modules/clinic/_difflibmodule.c.h b/Modules/clinic/_difflibmodule.c.h
new file mode 100644
index 00000000000000..9d71a41da9ce9c
--- /dev/null
+++ b/Modules/clinic/_difflibmodule.c.h
@@ -0,0 +1,350 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+# include "pycore_gc.h" // PyGC_Head
+# include "pycore_runtime.h" // _Py_ID()
+#endif
+#include "pycore_abstract.h" // _PyNumber_Index()
+#include "pycore_modsupport.h" // _PyArg_UnpackKeywords()
+
+PyDoc_STRVAR(_difflib_SequenceMatcher___init____doc__,
+"SequenceMatcher(isjunk=None, a=\'\', b=\'\', autojunk=True)\n"
+"--\n"
+"\n"
+"Construct a SequenceMatcher.\n"
+"\n"
+"See difflib.py for the full documentation; output is identical to the\n"
+"pure-Python SequenceMatcher class.");
+
+static int
+_difflib_SequenceMatcher___init___impl(SequenceMatcherObject *self,
+ PyObject *isjunk, PyObject *a,
+ PyObject *b, int autojunk);
+
+static int
+_difflib_SequenceMatcher___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+ int return_value = -1;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 4
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ Py_hash_t ob_hash;
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_hash = -1,
+ .ob_item = { &_Py_ID(isjunk), _Py_LATIN1_CHR('a'), _Py_LATIN1_CHR('b'), &_Py_ID(autojunk), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"isjunk", "a", "b", "autojunk", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "SequenceMatcher",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[4];
+ PyObject * const *fastargs;
+ Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+ Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 0;
+ PyObject *isjunk = Py_None;
+ PyObject *a = NULL;
+ PyObject *b = NULL;
+ int autojunk = 1;
+
+ fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+ /*minpos*/ 0, /*maxpos*/ 4, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+ if (!fastargs) {
+ goto exit;
+ }
+ if (!noptargs) {
+ goto skip_optional_pos;
+ }
+ if (fastargs[0]) {
+ isjunk = fastargs[0];
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ if (fastargs[1]) {
+ a = fastargs[1];
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ if (fastargs[2]) {
+ b = fastargs[2];
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ autojunk = PyObject_IsTrue(fastargs[3]);
+ if (autojunk < 0) {
+ goto exit;
+ }
+skip_optional_pos:
+ return_value = _difflib_SequenceMatcher___init___impl((SequenceMatcherObject *)self, isjunk, a, b, autojunk);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_set_seq1__doc__,
+"set_seq1($self, a, /)\n"
+"--\n"
+"\n"
+"Set the first sequence to be compared.\n"
+"\n"
+"The second sequence to be compared is not changed.");
+
+#define _DIFFLIB_SEQUENCEMATCHER_SET_SEQ1_METHODDEF \
+ {"set_seq1", (PyCFunction)_difflib_SequenceMatcher_set_seq1, METH_O, _difflib_SequenceMatcher_set_seq1__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq1_impl(SequenceMatcherObject *self,
+ PyObject *a);
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq1(PyObject *self, PyObject *a)
+{
+ PyObject *return_value = NULL;
+
+ return_value = _difflib_SequenceMatcher_set_seq1_impl((SequenceMatcherObject *)self, a);
+
+ return return_value;
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_set_seq2__doc__,
+"set_seq2($self, b, /)\n"
+"--\n"
+"\n"
+"Set the second sequence to be compared.\n"
+"\n"
+"The first sequence to be compared is not changed.");
+
+#define _DIFFLIB_SEQUENCEMATCHER_SET_SEQ2_METHODDEF \
+ {"set_seq2", (PyCFunction)_difflib_SequenceMatcher_set_seq2, METH_O, _difflib_SequenceMatcher_set_seq2__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq2_impl(SequenceMatcherObject *self,
+ PyObject *b);
+
+static PyObject *
+_difflib_SequenceMatcher_set_seq2(PyObject *self, PyObject *b)
+{
+ PyObject *return_value = NULL;
+
+ return_value = _difflib_SequenceMatcher_set_seq2_impl((SequenceMatcherObject *)self, b);
+
+ return return_value;
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_set_seqs__doc__,
+"set_seqs($self, a, b, /)\n"
+"--\n"
+"\n"
+"Set the two sequences to be compared.");
+
+#define _DIFFLIB_SEQUENCEMATCHER_SET_SEQS_METHODDEF \
+ {"set_seqs", _PyCFunction_CAST(_difflib_SequenceMatcher_set_seqs), METH_FASTCALL, _difflib_SequenceMatcher_set_seqs__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_set_seqs_impl(SequenceMatcherObject *self,
+ PyObject *a, PyObject *b);
+
+static PyObject *
+_difflib_SequenceMatcher_set_seqs(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
+{
+ PyObject *return_value = NULL;
+ PyObject *a;
+ PyObject *b;
+
+ if (!_PyArg_CheckPositional("set_seqs", nargs, 2, 2)) {
+ goto exit;
+ }
+ a = args[0];
+ b = args[1];
+ return_value = _difflib_SequenceMatcher_set_seqs_impl((SequenceMatcherObject *)self, a, b);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_find_longest_match__doc__,
+"find_longest_match($self, /, alo=0, ahi=None, blo=0, bhi=None)\n"
+"--\n"
+"\n"
+"Find longest matching block in a[alo:ahi] and b[blo:bhi].\n"
+"\n"
+"By default the entire sequences are searched. Returns Match(i, j, k)\n"
+"such that a[i:i+k] equals b[j:j+k].");
+
+#define _DIFFLIB_SEQUENCEMATCHER_FIND_LONGEST_MATCH_METHODDEF \
+ {"find_longest_match", _PyCFunction_CAST(_difflib_SequenceMatcher_find_longest_match), METH_FASTCALL|METH_KEYWORDS, _difflib_SequenceMatcher_find_longest_match__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_find_longest_match_impl(SequenceMatcherObject *self,
+ Py_ssize_t alo,
+ PyObject *ahi,
+ Py_ssize_t blo,
+ PyObject *bhi);
+
+static PyObject *
+_difflib_SequenceMatcher_find_longest_match(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+ PyObject *return_value = NULL;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 4
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ Py_hash_t ob_hash;
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_hash = -1,
+ .ob_item = { &_Py_ID(alo), &_Py_ID(ahi), &_Py_ID(blo), &_Py_ID(bhi), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"alo", "ahi", "blo", "bhi", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "find_longest_match",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[4];
+ Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+ Py_ssize_t alo = 0;
+ PyObject *ahi = Py_None;
+ Py_ssize_t blo = 0;
+ PyObject *bhi = Py_None;
+
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+ /*minpos*/ 0, /*maxpos*/ 4, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+ if (!args) {
+ goto exit;
+ }
+ if (!noptargs) {
+ goto skip_optional_pos;
+ }
+ if (args[0]) {
+ {
+ Py_ssize_t ival = -1;
+ PyObject *iobj = _PyNumber_Index(args[0]);
+ if (iobj != NULL) {
+ ival = PyLong_AsSsize_t(iobj);
+ Py_DECREF(iobj);
+ }
+ if (ival == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ alo = ival;
+ }
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ if (args[1]) {
+ ahi = args[1];
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ if (args[2]) {
+ {
+ Py_ssize_t ival = -1;
+ PyObject *iobj = _PyNumber_Index(args[2]);
+ if (iobj != NULL) {
+ ival = PyLong_AsSsize_t(iobj);
+ Py_DECREF(iobj);
+ }
+ if (ival == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ blo = ival;
+ }
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ bhi = args[3];
+skip_optional_pos:
+ return_value = _difflib_SequenceMatcher_find_longest_match_impl((SequenceMatcherObject *)self, alo, ahi, blo, bhi);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_get_matching_blocks__doc__,
+"get_matching_blocks($self, /)\n"
+"--\n"
+"\n"
+"Return list of triples describing matching subsequences.");
+
+#define _DIFFLIB_SEQUENCEMATCHER_GET_MATCHING_BLOCKS_METHODDEF \
+ {"get_matching_blocks", (PyCFunction)_difflib_SequenceMatcher_get_matching_blocks, METH_NOARGS, _difflib_SequenceMatcher_get_matching_blocks__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_get_matching_blocks_impl(SequenceMatcherObject *self);
+
+static PyObject *
+_difflib_SequenceMatcher_get_matching_blocks(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+ return _difflib_SequenceMatcher_get_matching_blocks_impl((SequenceMatcherObject *)self);
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_get_opcodes__doc__,
+"get_opcodes($self, /)\n"
+"--\n"
+"\n"
+"Return list of 5-tuples describing how to turn a into b.");
+
+#define _DIFFLIB_SEQUENCEMATCHER_GET_OPCODES_METHODDEF \
+ {"get_opcodes", (PyCFunction)_difflib_SequenceMatcher_get_opcodes, METH_NOARGS, _difflib_SequenceMatcher_get_opcodes__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_get_opcodes_impl(SequenceMatcherObject *self);
+
+static PyObject *
+_difflib_SequenceMatcher_get_opcodes(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+ return _difflib_SequenceMatcher_get_opcodes_impl((SequenceMatcherObject *)self);
+}
+
+PyDoc_STRVAR(_difflib_SequenceMatcher_ratio__doc__,
+"ratio($self, /)\n"
+"--\n"
+"\n"
+"Return a measure of the sequences\' similarity (float in [0, 1]).");
+
+#define _DIFFLIB_SEQUENCEMATCHER_RATIO_METHODDEF \
+ {"ratio", (PyCFunction)_difflib_SequenceMatcher_ratio, METH_NOARGS, _difflib_SequenceMatcher_ratio__doc__},
+
+static PyObject *
+_difflib_SequenceMatcher_ratio_impl(SequenceMatcherObject *self);
+
+static PyObject *
+_difflib_SequenceMatcher_ratio(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+ return _difflib_SequenceMatcher_ratio_impl((SequenceMatcherObject *)self);
+}
+/*[clinic end generated code: output=359d12bb49bcc3bd input=a9049054013a1b77]*/
diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h
index 8937e666bbbdd5..54ba572604caee 100644
--- a/Python/stdlib_module_names.h
+++ b/Python/stdlib_module_names.h
@@ -32,6 +32,7 @@ static const char* _Py_stdlib_module_names[] = {
"_datetime",
"_dbm",
"_decimal",
+"_difflib",
"_elementtree",
"_frozen_importlib",
"_frozen_importlib_external",
@@ -67,6 +68,7 @@ static const char* _Py_stdlib_module_names[] = {
"_py_warnings",
"_pydatetime",
"_pydecimal",
+"_pydifflib",
"_pyio",
"_pylong",
"_pyrepl",
diff --git a/configure b/configure
index a320a397fe10d6..ba4daef4511846 100755
--- a/configure
+++ b/configure
@@ -812,6 +812,8 @@ MODULE__JSON_FALSE
MODULE__JSON_TRUE
MODULE__HEAPQ_FALSE
MODULE__HEAPQ_TRUE
+MODULE__DIFFLIB_FALSE
+MODULE__DIFFLIB_TRUE
MODULE__CSV_FALSE
MODULE__CSV_TRUE
MODULE__BISECT_FALSE
@@ -32119,6 +32121,28 @@ then :
+fi
+
+
+ if test "$py_cv_module__difflib" != "n/a"
+then :
+ py_cv_module__difflib=yes
+fi
+ if test "$py_cv_module__difflib" = yes; then
+ MODULE__DIFFLIB_TRUE=
+ MODULE__DIFFLIB_FALSE='#'
+else
+ MODULE__DIFFLIB_TRUE='#'
+ MODULE__DIFFLIB_FALSE=
+fi
+
+ as_fn_append MODULE_BLOCK "MODULE__DIFFLIB_STATE=$py_cv_module__difflib$as_nl"
+ if test "x$py_cv_module__difflib" = xyes
+then :
+
+
+
+
fi
@@ -35235,6 +35259,10 @@ if test -z "${MODULE__CSV_TRUE}" && test -z "${MODULE__CSV_FALSE}"; then
as_fn_error $? "conditional \"MODULE__CSV\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${MODULE__DIFFLIB_TRUE}" && test -z "${MODULE__DIFFLIB_FALSE}"; then
+ as_fn_error $? "conditional \"MODULE__DIFFLIB\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${MODULE__HEAPQ_TRUE}" && test -z "${MODULE__HEAPQ_FALSE}"; then
as_fn_error $? "conditional \"MODULE__HEAPQ\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/configure.ac b/configure.ac
index d975d9685caa7d..328525b2ac31d4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8091,6 +8091,7 @@ PY_STDLIB_MOD_SIMPLE([_math_integer])
PY_STDLIB_MOD_SIMPLE([_asyncio])
PY_STDLIB_MOD_SIMPLE([_bisect])
PY_STDLIB_MOD_SIMPLE([_csv])
+PY_STDLIB_MOD_SIMPLE([_difflib])
PY_STDLIB_MOD_SIMPLE([_heapq])
PY_STDLIB_MOD_SIMPLE([_json])
PY_STDLIB_MOD_SIMPLE([_lsprof])