Added functions for creating context diffs and unified diffs.

rhettinger · rhettinger · commit e6444c19208f · 2003-06-08T11:07:08.000Z
Documentation update and NEWS item are forthcoming. git-svn-id: http://svn.python.org/projects/python/trunk@33037 6015fed2-1504-0410-9fe1-9d1591cc4771
diff --git a/Lib/difflib.py b/Lib/difflib.py
@@ -6,12 +6,18 @@
 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
     Use SequenceMatcher to return list of the best "good enough" matches.
 
+Function context_diff(a, b):
+    For two lists of strings, return a delta in context diff format.
+
 Function ndiff(a, b):
     Return a delta: the difference between `a` and `b` (lists of strings).
 
 Function restore(delta, which):
     Return one of the two sequences that generated an ndiff delta.
 
+Function unified_diff(a, b):
+    For two lists of strings, return a delta in unified diff format.
+
 Class SequenceMatcher:
     A flexible class for comparing pairs of sequences of any type.
 
@@ -20,7 +26,8 @@
 """
 
 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
-           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK']
+           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
+           'unified_diff']
 
 class SequenceMatcher:
 
@@ -532,6 +539,54 @@ def get_opcodes(self):
                 answer.append( ('equal', ai, i, bj, j) )
         return answer
 
+    def get_grouped_opcodes(self, n=3):
+        """ Isolate change clusters by eliminating ranges with no changes.
+
+        Return a generator of groups with upto n lines of context.
+        Each group is in the same format as returned by get_opcodes().
+
+        >>> from pprint import pprint
+        >>> a = map(str, range(1,40))
+        >>> b = a[:]
+        >>> b[8:8] = ['i']     # Make an insertion
+        >>> b[20] += 'x'       # Make a replacement
+        >>> b[23:28] = []      # Make a deletion
+        >>> b[30] += 'y'       # Make another replacement
+        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
+        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
+         [('equal', 16, 19, 17, 20),
+          ('replace', 19, 20, 20, 21),
+          ('equal', 20, 22, 21, 23),
+          ('delete', 22, 27, 23, 23),
+          ('equal', 27, 30, 23, 26)],
+         [('equal', 31, 34, 27, 30),
+          ('replace', 34, 35, 30, 31),
+          ('equal', 35, 38, 31, 34)]]
+        """
+
+        codes = self.get_opcodes()
+        # Fixup leading and trailing groups if they show no changes.
+        if codes[0][0] == 'equal':
+            tag, i1, i2, j1, j2 = codes[0]
+            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
+        if codes[-1][0] == 'equal':
+            tag, i1, i2, j1, j2 = codes[-1]
+            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
+
+        nn = n + n
+        group = []
+        for tag, i1, i2, j1, j2 in codes:
+            # End the current group and start a new one whenever
+            # there is a large range with no changes.
+            if tag == 'equal' and i2-i1 > nn:
+                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
+                yield group
+                group = []
+                i1, j1 = max(i1, i2-n), max(j1, j2-n)
+            group.append((tag, i1, i2, j1 ,j2))
+        if group and not (len(group)==1 and group[0][0] == 'equal'):
+            yield group
+
     def ratio(self):
         """Return a measure of the sequences' similarity (float in [0,1]).
 
@@ -1042,6 +1097,150 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"):
 
 del re
 
+
+def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
+                 tofiledate='', n=3, lineterm='\n'):
+    r"""
+    Compare two sequences of lines; generate the delta as a unified diff.
+
+    Unified diffs are a compact way of showing line changes and a few
+    lines of context.  The number of context lines is set by 'n' which
+    defaults to three.
+
+    By default, the diff control lines (those with *** or ---) are
+    created with a trailing newline.  This is helpful so that inputs
+    created from file.readlines() result in diffs that are suitable for
+    file.writelines() since both the inputs and outputs have trailing
+    newlines.
+
+    For inputs that do not have trailing newlines, set the lineterm
+    argument to "" so that the output will be uniformly newline free.
+
+    The unidiff format normally has a header for filenames and modification
+    times.  Any or all of these may be specified using strings for
+    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.  The modification
+    times are normally expressed in the format returned by time.ctime().
+
+    Example:
+
+    >>> for line in unified_diff('one two three four'.split(),
+    ...             'zero one tree four'.split(), 'Original', 'Current',
+    ...             'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
+    ...             lineterm=''):
+    ...     print line
+    --- Original Sat Jan 26 23:30:50 1991
+    +++ Current Fri Jun 06 10:20:52 2003
+    @@ -1,4 +1,4 @@
+    +zero
+     one
+    -two
+    -three
+    +tree
+     four
+    """
+
+    started = False
+    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+        if not started:
+            yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
+            yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
+            started = True
+        i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
+        yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'equal':
+                for line in a[i1:i2]:
+                    yield ' ' + line
+                continue
+            if tag == 'replace' or tag == 'delete':
+                for line in a[i1:i2]:
+                    yield '-' + line
+            if tag == 'replace' or tag == 'insert':
+                for line in b[j1:j2]:
+                    yield '+' + line
+
+# See http://www.unix.org/single_unix_specification/
+def context_diff(a, b, fromfile='', tofile='',
+                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
+    r"""
+    Compare two sequences of lines; generate the delta as a context diff.
+
+    Context diffs are a compact way of showing line changes and a few
+    lines of context.  The number of context lines is set by 'n' which
+    defaults to three.
+
+    By default, the diff control lines (those with *** or ---) are
+    created with a trailing newline.  This is helpful so that inputs
+    created from file.readlines() result in diffs that are suitable for
+    file.writelines() since both the inputs and outputs have trailing
+    newlines.
+
+    For inputs that do not have trailing newlines, set the lineterm
+    argument to "" so that the output will be uniformly newline free.
+
+    The context diff format normally has a header for filenames and
+    modification times.  Any or all of these may be specified using
+    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+    The modification times are normally expressed in the format returned
+    by time.ctime().  If not specified, the strings default to blanks.
+
+    Example:
+
+    >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
+    ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
+    ...       'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
+    *** Original Sat Jan 26 23:30:50 1991
+    --- Current Fri Jun 06 10:22:46 2003
+    ***************
+    *** 1,4 ****
+      one
+    ! two
+    ! three
+      four
+    --- 1,4 ----
+    + zero
+      one
+    ! tree
+      four
+    """
+
+    started = False
+    prefixmap = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
+    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+        if not started:
+            yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
+            yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
+            started = True
+        yield '***************%s' % (lineterm,)
+        if group[-1][2] - group[0][1] >= 2:
+            yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
+        else:
+            yield '*** %d ****%s' % (group[-1][2], lineterm)
+        empty = True
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'replace' or tag == 'delete':
+                empty = False
+                break
+        if not empty:
+            for tag, i1, i2, j1, j2 in group:
+                if tag != 'insert':
+                    for line in a[i1:i2]:
+                        yield prefixmap[tag] + line
+        if group[-1][4] - group[0][3] >= 2:
+            yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
+        else:
+            yield '--- %d ----%s' % (group[-1][4], lineterm)
+        empty = True
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'replace' or tag == 'insert':
+                empty = False
+                break
+        if not empty:
+            for tag, i1, i2, j1, j2 in group:
+                if tag != 'delete':
+                    for line in b[j1:j2]:
+                        yield prefixmap[tag] + line
+
 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
     r"""
     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.