|
6 | 6 | Function get_close_matches(word, possibilities, n=3, cutoff=0.6): |
7 | 7 | Use SequenceMatcher to return list of the best "good enough" matches. |
8 | 8 |
|
| 9 | +Function context_diff(a, b): |
| 10 | + For two lists of strings, return a delta in context diff format. |
| 11 | +
|
9 | 12 | Function ndiff(a, b): |
10 | 13 | Return a delta: the difference between `a` and `b` (lists of strings). |
11 | 14 |
|
12 | 15 | Function restore(delta, which): |
13 | 16 | Return one of the two sequences that generated an ndiff delta. |
14 | 17 |
|
| 18 | +Function unified_diff(a, b): |
| 19 | + For two lists of strings, return a delta in unified diff format. |
| 20 | +
|
15 | 21 | Class SequenceMatcher: |
16 | 22 | A flexible class for comparing pairs of sequences of any type. |
17 | 23 |
|
|
20 | 26 | """ |
21 | 27 |
|
22 | 28 | __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', |
23 | | - 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK'] |
| 29 | + 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', |
| 30 | + 'unified_diff'] |
24 | 31 |
|
25 | 32 | class SequenceMatcher: |
26 | 33 |
|
@@ -532,6 +539,54 @@ def get_opcodes(self): |
532 | 539 | answer.append( ('equal', ai, i, bj, j) ) |
533 | 540 | return answer |
534 | 541 |
|
| 542 | + def get_grouped_opcodes(self, n=3): |
| 543 | + """ Isolate change clusters by eliminating ranges with no changes. |
| 544 | +
|
| 545 | + Return a generator of groups with upto n lines of context. |
| 546 | + Each group is in the same format as returned by get_opcodes(). |
| 547 | +
|
| 548 | + >>> from pprint import pprint |
| 549 | + >>> a = map(str, range(1,40)) |
| 550 | + >>> b = a[:] |
| 551 | + >>> b[8:8] = ['i'] # Make an insertion |
| 552 | + >>> b[20] += 'x' # Make a replacement |
| 553 | + >>> b[23:28] = [] # Make a deletion |
| 554 | + >>> b[30] += 'y' # Make another replacement |
| 555 | + >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) |
| 556 | + [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], |
| 557 | + [('equal', 16, 19, 17, 20), |
| 558 | + ('replace', 19, 20, 20, 21), |
| 559 | + ('equal', 20, 22, 21, 23), |
| 560 | + ('delete', 22, 27, 23, 23), |
| 561 | + ('equal', 27, 30, 23, 26)], |
| 562 | + [('equal', 31, 34, 27, 30), |
| 563 | + ('replace', 34, 35, 30, 31), |
| 564 | + ('equal', 35, 38, 31, 34)]] |
| 565 | + """ |
| 566 | + |
| 567 | + codes = self.get_opcodes() |
| 568 | + # Fixup leading and trailing groups if they show no changes. |
| 569 | + if codes[0][0] == 'equal': |
| 570 | + tag, i1, i2, j1, j2 = codes[0] |
| 571 | + codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 |
| 572 | + if codes[-1][0] == 'equal': |
| 573 | + tag, i1, i2, j1, j2 = codes[-1] |
| 574 | + codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) |
| 575 | + |
| 576 | + nn = n + n |
| 577 | + group = [] |
| 578 | + for tag, i1, i2, j1, j2 in codes: |
| 579 | + # End the current group and start a new one whenever |
| 580 | + # there is a large range with no changes. |
| 581 | + if tag == 'equal' and i2-i1 > nn: |
| 582 | + group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) |
| 583 | + yield group |
| 584 | + group = [] |
| 585 | + i1, j1 = max(i1, i2-n), max(j1, j2-n) |
| 586 | + group.append((tag, i1, i2, j1 ,j2)) |
| 587 | + if group and not (len(group)==1 and group[0][0] == 'equal'): |
| 588 | + yield group |
| 589 | + |
535 | 590 | def ratio(self): |
536 | 591 | """Return a measure of the sequences' similarity (float in [0,1]). |
537 | 592 |
|
@@ -1042,6 +1097,150 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"): |
1042 | 1097 |
|
1043 | 1098 | del re |
1044 | 1099 |
|
| 1100 | + |
| 1101 | +def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', |
| 1102 | + tofiledate='', n=3, lineterm='\n'): |
| 1103 | + r""" |
| 1104 | + Compare two sequences of lines; generate the delta as a unified diff. |
| 1105 | +
|
| 1106 | + Unified diffs are a compact way of showing line changes and a few |
| 1107 | + lines of context. The number of context lines is set by 'n' which |
| 1108 | + defaults to three. |
| 1109 | +
|
| 1110 | + By default, the diff control lines (those with *** or ---) are |
| 1111 | + created with a trailing newline. This is helpful so that inputs |
| 1112 | + created from file.readlines() result in diffs that are suitable for |
| 1113 | + file.writelines() since both the inputs and outputs have trailing |
| 1114 | + newlines. |
| 1115 | +
|
| 1116 | + For inputs that do not have trailing newlines, set the lineterm |
| 1117 | + argument to "" so that the output will be uniformly newline free. |
| 1118 | +
|
| 1119 | + The unidiff format normally has a header for filenames and modification |
| 1120 | + times. Any or all of these may be specified using strings for |
| 1121 | + 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification |
| 1122 | + times are normally expressed in the format returned by time.ctime(). |
| 1123 | +
|
| 1124 | + Example: |
| 1125 | +
|
| 1126 | + >>> for line in unified_diff('one two three four'.split(), |
| 1127 | + ... 'zero one tree four'.split(), 'Original', 'Current', |
| 1128 | + ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003', |
| 1129 | + ... lineterm=''): |
| 1130 | + ... print line |
| 1131 | + --- Original Sat Jan 26 23:30:50 1991 |
| 1132 | + +++ Current Fri Jun 06 10:20:52 2003 |
| 1133 | + @@ -1,4 +1,4 @@ |
| 1134 | + +zero |
| 1135 | + one |
| 1136 | + -two |
| 1137 | + -three |
| 1138 | + +tree |
| 1139 | + four |
| 1140 | + """ |
| 1141 | + |
| 1142 | + started = False |
| 1143 | + for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): |
| 1144 | + if not started: |
| 1145 | + yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm) |
| 1146 | + yield '+++ %s %s%s' % (tofile, tofiledate, lineterm) |
| 1147 | + started = True |
| 1148 | + i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] |
| 1149 | + yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) |
| 1150 | + for tag, i1, i2, j1, j2 in group: |
| 1151 | + if tag == 'equal': |
| 1152 | + for line in a[i1:i2]: |
| 1153 | + yield ' ' + line |
| 1154 | + continue |
| 1155 | + if tag == 'replace' or tag == 'delete': |
| 1156 | + for line in a[i1:i2]: |
| 1157 | + yield '-' + line |
| 1158 | + if tag == 'replace' or tag == 'insert': |
| 1159 | + for line in b[j1:j2]: |
| 1160 | + yield '+' + line |
| 1161 | + |
| 1162 | +# See http://www.unix.org/single_unix_specification/ |
| 1163 | +def context_diff(a, b, fromfile='', tofile='', |
| 1164 | + fromfiledate='', tofiledate='', n=3, lineterm='\n'): |
| 1165 | + r""" |
| 1166 | + Compare two sequences of lines; generate the delta as a context diff. |
| 1167 | +
|
| 1168 | + Context diffs are a compact way of showing line changes and a few |
| 1169 | + lines of context. The number of context lines is set by 'n' which |
| 1170 | + defaults to three. |
| 1171 | +
|
| 1172 | + By default, the diff control lines (those with *** or ---) are |
| 1173 | + created with a trailing newline. This is helpful so that inputs |
| 1174 | + created from file.readlines() result in diffs that are suitable for |
| 1175 | + file.writelines() since both the inputs and outputs have trailing |
| 1176 | + newlines. |
| 1177 | +
|
| 1178 | + For inputs that do not have trailing newlines, set the lineterm |
| 1179 | + argument to "" so that the output will be uniformly newline free. |
| 1180 | +
|
| 1181 | + The context diff format normally has a header for filenames and |
| 1182 | + modification times. Any or all of these may be specified using |
| 1183 | + strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. |
| 1184 | + The modification times are normally expressed in the format returned |
| 1185 | + by time.ctime(). If not specified, the strings default to blanks. |
| 1186 | +
|
| 1187 | + Example: |
| 1188 | +
|
| 1189 | + >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), |
| 1190 | + ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current', |
| 1191 | + ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')), |
| 1192 | + *** Original Sat Jan 26 23:30:50 1991 |
| 1193 | + --- Current Fri Jun 06 10:22:46 2003 |
| 1194 | + *************** |
| 1195 | + *** 1,4 **** |
| 1196 | + one |
| 1197 | + ! two |
| 1198 | + ! three |
| 1199 | + four |
| 1200 | + --- 1,4 ---- |
| 1201 | + + zero |
| 1202 | + one |
| 1203 | + ! tree |
| 1204 | + four |
| 1205 | + """ |
| 1206 | + |
| 1207 | + started = False |
| 1208 | + prefixmap = dict(insert='+ ', delete='- ', replace='! ', equal=' ') |
| 1209 | + for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): |
| 1210 | + if not started: |
| 1211 | + yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm) |
| 1212 | + yield '--- %s %s%s' % (tofile, tofiledate, lineterm) |
| 1213 | + started = True |
| 1214 | + yield '***************%s' % (lineterm,) |
| 1215 | + if group[-1][2] - group[0][1] >= 2: |
| 1216 | + yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm) |
| 1217 | + else: |
| 1218 | + yield '*** %d ****%s' % (group[-1][2], lineterm) |
| 1219 | + empty = True |
| 1220 | + for tag, i1, i2, j1, j2 in group: |
| 1221 | + if tag == 'replace' or tag == 'delete': |
| 1222 | + empty = False |
| 1223 | + break |
| 1224 | + if not empty: |
| 1225 | + for tag, i1, i2, j1, j2 in group: |
| 1226 | + if tag != 'insert': |
| 1227 | + for line in a[i1:i2]: |
| 1228 | + yield prefixmap[tag] + line |
| 1229 | + if group[-1][4] - group[0][3] >= 2: |
| 1230 | + yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm) |
| 1231 | + else: |
| 1232 | + yield '--- %d ----%s' % (group[-1][4], lineterm) |
| 1233 | + empty = True |
| 1234 | + for tag, i1, i2, j1, j2 in group: |
| 1235 | + if tag == 'replace' or tag == 'insert': |
| 1236 | + empty = False |
| 1237 | + break |
| 1238 | + if not empty: |
| 1239 | + for tag, i1, i2, j1, j2 in group: |
| 1240 | + if tag != 'delete': |
| 1241 | + for line in b[j1:j2]: |
| 1242 | + yield prefixmap[tag] + line |
| 1243 | + |
1045 | 1244 | def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): |
1046 | 1245 | r""" |
1047 | 1246 | Compare `a` and `b` (lists of strings); return a `Differ`-style delta. |
|
0 commit comments