@@ -161,8 +161,6 @@ def __init__(self, isjunk=None, a='', b=''):
161161 # b2j
162162 # for x in b, b2j[x] is a list of the indices (into b)
163163 # at which x appears; junk elements do not appear
164- # b2jhas
165- # b2j.has_key
166164 # fullbcount
167165 # for x in b, fullbcount[x] == the number of times x
168166 # appears in b; only materialized if really needed (used
@@ -188,6 +186,10 @@ def __init__(self, isjunk=None, a='', b=''):
188186 # for x in b, isbjunk(x) == isjunk(x) but much faster;
189187 # it's really the has_key method of a hidden dict.
190188 # DOES NOT WORK for x in a!
189+ # isbpopular
190+ # for x in b, isbpopular(x) is true iff b is reasonably long
191+ # (at least 200 elements) and x accounts for more than 1% of
192+ # its elements. DOES NOT WORK for x in a!
191193
192194 self .isjunk = isjunk
193195 self .a = self .b = None
@@ -266,6 +268,12 @@ def set_seq2(self, b):
266268 # map at all, which stops the central find_longest_match method
267269 # from starting any matching block at a junk element ...
268270 # also creates the fast isbjunk function ...
271+ # b2j also does not contain entries for "popular" elements, meaning
272+ # elements that account for more than 1% of the total elements, and
273+ # when the sequence is reasonably large (>= 200 elements); this can
274+ # be viewed as an adaptive notion of semi-junk, and yields an enormous
275+ # speedup when, e.g., comparing program files with hundreds of
276+ # instances of "return NULL;" ...
269277 # note that this is only called when b changes; so for cross-product
270278 # kinds of matches, it's best to call set_seq2 once, then set_seq1
271279 # repeatedly
@@ -282,32 +290,44 @@ def __chain_b(self):
282290 # out the junk later is much cheaper than building b2j "right"
283291 # from the start.
284292 b = self .b
293+ n = len (b )
285294 self .b2j = b2j = {}
286- self .b2jhas = b2jhas = b2j .has_key
287- for i in xrange (len (b )):
288- elt = b [i ]
289- if b2jhas (elt ):
290- b2j [elt ].append (i )
295+ populardict = {}
296+ for i , elt in enumerate (b ):
297+ if elt in b2j :
298+ indices = b2j [elt ]
299+ if n >= 200 and len (indices ) * 100 > n :
300+ populardict [elt ] = 1
301+ del indices [:]
302+ else :
303+ indices .append (i )
291304 else :
292305 b2j [elt ] = [i ]
293306
307+ # Purge leftover indices for popular elements.
308+ for elt in populardict :
309+ del b2j [elt ]
310+
294311 # Now b2j.keys() contains elements uniquely, and especially when
295312 # the sequence is a string, that's usually a good deal smaller
296313 # than len(string). The difference is the number of isjunk calls
297314 # saved.
298- isjunk , junkdict = self .isjunk , {}
315+ isjunk = self .isjunk
316+ junkdict = {}
299317 if isjunk :
300- for elt in b2j .keys ():
301- if isjunk (elt ):
302- junkdict [elt ] = 1 # value irrelevant; it's a set
303- del b2j [elt ]
318+ for d in populardict , b2j :
319+ for elt in d .keys ():
320+ if isjunk (elt ):
321+ junkdict [elt ] = 1
322+ del d [elt ]
304323
305324 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
306325 # latter is much faster. Note too that while there may be a
307326 # lot of junk in the sequence, the number of *unique* junk
308327 # elements is probably small. So the memory burden of keeping
309328 # this dict alive is likely trivial compared to the size of b2j.
310329 self .isbjunk = junkdict .has_key
330+ self .isbpopular = populardict .has_key
311331
312332 def find_longest_match (self , alo , ahi , blo , bhi ):
313333 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -388,6 +408,19 @@ def find_longest_match(self, alo, ahi, blo, bhi):
388408 besti , bestj , bestsize = i - k + 1 , j - k + 1 , k
389409 j2len = newj2len
390410
411+ # Extend the best by non-junk elements on each end. In particular,
412+ # "popular" non-junk elements aren't in b2j, which greatly speeds
413+ # the inner loop above, but also means "the best" match so far
414+ # doesn't contain any junk *or* popular non-junk elements.
415+ while besti > alo and bestj > blo and \
416+ not isbjunk (b [bestj - 1 ]) and \
417+ a [besti - 1 ] == b [bestj - 1 ]:
418+ besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
419+ while besti + bestsize < ahi and bestj + bestsize < bhi and \
420+ not isbjunk (b [bestj + bestsize ]) and \
421+ a [besti + bestsize ] == b [bestj + bestsize ]:
422+ bestsize += 1
423+
391424 # Now that we have a wholly interesting match (albeit possibly
392425 # empty!), we may as well suck up the matching junk on each
393426 # side of it too. Can't think of a good reason not to, and it
@@ -736,12 +769,16 @@ def __init__(self, linejunk=None, charjunk=None):
736769 - `linejunk`: A function that should accept a single string argument,
737770 and return true iff the string is junk. The module-level function
738771 `IS_LINE_JUNK` may be used to filter out lines without visible
739- characters, except for at most one splat ('#').
772+ characters, except for at most one splat ('#'). It is recommended
773+ to leave linejunk None; as of Python 2.3, the underlying
774+ SequenceMatcher class has grown an adaptive notion of "noise" lines
775+ that's better than any static definition the author has ever been
776+ able to craft.
740777
741778 - `charjunk`: A function that should accept a string of length 1. The
742779 module-level function `IS_CHARACTER_JUNK` may be used to filter out
743780 whitespace characters (a blank or tab; **note**: bad idea to include
744- newline in this!).
781+ newline in this!). Use of IS_CHARACTER_JUNK is recommended.
745782 """
746783
747784 self .linejunk = linejunk
@@ -1005,17 +1042,17 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"):
10051042
10061043del re
10071044
1008- def ndiff (a , b , linejunk = IS_LINE_JUNK , charjunk = IS_CHARACTER_JUNK ):
1045+ def ndiff (a , b , linejunk = None , charjunk = IS_CHARACTER_JUNK ):
10091046 r"""
10101047 Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
10111048
10121049 Optional keyword parameters `linejunk` and `charjunk` are for filter
10131050 functions (or None):
10141051
10151052 - linejunk: A function that should accept a single string argument, and
1016- return true iff the string is junk. The default is module-level function
1017- IS_LINE_JUNK, which filters out lines without visible characters, except
1018- for at most one splat ('#') .
1053+ return true iff the string is junk. The default is None, and is
1054+ recommended; as of Python 2.3, an adaptive notion of "noise" lines is
1055+ used that does a good job on its own .
10191056
10201057 - charjunk: A function that should accept a string of length 1. The
10211058 default is module-level function IS_CHARACTER_JUNK, which filters out
0 commit comments