From 9f0702c4822a680dedf127df3e1d7a1411b64e55 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 11 May 2017 12:38:01 +0300 Subject: [PATCH 1/4] bpo-30340: Enhanced regular expressions optimization. This increased the performance of matching some patterns up to 25 times. --- Lib/sre_compile.py | 7 +-- Lib/sre_parse.py | 140 +++++++++++++++++++++++++++++++++----------- Lib/test/test_re.py | 26 ++++---- Misc/NEWS | 3 + 4 files changed, 121 insertions(+), 55 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index cebecb93c0ab80..19c56698dcdd6d 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -125,7 +125,7 @@ def _compile(code, pattern, flags): elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) - elif _simple(av) and op is not REPEAT: + if av[2].getwidth() == (1, 1) and not av[2].hasgroups(): if op is MAX_REPEAT: emit(REPEAT_ONE) else: @@ -404,11 +404,6 @@ def _bytes_to_codes(b): assert len(a) * a.itemsize == len(b) return a.tolist() -def _simple(av): - # check if av is a "simple" operator - lo, hi = av[2].getwidth() - return lo == hi == 1 and av[2][0][0] != SUBPATTERN - def _generate_overlap_table(prefix): """ Generate an overlap table for the following prefix. diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d8d1bd552fbee0..b441902c7ca0ca 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -27,6 +27,7 @@ _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) +_ASSERT_CODES = frozenset({ASSERT, ASSERT_NOT}) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -114,6 +115,8 @@ def __init__(self, pattern, data=None): data = [] self.data = data self.width = None + self._hasgroups = None + def dump(self, level=0): nl = True seqtypes = (tuple, list) @@ -219,6 +222,38 @@ def getwidth(self): self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) return self.width + def hasgroups(self): + # determine whether this subpattern contains capturing groups + if self._hasgroups is not None: + return self._hasgroups + v = False + for op, av in self.data: + if op is BRANCH: + for av in av[1]: + v = av.hasgroups() + if v: + break + elif op is CALL: + v = av.hasgroups() + elif op is SUBPATTERN: + v = av[0] is not None or av[-1].hasgroups() + elif op in _REPEATCODES: + v = av[2].hasgroups() + elif op in _ASSERT_CODES: + v = av[1].hasgroups() + elif op is GROUPREF_EXISTS: + v = av[1].hasgroups() + if not v and av[2] is not None: + v = av[2].hasgroups() + elif op is SUCCESS: + break + else: + continue + if v: + break + self._hasgroups = v + return v + class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) @@ -404,6 +439,16 @@ def _escape(source, escape, state): pass raise source.error("bad escape %s" % escape, len(escape)) +def _uniq(items): + return items + if len(set(items)) == len(items): + return items + newitems = [] + for item in items: + if item not in newitems: + newitems.append(item) + return newitems + def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c @@ -420,7 +465,6 @@ def _parse_sub(source, state, verbose, nested=True): return items[0] subpattern = SubPattern(state) - subpatternappend = subpattern.append # check if all items share a common prefix while True: @@ -437,35 +481,31 @@ def _parse_sub(source, state, verbose, nested=True): # move it out of the branch for item in items: del item[0] - subpatternappend(prefix) + subpattern.append(prefix) continue # check next one break # check if the branch can be replaced by a character set + set = [] for item in items: - if len(item) != 1 or item[0][0] is not LITERAL: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - subpatternappend((IN, [item[0] for item in items])) + subpattern.append((IN, _uniq(set))) return subpattern subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup, verbose): - item_yes = _parse(source, state, verbose) - if source.match("|"): - item_no = _parse(source, state, verbose) - if source.next == "|": - raise source.error("conditional backref with more than two branches") - else: - item_no = None - subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - return subpattern - def _parse(source, state, verbose, first=False): # parse a simple pattern subpattern = SubPattern(state) @@ -511,16 +551,14 @@ def _parse(source, state, verbose, first=False): setappend = set.append ## if sourcematch(":"): ## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) + negate = sourcematch("^") # check remaining characters - start = set[:] while True: this = sourceget() if this is None: raise source.error("unterminated character set", source.tell() - here) - if this == "]" and set != start: + if this == "]" and set: break elif this[0] == "\\": code1 = _class_escape(source, this) @@ -556,13 +594,19 @@ def _parse(source, state, verbose, first=False): code1 = code1[1][0] setappend(code1) + set = _uniq(set) # XXX: should move set optimization to compiler! - if _len(set)==1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) else: - # XXX: should add charmap optimization here + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known subpatternappend((IN, set)) elif this in REPEAT_CHARS: @@ -579,6 +623,7 @@ def _parse(source, state, verbose, first=False): if source.next == "}": subpatternappend((LITERAL, _ord(this))) continue + min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: @@ -592,6 +637,7 @@ def _parse(source, state, verbose, first=False): subpatternappend((LITERAL, _ord(this))) source.seek(here) continue + if lo: min = int(lo) if min >= MAXREPEAT: @@ -610,12 +656,16 @@ def _parse(source, state, verbose, first=False): item = subpattern[-1:] else: item = None - if not item or (_len(item) == 1 and item[0][0] is AT): + if not item or item[0][0] is AT: raise source.error("nothing to repeat", source.tell() - here + len(this)) if item[0][0] in _REPEATCODES: raise source.error("multiple repeat", source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -628,7 +678,6 @@ def _parse(source, state, verbose, first=False): start = source.tell() - 1 group = True name = None - condgroup = None add_flags = 0 del_flags = 0 if sourcematch("?"): @@ -660,6 +709,7 @@ def _parse(source, state, verbose, first=False): state.checklookbehindgroup(gid, source) subpatternappend((GROUPREF, gid)) continue + else: char = sourceget() if char is None: @@ -678,6 +728,7 @@ def _parse(source, state, verbose, first=False): if sourceget() == ")": break continue + elif char in "=!<": # lookahead assertions dir = 1 @@ -704,10 +755,10 @@ def _parse(source, state, verbose, first=False): else: subpatternappend((ASSERT_NOT, (dir, p))) continue + elif char == "(": # conditional backreference group condname = source.getuntil(")") - group = None if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -728,6 +779,19 @@ def _parse(source, state, verbose, first=False): msg = "invalid group reference %d" % condgroup raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose) + if source.match("|"): + item_no = _parse(source, state, verbose) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -744,6 +808,7 @@ def _parse(source, state, verbose, first=False): if (state.flags & SRE_FLAG_VERBOSE) and not verbose: raise Verbose continue + add_flags, del_flags = flags group = None else: @@ -756,12 +821,9 @@ def _parse(source, state, verbose, first=False): group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None - if condgroup: - p = _parse_sub_cond(source, state, condgroup, verbose) - else: - sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and - not (del_flags & SRE_FLAG_VERBOSE)) - p = _parse_sub(source, state, sub_verbose) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not source.match(")"): raise source.error("missing ), unterminated subpattern", source.tell() - start) @@ -773,11 +835,19 @@ def _parse(source, state, verbose, first=False): subpatternappend((AT, AT_BEGINNING)) elif this == "$": - subpattern.append((AT, AT_END)) + subpatternappend((AT, AT_END)) else: raise AssertionError("unsupported special character %r" % (char,)) + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + return subpattern def _parse_flags(source, state, char): diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4d71eea517e3e4..5d36b54680d905 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1695,20 +1695,18 @@ def test_debug_flag(self): dump = '''\ SUBPATTERN 1 0 0 LITERAL 46 -SUBPATTERN None 0 0 - BRANCH - IN - LITERAL 99 - LITERAL 104 - OR - LITERAL 112 - LITERAL 121 -SUBPATTERN None 0 0 - GROUPREF_EXISTS 1 - AT AT_END - ELSE - LITERAL 58 - LITERAL 32 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 ''' self.assertEqual(out.getvalue(), dump) # Debug output is output again even a second time (bypassing diff --git a/Misc/NEWS b/Misc/NEWS index 5ee7ea65ff1df5..abb4c579462b05 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -320,6 +320,9 @@ Extension Modules Library ------- +- bpo-30340: Enhanced regular expressions optimization. This increased + the performance of matching some patterns up to 25 times. + - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers. Now allowed several subsequential inline modifiers at the start of the pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments From 1f6d33af3d0e3d98d8eab16d03320262cb025aa8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 11 May 2017 19:21:45 +0300 Subject: [PATCH 2/4] Make _uniq() actually working. --- Lib/sre_parse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index b441902c7ca0ca..3b8a331ffb0519 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -440,7 +440,6 @@ def _escape(source, escape, state): raise source.error("bad escape %s" % escape, len(escape)) def _uniq(items): - return items if len(set(items)) == len(items): return items newitems = [] From 8a611100655a9a42c3eee27425a94f73c7bf8745 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 13 May 2017 18:18:08 +0300 Subject: [PATCH 3/4] Fix compiling to REPEAT_ONE. --- Lib/sre_compile.py | 12 +++++++++++- Lib/sre_parse.py | 32 -------------------------------- 2 files changed, 11 insertions(+), 33 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 19c56698dcdd6d..aeb89bcc7b4739 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -20,6 +20,7 @@ _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN} # Sets of lowercase characters which have the same uppercase. _equivalences = ( @@ -125,7 +126,7 @@ def _compile(code, pattern, flags): elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) - if av[2].getwidth() == (1, 1) and not av[2].hasgroups(): + if _simple(av[2]): if op is MAX_REPEAT: emit(REPEAT_ONE) else: @@ -404,6 +405,15 @@ def _bytes_to_codes(b): assert len(a) * a.itemsize == len(b) return a.tolist() +def _simple(p): + # check if this subpattern is a "simple" operator + if len(p) != 1: + return False + op, av = p[0] + if op is SUBPATTERN: + return av[0] is None and _simple(av[-1]) + return op in _UNIT_CODES + def _generate_overlap_table(prefix): """ Generate an overlap table for the following prefix. diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 3b8a331ffb0519..beceeafa4b98a2 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -222,38 +222,6 @@ def getwidth(self): self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) return self.width - def hasgroups(self): - # determine whether this subpattern contains capturing groups - if self._hasgroups is not None: - return self._hasgroups - v = False - for op, av in self.data: - if op is BRANCH: - for av in av[1]: - v = av.hasgroups() - if v: - break - elif op is CALL: - v = av.hasgroups() - elif op is SUBPATTERN: - v = av[0] is not None or av[-1].hasgroups() - elif op in _REPEATCODES: - v = av[2].hasgroups() - elif op in _ASSERT_CODES: - v = av[1].hasgroups() - elif op is GROUPREF_EXISTS: - v = av[1].hasgroups() - if not v and av[2] is not None: - v = av[2].hasgroups() - elif op is SUCCESS: - break - else: - continue - if v: - break - self._hasgroups = v - return v - class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) From dcd7cccaa269ff32d4a9f5e30176cd538d2694a0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 13 May 2017 18:22:54 +0300 Subject: [PATCH 4/4] Remove unused code. --- Lib/sre_parse.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index beceeafa4b98a2..f72408f010b1fa 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -27,7 +27,6 @@ _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) -_ASSERT_CODES = frozenset({ASSERT, ASSERT_NOT}) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -115,7 +114,6 @@ def __init__(self, pattern, data=None): data = [] self.data = data self.width = None - self._hasgroups = None def dump(self, level=0): nl = True