Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions Lib/sre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}

# Sets of lowercase characters which have the same uppercase.
_equivalences = (
Expand Down Expand Up @@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT:
if _simple(av[2]):
if op is MAX_REPEAT:
emit(REPEAT_ONE)
else:
Expand Down Expand Up @@ -404,10 +405,14 @@ def _bytes_to_codes(b):
assert len(a) * a.itemsize == len(b)
return a.tolist()

def _simple(av):
# check if av is a "simple" operator
lo, hi = av[2].getwidth()
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
def _simple(p):
# check if this subpattern is a "simple" operator
if len(p) != 1:
return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES

def _generate_overlap_table(prefix):
"""
Expand Down
105 changes: 70 additions & 35 deletions Lib/sre_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def __init__(self, pattern, data=None):
data = []
self.data = data
self.width = None

def dump(self, level=0):
nl = True
seqtypes = (tuple, list)
Expand Down Expand Up @@ -404,6 +405,15 @@ def _escape(source, escape, state):
pass
raise source.error("bad escape %s" % escape, len(escape))

def _uniq(items):
if len(set(items)) == len(items):
return items
newitems = []
for item in items:
if item not in newitems:
newitems.append(item)
return newitems

def _parse_sub(source, state, verbose, nested=True):
# parse an alternation: a|b|c

Expand All @@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
return items[0]

subpattern = SubPattern(state)
subpatternappend = subpattern.append

# check if all items share a common prefix
while True:
Expand All @@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
# move it out of the branch
for item in items:
del item[0]
subpatternappend(prefix)
subpattern.append(prefix)
continue # check next one
break

# check if the branch can be replaced by a character set
set = []
for item in items:
if len(item) != 1 or item[0][0] is not LITERAL:
if len(item) != 1:
break
op, av = item[0]
if op is LITERAL:
set.append((op, av))
elif op is IN and av[0][0] is not NEGATE:
set.extend(av)
else:
break
else:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpatternappend((IN, [item[0] for item in items]))
subpattern.append((IN, _uniq(set)))
return subpattern

subpattern.append((BRANCH, (None, items)))
return subpattern

def _parse_sub_cond(source, state, condgroup, verbose):
item_yes = _parse(source, state, verbose)
if source.match("|"):
item_no = _parse(source, state, verbose)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern

def _parse(source, state, verbose, first=False):
# parse a simple pattern
subpattern = SubPattern(state)
Expand Down Expand Up @@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
setappend = set.append
## if sourcematch(":"):
## pass # handle character classes
if sourcematch("^"):
setappend((NEGATE, None))
negate = sourcematch("^")
# check remaining characters
start = set[:]
while True:
this = sourceget()
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
if this == "]" and set != start:
if this == "]" and set:
break
elif this[0] == "\\":
code1 = _class_escape(source, this)
Expand Down Expand Up @@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
code1 = code1[1][0]
setappend(code1)

set = _uniq(set)
# XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL:
subpatternappend(set[0]) # optimization
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
if _len(set) == 1 and set[0][0] is LITERAL:
# optimization
if negate:
subpatternappend((NOT_LITERAL, set[0][1]))
else:
subpatternappend(set[0])
else:
# XXX: <fl> should add charmap optimization here
if negate:
set.insert(0, (NEGATE, None))
# charmap optimization can't be added here because
# global flags still are not known
subpatternappend((IN, set))

elif this in REPEAT_CHARS:
Expand All @@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
if source.next == "}":
subpatternappend((LITERAL, _ord(this)))
continue

min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
Expand All @@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
subpatternappend((LITERAL, _ord(this)))
source.seek(here)
continue

if lo:
min = int(lo)
if min >= MAXREPEAT:
Expand All @@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
item = subpattern[-1:]
else:
item = None
if not item or (_len(item) == 1 and item[0][0] is AT):
if not item or item[0][0] is AT:
raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES:
raise source.error("multiple repeat",
source.tell() - here + len(this))
if item[0][0] is SUBPATTERN:
group, add_flags, del_flags, p = item[0][1]
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
Expand All @@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
start = source.tell() - 1
group = True
name = None
condgroup = None
add_flags = 0
del_flags = 0
if sourcematch("?"):
Expand Down Expand Up @@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid))
continue

else:
char = sourceget()
if char is None:
Expand All @@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
if sourceget() == ")":
break
continue

elif char in "=!<":
# lookahead assertions
dir = 1
Expand All @@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
else:
subpatternappend((ASSERT_NOT, (dir, p)))
continue

elif char == "(":
# conditional backreference group
condname = source.getuntil(")")
group = None
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
Expand All @@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
msg = "invalid group reference %d" % condgroup
raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose)
if source.match("|"):
item_no = _parse(source, state, verbose)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue

elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
Expand All @@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
raise Verbose
continue

add_flags, del_flags = flags
group = None
else:
Expand All @@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
if condgroup:
p = _parse_sub_cond(source, state, condgroup, verbose)
else:
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose)
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose)
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
Expand All @@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
subpatternappend((AT, AT_BEGINNING))

elif this == "$":
subpattern.append((AT, AT_END))
subpatternappend((AT, AT_END))

else:
raise AssertionError("unsupported special character %r" % (char,))

# unpack non-capturing groups
for i in range(len(subpattern))[::-1]:
op, av = subpattern[i]
if op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group is None and not add_flags and not del_flags:
subpattern[i: i+1] = p

return subpattern

def _parse_flags(source, state, char):
Expand Down
26 changes: 12 additions & 14 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1695,20 +1695,18 @@ def test_debug_flag(self):
dump = '''\
SUBPATTERN 1 0 0
LITERAL 46
SUBPATTERN None 0 0
BRANCH
IN
LITERAL 99
LITERAL 104
OR
LITERAL 112
LITERAL 121
SUBPATTERN None 0 0
GROUPREF_EXISTS 1
AT AT_END
ELSE
LITERAL 58
LITERAL 32
BRANCH
IN
LITERAL 99
LITERAL 104
OR
LITERAL 112
LITERAL 121
GROUPREF_EXISTS 1
AT AT_END
ELSE
LITERAL 58
LITERAL 32
'''
self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing
Expand Down
3 changes: 3 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ Library
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``.

- bpo-30340: Enhanced regular expressions optimization. This increased
the performance of matching some patterns up to 25 times.

- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
Now allowed several subsequential inline modifiers at the start of the
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments
Expand Down