From 9f0702c4822a680dedf127df3e1d7a1411b64e55 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 11 May 2017 12:38:01 +0300
Subject: [PATCH 1/4] bpo-30340: Enhanced regular expressions optimization.

This increased the performance of matching some patterns up to 25 times.
---
 Lib/sre_compile.py  |   7 +--
 Lib/sre_parse.py    | 140 +++++++++++++++++++++++++++++++++-----------
 Lib/test/test_re.py |  26 ++++----
 Misc/NEWS           |   3 +
 4 files changed, 121 insertions(+), 55 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index cebecb93c0ab80..19c56698dcdd6d 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -125,7 +125,7 @@ def _compile(code, pattern, flags):
         elif op in REPEATING_CODES:
             if flags & SRE_FLAG_TEMPLATE:
                 raise error("internal: unsupported template operator %r" % (op,))
-            elif _simple(av) and op is not REPEAT:
+            if av[2].getwidth() == (1, 1) and not av[2].hasgroups():
                 if op is MAX_REPEAT:
                     emit(REPEAT_ONE)
                 else:
@@ -404,11 +404,6 @@ def _bytes_to_codes(b):
     assert len(a) * a.itemsize == len(b)
     return a.tolist()
 
-def _simple(av):
-    # check if av is a "simple" operator
-    lo, hi = av[2].getwidth()
-    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
-
 def _generate_overlap_table(prefix):
     """
     Generate an overlap table for the following prefix.
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index d8d1bd552fbee0..b441902c7ca0ca 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -27,6 +27,7 @@
 
 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
+_ASSERT_CODES = frozenset({ASSERT, ASSERT_NOT})
 
 ESCAPES = {
     r"\a": (LITERAL, ord("\a")),
@@ -114,6 +115,8 @@ def __init__(self, pattern, data=None):
             data = []
         self.data = data
         self.width = None
+        self._hasgroups = None
+
     def dump(self, level=0):
         nl = True
         seqtypes = (tuple, list)
@@ -219,6 +222,38 @@ def getwidth(self):
         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
         return self.width
 
+    def hasgroups(self):
+        # determine whether this subpattern contains capturing groups
+        if self._hasgroups is not None:
+            return self._hasgroups
+        v = False
+        for op, av in self.data:
+            if op is BRANCH:
+                for av in av[1]:
+                    v = av.hasgroups()
+                    if v:
+                        break
+            elif op is CALL:
+                v = av.hasgroups()
+            elif op is SUBPATTERN:
+                v = av[0] is not None or av[-1].hasgroups()
+            elif op in _REPEATCODES:
+                v = av[2].hasgroups()
+            elif op in _ASSERT_CODES:
+                v = av[1].hasgroups()
+            elif op is GROUPREF_EXISTS:
+                v = av[1].hasgroups()
+                if not v and av[2] is not None:
+                    v = av[2].hasgroups()
+            elif op is SUCCESS:
+                break
+            else:
+                continue
+            if v:
+                break
+        self._hasgroups = v
+        return v
+
 class Tokenizer:
     def __init__(self, string):
         self.istext = isinstance(string, str)
@@ -404,6 +439,16 @@ def _escape(source, escape, state):
         pass
     raise source.error("bad escape %s" % escape, len(escape))
 
+def _uniq(items):
+    return items
+    if len(set(items)) == len(items):
+        return items
+    newitems = []
+    for item in items:
+        if item not in newitems:
+            newitems.append(item)
+    return newitems
+
 def _parse_sub(source, state, verbose, nested=True):
     # parse an alternation: a|b|c
 
@@ -420,7 +465,6 @@ def _parse_sub(source, state, verbose, nested=True):
         return items[0]
 
     subpattern = SubPattern(state)
-    subpatternappend = subpattern.append
 
     # check if all items share a common prefix
     while True:
@@ -437,35 +481,31 @@ def _parse_sub(source, state, verbose, nested=True):
             # move it out of the branch
             for item in items:
                 del item[0]
-            subpatternappend(prefix)
+            subpattern.append(prefix)
             continue # check next one
         break
 
     # check if the branch can be replaced by a character set
+    set = []
     for item in items:
-        if len(item) != 1 or item[0][0] is not LITERAL:
+        if len(item) != 1:
+            break
+        op, av = item[0]
+        if op is LITERAL:
+            set.append((op, av))
+        elif op is IN and av[0][0] is not NEGATE:
+            set.extend(av)
+        else:
             break
     else:
         # we can store this as a character set instead of a
         # branch (the compiler may optimize this even more)
-        subpatternappend((IN, [item[0] for item in items]))
+        subpattern.append((IN, _uniq(set)))
         return subpattern
 
     subpattern.append((BRANCH, (None, items)))
     return subpattern
 
-def _parse_sub_cond(source, state, condgroup, verbose):
-    item_yes = _parse(source, state, verbose)
-    if source.match("|"):
-        item_no = _parse(source, state, verbose)
-        if source.next == "|":
-            raise source.error("conditional backref with more than two branches")
-    else:
-        item_no = None
-    subpattern = SubPattern(state)
-    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
-    return subpattern
-
 def _parse(source, state, verbose, first=False):
     # parse a simple pattern
     subpattern = SubPattern(state)
@@ -511,16 +551,14 @@ def _parse(source, state, verbose, first=False):
             setappend = set.append
 ##          if sourcematch(":"):
 ##              pass # handle character classes
-            if sourcematch("^"):
-                setappend((NEGATE, None))
+            negate = sourcematch("^")
             # check remaining characters
-            start = set[:]
             while True:
                 this = sourceget()
                 if this is None:
                     raise source.error("unterminated character set",
                                        source.tell() - here)
-                if this == "]" and set != start:
+                if this == "]" and set:
                     break
                 elif this[0] == "\\":
                     code1 = _class_escape(source, this)
@@ -556,13 +594,19 @@ def _parse(source, state, verbose, first=False):
                         code1 = code1[1][0]
                     setappend(code1)
 
+            set = _uniq(set)
             # XXX: <fl> should move set optimization to compiler!
-            if _len(set)==1 and set[0][0] is LITERAL:
-                subpatternappend(set[0]) # optimization
-            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
-                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
+            if _len(set) == 1 and set[0][0] is LITERAL:
+                # optimization
+                if negate:
+                    subpatternappend((NOT_LITERAL, set[0][1]))
+                else:
+                    subpatternappend(set[0])
             else:
-                # XXX: <fl> should add charmap optimization here
+                if negate:
+                    set.insert(0, (NEGATE, None))
+                # charmap optimization can't be added here because
+                # global flags still are not known
                 subpatternappend((IN, set))
 
         elif this in REPEAT_CHARS:
@@ -579,6 +623,7 @@ def _parse(source, state, verbose, first=False):
                 if source.next == "}":
                     subpatternappend((LITERAL, _ord(this)))
                     continue
+
                 min, max = 0, MAXREPEAT
                 lo = hi = ""
                 while source.next in DIGITS:
@@ -592,6 +637,7 @@ def _parse(source, state, verbose, first=False):
                     subpatternappend((LITERAL, _ord(this)))
                     source.seek(here)
                     continue
+
                 if lo:
                     min = int(lo)
                     if min >= MAXREPEAT:
@@ -610,12 +656,16 @@ def _parse(source, state, verbose, first=False):
                 item = subpattern[-1:]
             else:
                 item = None
-            if not item or (_len(item) == 1 and item[0][0] is AT):
+            if not item or item[0][0] is AT:
                 raise source.error("nothing to repeat",
                                    source.tell() - here + len(this))
             if item[0][0] in _REPEATCODES:
                 raise source.error("multiple repeat",
                                    source.tell() - here + len(this))
+            if item[0][0] is SUBPATTERN:
+                group, add_flags, del_flags, p = item[0][1]
+                if group is None and not add_flags and not del_flags:
+                    item = p
             if sourcematch("?"):
                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
             else:
@@ -628,7 +678,6 @@ def _parse(source, state, verbose, first=False):
             start = source.tell() - 1
             group = True
             name = None
-            condgroup = None
             add_flags = 0
             del_flags = 0
             if sourcematch("?"):
@@ -660,6 +709,7 @@ def _parse(source, state, verbose, first=False):
                         state.checklookbehindgroup(gid, source)
                         subpatternappend((GROUPREF, gid))
                         continue
+
                     else:
                         char = sourceget()
                         if char is None:
@@ -678,6 +728,7 @@ def _parse(source, state, verbose, first=False):
                         if sourceget() == ")":
                             break
                     continue
+
                 elif char in "=!<":
                     # lookahead assertions
                     dir = 1
@@ -704,10 +755,10 @@ def _parse(source, state, verbose, first=False):
                     else:
                         subpatternappend((ASSERT_NOT, (dir, p)))
                     continue
+
                 elif char == "(":
                     # conditional backreference group
                     condname = source.getuntil(")")
-                    group = None
                     if condname.isidentifier():
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
@@ -728,6 +779,19 @@ def _parse(source, state, verbose, first=False):
                             msg = "invalid group reference %d" % condgroup
                             raise source.error(msg, len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
+                    item_yes = _parse(source, state, verbose)
+                    if source.match("|"):
+                        item_no = _parse(source, state, verbose)
+                        if source.next == "|":
+                            raise source.error("conditional backref with more than two branches")
+                    else:
+                        item_no = None
+                    if not source.match(")"):
+                        raise source.error("missing ), unterminated subpattern",
+                                           source.tell() - start)
+                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
+                    continue
+
                 elif char in FLAGS or char == "-":
                     # flags
                     flags = _parse_flags(source, state, char)
@@ -744,6 +808,7 @@ def _parse(source, state, verbose, first=False):
                         if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
                             raise Verbose
                         continue
+
                     add_flags, del_flags = flags
                     group = None
                 else:
@@ -756,12 +821,9 @@ def _parse(source, state, verbose, first=False):
                     group = state.opengroup(name)
                 except error as err:
                     raise source.error(err.msg, len(name) + 1) from None
-            if condgroup:
-                p = _parse_sub_cond(source, state, condgroup, verbose)
-            else:
-                sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
-                               not (del_flags & SRE_FLAG_VERBOSE))
-                p = _parse_sub(source, state, sub_verbose)
+            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
+                           not (del_flags & SRE_FLAG_VERBOSE))
+            p = _parse_sub(source, state, sub_verbose)
             if not source.match(")"):
                 raise source.error("missing ), unterminated subpattern",
                                    source.tell() - start)
@@ -773,11 +835,19 @@ def _parse(source, state, verbose, first=False):
             subpatternappend((AT, AT_BEGINNING))
 
         elif this == "$":
-            subpattern.append((AT, AT_END))
+            subpatternappend((AT, AT_END))
 
         else:
             raise AssertionError("unsupported special character %r" % (char,))
 
+    # unpack non-capturing groups
+    for i in range(len(subpattern))[::-1]:
+        op, av = subpattern[i]
+        if op is SUBPATTERN:
+            group, add_flags, del_flags, p = av
+            if group is None and not add_flags and not del_flags:
+                subpattern[i: i+1] = p
+
     return subpattern
 
 def _parse_flags(source, state, char):
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 4d71eea517e3e4..5d36b54680d905 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1695,20 +1695,18 @@ def test_debug_flag(self):
         dump = '''\
 SUBPATTERN 1 0 0
   LITERAL 46
-SUBPATTERN None 0 0
-  BRANCH
-    IN
-      LITERAL 99
-      LITERAL 104
-  OR
-    LITERAL 112
-    LITERAL 121
-SUBPATTERN None 0 0
-  GROUPREF_EXISTS 1
-    AT AT_END
-  ELSE
-    LITERAL 58
-    LITERAL 32
+BRANCH
+  IN
+    LITERAL 99
+    LITERAL 104
+OR
+  LITERAL 112
+  LITERAL 121
+GROUPREF_EXISTS 1
+  AT AT_END
+ELSE
+  LITERAL 58
+  LITERAL 32
 '''
         self.assertEqual(out.getvalue(), dump)
         # Debug output is output again even a second time (bypassing
diff --git a/Misc/NEWS b/Misc/NEWS
index 5ee7ea65ff1df5..abb4c579462b05 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -320,6 +320,9 @@ Extension Modules
 Library
 -------
 
+- bpo-30340: Enhanced regular expressions optimization. This increased
+  the performance of matching some patterns up to 25 times.
+
 - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
   Now allowed several subsequential inline modifiers at the start of the
   pattern (e.g. ``'(?i)(?s)...'``).  In verbose mode whitespaces and comments

From 1f6d33af3d0e3d98d8eab16d03320262cb025aa8 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 11 May 2017 19:21:45 +0300
Subject: [PATCH 2/4] Make _uniq() actually working.

---
 Lib/sre_parse.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index b441902c7ca0ca..3b8a331ffb0519 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -440,7 +440,6 @@ def _escape(source, escape, state):
     raise source.error("bad escape %s" % escape, len(escape))
 
 def _uniq(items):
-    return items
     if len(set(items)) == len(items):
         return items
     newitems = []

From 8a611100655a9a42c3eee27425a94f73c7bf8745 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 13 May 2017 18:18:08 +0300
Subject: [PATCH 3/4] Fix compiling to REPEAT_ONE.

---
 Lib/sre_compile.py | 12 +++++++++++-
 Lib/sre_parse.py   | 32 --------------------------------
 2 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 19c56698dcdd6d..aeb89bcc7b4739 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -20,6 +20,7 @@
 _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
+_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
 
 # Sets of lowercase characters which have the same uppercase.
 _equivalences = (
@@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
         elif op in REPEATING_CODES:
             if flags & SRE_FLAG_TEMPLATE:
                 raise error("internal: unsupported template operator %r" % (op,))
-            if av[2].getwidth() == (1, 1) and not av[2].hasgroups():
+            if _simple(av[2]):
                 if op is MAX_REPEAT:
                     emit(REPEAT_ONE)
                 else:
@@ -404,6 +405,15 @@ def _bytes_to_codes(b):
     assert len(a) * a.itemsize == len(b)
     return a.tolist()
 
+def _simple(p):
+    # check if this subpattern is a "simple" operator
+    if len(p) != 1:
+        return False
+    op, av = p[0]
+    if op is SUBPATTERN:
+        return av[0] is None and _simple(av[-1])
+    return op in _UNIT_CODES
+
 def _generate_overlap_table(prefix):
     """
     Generate an overlap table for the following prefix.
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 3b8a331ffb0519..beceeafa4b98a2 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -222,38 +222,6 @@ def getwidth(self):
         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
         return self.width
 
-    def hasgroups(self):
-        # determine whether this subpattern contains capturing groups
-        if self._hasgroups is not None:
-            return self._hasgroups
-        v = False
-        for op, av in self.data:
-            if op is BRANCH:
-                for av in av[1]:
-                    v = av.hasgroups()
-                    if v:
-                        break
-            elif op is CALL:
-                v = av.hasgroups()
-            elif op is SUBPATTERN:
-                v = av[0] is not None or av[-1].hasgroups()
-            elif op in _REPEATCODES:
-                v = av[2].hasgroups()
-            elif op in _ASSERT_CODES:
-                v = av[1].hasgroups()
-            elif op is GROUPREF_EXISTS:
-                v = av[1].hasgroups()
-                if not v and av[2] is not None:
-                    v = av[2].hasgroups()
-            elif op is SUCCESS:
-                break
-            else:
-                continue
-            if v:
-                break
-        self._hasgroups = v
-        return v
-
 class Tokenizer:
     def __init__(self, string):
         self.istext = isinstance(string, str)

From dcd7cccaa269ff32d4a9f5e30176cd538d2694a0 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 13 May 2017 18:22:54 +0300
Subject: [PATCH 4/4] Remove unused code.

---
 Lib/sre_parse.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index beceeafa4b98a2..f72408f010b1fa 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -27,7 +27,6 @@
 
 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
-_ASSERT_CODES = frozenset({ASSERT, ASSERT_NOT})
 
 ESCAPES = {
     r"\a": (LITERAL, ord("\a")),
@@ -115,7 +114,6 @@ def __init__(self, pattern, data=None):
             data = []
         self.data = data
         self.width = None
-        self._hasgroups = None
 
     def dump(self, level=0):
         nl = True