Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Doc/whatsnew/3.16.rst
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,15 @@ zipfile
Optimizations
=============

re
--

* Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``)
outside a character set are now compiled to a single ``CATEGORY`` opcode
instead of being wrapped in an ``IN`` block. This speeds up matching of
patterns such as ``\d+`` and reduces the size of the compiled byte code.
(Contributed by Serhiy Storchaka in :gh:`152033`.)

module_name
-----------

Expand Down
4 changes: 3 additions & 1 deletion Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY}

_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
Expand Down Expand Up @@ -495,6 +495,8 @@ def _get_charset_prefix(pattern, flags):
if iscased and iscased(av):
return None
return [(op, av)]
elif op is CATEGORY:
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
Expand Down
21 changes: 9 additions & 12 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
_SETITEMCODES = frozenset({LITERAL, CATEGORY})

ESCAPES = {
r"\a": (LITERAL, ord("\a")),
Expand All @@ -43,12 +44,12 @@
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\d": (CATEGORY, CATEGORY_DIGIT),
r"\D": (CATEGORY, CATEGORY_NOT_DIGIT),
r"\s": (CATEGORY, CATEGORY_SPACE),
r"\S": (CATEGORY, CATEGORY_NOT_SPACE),
r"\w": (CATEGORY, CATEGORY_WORD),
r"\W": (CATEGORY, CATEGORY_NOT_WORD),
r"\z": (AT, AT_END_STRING), # end of string
r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
}
Expand Down Expand Up @@ -315,7 +316,7 @@ def _class_escape(source, escape):
if code:
return code
code = CATEGORIES.get(escape)
if code and code[0] is IN:
if code and code[0] is CATEGORY:
return code
try:
c = escape[1:2]
Expand Down Expand Up @@ -493,7 +494,7 @@ def _parse_sub(source, state, verbose, nested):
if len(item) != 1:
break
op, av = item[0]
if op is LITERAL:
if op in _SETITEMCODES:
set.append((op, av))
elif op is IN and av[0][0] is not NEGATE:
set.extend(av)
Expand Down Expand Up @@ -590,8 +591,6 @@ def _parse(source, state, verbose, nested, first=False):
raise source.error("unterminated character set",
source.tell() - here)
if that == "]":
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
Expand All @@ -616,8 +615,6 @@ def _parse(source, state, verbose, nested, first=False):
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)

set = _uniq(set)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Optimize matching of character class escapes (``\d``, ``\D``, ``\s``,
``\S``, ``\w`` and ``\W``) that occur outside a character set: they are now
compiled to a single ``CATEGORY`` opcode instead of being wrapped in an
``IN`` block. This speeds up patterns such as ``\d+`` and reduces the size
of the compiled byte code.
57 changes: 36 additions & 21 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,34 @@ _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
} while (0)
#define GET_SKIP GET_SKIP_ADJ(0)

static int
_validate_category(SRE_CODE arg)
{
switch (arg) {
case SRE_CATEGORY_DIGIT:
case SRE_CATEGORY_NOT_DIGIT:
case SRE_CATEGORY_SPACE:
case SRE_CATEGORY_NOT_SPACE:
case SRE_CATEGORY_WORD:
case SRE_CATEGORY_NOT_WORD:
case SRE_CATEGORY_LINEBREAK:
case SRE_CATEGORY_NOT_LINEBREAK:
case SRE_CATEGORY_LOC_WORD:
case SRE_CATEGORY_LOC_NOT_WORD:
case SRE_CATEGORY_UNI_DIGIT:
case SRE_CATEGORY_UNI_NOT_DIGIT:
case SRE_CATEGORY_UNI_SPACE:
case SRE_CATEGORY_UNI_NOT_SPACE:
case SRE_CATEGORY_UNI_WORD:
case SRE_CATEGORY_UNI_NOT_WORD:
case SRE_CATEGORY_UNI_LINEBREAK:
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return 1;
default:
return 0;
}
}

static int
_validate_charset(SRE_CODE *code, SRE_CODE *end)
{
Expand Down Expand Up @@ -1894,27 +1922,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)

case SRE_OP_CATEGORY:
GET_ARG;
switch (arg) {
case SRE_CATEGORY_DIGIT:
case SRE_CATEGORY_NOT_DIGIT:
case SRE_CATEGORY_SPACE:
case SRE_CATEGORY_NOT_SPACE:
case SRE_CATEGORY_WORD:
case SRE_CATEGORY_NOT_WORD:
case SRE_CATEGORY_LINEBREAK:
case SRE_CATEGORY_NOT_LINEBREAK:
case SRE_CATEGORY_LOC_WORD:
case SRE_CATEGORY_LOC_NOT_WORD:
case SRE_CATEGORY_UNI_DIGIT:
case SRE_CATEGORY_UNI_NOT_DIGIT:
case SRE_CATEGORY_UNI_SPACE:
case SRE_CATEGORY_UNI_NOT_SPACE:
case SRE_CATEGORY_UNI_WORD:
case SRE_CATEGORY_UNI_NOT_WORD:
case SRE_CATEGORY_UNI_LINEBREAK:
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
break;
default:
if (!_validate_category(arg)) {
FAIL;
}
break;
Expand Down Expand Up @@ -1995,6 +2003,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
}
break;

case SRE_OP_CATEGORY:
GET_ARG;
if (!_validate_category(arg)) {
FAIL;
}
break;

case SRE_OP_ANY:
case SRE_OP_ANY_ALL:
/* These have no operands */
Expand Down
8 changes: 8 additions & 0 deletions Modules/_sre/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ LOCAL(Py_ssize_t)
SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
{
SRE_CODE chr;
SRE_CODE arg;
SRE_CHAR c;
const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
const SRE_CHAR* end = (const SRE_CHAR *)state->end;
Expand Down Expand Up @@ -302,6 +303,13 @@ SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;

case SRE_OP_CATEGORY:
arg = pattern[1];
TRACE(("|%p|%p|COUNT CATEGORY %d\n", pattern, ptr, arg));
while (ptr < end && sre_category(arg, *ptr))
ptr++;
break;

default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Expand Down
Loading