Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,15 @@ The special characters are:
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
Python identifiers, and in :class:`bytes` patterns they can only contain
bytes in the ASCII range. Each group name must be defined only once within
a regular expression. A symbolic group is also a numbered group, just as if
the group were not named.
bytes in the ASCII range. A symbolic group is also a numbered group, just as
if the group were not named.

A group name may be used for more than one group. All such groups share a
single group number, and the name (and that number) refer to whichever of
them matched; if more than one matched, they refer to the last. This is
chiefly useful for giving the same name to corresponding groups in
alternative spellings of a pattern, for example
``(?P<y>\d{4})-(?P<m>\d\d)|(?P<m>\d\d)/(?P<y>\d{4})``.

Named groups can be referenced in three contexts. If the pattern is
``(?P<quote>['"]).*?(?P=quote)`` (i.e. matching a string quoted with either
Expand All @@ -440,6 +446,10 @@ The special characters are:
In :class:`bytes` patterns, group *name* can only contain bytes
in the ASCII range (``b'\x00'``-``b'\x7f'``).

.. versionchanged:: next
A group name can be used for more than one group. Previously each name
could be defined only once in a regular expression.

.. index:: single: (?P=; in regular expressions

``(?P=name)``
Expand Down
11 changes: 11 additions & 0 deletions Doc/whatsnew/3.16.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ os
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.)


re
--

* A capturing group name can now be used for more than one group in a regular
expression. Such groups share a single group number, and the name refers to
whichever of them matched. This is useful for giving the same name to
corresponding groups in alternative spellings of a pattern, such as
``(?P<y>\d{4})-(?P<m>\d\d)|(?P<m>\d\d)/(?P<y>\d{4})``.
(Contributed by Serhiy Storchaka in :gh:`152026`.)


shlex
-----

Expand Down
16 changes: 12 additions & 4 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,26 @@ def groups(self):
return len(self.groupwidths)
def opengroup(self, name=None):
gid = self.groups
self.groupwidths.append(None)
if self.groups > MAXGROUPS:
raise error("too many groups")
if name is not None:
ogid = self.groupdict.get(name, None)
if ogid is not None:
raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid))
# The same name may be used for more than one group. All such
# groups share a single group number, and the name refers to
# whichever of them matched.
return ogid
self.groupdict[name] = gid
self.groupwidths.append(None)
return gid
def closegroup(self, gid, p):
self.groupwidths[gid] = p.getwidth()
# A reused group number may be closed more than once; its width spans
# the union of all the definitions.
w = p.getwidth()
wold = self.groupwidths[gid]
if wold is not None:
w = (min(wold[0], w[0]), max(wold[1], w[1]))
self.groupwidths[gid] = w
def checkgroup(self, gid):
return gid < self.groups and self.groupwidths[gid] is not None

Expand Down
66 changes: 63 additions & 3 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,6 @@ def test_symbolic_groups(self):
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))

def test_symbolic_groups_errors(self):
self.checkPatternError(r'(?P<a>)(?P<a>)',
"redefinition of group name 'a' as group 2; "
"was group 1")
self.checkPatternError(r'(?P<a>(?P=a))',
"cannot refer to an open group", 10)
self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
Expand Down Expand Up @@ -380,6 +377,69 @@ def test_symbolic_refs_errors(self):
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
"bad character in group name '१'", 3)

def test_redefined_named_groups(self):
# The same name may be used for more than one group.
p = re.compile(r'(?P<m>\d+)/(?P<d>\d+)/(?P<y>\d+)|'
r'(?P<y>\d+)-(?P<m>\d+)-(?P<d>\d+)')
self.assertEqual(p.fullmatch('07/09/2023').groupdict(),
{'m': '07', 'd': '09', 'y': '2023'})
self.assertEqual(p.fullmatch('2023-07-09').groupdict(),
{'m': '07', 'd': '09', 'y': '2023'})
# Reused groups share a single group number.
self.assertEqual(p.groups, 3)
self.assertEqual(p.groupindex, {'m': 1, 'd': 2, 'y': 3})
# If more than one of the groups matches, the name refers to the last.
p = re.compile(r'(?P<a>\w)(?P<a>\w)')
self.assertEqual(p.match('xy').group('a'), 'y')
# A reused group may be left unset by the branch that does not match.
p = re.compile(r'(?P<a>\d)(?:-(?P<a>\d))?')
self.assertEqual(p.match('1').group('a'), '1')
self.assertEqual(p.match('1-2').group('a'), '2')
# A definition that is backtracked away does not leak into the group.
p = re.compile(r'(?P<g>x)(?:(?P<g>y)|z)')
self.assertEqual(p.match('xz').group('g'), 'x')
self.assertEqual(p.match('xy').group('g'), 'y')

def test_redefined_named_groups_backref(self):
# A backreference to a redefined name refers to whichever definition
# participated in the match.
p = re.compile(r'(?:(?P<g>a)|(?P<g>b))(?P=g)')
self.assertEqual(p.match('aa').group(), 'aa')
self.assertEqual(p.match('bb').group(), 'bb')
self.assertIsNone(p.match('ab'))
# A numeric backreference to the shared group number works too.
p = re.compile(r'(?:(?P<g>a)|(?P<g>b))\1')
self.assertEqual(p.match('aa').group(), 'aa')
self.assertIsNone(p.match('ab'))

def test_redefined_named_groups_conditional(self):
p = re.compile(r'(?:(?P<g>a)|b)(?(g)X|Y)')
self.assertEqual(p.match('aX').group(), 'aX')
self.assertEqual(p.match('bY').group(), 'bY')
self.assertIsNone(p.match('aY'))
self.assertIsNone(p.match('bX'))

def test_redefined_named_groups_width(self):
# closegroup() widens the shared group to the union of the definitions.
p = re.compile(r'(?:(?P<g>a)|(?P<g>bb))!')
self.assertEqual(p.match('a!').group('g'), 'a')
self.assertEqual(p.match('bb!').group('g'), 'bb')
# A fixed-width union may be used in a look-behind...
p = re.compile(r'(?:(?P<g>aa)|(?P<g>bb))(?<=(?P=g))')
self.assertEqual(p.match('aa').group(), 'aa')
# ...but a variable-width union may not.
self.checkPatternError(r'x(?<=(?:(?P<g>a)|(?P<g>bb)))',
'look-behind requires fixed-width pattern')

def test_redefined_named_groups_backtracking(self):
# A definition that matches and is then backtracked away (here the
# other branch of the inner alternation is taken) must not leak a stale
# capture into the shared group number.
p = re.compile(r'(?:(?P<g>a)|(?P<h>(?P<g>b)(?:b|a)|(?:bbb)*))')
self.assertIsNone(p.match('b').group('g'))
self.assertEqual(p.match('a').group('g'), 'a')
self.assertEqual(p.match('bb').group('g'), 'b')

def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
A group name can now be used for more than one group in a regular expression.
All such groups share a single group number, and the name refers to whichever
of them matched (the last one, if more than one matched). This is chiefly
useful for giving the same name to corresponding groups in alternative
spellings of a pattern, e.g. ``(?P<y>\d{4})-(?P<m>\d\d)|(?P<m>\d\d)/(?P<y>\d{4})``.
50 changes: 36 additions & 14 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->charsize = charsize;
state->match_all = 0;
state->must_advance = 0;
state->save_marks = pattern->reused_groups;
state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);

state->beginning = ptr;
Expand Down Expand Up @@ -1647,6 +1648,7 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
self->pattern = NULL;
self->groupindex = NULL;
self->indexgroup = NULL;
self->reused_groups = 0;
#ifdef Py_DEBUG
self->fail_after_count = -1;
self->fail_after_exc = NULL;
Expand Down Expand Up @@ -1930,7 +1932,8 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)

/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
static int
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups,
char *seen, int *reused)
{
/* Some variables are manipulated by the macros above */
SRE_CODE op;
Expand All @@ -1955,6 +1958,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
FAIL;
}
/* A mark index appears exactly once in well-formed code, unless
the same group number is opened in more than one place (a
redefined named group). Such groups need full mark save/restore
on backtracking. */
if (seen[arg])
*reused = 1;
else
seen[arg] = 1;
break;

case SRE_OP_LITERAL:
Expand Down Expand Up @@ -2081,7 +2092,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
if (skip == 0)
break;
/* Stop 2 before the end; we check the JUMP below */
if (_validate_inner(code, code+skip-3, groups))
if (_validate_inner(code, code+skip-3, groups, seen, reused))
FAIL;
code += skip-3;
/* Check that it ends with a JUMP, and that each JUMP
Expand Down Expand Up @@ -2112,7 +2123,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
FAIL;
if (max > SRE_MAXREPEAT)
FAIL;
if (_validate_inner(code, code+skip-4, groups))
if (_validate_inner(code, code+skip-4, groups, seen, reused))
FAIL;
code += skip-4;
GET_OP;
Expand All @@ -2132,7 +2143,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
FAIL;
if (max > SRE_MAXREPEAT)
FAIL;
if (_validate_inner(code, code+skip-3, groups))
if (_validate_inner(code, code+skip-3, groups, seen, reused))
FAIL;
code += skip-3;
GET_OP;
Expand All @@ -2150,7 +2161,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_ATOMIC_GROUP:
{
GET_SKIP;
if (_validate_inner(code, code+skip-2, groups))
if (_validate_inner(code, code+skip-2, groups, seen, reused))
FAIL;
code += skip-2;
GET_OP;
Expand Down Expand Up @@ -2203,12 +2214,12 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
for a JUMP opcode preceding our skip target.
*/
VTRACE(("then part:\n"));
int rc = _validate_inner(code+1, code+skip-1, groups);
int rc = _validate_inner(code+1, code+skip-1, groups, seen, reused);
if (rc == 1) {
VTRACE(("else part:\n"));
code += skip-2; /* Position after JUMP, at <skipno> */
GET_SKIP;
rc = _validate_inner(code, code+skip-1, groups);
rc = _validate_inner(code, code+skip-1, groups, seen, reused);
}
if (rc)
FAIL;
Expand All @@ -2221,7 +2232,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
/* Stop 1 before the end; we check the SUCCESS below */
if (_validate_inner(code+1, code+skip-2, groups))
if (_validate_inner(code+1, code+skip-2, groups, seen, reused))
FAIL;
code += skip-2;
GET_OP;
Expand All @@ -2246,24 +2257,35 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
}

static int
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups,
char *seen, int *reused)
{
if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
code >= end || end[-1] != SRE_OP_SUCCESS)
FAIL;
return _validate_inner(code, end-1, groups);
return _validate_inner(code, end-1, groups, seen, reused);
}

static int
_validate(PatternObject *self)
{
if (_validate_outer(self->code, self->code+self->codesize, self->groups))
{
/* seen[i] tracks whether mark index i has already been emitted, so that a
reused group number (the same mark seen twice) can be detected. */
int reused = 0;
char *seen = PyMem_Calloc(2 * (size_t)self->groups + 1, 1);
if (seen == NULL) {
PyErr_NoMemory();
return 0;
}
int invalid = _validate_outer(self->code, self->code+self->codesize,
self->groups, seen, &reused);
PyMem_Free(seen);
if (invalid) {
PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
return 0;
}
else
VTRACE(("Success!\n"));
self->reused_groups = reused;
VTRACE(("Success!\n"));
return 1;
}

Expand Down
7 changes: 7 additions & 0 deletions Modules/_sre/sre.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ typedef struct {
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
int reused_groups; /* a group number is opened in more than one place
(a redefined named group) */
#ifdef Py_DEBUG
/* for simulation of user interruption */
int fail_after_count;
Expand Down Expand Up @@ -97,6 +99,11 @@ typedef struct {
int lastmark;
int lastindex;
const void** mark;
int save_marks; /* if nonzero, save and restore mark values on
backtracking instead of only rewinding the lastmark
index; needed when a group inside the current region can
be revisited (reused group numbers, or the body of a
possessive repeat) */
/* dynamically allocated stuff */
char* data_stack;
size_t data_stack_size;
Expand Down
Loading
Loading