From ba3cc47f83a49dedb8e06fcb4be25069cdf75917 Mon Sep 17 00:00:00 2001 From: Dodothereal <129273127+Dodothereal@users.noreply.github.com> Date: Mon, 22 Jun 2026 10:31:16 +0200 Subject: [PATCH] docs: document backtracking clears capture in conditional regex (gh-151819) Closes python/cpython#151819 The (?(id/name)yes-pattern|no-pattern) documentation claims an example pattern (<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$) will not match '. The same backtracking behaviour occurs in simpler cases such as (<)?\w+(?(1)>) matching only '3' from '<3'. This change documents the backtracking semantics explicitly and corrects the embedded example. Adds a regression test that locks in the visible behavior. --- Doc/library/re.rst | 14 +++++++-- Lib/test/test_re.py | 31 +++++++++++++++++++ ...-06-22-00-00-00.gh-issue-151819.nULCij.rst | 4 +++ 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Documentation/2026-06-22-00-00-00.gh-issue-151819.nULCij.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 4745c1b98a4554..56d952d8840041 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -508,9 +508,17 @@ The special characters are: Will try to match with ``yes-pattern`` if the group with given *id* or *name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is optional and can be omitted. For example, - ``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern, which - will match with ``''`` as well as ``'user@host.com'``, but - not with ``''``. + ``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern, + which will match with ``''`` as well as ``'user@host.com'``, + and will not match with ``'user@host.com>'``. + + Note that when ``yes-pattern`` is not matched while the captured group + was set, backtracking clears the capture (the optional group falls + back to its no-match state). For example, + ``(<)?\w+(?(1)>)`` applied to ``'<3'`` matches only ``'3'`` at + position 1 with ``group(1) is None``: the engine first consumes the + leading ``<`` to satisfy group 1, fails to match ``>`` at position + 2, then retries without consuming ``<``. .. versionchanged:: 3.12 Group *id* can only contain ASCII digits. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 69d730c49387be..553db8efb43d27 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -706,6 +706,37 @@ def test_re_groupref_exists_errors(self): self.checkPatternError(r'()(?(2)a)', "invalid group reference 2", 5) + def test_re_conditional_drops_capture_on_backtrack(self): + # Issue: a captured optional group is cleared when backtracking + # causes the ``yes-pattern`` of a (?(id/name)yes|no) construct + # to not match after the capture was set. See: + # https://github.com/python/cpython/issues/151819 + # Minimal reproduction from the issue: + m = re.search(r'(<)?\w+(?(1)>)', '<3') + self.assertEqual(m.group(), '3') + self.assertEqual(m.span(), (1, 2)) + self.assertEqual(m.group(1), None) + + # The successful case keeps the capture intact: + m = re.search(r'(<)?\w+(?(1)>)', '') + self.assertEqual(m.group(), '') + self.assertEqual(m.span(), (0, 6)) + self.assertEqual(m.group(1), '<') + + # Same effect with ``\w`` style groups and a longer input: + m = re.search(r'(<)?[A-Za-z]+(?(1)>)', '|$)', '