Skip to content

Commit 64dd760

Browse files
fix: Ignore combining characters in SRE
Closes: #7518
1 parent cd8b11d commit 64dd760

File tree

4 files changed

+6
-2
lines changed

4 files changed

+6
-2
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/sre_engine/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
1919
num_enum = { workspace = true }
2020
bitflags = { workspace = true }
2121
optional = { workspace = true }
22+
icu_properties = { workspace = true }
2223

2324
[dev-dependencies]
2425
criterion = { workspace = true }

crates/sre_engine/src/string.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use icu_properties::{CodePointMapData, props::CanonicalCombiningClass};
12
use rustpython_wtf8::Wtf8;
23

34
#[derive(Debug, Clone, Copy)]
@@ -442,8 +443,9 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
442443
#[inline]
443444
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
444445
// TODO: check with cpython
446+
let map = CodePointMapData::<CanonicalCombiningClass>::new();
445447
char::try_from(ch)
446-
.map(|x| x.is_alphanumeric())
448+
.map(|x| x.is_alphanumeric() && map.get(x) == CanonicalCombiningClass::NotReordered)
447449
.unwrap_or(false)
448450
}
449451
#[inline]

extra_tests/snippets/stdlib_re.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,4 @@
8181
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"
8282

8383
# Combining characters; issue #7518
84-
# assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"
84+
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"

0 commit comments

Comments
 (0)