From d3af1c54ec029d29cf8708f2bd5f64478a28f99e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 07:11:29 +0000 Subject: [PATCH 1/7] Initial plan From 67485b5b7781692c0deb8d4ca6a72a6b0d3befa7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 07:39:45 +0000 Subject: [PATCH 2/7] Extract shared unicode crate Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- Cargo.lock | 26 ++- Cargo.toml | 1 + crates/literal/Cargo.toml | 2 +- crates/literal/src/char.rs | 16 +- crates/sre_engine/Cargo.toml | 1 + crates/sre_engine/src/string.rs | 93 ++------ crates/stdlib/Cargo.toml | 8 +- crates/stdlib/src/unicodedata.rs | 210 +++++------------- crates/unicode/Cargo.toml | 29 +++ crates/unicode/src/case.rs | 111 +++++++++ crates/unicode/src/classify.rs | 65 ++++++ crates/unicode/src/data.rs | 132 +++++++++++ crates/unicode/src/identifier.rs | 27 +++ crates/unicode/src/lib.rs | 77 +++++++ crates/unicode/src/normalize.rs | 40 ++++ crates/unicode/src/regex.rs | 85 +++++++ crates/vm/Cargo.toml | 2 +- crates/vm/src/builtins/str.rs | 52 +---- extra_tests/snippets/stdlib_unicode_shared.py | 20 ++ 19 files changed, 687 insertions(+), 310 deletions(-) create mode 100644 crates/unicode/Cargo.toml create mode 100644 crates/unicode/src/case.rs create mode 100644 crates/unicode/src/classify.rs create mode 100644 crates/unicode/src/data.rs create mode 100644 crates/unicode/src/identifier.rs create mode 100644 crates/unicode/src/lib.rs create mode 100644 crates/unicode/src/normalize.rs create mode 100644 crates/unicode/src/regex.rs create mode 100644 extra_tests/snippets/stdlib_unicode_shared.py diff --git a/Cargo.lock b/Cargo.lock index 5fd981fd135..553d18c7be3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3242,11 +3242,11 @@ name = "rustpython-literal" version = "0.5.0" dependencies = [ "hexf-parse", - "icu_properties", "is-macro", "lexical-parse-float", "num-traits", "rand 0.9.2", + "rustpython-unicode", "rustpython-wtf8", ] @@ -3338,6 +3338,7 @@ dependencies = [ "criterion", "num_enum", "optional", + "rustpython-unicode", "rustpython-wtf8", ] @@ -3368,8 +3369,6 @@ dependencies = [ "gethostname", "hex", "hmac", - "icu_normalizer", - "icu_properties", "indexmap", "itertools 0.14.0", "libc", @@ -3411,6 +3410,7 @@ dependencies = [ "rustpython-ruff_python_parser", "rustpython-ruff_source_file", "rustpython-ruff_text_size", + "rustpython-unicode", "rustpython-vm", "schannel", "sha-1", @@ -3421,9 +3421,6 @@ dependencies = [ "tcl-sys", "termios", "tk-sys", - "ucd", - "unic-ucd-age", - "unicode_names2 2.0.0", "uuid", "webpki-roots", "widestring", @@ -3433,6 +3430,21 @@ dependencies = [ "xml", ] +[[package]] +name = "rustpython-unicode" +version = "0.5.0" +dependencies = [ + "caseless", + "icu_normalizer", + "icu_properties", + "itertools 0.14.0", + "rustpython-wtf8", + "ucd", + "unic-ucd-age", + "unicode-casing", + "unicode_names2 2.0.0", +] + [[package]] name = "rustpython-venvlauncher" version = "0.5.0" @@ -3458,7 +3470,6 @@ dependencies = [ "glob", "half", "hex", - "icu_properties", "indexmap", "is-macro", "itertools 0.14.0", @@ -3492,6 +3503,7 @@ dependencies = [ "rustpython-ruff_python_parser", "rustpython-ruff_text_size", "rustpython-sre_engine", + "rustpython-unicode", "rustyline", "scoped-tls", "scopeguard", diff --git a/Cargo.toml b/Cargo.toml index 7bd8b8f3374..87ad486f079 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -153,6 +153,7 @@ rustpython-vm = { path = "crates/vm", default-features = false, version = "0.5.0 rustpython-pylib = { path = "crates/pylib", version = "0.5.0" } rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" } rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" } +rustpython-unicode = { path = "crates/unicode", default-features = false, version = "0.5.0" } rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" } rustpython-doc = { path = "crates/doc", version = "0.5.0" } diff --git a/crates/literal/Cargo.toml b/crates/literal/Cargo.toml index 3f0bec33c30..929b3393807 100644 --- a/crates/literal/Cargo.toml +++ b/crates/literal/Cargo.toml @@ -9,13 +9,13 @@ license = { workspace = true } rust-version = { workspace = true } [dependencies] +rustpython-unicode = { workspace = true, default-features = false } rustpython-wtf8 = { workspace = true } hexf-parse = "0.2.1" is-macro.workspace = true lexical-parse-float = { version = "1.0.6", features = ["format"] } num-traits = { workspace = true } -icu_properties = { workspace = true } [dev-dependencies] rand = { workspace = true } diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs index 5b446cc1a19..4544133d3b6 100644 --- a/crates/literal/src/char.rs +++ b/crates/literal/src/char.rs @@ -1,5 +1,3 @@ -use icu_properties::props::{EnumeratedProperty, GeneralCategory}; - /// According to python following categories aren't printable: /// * Cc (Other, Control) /// * Cf (Other, Format) @@ -10,17 +8,5 @@ use icu_properties::props::{EnumeratedProperty, GeneralCategory}; /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) /// * Zs (Separator, Space) other than ASCII space('\x20'). pub fn is_printable(c: char) -> bool { - let cat = GeneralCategory::for_char(c); - - !matches!( - cat, - GeneralCategory::SpaceSeparator - | GeneralCategory::LineSeparator - | GeneralCategory::ParagraphSeparator - | GeneralCategory::Control - | GeneralCategory::Format - | GeneralCategory::Surrogate - | GeneralCategory::PrivateUse - | GeneralCategory::Unassigned - ) + rustpython_unicode::classify::is_repr_printable(c as u32) } diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml index 4f899e6b3e9..d95e00693ae 100644 --- a/crates/sre_engine/Cargo.toml +++ b/crates/sre_engine/Cargo.toml @@ -15,6 +15,7 @@ name = "benches" harness = false [dependencies] +rustpython-unicode = { workspace = true, default-features = false } rustpython-wtf8 = { workspace = true } num_enum = { workspace = true } bitflags = { workspace = true } diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index 489819bfb3e..bb2974bca5b 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -332,35 +332,22 @@ const fn utf8_is_cont_byte(byte: u8) -> bool { /// Mask of the value bits of a continuation byte. const CONT_MASK: u8 = 0b0011_1111; -const fn is_py_ascii_whitespace(b: u8) -> bool { - matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') -} - #[inline] pub(crate) fn is_word(ch: u32) -> bool { - ch == '_' as u32 - || u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) + rustpython_unicode::regex::is_word(ch) } #[inline] pub(crate) fn is_space(ch: u32) -> bool { - u8::try_from(ch) - .map(is_py_ascii_whitespace) - .unwrap_or(false) + rustpython_unicode::regex::is_space(ch) } #[inline] pub(crate) fn is_digit(ch: u32) -> bool { - u8::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) + rustpython_unicode::regex::is_digit(ch) } #[inline] pub(crate) fn is_loc_alnum(ch: u32) -> bool { // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) + rustpython_unicode::regex::is_locale_alnum(ch) } #[inline] pub(crate) fn is_loc_word(ch: u32) -> bool { @@ -368,83 +355,37 @@ pub(crate) fn is_loc_word(ch: u32) -> bool { } #[inline] pub(crate) const fn is_linebreak(ch: u32) -> bool { - ch == '\n' as u32 + rustpython_unicode::regex::is_linebreak(ch) } #[inline] pub fn lower_ascii(ch: u32) -> u32 { - u8::try_from(ch) - .map(|x| x.to_ascii_lowercase() as u32) - .unwrap_or(ch) + rustpython_unicode::regex::lower_ascii(ch) } #[inline] pub(crate) fn lower_locate(ch: u32) -> u32 { // FIXME: Ignore the locales - lower_ascii(ch) + rustpython_unicode::regex::lower_locale(ch) } #[inline] pub(crate) fn upper_locate(ch: u32) -> u32 { // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.to_ascii_uppercase() as u32) - .unwrap_or(ch) + rustpython_unicode::regex::upper_locale(ch) } #[inline] pub(crate) fn is_uni_digit(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) + rustpython_unicode::regex::is_unicode_digit(ch) } #[inline] pub(crate) fn is_uni_space(ch: u32) -> bool { - // TODO: check with cpython - is_space(ch) - || matches!( - ch, - 0x0009 - | 0x000A - | 0x000B - | 0x000C - | 0x000D - | 0x001C - | 0x001D - | 0x001E - | 0x001F - | 0x0020 - | 0x0085 - | 0x00A0 - | 0x1680 - | 0x2000 - | 0x2001 - | 0x2002 - | 0x2003 - | 0x2004 - | 0x2005 - | 0x2006 - | 0x2007 - | 0x2008 - | 0x2009 - | 0x200A - | 0x2028 - | 0x2029 - | 0x202F - | 0x205F - | 0x3000 - ) + rustpython_unicode::regex::is_unicode_space(ch) } #[inline] pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { - matches!( - ch, - 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 - ) + rustpython_unicode::regex::is_unicode_linebreak(ch) } #[inline] pub(crate) fn is_uni_alnum(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_alphanumeric()) - .unwrap_or(false) + rustpython_unicode::regex::is_unicode_alnum(ch) } #[inline] pub(crate) fn is_uni_word(ch: u32) -> bool { @@ -452,15 +393,9 @@ pub(crate) fn is_uni_word(ch: u32) -> bool { } #[inline] pub fn lower_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_lowercase().next().unwrap() as u32) - .unwrap_or(ch) + rustpython_unicode::regex::lower_unicode(ch) } #[inline] pub fn upper_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_uppercase().next().unwrap() as u32) - .unwrap_or(ch) + rustpython_unicode::regex::upper_unicode(ch) } diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index f828507d6cf..6945132243f 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -28,6 +28,7 @@ flame-it = ["flame"] [dependencies] # rustpython crates rustpython-derive = { workspace = true } +rustpython-unicode = { workspace = true, features = ["std", "casefold"] } rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]} rustpython-common = { workspace = true } @@ -76,13 +77,6 @@ pbkdf2 = { version = "0.12", features = ["hmac"] } constant_time_eq = { workspace = true } ## unicode stuff -unicode_names2 = { workspace = true } -# update version all at the same time -icu_properties = { workspace = true } -icu_normalizer = { workspace = true } -unic-ucd-age = { workspace = true } -ucd = "0.1.1" - # compression adler32 = "1.2.0" crc32fast = "1.3.2" diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 6ee5b0c2ee8..622a7854157 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -17,6 +17,17 @@ enum NormalizeForm { Nfkd, } +impl From for rustpython_unicode::NormalizeForm { + fn from(value: NormalizeForm) -> Self { + match value { + NormalizeForm::Nfc => Self::Nfc, + NormalizeForm::Nfkc => Self::Nfkc, + NormalizeForm::Nfd => Self::Nfd, + NormalizeForm::Nfkd => Self::Nfkd, + } + } +} + impl<'a> TryFromBorrowedObject<'a> for NormalizeForm { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { obj.try_value_with( @@ -34,25 +45,15 @@ impl<'a> TryFromBorrowedObject<'a> for NormalizeForm { #[pymodule] mod unicodedata { - use super::NormalizeForm::*; use crate::vm::{ Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine, builtins::{PyModule, PyStrRef}, function::OptionalArg, }; - use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; - use icu_properties::{ - CodePointSetData, - props::{ - BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty, - GeneralCategory, NamedEnumeratedProperty, - }, - }; use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; - use ucd::{Codepoint, DecompositionType, Number, NumericType}; - use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; + use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize}; pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py) -> PyResult<()> { __module_exec(vm, module); @@ -94,8 +95,7 @@ mod unicodedata { } fn check_age(&self, c: CodePoint) -> bool { - c.to_char() - .is_none_or(|c| Age::of(c).is_some_and(|age| age.actual() <= self.unic_version)) + data::is_assigned_in_version(c.to_u32(), self.unic_version) } fn extract_char( @@ -119,21 +119,19 @@ mod unicodedata { fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self .extract_char(character, vm)? - .map_or(GeneralCategory::Unassigned, |c| { - c.to_char() - .map_or(GeneralCategory::Surrogate, GeneralCategory::for_char) - }) - .short_name() + .map_or("Cn", |c| data::category(c.to_u32())) .to_owned()) } #[pymethod] fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult { if let Some(name_str) = name.to_str() - && let Some(character) = unicode_names2::character(name_str) - && self.check_age(character.into()) + && let Some(character) = data::lookup(name_str) + && self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point")) { - return Ok(character.to_string()); + return Ok(char::from_u32(character) + .expect("unicode_names2 only returns Unicode scalar values") + .to_string()); } Err(vm.new_key_error( vm.ctx @@ -153,9 +151,9 @@ mod unicodedata { if let Some(c) = c && self.check_age(c) - && let Some(name) = c.to_char().and_then(unicode_names2::name) + && let Some(name) = data::name(c.to_u32()) { - return Ok(vm.ctx.new_str(name.to_string()).into()); + return Ok(vm.ctx.new_str(name).into()); } default.ok_or_else(|| vm.new_value_error("no such name")) } @@ -166,14 +164,9 @@ mod unicodedata { character: PyStrRef, vm: &VirtualMachine, ) -> PyResult<&'static str> { - let bidi = match self.extract_char(character, vm)? { - Some(c) => c - .to_char() - .map_or(BidiClass::LeftToRight, BidiClass::for_char) - .short_name(), - None => "", - }; - Ok(bidi) + Ok(self + .extract_char(character, vm)? + .map_or("", |c| data::bidirectional(c.to_u32()))) } /// NOTE: This function uses 9.0.0 database instead of 3.2.0 @@ -185,110 +178,38 @@ mod unicodedata { ) -> PyResult<&'static str> { Ok(self .extract_char(character, vm)? - .and_then(|c| c.to_char()) - .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char) - .short_name()) + .map_or("N", |c| data::east_asian_width(c.to_u32()))) } #[pymethod] fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - let text = unistr.as_wtf8(); - let normalized_text = match form { - Nfc => { - let normalizer = ComposingNormalizerBorrowed::new_nfc(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfkc => { - let normalizer = ComposingNormalizerBorrowed::new_nfkc(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfd => { - let normalizer = DecomposingNormalizerBorrowed::new_nfd(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfkd => { - let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - }; - Ok(normalized_text) + Ok(normalize::normalize(form.into(), unistr.as_wtf8())) } #[pymethod] fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - let text = unistr.as_wtf8(); - let normalized: Wtf8Buf = match form { - Nfc => { - let normalizer = ComposingNormalizerBorrowed::new_nfc(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfkc => { - let normalizer = ComposingNormalizerBorrowed::new_nfkc(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfd => { - let normalizer = DecomposingNormalizerBorrowed::new_nfd(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - Nfkd => { - let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); - text.map_utf8(|s| normalizer.normalize_iter(s.chars())) - .collect() - } - }; - Ok(text == &*normalized) + Ok(normalize::is_normalized(form.into(), unistr.as_wtf8())) } #[pymethod] fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { - match self.extract_char(character, vm)? { - Some(c) => { - if let Some(ch) = c.to_char() { - // Check if the character is mirrored in bidirectional text using Unicode standard - let bidi_mirrored = CodePointSetData::new::(); - Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 }) - } else { - Ok(0) - } - } - None => Ok(0), - } + Ok(self + .extract_char(character, vm)? + .is_some_and(|c| data::mirrored(c.to_u32())) as i32) } #[pymethod] fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self .extract_char(character, vm)? - .and_then(|c| c.to_char()) - .map_or(0, |ch| { - CanonicalCombiningClass::for_char(ch).to_icu4c_value() - })) + .map_or(0, |c| data::combining(c.to_u32()))) } #[pymethod] fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { - let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) { - Some(ch) => ch, - None => return Ok(String::new()), - }; - let chars: Vec = ch.decomposition_map().collect(); - // If decomposition maps to just the character itself, there's no decomposition - if chars.len() == 1 && chars[0] == ch { - return Ok(String::new()); - } - let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" "); - let tag = match ch.decomposition_type() { - Some(DecompositionType::Canonical) | None => return Ok(hex_parts), - Some(dt) => decomposition_type_tag(dt), - }; - Ok(format!("<{tag}> {hex_parts}")) + Ok(self + .extract_char(character, vm)? + .map_or_else(String::new, |c| data::decomposition(c.to_u32()))) } #[pymethod] @@ -298,15 +219,11 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); - if let Some(ch) = ch - && matches!( - ch.numeric_type(), - Some(NumericType::Decimal) | Some(NumericType::Digit) - ) - && let Some(Number::Integer(n)) = ch.numeric_value() + if let Some(value) = self + .extract_char(character, vm)? + .and_then(|c| data::digit(c.to_u32())) { - return Ok(vm.ctx.new_int(n).into()); + return Ok(vm.ctx.new_int(value).into()); } default.ok_or_else(|| vm.new_value_error("not a digit")) } @@ -318,12 +235,11 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); - if let Some(ch) = ch - && ch.numeric_type() == Some(NumericType::Decimal) - && let Some(Number::Integer(n)) = ch.numeric_value() + if let Some(value) = self + .extract_char(character, vm)? + .and_then(|c| data::decimal(c.to_u32())) { - return Ok(vm.ctx.new_int(n).into()); + return Ok(vm.ctx.new_int(value).into()); } default.ok_or_else(|| vm.new_value_error("not a decimal")) } @@ -335,17 +251,15 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); - if let Some(ch) = ch { - match ch.numeric_value() { - Some(Number::Integer(n)) => { - return Ok(vm.ctx.new_float(n as f64).into()); - } - Some(Number::Rational(num, den)) => { - return Ok(vm.ctx.new_float(num as f64 / den as f64).into()); - } - None => {} - } + if let Some(value) = self + .extract_char(character, vm)? + .and_then(|c| data::numeric(c.to_u32())) + { + let value = match value { + data::NumericValue::Integer(n) => n as f64, + data::NumericValue::Rational(num, den) => num as f64 / den as f64, + }; + return Ok(vm.ctx.new_float(value).into()); } default.ok_or_else(|| vm.new_value_error("not a numeric character")) } @@ -356,28 +270,6 @@ mod unicodedata { } } - fn decomposition_type_tag(dt: DecompositionType) -> &'static str { - match dt { - DecompositionType::Canonical => "canonical", - DecompositionType::Compat => "compat", - DecompositionType::Circle => "circle", - DecompositionType::Final => "final", - DecompositionType::Font => "font", - DecompositionType::Fraction => "fraction", - DecompositionType::Initial => "initial", - DecompositionType::Isolated => "isolated", - DecompositionType::Medial => "medial", - DecompositionType::Narrow => "narrow", - DecompositionType::Nobreak => "noBreak", - DecompositionType::Small => "small", - DecompositionType::Square => "square", - DecompositionType::Sub => "sub", - DecompositionType::Super => "super", - DecompositionType::Vertical => "vertical", - DecompositionType::Wide => "wide", - } - } - #[pyattr] fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { Ucd { diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml new file mode 100644 index 00000000000..51fc781e790 --- /dev/null +++ b/crates/unicode/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "rustpython-unicode" +description = "Shared Unicode semantics and data for RustPython and related Python tooling." +version.workspace = true +authors.workspace = true +edition.workspace = true +rust-version.workspace = true +repository.workspace = true +license.workspace = true + +[features] +default = ["std", "casefold"] +std = [] +casefold = ["std", "dep:caseless"] + +[dependencies] +rustpython-wtf8 = { workspace = true } + +icu_normalizer = { workspace = true } +icu_properties = { workspace = true } +itertools = { workspace = true } +unicode-casing = { workspace = true } +unicode_names2 = { version = "2.0.0", default-features = false, features = ["no_std"] } +unic-ucd-age = { workspace = true } +ucd = "0.1.1" +caseless = { version = "0.2.2", optional = true } + +[lints] +workspace = true diff --git a/crates/unicode/src/case.rs b/crates/unicode/src/case.rs new file mode 100644 index 00000000000..dbd71929f2d --- /dev/null +++ b/crates/unicode/src/case.rs @@ -0,0 +1,111 @@ +#[cfg(feature = "casefold")] +use alloc::string::String; + +#[cfg(feature = "casefold")] +use rustpython_wtf8::Wtf8Chunk; +use rustpython_wtf8::{Wtf8, Wtf8Buf}; +use unicode_casing::CharExt; + +use crate::char_from_codepoint; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CaseMapping { + len: u8, + codepoints: [u32; 3], +} + +impl CaseMapping { + pub const fn identity(cp: u32) -> Self { + Self { + len: 1, + codepoints: [cp, 0, 0], + } + } + + pub const fn first(self) -> Option { + if self.len == 0 { + None + } else { + Some(self.codepoints[0]) + } + } + + pub fn iter(self) -> impl Iterator { + self.codepoints.into_iter().take(usize::from(self.len)) + } +} + +fn mapping_from_chars(chars: impl Iterator) -> CaseMapping { + let mut codepoints = [0; 3]; + let mut len = 0; + for ch in chars.take(codepoints.len()) { + codepoints[len] = ch as u32; + len += 1; + } + CaseMapping { + len: len as u8, + codepoints, + } +} + +#[cfg(feature = "casefold")] +fn mapping_from_string(text: String) -> CaseMapping { + mapping_from_chars(text.chars()) +} + +pub fn to_lowercase(cp: u32) -> CaseMapping { + char_from_codepoint(cp).map_or_else( + || CaseMapping::identity(cp), + |ch| mapping_from_chars(ch.to_lowercase()), + ) +} + +pub fn to_uppercase(cp: u32) -> CaseMapping { + char_from_codepoint(cp).map_or_else( + || CaseMapping::identity(cp), + |ch| mapping_from_chars(ch.to_uppercase()), + ) +} + +pub fn to_titlecase(cp: u32) -> CaseMapping { + char_from_codepoint(cp).map_or_else( + || CaseMapping::identity(cp), + |ch| mapping_from_chars(ch.to_titlecase()), + ) +} + +pub fn to_lowercase_wtf8(text: &Wtf8) -> Wtf8Buf { + text.map_utf8(|s| s.chars().flat_map(char::to_lowercase)) + .collect() +} + +pub fn to_uppercase_wtf8(text: &Wtf8) -> Wtf8Buf { + text.map_utf8(|s| s.chars().flat_map(char::to_uppercase)) + .collect() +} + +#[cfg(feature = "casefold")] +pub fn casefold(cp: u32) -> CaseMapping { + char_from_codepoint(cp).map_or_else( + || CaseMapping::identity(cp), + |ch| { + let mut buf = [0; 4]; + mapping_from_string(caseless::default_case_fold_str(ch.encode_utf8(&mut buf))) + }, + ) +} + +#[cfg(feature = "casefold")] +pub fn casefold_str(text: &str) -> String { + caseless::default_case_fold_str(text) +} + +#[cfg(feature = "casefold")] +pub fn casefold_wtf8(text: &Wtf8) -> Wtf8Buf { + text.chunks() + .map(|chunk| match chunk { + Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(casefold_str(s)), + Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), + }) + .collect() +} diff --git a/crates/unicode/src/classify.rs b/crates/unicode/src/classify.rs new file mode 100644 index 00000000000..56c6679e545 --- /dev/null +++ b/crates/unicode/src/classify.rs @@ -0,0 +1,65 @@ +use icu_properties::props::{BidiClass, EnumeratedProperty, GeneralCategory}; +use ucd::{Codepoint, NumericType}; + +use crate::{char_from_codepoint, is_surrogate}; + +pub fn general_category(cp: u32) -> GeneralCategory { + if is_surrogate(cp) { + GeneralCategory::Surrogate + } else { + char_from_codepoint(cp).map_or(GeneralCategory::Unassigned, GeneralCategory::for_char) + } +} + +pub fn is_alpha(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(char::is_alphabetic) +} + +pub fn is_alnum(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(char::is_alphanumeric) +} + +pub fn is_decimal(cp: u32) -> bool { + matches!(general_category(cp), GeneralCategory::DecimalNumber) +} + +pub fn is_digit(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(|ch| { + matches!( + ch.numeric_type(), + Some(NumericType::Decimal) | Some(NumericType::Digit) + ) + }) +} + +pub fn is_numeric(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(|ch| ch.numeric_value().is_some()) +} + +pub fn is_space(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(|ch| { + matches!(general_category(cp), GeneralCategory::SpaceSeparator) + || matches!( + BidiClass::for_char(ch), + BidiClass::WhiteSpace | BidiClass::ParagraphSeparator | BidiClass::SegmentSeparator + ) + }) +} + +pub fn is_printable(cp: u32) -> bool { + cp == '\u{0020}' as u32 || is_repr_printable(cp) +} + +pub fn is_repr_printable(cp: u32) -> bool { + !matches!( + general_category(cp), + GeneralCategory::SpaceSeparator + | GeneralCategory::LineSeparator + | GeneralCategory::ParagraphSeparator + | GeneralCategory::Control + | GeneralCategory::Format + | GeneralCategory::Surrogate + | GeneralCategory::PrivateUse + | GeneralCategory::Unassigned + ) +} diff --git a/crates/unicode/src/data.rs b/crates/unicode/src/data.rs new file mode 100644 index 00000000000..28d9e0b9553 --- /dev/null +++ b/crates/unicode/src/data.rs @@ -0,0 +1,132 @@ +use alloc::{format, string::String, vec::Vec}; + +use icu_properties::{ + CodePointSetData, + props::{ + BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty, + NamedEnumeratedProperty, + }, +}; +use itertools::Itertools; +use ucd::{Codepoint, DecompositionType, Number, NumericType}; +use unic_ucd_age::{Age, UnicodeVersion}; + +use crate::{char_from_codepoint, classify, is_surrogate}; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum NumericValue { + Integer(i64), + Rational(i64, i64), +} + +pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool { + if is_surrogate(cp) { + true + } else { + char_from_codepoint(cp) + .is_some_and(|ch| Age::of(ch).is_some_and(|age| age.actual() <= version)) + } +} + +pub fn category(cp: u32) -> &'static str { + classify::general_category(cp).short_name() +} + +pub fn lookup(name: &str) -> Option { + unicode_names2::character(name).map(u32::from) +} + +pub fn name(cp: u32) -> Option { + char_from_codepoint(cp) + .and_then(unicode_names2::name) + .map(|name| name.collect()) +} + +pub fn bidirectional(cp: u32) -> &'static str { + char_from_codepoint(cp) + .map_or(BidiClass::LeftToRight, BidiClass::for_char) + .short_name() +} + +pub fn east_asian_width(cp: u32) -> &'static str { + char_from_codepoint(cp) + .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char) + .short_name() +} + +pub fn mirrored(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(|ch| CodePointSetData::new::().contains(ch)) +} + +pub fn combining(cp: u32) -> u8 { + char_from_codepoint(cp).map_or(0, |ch| { + CanonicalCombiningClass::for_char(ch).to_icu4c_value() + }) +} + +pub fn decomposition(cp: u32) -> String { + let ch = match char_from_codepoint(cp) { + Some(ch) => ch, + None => return String::new(), + }; + let chars: Vec = ch.decomposition_map().collect(); + if chars.len() == 1 && chars[0] == ch { + return String::new(); + } + let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" "); + match ch.decomposition_type() { + Some(DecompositionType::Canonical) | None => hex_parts, + Some(dt) => format!("<{}> {hex_parts}", decomposition_type_tag(dt)), + } +} + +pub fn digit(cp: u32) -> Option { + let ch = char_from_codepoint(cp)?; + if matches!( + ch.numeric_type(), + Some(NumericType::Decimal) | Some(NumericType::Digit) + ) && let Some(Number::Integer(value)) = ch.numeric_value() + { + return u32::try_from(value).ok(); + } + None +} + +pub fn decimal(cp: u32) -> Option { + let ch = char_from_codepoint(cp)?; + if ch.numeric_type() == Some(NumericType::Decimal) + && let Some(Number::Integer(value)) = ch.numeric_value() + { + return u32::try_from(value).ok(); + } + None +} + +pub fn numeric(cp: u32) -> Option { + match char_from_codepoint(cp)?.numeric_value()? { + Number::Integer(value) => Some(NumericValue::Integer(value)), + Number::Rational(num, den) => Some(NumericValue::Rational(num.into(), den.into())), + } +} + +fn decomposition_type_tag(dt: DecompositionType) -> &'static str { + match dt { + DecompositionType::Canonical => "canonical", + DecompositionType::Compat => "compat", + DecompositionType::Circle => "circle", + DecompositionType::Final => "final", + DecompositionType::Font => "font", + DecompositionType::Fraction => "fraction", + DecompositionType::Initial => "initial", + DecompositionType::Isolated => "isolated", + DecompositionType::Medial => "medial", + DecompositionType::Narrow => "narrow", + DecompositionType::Nobreak => "noBreak", + DecompositionType::Small => "small", + DecompositionType::Square => "square", + DecompositionType::Sub => "sub", + DecompositionType::Super => "super", + DecompositionType::Vertical => "vertical", + DecompositionType::Wide => "wide", + } +} diff --git a/crates/unicode/src/identifier.rs b/crates/unicode/src/identifier.rs new file mode 100644 index 00000000000..30a3ae32d5e --- /dev/null +++ b/crates/unicode/src/identifier.rs @@ -0,0 +1,27 @@ +use icu_properties::props::{BinaryProperty, XidContinue, XidStart}; + +use crate::char_from_codepoint; + +pub fn is_xid_start(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(XidStart::for_char) +} + +pub fn is_xid_continue(cp: u32) -> bool { + char_from_codepoint(cp).is_some_and(XidContinue::for_char) +} + +pub fn is_python_identifier_start(cp: u32) -> bool { + cp == '_' as u32 || is_xid_start(cp) +} + +pub fn is_python_identifier_continue(cp: u32) -> bool { + is_xid_continue(cp) +} + +pub fn is_python_identifier(text: &str) -> bool { + let mut chars = text.chars(); + let is_identifier_start = chars + .next() + .is_some_and(|ch| is_python_identifier_start(ch as u32)); + is_identifier_start && chars.all(|ch| is_python_identifier_continue(ch as u32)) +} diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs new file mode 100644 index 00000000000..7ac71be12f4 --- /dev/null +++ b/crates/unicode/src/lib.rs @@ -0,0 +1,77 @@ +#![cfg_attr(not(feature = "std"), no_std)] + +extern crate alloc; + +pub mod case; +pub mod classify; +pub mod data; +pub mod identifier; +pub mod normalize; +pub mod regex; + +pub use normalize::NormalizeForm; +pub use unic_ucd_age::{UNICODE_VERSION, UnicodeVersion}; + +use core::char; + +pub(crate) fn char_from_codepoint(cp: u32) -> Option { + char::from_u32(cp) +} + +pub(crate) const fn is_surrogate(cp: u32) -> bool { + matches!(cp, 0xD800..=0xDFFF) +} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + use rustpython_wtf8::Wtf8Buf; + + use crate::{NormalizeForm, case, classify, data, identifier, normalize, regex}; + + #[test] + fn printable_and_repr_printable_follow_python_rules() { + assert!(classify::is_printable(' ' as u32)); + assert!(!classify::is_repr_printable(' ' as u32)); + assert!(!classify::is_printable('\n' as u32)); + } + + #[test] + fn identifier_and_regex_predicates_share_unicode_tables() { + assert!(identifier::is_python_identifier_start('_' as u32)); + assert!(identifier::is_python_identifier("유니코드")); + assert!(regex::is_unicode_word('가' as u32)); + assert!(regex::is_unicode_digit('५' as u32)); + assert!(regex::is_unicode_space('\u{3000}' as u32)); + } + + #[test] + fn case_and_normalization_helpers_support_full_mappings() { + let upper: Vec<_> = case::to_uppercase('ß' as u32).iter().collect(); + assert_eq!(upper, vec!['S' as u32, 'S' as u32]); + + let text = Wtf8Buf::from("e\u{301}"); + assert_eq!( + normalize::normalize(NormalizeForm::Nfc, &text), + Wtf8Buf::from("é") + ); + assert!(normalize::is_normalized( + NormalizeForm::Nfd, + &normalize::normalize(NormalizeForm::Nfd, &Wtf8Buf::from("é")) + )); + } + + #[test] + fn unicode_data_queries_match_existing_unicodedata_behavior() { + assert_eq!(data::category('A' as u32), "Lu"); + assert_eq!(data::category(0xD800), "Cs"); + assert_eq!(data::lookup("SNOWMAN"), Some('☃' as u32)); + assert_eq!(data::name('☃' as u32).as_deref(), Some("SNOWMAN")); + assert_eq!(data::decimal('५' as u32), Some(5)); + assert_eq!(data::digit('²' as u32), Some(2)); + assert_eq!( + data::numeric('⅓' as u32), + Some(data::NumericValue::Rational(1, 3)) + ); + } +} diff --git a/crates/unicode/src/normalize.rs b/crates/unicode/src/normalize.rs new file mode 100644 index 00000000000..a5a53504262 --- /dev/null +++ b/crates/unicode/src/normalize.rs @@ -0,0 +1,40 @@ +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; +use rustpython_wtf8::{Wtf8, Wtf8Buf}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum NormalizeForm { + Nfc, + Nfkc, + Nfd, + Nfkd, +} + +pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf { + match form { + NormalizeForm::Nfc => { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + NormalizeForm::Nfkc => { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + NormalizeForm::Nfd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + NormalizeForm::Nfkd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + } +} + +pub fn is_normalized(form: NormalizeForm, text: &Wtf8) -> bool { + let normalized = normalize(form, text); + text == &*normalized +} diff --git a/crates/unicode/src/regex.rs b/crates/unicode/src/regex.rs new file mode 100644 index 00000000000..dcc057f7ad9 --- /dev/null +++ b/crates/unicode/src/regex.rs @@ -0,0 +1,85 @@ +use crate::{case, classify}; + +const fn is_py_ascii_whitespace(byte: u8) -> bool { + matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} + +pub fn is_word(cp: u32) -> bool { + cp == '_' as u32 + || u8::try_from(cp) + .map(|byte| byte.is_ascii_alphanumeric()) + .unwrap_or(false) +} + +pub fn is_space(cp: u32) -> bool { + u8::try_from(cp) + .map(is_py_ascii_whitespace) + .unwrap_or(false) +} + +pub fn is_digit(cp: u32) -> bool { + u8::try_from(cp) + .map(|byte| byte.is_ascii_digit()) + .unwrap_or(false) +} + +pub fn is_locale_alnum(cp: u32) -> bool { + u8::try_from(cp) + .map(|byte| byte.is_ascii_alphanumeric()) + .unwrap_or(false) +} + +pub fn is_locale_word(cp: u32) -> bool { + cp == '_' as u32 || is_locale_alnum(cp) +} + +pub const fn is_linebreak(cp: u32) -> bool { + cp == '\n' as u32 +} + +pub fn lower_ascii(cp: u32) -> u32 { + u8::try_from(cp) + .map(|byte| byte.to_ascii_lowercase() as u32) + .unwrap_or(cp) +} + +pub fn lower_locale(cp: u32) -> u32 { + lower_ascii(cp) +} + +pub fn upper_locale(cp: u32) -> u32 { + u8::try_from(cp) + .map(|byte| byte.to_ascii_uppercase() as u32) + .unwrap_or(cp) +} + +pub fn is_unicode_digit(cp: u32) -> bool { + classify::is_decimal(cp) +} + +pub fn is_unicode_space(cp: u32) -> bool { + classify::is_space(cp) +} + +pub const fn is_unicode_linebreak(cp: u32) -> bool { + matches!( + cp, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 + ) +} + +pub fn is_unicode_alnum(cp: u32) -> bool { + classify::is_alnum(cp) +} + +pub fn is_unicode_word(cp: u32) -> bool { + cp == '_' as u32 || is_unicode_alnum(cp) +} + +pub fn lower_unicode(cp: u32) -> u32 { + case::to_lowercase(cp).first().unwrap_or(cp) +} + +pub fn upper_unicode(cp: u32) -> u32 { + case::to_uppercase(cp).first().unwrap_or(cp) +} diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index b721418a4cc..22bf943fd1c 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -41,6 +41,7 @@ ruff_text_size = { workspace = true, optional = true } rustpython-compiler-core = { workspace = true } rustpython-literal = { workspace = true } rustpython-sre_engine = { workspace = true } +rustpython-unicode = { workspace = true, features = ["std", "casefold"] } ascii = { workspace = true } ahash = { workspace = true } @@ -86,7 +87,6 @@ timsort = "0.1.2" # TODO: use unic for this; needed for title case: # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 unicode-casing = { workspace = true } -icu_properties = { workspace = true } [target.'cfg(unix)'.dependencies] rustix = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index b31dc6ccc9d..5731eac6475 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -41,12 +41,9 @@ use rustpython_common::{ hash, lock::PyMutex, str::DeduceStrKind, - wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, + wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat}, }; -use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, -}; use unicode_casing::CharExt; impl<'a> TryFromBorrowedObject<'a> for String { @@ -698,7 +695,7 @@ impl PyStr { match self.as_str_kind() { PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(), PyKindStr::Utf8(s) => s.to_lowercase().into(), - PyKindStr::Wtf8(w) => w.to_lowercase().into(), + PyKindStr::Wtf8(w) => rustpython_unicode::case::to_lowercase_wtf8(w).into(), } } @@ -706,16 +703,9 @@ impl PyStr { #[pymethod] fn casefold(&self) -> Self { match self.as_str_kind() { - PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(), - PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(), - PyKindStr::Wtf8(w) => w - .chunks() - .map(|c| match c { - Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)), - Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), - }) - .collect::() - .into(), + PyKindStr::Ascii(s) => rustpython_unicode::case::casefold_str(s.as_str()).into(), + PyKindStr::Utf8(s) => rustpython_unicode::case::casefold_str(s).into(), + PyKindStr::Wtf8(w) => rustpython_unicode::case::casefold_wtf8(w).into(), } } @@ -724,7 +714,7 @@ impl PyStr { match self.as_str_kind() { PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(), PyKindStr::Utf8(s) => s.to_uppercase().into(), - PyKindStr::Wtf8(w) => w.to_uppercase().into(), + PyKindStr::Wtf8(w) => rustpython_unicode::case::to_uppercase_wtf8(w).into(), } } @@ -967,9 +957,7 @@ impl PyStr { #[pymethod] fn isdecimal(&self) -> bool { !self.data.is_empty() - && self.char_all(|c| { - matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber) - }) + && self.char_all(|c| rustpython_unicode::classify::is_decimal(c as u32)) } fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult { @@ -1089,23 +1077,12 @@ impl PyStr { #[pymethod] fn isprintable(&self) -> bool { - self.char_all(|c| c == '\u{0020}' || rustpython_literal::char::is_printable(c)) + self.char_all(|c| rustpython_unicode::classify::is_printable(c as u32)) } #[pymethod] fn isspace(&self) -> bool { - !self.data.is_empty() - && self.char_all(|c| { - matches!( - GeneralCategory::for_char(c), - GeneralCategory::SpaceSeparator - ) || matches!( - BidiClass::for_char(c), - BidiClass::WhiteSpace - | BidiClass::ParagraphSeparator - | BidiClass::SegmentSeparator - ) - }) + !self.data.is_empty() && self.char_all(|c| rustpython_unicode::classify::is_space(c as u32)) } // Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise. @@ -1362,15 +1339,8 @@ impl PyStr { #[pymethod] pub fn isidentifier(&self) -> bool { - let Some(s) = self.to_str() else { return false }; - let mut chars = s.chars(); - - let is_identifier_start = chars - .next() - .is_some_and(|c| c == '_' || XidStart::for_char(c)); - - // a string is not an identifier if it has whitespace or starts with a number - is_identifier_start && chars.all(XidContinue::for_char) + self.to_str() + .is_some_and(rustpython_unicode::identifier::is_python_identifier) } // https://docs.python.org/3/library/stdtypes.html#str.translate diff --git a/extra_tests/snippets/stdlib_unicode_shared.py b/extra_tests/snippets/stdlib_unicode_shared.py new file mode 100644 index 00000000000..94aef5cdf2b --- /dev/null +++ b/extra_tests/snippets/stdlib_unicode_shared.py @@ -0,0 +1,20 @@ +import re +import unicodedata + +assert "유니코드".isidentifier() +assert "५".isdecimal() +assert "\u3000".isspace() +assert " ".isprintable() +assert not "\n".isprintable() + +assert unicodedata.category("\ud800") == "Cs" +assert unicodedata.lookup("SNOWMAN") == "☃" +assert unicodedata.name("☃") == "SNOWMAN" +assert unicodedata.normalize("NFC", "e\u0301") == "é" +assert unicodedata.digit("²") == 2 +assert unicodedata.decimal("५") == 5 +assert unicodedata.numeric("⅓") == 1 / 3 + +assert re.fullmatch(r"\w+", "가나다") +assert re.fullmatch(r"\d+", "५६७") +assert re.fullmatch(r"\s+", "\u3000") From e968d838082a3655b4f9c29192535d07325ae120 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 07:44:23 +0000 Subject: [PATCH 3/7] Tidy shared unicode review feedback Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- crates/sre_engine/src/string.rs | 13 ++----------- crates/unicode/src/classify.rs | 2 ++ 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index bb2974bca5b..b2333330a46 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -345,13 +345,8 @@ pub(crate) fn is_digit(ch: u32) -> bool { rustpython_unicode::regex::is_digit(ch) } #[inline] -pub(crate) fn is_loc_alnum(ch: u32) -> bool { - // FIXME: Ignore the locales - rustpython_unicode::regex::is_locale_alnum(ch) -} -#[inline] pub(crate) fn is_loc_word(ch: u32) -> bool { - ch == '_' as u32 || is_loc_alnum(ch) + rustpython_unicode::regex::is_locale_word(ch) } #[inline] pub(crate) const fn is_linebreak(ch: u32) -> bool { @@ -384,12 +379,8 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { rustpython_unicode::regex::is_unicode_linebreak(ch) } #[inline] -pub(crate) fn is_uni_alnum(ch: u32) -> bool { - rustpython_unicode::regex::is_unicode_alnum(ch) -} -#[inline] pub(crate) fn is_uni_word(ch: u32) -> bool { - ch == '_' as u32 || is_uni_alnum(ch) + rustpython_unicode::regex::is_unicode_word(ch) } #[inline] pub fn lower_unicode(ch: u32) -> u32 { diff --git a/crates/unicode/src/classify.rs b/crates/unicode/src/classify.rs index 56c6679e545..a6bd732e3cb 100644 --- a/crates/unicode/src/classify.rs +++ b/crates/unicode/src/classify.rs @@ -46,10 +46,12 @@ pub fn is_space(cp: u32) -> bool { }) } +/// Python's `str.isprintable()` semantics, which treat ASCII space as printable. pub fn is_printable(cp: u32) -> bool { cp == '\u{0020}' as u32 || is_repr_printable(cp) } +/// Repr/escape printable semantics, which exclude all Unicode space separators. pub fn is_repr_printable(cp: u32) -> bool { !matches!( general_category(cp), From 5cf1bd666709a79096a30109c0f4bbd2896c88e4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 07:45:20 +0000 Subject: [PATCH 4/7] Polish unicode regex helpers Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- crates/unicode/src/regex.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/unicode/src/regex.rs b/crates/unicode/src/regex.rs index dcc057f7ad9..1f2045b96bd 100644 --- a/crates/unicode/src/regex.rs +++ b/crates/unicode/src/regex.rs @@ -1,11 +1,13 @@ use crate::{case, classify}; +const UNDERSCORE: u32 = '_' as u32; + const fn is_py_ascii_whitespace(byte: u8) -> bool { matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } pub fn is_word(cp: u32) -> bool { - cp == '_' as u32 + cp == UNDERSCORE || u8::try_from(cp) .map(|byte| byte.is_ascii_alphanumeric()) .unwrap_or(false) @@ -30,7 +32,7 @@ pub fn is_locale_alnum(cp: u32) -> bool { } pub fn is_locale_word(cp: u32) -> bool { - cp == '_' as u32 || is_locale_alnum(cp) + cp == UNDERSCORE || is_locale_alnum(cp) } pub const fn is_linebreak(cp: u32) -> bool { @@ -73,7 +75,7 @@ pub fn is_unicode_alnum(cp: u32) -> bool { } pub fn is_unicode_word(cp: u32) -> bool { - cp == '_' as u32 || is_unicode_alnum(cp) + cp == UNDERSCORE || is_unicode_alnum(cp) } pub fn lower_unicode(cp: u32) -> u32 { From 0a340de9c30e00c6794464104397ef021244aeab Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:53:23 +0000 Subject: [PATCH 5/7] Finish unicode crate follow-up refactors Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- Cargo.lock | 4 +- crates/codegen/Cargo.toml | 2 +- crates/codegen/src/string_parser.rs | 4 +- crates/common/Cargo.toml | 2 +- crates/common/src/encodings.rs | 2 +- crates/stdlib/Cargo.toml | 2 +- crates/stdlib/src/unicodedata.rs | 163 ++++++++++------------------ crates/unicode/Cargo.toml | 5 +- crates/unicode/src/data.rs | 100 ++++++++++++++++- crates/unicode/src/lib.rs | 2 +- crates/unicode/src/normalize.rs | 15 +++ crates/vm/Cargo.toml | 2 +- 12 files changed, 185 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 553d18c7be3..0e272c3eaf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3123,9 +3123,9 @@ dependencies = [ "rustpython-ruff_python_ast", "rustpython-ruff_python_parser", "rustpython-ruff_text_size", + "rustpython-unicode", "rustpython-wtf8", "thiserror 2.0.18", - "unicode_names2 2.0.0", ] [[package]] @@ -3148,9 +3148,9 @@ dependencies = [ "parking_lot", "radium", "rustpython-literal", + "rustpython-unicode", "rustpython-wtf8", "siphasher", - "unicode_names2 2.0.0", "widestring", "windows-sys 0.61.2", ] diff --git a/crates/codegen/Cargo.toml b/crates/codegen/Cargo.toml index 78065962fff..3a5a5acb810 100644 --- a/crates/codegen/Cargo.toml +++ b/crates/codegen/Cargo.toml @@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"] [dependencies] rustpython-compiler-core = { workspace = true } +rustpython-unicode = { workspace = true, default-features = false } rustpython-literal = {workspace = true } rustpython-wtf8 = { workspace = true } ruff_python_ast = { workspace = true } @@ -29,7 +30,6 @@ num-traits = { workspace = true } thiserror = { workspace = true } malachite-bigint = { workspace = true } memchr = { workspace = true } -unicode_names2 = { workspace = true } [dev-dependencies] ruff_python_parser = { workspace = true } diff --git a/crates/codegen/src/string_parser.rs b/crates/codegen/src/string_parser.rs index a7ad8c35a46..8934e1868cd 100644 --- a/crates/codegen/src/string_parser.rs +++ b/crates/codegen/src/string_parser.rs @@ -113,7 +113,9 @@ impl StringParser { let name_and_ending = self.skip_bytes(close_idx + 1); let name = &name_and_ending[..name_and_ending.len() - 1]; - unicode_names2::character(name).ok_or_else(|| unreachable!()) + rustpython_unicode::data::lookup(name) + .and_then(char::from_u32) + .ok_or_else(|| unreachable!()) } /// Parse an escaped character, returning the new character. diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 555336f059a..a6694ad6180 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"] [dependencies] rustpython-literal = { workspace = true } +rustpython-unicode = { workspace = true, default-features = false } rustpython-wtf8 = { workspace = true } ascii = { workspace = true } @@ -29,7 +30,6 @@ malachite-q = { workspace = true } malachite-base = { workspace = true } num-traits = { workspace = true } parking_lot = { workspace = true, optional = true } -unicode_names2 = { workspace = true } radium = { workspace = true } lock_api = "0.4" diff --git a/crates/common/src/encodings.rs b/crates/common/src/encodings.rs index 913f0521e16..54a757de358 100644 --- a/crates/common/src/encodings.rs +++ b/crates/common/src/encodings.rs @@ -414,7 +414,7 @@ pub mod errors { let mut out = String::with_capacity(num_chars * 4); for c in err_str.code_points() { let c_u32 = c.to_u32(); - if let Some(c_name) = c.to_char().and_then(unicode_names2::name) { + if let Some(c_name) = rustpython_unicode::data::name(c_u32) { write!(out, "\\N{{{c_name}}}").unwrap(); } else if c_u32 >= 0x10000 { write!(out, "\\U{c_u32:08x}").unwrap(); diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index 6945132243f..9e3c5382ee0 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -28,7 +28,7 @@ flame-it = ["flame"] [dependencies] # rustpython crates rustpython-derive = { workspace = true } -rustpython-unicode = { workspace = true, features = ["std", "casefold"] } +rustpython-unicode = { workspace = true, features = ["casefold"] } rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]} rustpython-common = { workspace = true } diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 622a7854157..2633ba4c0a0 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -6,43 +6,6 @@ pub(crate) use unicodedata::module_def; -use crate::vm::{ - PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject, -}; - -enum NormalizeForm { - Nfc, - Nfkc, - Nfd, - Nfkd, -} - -impl From for rustpython_unicode::NormalizeForm { - fn from(value: NormalizeForm) -> Self { - match value { - NormalizeForm::Nfc => Self::Nfc, - NormalizeForm::Nfkc => Self::Nfkc, - NormalizeForm::Nfd => Self::Nfd, - NormalizeForm::Nfkd => Self::Nfkd, - } - } -} - -impl<'a> TryFromBorrowedObject<'a> for NormalizeForm { - fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { - obj.try_value_with( - |form: &PyStr| match form.as_bytes() { - b"NFC" => Ok(Self::Nfc), - b"NFKC" => Ok(Self::Nfkc), - b"NFD" => Ok(Self::Nfd), - b"NFKD" => Ok(Self::Nfkd), - _ => Err(vm.new_value_error("invalid normalization form")), - }, - vm, - ) - } -} - #[pymodule] mod unicodedata { use crate::vm::{ @@ -53,13 +16,20 @@ mod unicodedata { use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; - use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize}; + use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data}; + + fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult { + form.to_str() + .ok_or_else(|| vm.new_value_error("invalid normalization form"))? + .parse() + .map_err(|()| vm.new_value_error("invalid normalization form")) + } pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py) -> PyResult<()> { __module_exec(vm, module); // Add UCD methods as module-level functions - let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into(); + let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into(); for attr in [ "category", @@ -85,49 +55,36 @@ mod unicodedata { #[pyattr] #[pyclass(name = "UCD")] #[derive(Debug, PyPayload)] - pub(super) struct Ucd { - unic_version: UnicodeVersion, - } - - impl Ucd { - pub const fn new(unic_version: UnicodeVersion) -> Self { - Self { unic_version } - } + pub(super) struct PyUcd(data::Ucd); - fn check_age(&self, c: CodePoint) -> bool { - data::is_assigned_in_version(c.to_u32(), self.unic_version) + impl PyUcd { + pub const fn new(ucd: data::Ucd) -> Self { + Self(ucd) } - fn extract_char( - &self, - character: PyStrRef, - vm: &VirtualMachine, - ) -> PyResult> { - let c = character + fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult { + character .as_wtf8() .code_points() .exactly_one() - .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?; - - Ok(self.check_age(c).then_some(c)) + .map_err(|_| vm.new_type_error("argument must be an unicode character, not str")) } } #[pyclass(flags(DISALLOW_INSTANTIATION))] - impl Ucd { + impl PyUcd { #[pymethod] fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self - .extract_char(character, vm)? - .map_or("Cn", |c| data::category(c.to_u32())) + .0 + .category(Self::extract_char(character, vm)?.to_u32()) .to_owned()) } #[pymethod] fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult { if let Some(name_str) = name.to_str() - && let Some(character) = data::lookup(name_str) - && self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point")) + && let Some(character) = self.0.lookup(name_str) { return Ok(char::from_u32(character) .expect("unicode_names2 only returns Unicode scalar values") @@ -147,12 +104,7 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - let c = self.extract_char(character, vm)?; - - if let Some(c) = c - && self.check_age(c) - && let Some(name) = data::name(c.to_u32()) - { + if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) { return Ok(vm.ctx.new_str(name).into()); } default.ok_or_else(|| vm.new_value_error("no such name")) @@ -165,8 +117,8 @@ mod unicodedata { vm: &VirtualMachine, ) -> PyResult<&'static str> { Ok(self - .extract_char(character, vm)? - .map_or("", |c| data::bidirectional(c.to_u32()))) + .0 + .bidirectional(Self::extract_char(character, vm)?.to_u32())) } /// NOTE: This function uses 9.0.0 database instead of 3.2.0 @@ -177,39 +129,51 @@ mod unicodedata { vm: &VirtualMachine, ) -> PyResult<&'static str> { Ok(self - .extract_char(character, vm)? - .map_or("N", |c| data::east_asian_width(c.to_u32()))) + .0 + .east_asian_width(Self::extract_char(character, vm)?.to_u32())) } #[pymethod] - fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - Ok(normalize::normalize(form.into(), unistr.as_wtf8())) + fn normalize( + &self, + form: PyStrRef, + unistr: PyStrRef, + vm: &VirtualMachine, + ) -> PyResult { + Ok(self + .0 + .normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8())) } #[pymethod] - fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { - Ok(normalize::is_normalized(form.into(), unistr.as_wtf8())) + fn is_normalized( + &self, + form: PyStrRef, + unistr: PyStrRef, + vm: &VirtualMachine, + ) -> PyResult { + Ok(self + .0 + .is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8())) } #[pymethod] fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { - Ok(self - .extract_char(character, vm)? - .is_some_and(|c| data::mirrored(c.to_u32())) as i32) + Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32) } #[pymethod] fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self - .extract_char(character, vm)? - .map_or(0, |c| data::combining(c.to_u32()))) + .0 + .combining(Self::extract_char(character, vm)?.to_u32())) } #[pymethod] fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self - .extract_char(character, vm)? - .map_or_else(String::new, |c| data::decomposition(c.to_u32()))) + .0 + .decomposition(Self::extract_char(character, vm)?.to_u32())) } #[pymethod] @@ -219,10 +183,7 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - if let Some(value) = self - .extract_char(character, vm)? - .and_then(|c| data::digit(c.to_u32())) - { + if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) { return Ok(vm.ctx.new_int(value).into()); } default.ok_or_else(|| vm.new_value_error("not a digit")) @@ -235,10 +196,7 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - if let Some(value) = self - .extract_char(character, vm)? - .and_then(|c| data::decimal(c.to_u32())) - { + if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) { return Ok(vm.ctx.new_int(value).into()); } default.ok_or_else(|| vm.new_value_error("not a decimal")) @@ -251,10 +209,7 @@ mod unicodedata { default: OptionalArg, vm: &VirtualMachine, ) -> PyResult { - if let Some(value) = self - .extract_char(character, vm)? - .and_then(|c| data::numeric(c.to_u32())) - { + if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) { let value = match value { data::NumericValue::Integer(n) => n as f64, data::NumericValue::Rational(num, den) => num as f64 / den as f64, @@ -266,19 +221,17 @@ mod unicodedata { #[pygetset] fn unidata_version(&self) -> String { - self.unic_version.to_string() + self.0.unicode_version().to_string() } } #[pyattr] - fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { - Ucd { - unic_version: UnicodeVersion { - major: 3, - minor: 2, - micro: 0, - }, - } + fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { + PyUcd::new(data::Ucd::new(UnicodeVersion { + major: 3, + minor: 2, + micro: 0, + })) .into_ref(&vm.ctx) } diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml index 51fc781e790..ff59bb0dd77 100644 --- a/crates/unicode/Cargo.toml +++ b/crates/unicode/Cargo.toml @@ -9,9 +9,8 @@ repository.workspace = true license.workspace = true [features] -default = ["std", "casefold"] -std = [] -casefold = ["std", "dep:caseless"] +default = ["casefold"] +casefold = ["dep:caseless"] [dependencies] rustpython-wtf8 = { workspace = true } diff --git a/crates/unicode/src/data.rs b/crates/unicode/src/data.rs index 28d9e0b9553..5c9a48a071a 100644 --- a/crates/unicode/src/data.rs +++ b/crates/unicode/src/data.rs @@ -9,7 +9,7 @@ use icu_properties::{ }; use itertools::Itertools; use ucd::{Codepoint, DecompositionType, Number, NumericType}; -use unic_ucd_age::{Age, UnicodeVersion}; +use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; use crate::{char_from_codepoint, classify, is_surrogate}; @@ -19,6 +19,104 @@ pub enum NumericValue { Rational(i64, i64), } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Ucd { + unic_version: UnicodeVersion, +} + +impl Default for Ucd { + fn default() -> Self { + Self::new(UNICODE_VERSION) + } +} + +impl Ucd { + pub const fn new(unic_version: UnicodeVersion) -> Self { + Self { unic_version } + } + + pub const fn unicode_version(&self) -> UnicodeVersion { + self.unic_version + } + + pub fn category(&self, cp: u32) -> &'static str { + if self.contains(cp) { + category(cp) + } else { + "Cn" + } + } + + pub fn lookup(&self, name: &str) -> Option { + let cp = lookup(name)?; + self.contains(cp).then_some(cp) + } + + pub fn name(&self, cp: u32) -> Option { + self.contains(cp).then(|| name(cp)).flatten() + } + + pub fn bidirectional(&self, cp: u32) -> &'static str { + if self.contains(cp) { + bidirectional(cp) + } else { + "" + } + } + + pub fn east_asian_width(&self, cp: u32) -> &'static str { + if self.contains(cp) { + east_asian_width(cp) + } else { + "N" + } + } + + pub fn normalize( + &self, + form: crate::NormalizeForm, + text: &rustpython_wtf8::Wtf8, + ) -> rustpython_wtf8::Wtf8Buf { + crate::normalize::normalize(form, text) + } + + pub fn is_normalized(&self, form: crate::NormalizeForm, text: &rustpython_wtf8::Wtf8) -> bool { + crate::normalize::is_normalized(form, text) + } + + pub fn mirrored(&self, cp: u32) -> bool { + self.contains(cp) && mirrored(cp) + } + + pub fn combining(&self, cp: u32) -> u8 { + if self.contains(cp) { combining(cp) } else { 0 } + } + + pub fn decomposition(&self, cp: u32) -> String { + if self.contains(cp) { + decomposition(cp) + } else { + String::new() + } + } + + pub fn digit(&self, cp: u32) -> Option { + self.contains(cp).then(|| digit(cp)).flatten() + } + + pub fn decimal(&self, cp: u32) -> Option { + self.contains(cp).then(|| decimal(cp)).flatten() + } + + pub fn numeric(&self, cp: u32) -> Option { + self.contains(cp).then(|| numeric(cp)).flatten() + } + + fn contains(&self, cp: u32) -> bool { + is_assigned_in_version(cp, self.unic_version) + } +} + pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool { if is_surrogate(cp) { true diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs index 7ac71be12f4..6ccc9fb3307 100644 --- a/crates/unicode/src/lib.rs +++ b/crates/unicode/src/lib.rs @@ -1,4 +1,4 @@ -#![cfg_attr(not(feature = "std"), no_std)] +#![cfg_attr(not(feature = "casefold"), no_std)] extern crate alloc; diff --git a/crates/unicode/src/normalize.rs b/crates/unicode/src/normalize.rs index a5a53504262..702f27569a1 100644 --- a/crates/unicode/src/normalize.rs +++ b/crates/unicode/src/normalize.rs @@ -1,3 +1,4 @@ +use core::str::FromStr; use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; use rustpython_wtf8::{Wtf8, Wtf8Buf}; @@ -9,6 +10,20 @@ pub enum NormalizeForm { Nfkd, } +impl FromStr for NormalizeForm { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "NFC" => Ok(Self::Nfc), + "NFKC" => Ok(Self::Nfkc), + "NFD" => Ok(Self::Nfd), + "NFKD" => Ok(Self::Nfkd), + _ => Err(()), + } + } +} + pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf { match form { NormalizeForm::Nfc => { diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 22bf943fd1c..d1c52d9e40b 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -41,7 +41,7 @@ ruff_text_size = { workspace = true, optional = true } rustpython-compiler-core = { workspace = true } rustpython-literal = { workspace = true } rustpython-sre_engine = { workspace = true } -rustpython-unicode = { workspace = true, features = ["std", "casefold"] } +rustpython-unicode = { workspace = true, features = ["casefold"] } ascii = { workspace = true } ahash = { workspace = true } From 2934897035a4a169bd6a669ed59972725e13e64a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:54:41 +0000 Subject: [PATCH 6/7] Polish unicode follow-up review fixes Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- crates/stdlib/src/unicodedata.rs | 2 +- crates/unicode/Cargo.toml | 5 +++-- crates/unicode/src/lib.rs | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 2633ba4c0a0..d563021e22d 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -67,7 +67,7 @@ mod unicodedata { .as_wtf8() .code_points() .exactly_one() - .map_err(|_| vm.new_type_error("argument must be an unicode character, not str")) + .map_err(|_| vm.new_type_error("argument must be a Unicode character, not str")) } } diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml index ff59bb0dd77..51fc781e790 100644 --- a/crates/unicode/Cargo.toml +++ b/crates/unicode/Cargo.toml @@ -9,8 +9,9 @@ repository.workspace = true license.workspace = true [features] -default = ["casefold"] -casefold = ["dep:caseless"] +default = ["std", "casefold"] +std = [] +casefold = ["std", "dep:caseless"] [dependencies] rustpython-wtf8 = { workspace = true } diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs index 6ccc9fb3307..7ac71be12f4 100644 --- a/crates/unicode/src/lib.rs +++ b/crates/unicode/src/lib.rs @@ -1,4 +1,4 @@ -#![cfg_attr(not(feature = "casefold"), no_std)] +#![cfg_attr(not(feature = "std"), no_std)] extern crate alloc; From 4efa5da5f68851b7b5c28fc53355590266d51db9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 02:07:56 +0000 Subject: [PATCH 7/7] Simplify unicode regex call sites Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/b894057a-9bed-4f35-8400-a5731c63602d Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com> --- Cargo.lock | 1 - crates/literal/src/char.rs | 12 ---- crates/literal/src/escape.rs | 6 +- crates/literal/src/lib.rs | 1 - crates/sre_engine/src/engine.rs | 108 ++++++++++++++++++-------------- crates/sre_engine/src/string.rs | 59 ----------------- crates/vm/Cargo.toml | 1 - crates/vm/src/stdlib/_sre.rs | 12 ++-- 8 files changed, 69 insertions(+), 131 deletions(-) delete mode 100644 crates/literal/src/char.rs diff --git a/Cargo.lock b/Cargo.lock index 0e272c3eaf7..6ac26758e5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3457,7 +3457,6 @@ dependencies = [ "ascii", "bitflags 2.11.0", "bstr", - "caseless", "cfg-if", "chrono", "constant_time_eq", diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs deleted file mode 100644 index 4544133d3b6..00000000000 --- a/crates/literal/src/char.rs +++ /dev/null @@ -1,12 +0,0 @@ -/// According to python following categories aren't printable: -/// * Cc (Other, Control) -/// * Cf (Other, Format) -/// * Cs (Other, Surrogate) -/// * Co (Other, Private Use) -/// * Cn (Other, Not Assigned) -/// * Zl Separator, Line ('\u2028', LINE SEPARATOR) -/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) -/// * Zs (Separator, Space) other than ASCII space('\x20'). -pub fn is_printable(c: char) -> bool { - rustpython_unicode::classify::is_repr_printable(c as u32) -} diff --git a/crates/literal/src/escape.rs b/crates/literal/src/escape.rs index 1099c0a02bc..01df100a004 100644 --- a/crates/literal/src/escape.rs +++ b/crates/literal/src/escape.rs @@ -204,7 +204,7 @@ impl UnicodeEscape<'_> { '\\' | '\t' | '\r' | '\n' => 2, ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH ch if ch.is_ascii() => 1, - ch if crate::char::is_printable(ch) => { + ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => { // max = std::cmp::max(ch, max); ch.len_utf8() } @@ -238,7 +238,9 @@ impl UnicodeEscape<'_> { ch if ch.is_ascii() => { write!(formatter, "\\x{:02x}", ch as u8) } - ch if crate::char::is_printable(ch) => formatter.write_char(ch), + ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => { + formatter.write_char(ch) + } '\0'..='\u{ff}' => { write!(formatter, "\\x{:02x}", ch as u32) } diff --git a/crates/literal/src/lib.rs b/crates/literal/src/lib.rs index a863dd87738..6d520900142 100644 --- a/crates/literal/src/lib.rs +++ b/crates/literal/src/lib.rs @@ -2,7 +2,6 @@ extern crate alloc; -pub mod char; pub mod complex; pub mod escape; pub mod float; diff --git a/crates/sre_engine/src/engine.rs b/crates/sre_engine/src/engine.rs index 73e263012fc..c23d3477fbb 100644 --- a/crates/sre_engine/src/engine.rs +++ b/crates/sre_engine/src/engine.rs @@ -1,14 +1,10 @@ // good luck to those that follow; here be dragons -use crate::string::{ - is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space, - is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode, -}; - use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor}; use alloc::{vec, vec::Vec}; use core::{convert::TryFrom, ptr::null}; use optional::Optioned; +use rustpython_unicode::regex as unicode_regex; #[derive(Debug, Clone, Copy)] pub struct Request<'a, S> { @@ -659,10 +655,10 @@ fn _match(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo } SreOpcode::IN => general_op_in!(charset), SreOpcode::IN_IGNORE => { - general_op_in!(|set, c| charset(set, lower_ascii(c))) + general_op_in!(|set, c| charset(set, unicode_regex::lower_ascii(c))) } SreOpcode::IN_UNI_IGNORE => { - general_op_in!(|set, c| charset(set, lower_unicode(c))) + general_op_in!(|set, c| charset(set, unicode_regex::lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), SreOpcode::MARK => { @@ -803,25 +799,31 @@ fn _match(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo SreOpcode::LITERAL => general_op_literal!(|code, c| code == c), SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c), SreOpcode::LITERAL_IGNORE => { - general_op_literal!(|code, c| code == lower_ascii(c)) + general_op_literal!(|code, c| code == unicode_regex::lower_ascii(c)) } SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal!(|code, c| code != lower_ascii(c)) + general_op_literal!(|code, c| code != unicode_regex::lower_ascii(c)) } SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal!(|code, c| code == lower_unicode(c)) + general_op_literal!(|code, c| code == unicode_regex::lower_unicode(c)) } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal!(|code, c| code != lower_unicode(c)) + general_op_literal!(|code, c| code != unicode_regex::lower_unicode(c)) } SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore), SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_op_literal!(|code, c| !char_loc_ignore(code, c)) } SreOpcode::GROUPREF => general_op_groupref!(|x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode), + SreOpcode::GROUPREF_IGNORE => { + general_op_groupref!(unicode_regex::lower_ascii) + } + SreOpcode::GROUPREF_LOC_IGNORE => { + general_op_groupref!(unicode_regex::lower_locale) + } + SreOpcode::GROUPREF_UNI_IGNORE => { + general_op_groupref!(unicode_regex::lower_unicode) + } SreOpcode::GROUPREF_EXISTS => { let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); @@ -1125,7 +1127,7 @@ impl MatchContext { } fn at_linebreak(&self, req: &Request<'_, S>) -> bool { - !self.at_end(req) && is_linebreak(self.peek_char::()) + !self.at_end(req) && unicode_regex::is_linebreak(self.peek_char::()) } fn at_boundary bool>( @@ -1192,54 +1194,56 @@ impl MatchContext { fn at(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool { match at_code { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::()), - SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), - SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), + SreAtCode::BEGINNING_LINE => { + ctx.at_beginning() || unicode_regex::is_linebreak(ctx.back_peek_char::()) + } + SreAtCode::BOUNDARY => ctx.at_boundary(req, unicode_regex::is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_word), SreAtCode::END => { (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) } SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), SreAtCode::END_STRING => ctx.at_end(req), - SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), - SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_locale_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_locale_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_unicode_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_unicode_word), } } fn char_loc_ignore(code: u32, c: u32) -> bool { - code == c || code == lower_locate(c) || code == upper_locate(c) + code == c || code == unicode_regex::lower_locale(c) || code == unicode_regex::upper_locale(c) } fn charset_loc_ignore(set: &[u32], c: u32) -> bool { - let lo = lower_locate(c); + let lo = unicode_regex::lower_locale(c); if charset(set, c) { return true; } - let up = upper_locate(c); + let up = unicode_regex::upper_locale(c); up != lo && charset(set, up) } fn category(cat_code: SreCatCode, c: u32) -> bool { match cat_code { - SreCatCode::DIGIT => is_digit(c), - SreCatCode::NOT_DIGIT => !is_digit(c), - SreCatCode::SPACE => is_space(c), - SreCatCode::NOT_SPACE => !is_space(c), - SreCatCode::WORD => is_word(c), - SreCatCode::NOT_WORD => !is_word(c), - SreCatCode::LINEBREAK => is_linebreak(c), - SreCatCode::NOT_LINEBREAK => !is_linebreak(c), - SreCatCode::LOC_WORD => is_loc_word(c), - SreCatCode::LOC_NOT_WORD => !is_loc_word(c), - SreCatCode::UNI_DIGIT => is_uni_digit(c), - SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), - SreCatCode::UNI_SPACE => is_uni_space(c), - SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), - SreCatCode::UNI_WORD => is_uni_word(c), - SreCatCode::UNI_NOT_WORD => !is_uni_word(c), - SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), - SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), + SreCatCode::DIGIT => unicode_regex::is_digit(c), + SreCatCode::NOT_DIGIT => !unicode_regex::is_digit(c), + SreCatCode::SPACE => unicode_regex::is_space(c), + SreCatCode::NOT_SPACE => !unicode_regex::is_space(c), + SreCatCode::WORD => unicode_regex::is_word(c), + SreCatCode::NOT_WORD => !unicode_regex::is_word(c), + SreCatCode::LINEBREAK => unicode_regex::is_linebreak(c), + SreCatCode::NOT_LINEBREAK => !unicode_regex::is_linebreak(c), + SreCatCode::LOC_WORD => unicode_regex::is_locale_word(c), + SreCatCode::LOC_NOT_WORD => !unicode_regex::is_locale_word(c), + SreCatCode::UNI_DIGIT => unicode_regex::is_unicode_digit(c), + SreCatCode::UNI_NOT_DIGIT => !unicode_regex::is_unicode_digit(c), + SreCatCode::UNI_SPACE => unicode_regex::is_unicode_space(c), + SreCatCode::UNI_NOT_SPACE => !unicode_regex::is_unicode_space(c), + SreCatCode::UNI_WORD => unicode_regex::is_unicode_word(c), + SreCatCode::UNI_NOT_WORD => !unicode_regex::is_unicode_word(c), + SreCatCode::UNI_LINEBREAK => unicode_regex::is_unicode_linebreak(c), + SreCatCode::UNI_NOT_LINEBREAK => !unicode_regex::is_unicode_linebreak(c), } } @@ -1320,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool { if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } - let ch = upper_unicode(ch); + let ch = unicode_regex::upper_unicode(ch); if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } @@ -1368,10 +1372,14 @@ fn _count( general_count_literal(req, ctx, end, |code, c| code != c); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| { + code == unicode_regex::lower_ascii(c) + }); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| { + code != unicode_regex::lower_ascii(c) + }); } SreOpcode::LITERAL_LOC_IGNORE => { general_count_literal(req, ctx, end, char_loc_ignore); @@ -1380,10 +1388,14 @@ fn _count( general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| { + code == unicode_regex::lower_unicode(c) + }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| { + code != unicode_regex::lower_unicode(c) + }); } _ => { /* General case */ diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index b2333330a46..0c548ded214 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -331,62 +331,3 @@ const fn utf8_is_cont_byte(byte: u8) -> bool { /// Mask of the value bits of a continuation byte. const CONT_MASK: u8 = 0b0011_1111; - -#[inline] -pub(crate) fn is_word(ch: u32) -> bool { - rustpython_unicode::regex::is_word(ch) -} -#[inline] -pub(crate) fn is_space(ch: u32) -> bool { - rustpython_unicode::regex::is_space(ch) -} -#[inline] -pub(crate) fn is_digit(ch: u32) -> bool { - rustpython_unicode::regex::is_digit(ch) -} -#[inline] -pub(crate) fn is_loc_word(ch: u32) -> bool { - rustpython_unicode::regex::is_locale_word(ch) -} -#[inline] -pub(crate) const fn is_linebreak(ch: u32) -> bool { - rustpython_unicode::regex::is_linebreak(ch) -} -#[inline] -pub fn lower_ascii(ch: u32) -> u32 { - rustpython_unicode::regex::lower_ascii(ch) -} -#[inline] -pub(crate) fn lower_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - rustpython_unicode::regex::lower_locale(ch) -} -#[inline] -pub(crate) fn upper_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - rustpython_unicode::regex::upper_locale(ch) -} -#[inline] -pub(crate) fn is_uni_digit(ch: u32) -> bool { - rustpython_unicode::regex::is_unicode_digit(ch) -} -#[inline] -pub(crate) fn is_uni_space(ch: u32) -> bool { - rustpython_unicode::regex::is_unicode_space(ch) -} -#[inline] -pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { - rustpython_unicode::regex::is_unicode_linebreak(ch) -} -#[inline] -pub(crate) fn is_uni_word(ch: u32) -> bool { - rustpython_unicode::regex::is_unicode_word(ch) -} -#[inline] -pub fn lower_unicode(ch: u32) -> u32 { - rustpython_unicode::regex::lower_unicode(ch) -} -#[inline] -pub fn upper_unicode(ch: u32) -> u32 { - rustpython_unicode::regex::upper_unicode(ch) -} diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index d1c52d9e40b..4f097ff03d4 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -75,7 +75,6 @@ strum_macros = { workspace = true } thiserror = { workspace = true } memchr = { workspace = true } -caseless = "0.2.2" flamer = { version = "0.5", optional = true } half = "2" psm = "0.1" diff --git a/crates/vm/src/stdlib/_sre.rs b/crates/vm/src/stdlib/_sre.rs index ba7044fb5a9..19d114b50a3 100644 --- a/crates/vm/src/stdlib/_sre.rs +++ b/crates/vm/src/stdlib/_sre.rs @@ -21,10 +21,8 @@ mod _sre { use crossbeam_utils::atomic::AtomicCell; use itertools::Itertools; use num_traits::ToPrimitive; - use rustpython_sre_engine::{ - Request, SearchIter, SreFlag, State, StrDrive, - string::{lower_ascii, lower_unicode, upper_unicode}, - }; + use rustpython_sre_engine::{Request, SearchIter, SreFlag, State, StrDrive}; + use rustpython_unicode::regex as unicode_regex; #[pyattr] pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC}; @@ -42,17 +40,17 @@ mod _sre { #[pyfunction] fn unicode_iscased(ch: i32) -> bool { let ch = ch as u32; - ch != lower_unicode(ch) || ch != upper_unicode(ch) + ch != unicode_regex::lower_unicode(ch) || ch != unicode_regex::upper_unicode(ch) } #[pyfunction] fn ascii_tolower(ch: i32) -> i32 { - lower_ascii(ch as u32) as i32 + unicode_regex::lower_ascii(ch as u32) as i32 } #[pyfunction] fn unicode_tolower(ch: i32) -> i32 { - lower_unicode(ch as u32) as i32 + unicode_regex::lower_unicode(ch as u32) as i32 } trait SreStr: StrDrive {