Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 215 additions & 86 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,11 @@ strum = "0.28"
strum_macros = "0.28"
syn = "2"
thiserror = "2.0"
icu_properties = "2"
icu_normalizer = "2"
unicode-casing = "0.1.1"
unic-char-property = "0.9.0"
unic-normal = "0.9.0"
unic-ucd-age = "0.9.0"
unic-ucd-bidi = "0.9.0"
unic-ucd-category = "0.9.0"
unic-ucd-ident = "0.9.0"
unicode_names2 = "2.0.0"
unicode-bidi-mirroring = "0.4"
widestring = "1.2.0"
windows-sys = "0.61.2"
wasm-bindgen = "0.2.106"
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,7 @@ def test_isprintable(self):
self.assertTrue('\U0001F46F'.isprintable())
self.assertFalse('\U000E0020'.isprintable())

@unittest.expectedFailure # TODO: RUSTPYTHON
@support.requires_resource('cpu')
def test_isprintable_invariant(self):
Copy link
Copy Markdown
Contributor Author

@ShaharNaveh ShaharNaveh Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was giving us a false sense that it works, because it used to report \u{0560} as non-printable (but now it's saying it's printable).

use icu_properties::props::{EnumeratedProperty, GeneralCategory};
 
const CH: char = '\u{0560}';

fn main() {
    dbg!(unic_ucd_category::GeneralCategory::of(CH)); // Unassigned (wrong)
    dbg!(GeneralCategory::for_char(CH)); // LowercaseLetter (correct)
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what should we do?

for codepoint in range(sys.maxunicode + 1):
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ def test_issue10254(self):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)

@unittest.expectedFailure # TODO: RUSTPYTHON; ? +
def test_issue29456(self):
# Fix #29456
u1176_str_a = '\u1100\u1176\u11a8'
Expand Down Expand Up @@ -389,6 +388,7 @@ def unistr(data):
data = [int(x, 16) for x in data.split(" ")]
return "".join([chr(x) for x in data])

@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055
@requires_resource('network')
@requires_resource('cpu')
def test_normalization(self):
Expand Down
2 changes: 1 addition & 1 deletion crates/literal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ hexf-parse = "0.2.1"
is-macro.workspace = true
lexical-parse-float = { version = "1.0.6", features = ["format"] }
num-traits = { workspace = true }
unic-ucd-category = { workspace = true }
icu_properties = { workspace = true }

[dev-dependencies]
rand = { workspace = true }
17 changes: 14 additions & 3 deletions crates/literal/src/char.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use unic_ucd_category::GeneralCategory;
use icu_properties::props::{EnumeratedProperty, GeneralCategory};

/// According to python following categories aren't printable:
/// * Cc (Other, Control)
Expand All @@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory;
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
/// * Zs (Separator, Space) other than ASCII space('\x20').
pub fn is_printable(c: char) -> bool {
let cat = GeneralCategory::of(c);
!(cat.is_other() || cat.is_separator())
let cat = GeneralCategory::for_char(c);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like we need to have universe character set comparison script to ensure texts are actually the same.


!matches!(
cat,
GeneralCategory::SpaceSeparator
| GeneralCategory::LineSeparator
| GeneralCategory::ParagraphSeparator
| GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
)
}
7 changes: 2 additions & 5 deletions crates/stdlib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,10 @@ constant_time_eq = { workspace = true }
## unicode stuff
unicode_names2 = { workspace = true }
# update version all at the same time
unic-char-property = { workspace = true }
unic-normal = { workspace = true }
unic-ucd-bidi = { workspace = true }
unic-ucd-category = { workspace = true }
icu_properties = { workspace = true }
icu_normalizer = { workspace = true }
unic-ucd-age = { workspace = true }
ucd = "0.1.1"
unicode-bidi-mirroring = { workspace = true }

# compression
adler32 = "1.2.0"
Expand Down
102 changes: 62 additions & 40 deletions crates/stdlib/src/unicodedata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,19 @@ mod unicodedata {
builtins::{PyModule, PyStrRef},
function::OptionalArg,
};

use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
use icu_properties::{
CodePointSetData,
props::{
BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
GeneralCategory, NamedEnumeratedProperty,
},
};
use itertools::Itertools;
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
use unic_char_property::EnumeratedCharProperty;
use unic_normal::StrNormalForm;
use ucd::{Codepoint, DecompositionType, Number, NumericType};
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
Comment on lines +44 to 55
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai bot Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

wc -l crates/stdlib/src/unicodedata.rs

Repository: RustPython/RustPython

Length of output: 101


🏁 Script executed:

cat -n crates/stdlib/src/unicodedata.rs | head -250

Repository: RustPython/RustPython

Length of output: 10161


🏁 Script executed:

# Search for unidata_version and how it's constructed
rg "unidata_version" crates/stdlib/src/unicodedata.rs -B 2 -A 2

Repository: RustPython/RustPython

Length of output: 300


🏁 Script executed:

# Search for check_age and Age usage
rg "check_age|Age::|UNICODE_VERSION" crates/stdlib/src/unicodedata.rs -B 2 -A 2

Repository: RustPython/RustPython

Length of output: 1397


🏁 Script executed:

fd -type f Cargo.toml | grep -E "(stdlib|Cargo)" | head -5

Repository: RustPython/RustPython

Length of output: 235


🏁 Script executed:

# Find and examine the Cargo.toml for the stdlib crate
cat Cargo.toml.in 2>/dev/null || find . -name "Cargo.toml" -path "*/stdlib/*" | head -1 | xargs cat

Repository: RustPython/RustPython

Length of output: 5918


🏁 Script executed:

# Search for icu and unic_ucd_age dependency versions
rg "icu_|unic_ucd_age" Cargo.toml --type toml -A 1

Repository: RustPython/RustPython

Length of output: 131


🏁 Script executed:

# Check the root Cargo.toml for workspace dependencies
head -150 Cargo.toml | grep -A 30 "^\[workspace.dependencies\]"

Repository: RustPython/RustPython

Length of output: 575


🏁 Script executed:

# Search for the unic_ucd_age version definition
grep -A 5 "unic-ucd-age" Cargo.toml

Repository: RustPython/RustPython

Length of output: 182


🏁 Script executed:

# Get version info from Cargo.lock if available
ls -la Cargo.lock 2>/dev/null && grep -A 5 "unic-ucd-age\|icu_properties\|icu_normalizer" Cargo.lock | head -40

Repository: RustPython/RustPython

Length of output: 1168


🏁 Script executed:

# Search for UNICODE_VERSION constant value or documentation
rg "UNICODE_VERSION" --type rust -A 3 -B 3 | head -50

Repository: RustPython/RustPython

Length of output: 1335


🏁 Script executed:

# Check if there's any documentation or comments about Unicode versions in the code
rg "Unicode|unicode" crates/stdlib/src/unicodedata.rs | head -20

Repository: RustPython/RustPython

Length of output: 792


🏁 Script executed:

# Look at the comment on line 179 about version mismatch
sed -n '175,185p' crates/stdlib/src/unicodedata.rs

Repository: RustPython/RustPython

Length of output: 352


🏁 Script executed:

# Get more context about the ICU data versions used
rg "2.1" --type rust -B 2 -A 2 | grep -E "(icu_|version|9.0|3.2)" | head -20

Repository: RustPython/RustPython

Length of output: 2575


🏁 Script executed:

# Check what Unicode version the ICU libraries support
web_search ICU 2.1.2 unicode version support

Repository: RustPython/RustPython

Length of output: 115


🌐 Web query:

ICU 2.1.2 unicode version support which unicode standard version

💡 Result:

ICU 2.1.2 supports Unicode Standard version 2.1.2.

Citations:


🌐 Web query:

ICU Unicode data version 15 16 icu_normalizer icu_properties 2.1

💡 Result:

ICU Unicode data versions 15 and 16 are supported in ICU4C releases: ICU 72/74 for Unicode 15/15.1, ICU 76 for Unicode 16. ICU 78 uses Unicode 17. In ICU4X (Rust), icu_normalizer and icu_properties crates at version 2.1 (released 2025-10-28) use data generated from ICU version release-78.1rc / icu4x/2025-10-28 tag and CLDR 48, corresponding to Unicode 17 (not 15 or 16). Earlier ICU4X versions aligned with Unicode 15 (ICU 74, 2023) and 16 (ICU 76, 2024). icu_normalizer depends on icu_properties for decomposition/composition properties. No specific "icu_properties 2.1" data version beyond the shared ICU4X data pipeline.

Citations:


🏁 Script executed:

# Check the actual Unicode version that unic_ucd_age provides
rg "unic.ucd.age" Cargo.lock -A 10 | head -40

Repository: RustPython/RustPython

Length of output: 493


🌐 Web query:

unic-ucd-age 0.9.0 unicode version support UNIC

💡 Result:

The unic-ucd-age 0.9.0 crate, part of the UNIC (Unicode and Internationalization Crates for Rust) project, provides access to the Unicode character Age property from the Unicode Character Database (UCD). It supports Unicode 10.0.0 data, as indicated in the project's documentation for the unic super-crate version 0.9.0, which explicitly states it includes Unicode 10.0.0 data and algorithms. The crate exposes a UNICODE_VERSION constant representing the Unicode version of its data and includes the Age struct and CharAge trait for querying when characters were first assigned in Unicode versions. Released on 2019-03-03, it depends on unic-ucd-version ^0.9.0 and is marked as unmaintained. Example usage: Age::of('A') returns UnicodeVersion { major: 1, minor: 1, micro: 0 }, matching Unicode 1.1 assignment.

Citations:


🏁 Script executed:

# Check what unicode version unic_ucd_age uses
rg "unic-ucd-version" Cargo.lock -A 5

Repository: RustPython/RustPython

Length of output: 320


Resolve the mixed Unicode database versions.

Age::of() and UNICODE_VERSION come from unic_ucd_age 0.9.0 (Unicode 10.0.0), but category(), bidirectional(), east_asian_width(), normalize(), is_normalized(), and mirrored() all use ICU properties and normalization (Unicode 17.0). This causes unidata_version to report 10.0.0 while queries run against Unicode 17.0 data, breaking Python's contract that unicodedata.unidata_version describes the actual Unicode database used by the module. Characters added after Unicode 10.0.0 will either be filtered out by check_age() or return incorrect properties, causing silent data corruption.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/stdlib/src/unicodedata.rs` around lines 44 - 55, The code mixes
Unicode data sources: Age::of()/UNICODE_VERSION (unic_ucd_age, Unicode 10) vs.
ICU-based APIs (icu_properties/icu_normalizer, Unicode 17), so update the module
to use one consistent source—preferably the ICU data already used by category(),
bidirectional(), east_asian_width(), normalize(), is_normalized(), and
mirrored(): remove references to unic_ucd_age::Age and UNICODE_VERSION, derive
unidata_version from the ICU provider/metadata (the same runtime data backing
icu_properties and icu_normalizer), and change check_age() to use ICU's age
information (or remove age-based filtering) so that unidata_version accurately
reflects the data used by those functions. Ensure all places that previously
consulted Age::of() now query the ICU data provider or use the ICU-provided
UnicodeVersion so behavior and reported unidata_version remain in sync.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the recommended replacement crate for unic-ucd-age?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ShaharNaveh, the recommended replacement for unic-ucd-age is the icu_properties crate — which you're already pulling in as a workspace dependency in this PR.

The icu_properties crate provides definitions of Unicode Properties and APIs for retrieving property data in an appropriate data structure, and is published both as its own crate and as part of the icu meta-crate.

The Age property is available via icu_properties::props::Age (an enumerated property), accessible through the standard CodePointMapData API:

use icu_properties::{CodePointMapData, props::Age};

// Replaces: Age::of(c)
let age = CodePointMapData::<Age>::new().get(c);

For UNICODE_VERSION (the constant used to report unidata_version and drive check_age), you can derive it from icu_properties_data. icu_properties_data ships data generated with a specific ICU and Unicode version, so you'd hard-code or derive the version string to match the icu_properties/icu_normalizer data actually bundled. This is also the correct fix for the mixed-version issue flagged in my earlier review comment — once unic_ucd_age is removed, unidata_version will accurately reflect the Unicode version used by all the ICU-backed functions.

The migration would look roughly like:

  • Remove unic_ucd_age from crates/stdlib/Cargo.toml
  • Replace Age::of(c) in check_age() with CodePointMapData::<Age>::new().get(c) (returning an Age enum value you can compare against a target version)
  • Replace UNICODE_VERSION with the corresponding ICU Unicode version constant (e.g., hard-coded or from a const based on the compiled data version)

Would you like me to draft the full diff for this migration?


🧠 Learnings used
Learnt from: ShaharNaveh
Repo: RustPython/RustPython PR: 7181
File: crates/stdlib/src/csv.rs:20-20
Timestamp: 2026-02-17T07:54:36.200Z
Learning: In RustPython, ensure that modules using Python sequence-like types bring in extension methods by importing the trait (for example, 'use rustpython_vm::sliceable::SliceableSequenceOp;'). Even if the trait name isn't directly referenced in code, its methods (such as len()) become in-scope on types like PyStr. Add this import in modules that rely on those extension methods to avoid compilation errors.

Copy link
Copy Markdown
Contributor Author

@ShaharNaveh ShaharNaveh Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

icu_properties::props::Age doesn't exists

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

@ShaharNaveh, you're absolutely right — I apologize for the incorrect suggestion. After verifying the icu_properties::props module docs, Age is indeed not exposed as a property in icu_properties v2. The props module includes things like BidiClass, GeneralCategory, EastAsianWidth, CanonicalCombiningClass, XidStart/XidContinue, etc., but not Age.

So the options for resolving the mixed-version concern are:

  1. Keep unic_ucd_age as-is — it's the last remaining unic crate in use, but it serves a narrow and specific purpose (age-based filtering in check_age() and reporting UNICODE_VERSION). Since replacing it would require either a different crate or significant rework, this may be acceptable as a known limitation to address in a follow-up.

  2. Hard-code the Unicode version constant to match the ICU data version (e.g. Unicode 16 or 17 depending on which icu_properties/icu_normalizer data version is bundled), and remove the age-based filtering from check_age() entirely. This eliminates unic_ucd_age at the cost of dropping the age-based character filtering that UCD.ucd_3_2_0 relies on.

  3. Use a different crate that provides the Age property, such as the unicode-age or ucd-trie-based crates, but those may have their own maintenance concerns.

Given the scope of this PR (replacing unmaintained unic crates), option 1 (keeping unic_ucd_age for now) seems most pragmatic. The mixed-version issue I flagged is a real concern but may be pre-existing and better addressed separately. Would you like me to open a follow-up issue to track this?


🌐 Web query:

icu_properties 2 Age property Rust API docs CodePointMapData

💡 Result:

In ICU4X icu_properties v2.x (incl. icu_properties 2.1.2) there is currently no props::Age type, so you can’t do CodePointMapData::<props::Age> (or load an “Age” property map) via the Rust API. The props module lists the supported properties, and Age is not among them. [1]

What is supported is the general pattern: enumerated properties (like Script) are queried via CodePointMapData<T>, e.g. CodePointMapData::<Script>::new().get('木'). [2]

Sources:
[1] icu_properties::props module items (supported properties list) (docs.rs)
[2] icu::properties docs showing CodePointMapData usage for enumerated properties (docs.rs)

Citations:


🌐 Web query:

icu4x icu_properties Age unicode property Rust crate API

💡 Result:

icu4x’s Rust Unicode-properties API is the icu_properties crate (also re-exported as icu::properties). It exposes properties via marker types in icu_properties::props (used with CodePointSetData/CodePointMapData) and helper modules like sets/maps. (docs.rs)

However, the Unicode Age property (the Unicode version a code point was first assigned) is not currently implemented/exposed by icu_properties: it does not appear in the crate’s “all items” list of available properties. (docs.rs)

If you specifically need Age, you have to source it elsewhere, for example:

  • Parse Unicode’s DerivedAge data yourself (the Age property is defined as the version of Unicode in which the code point was assigned). (unicode.org)
  • Use ICU4C’s API (ICU exposes Age/u_charAge). (unicode-org.github.io)

Citations:


@ShaharNaveh, you're correct — I apologize for the bad suggestion. The Age property is not currently implemented or exposed by icu_properties; it does not appear in the icu_properties::props module's list of supported properties.

So unic_ucd_age cannot be replaced by icu_properties at this time. The practical options are:

  1. Keep unic_ucd_age as-is for this PR — it's the only remaining unic crate after your changes, serving the specific purpose of age-based filtering and version reporting. This is a reasonable stopping point for this PR.
  2. Hard-code the Unicode version to match the ICU data, and drop age-based filtering — but that's a larger behavioral change that goes beyond just swapping dependencies.

Given the scope of this PR, keeping unic_ucd_age and deferring full replacement seems most pragmatic. Would you like me to open a follow-up issue to track replacing unic_ucd_age once icu_properties gains Age support, or if a suitable alternative crate is found?


✏️ Learnings added
Learnt from: ShaharNaveh
Repo: RustPython/RustPython PR: 7555
File: crates/stdlib/src/unicodedata.rs:44-55
Timestamp: 2026-04-01T10:07:36.832Z
Learning: In icu_properties v2 (icu4x), the Unicode Age property is NOT implemented or exposed. `icu_properties::props::Age` does not exist and is not in the props module items list. There is no direct replacement for `unic_ucd_age::Age` in icu4x v2 crates.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: ShaharNaveh
Repo: RustPython/RustPython PR: 7181
File: crates/stdlib/src/csv.rs:20-20
Timestamp: 2026-02-17T07:54:36.200Z
Learning: In RustPython, ensure that modules using Python sequence-like types bring in extension methods by importing the trait (for example, 'use rustpython_vm::sliceable::SliceableSequenceOp;'). Even if the trait name isn't directly referenced in code, its methods (such as len()) become in-scope on types like PyStr. Add this import in modules that rely on those extension methods to avoid compilation errors.

use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
use unicode_bidi_mirroring::is_mirroring;

pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
__module_exec(vm, module);
Expand Down Expand Up @@ -117,9 +121,9 @@ mod unicodedata {
.extract_char(character, vm)?
.map_or(GeneralCategory::Unassigned, |c| {
c.to_char()
.map_or(GeneralCategory::Surrogate, GeneralCategory::of)
.map_or(GeneralCategory::Surrogate, GeneralCategory::for_char)
})
.abbr_name()
.short_name()
.to_owned())
}

Expand Down Expand Up @@ -165,8 +169,8 @@ mod unicodedata {
let bidi = match self.extract_char(character, vm)? {
Some(c) => c
.to_char()
.map_or(BidiClass::LeftToRight, BidiClass::of)
.abbr_name(),
.map_or(BidiClass::LeftToRight, BidiClass::for_char)
.short_name(),
None => "",
};
Ok(bidi)
Expand All @@ -182,18 +186,34 @@ mod unicodedata {
Ok(self
.extract_char(character, vm)?
.and_then(|c| c.to_char())
.map_or(EastAsianWidth::Neutral, |c| c.east_asian_width())
.abbr_name())
.map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
.short_name())
}

#[pymethod]
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
let text = unistr.as_wtf8();
let normalized_text = match form {
Nfc => text.map_utf8(|s| s.nfc()).collect(),
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
Nfd => text.map_utf8(|s| s.nfd()).collect(),
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
Nfc => {
let normalizer = ComposingNormalizerBorrowed::new_nfc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfkc => {
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfkd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
};
Ok(normalized_text)
}
Expand All @@ -202,10 +222,26 @@ mod unicodedata {
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
let text = unistr.as_wtf8();
let normalized: Wtf8Buf = match form {
Nfc => text.map_utf8(|s| s.nfc()).collect(),
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
Nfd => text.map_utf8(|s| s.nfd()).collect(),
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
Nfc => {
let normalizer = ComposingNormalizerBorrowed::new_nfc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfkc => {
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
Nfkd => {
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
.collect()
}
};
Ok(text == &*normalized)
}
Expand All @@ -216,7 +252,8 @@ mod unicodedata {
Some(c) => {
if let Some(ch) = c.to_char() {
// Check if the character is mirrored in bidirectional text using Unicode standard
Ok(if is_mirroring(ch) { 1 } else { 0 })
let bidi_mirrored = CodePointSetData::new::<BidiMirrored>();
Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 })
} else {
Ok(0)
}
Expand All @@ -226,11 +263,13 @@ mod unicodedata {
}

#[pymethod]
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
Ok(self
.extract_char(character, vm)?
.and_then(|c| c.to_char())
.map_or(0, |ch| ch.canonical_combining_class() as i32))
.map_or(0, |ch| {
CanonicalCombiningClass::for_char(ch).to_icu4c_value()
}))
}

#[pymethod]
Expand Down Expand Up @@ -339,23 +378,6 @@ mod unicodedata {
}
}

trait EastAsianWidthAbbrName {
fn abbr_name(&self) -> &'static str;
}

impl EastAsianWidthAbbrName for EastAsianWidth {
fn abbr_name(&self) -> &'static str {
match self {
Self::Narrow => "Na",
Self::Wide => "W",
Self::Neutral => "N",
Self::Ambiguous => "A",
Self::FullWidth => "F",
Self::HalfWidth => "H",
}
}
}

#[pyattr]
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
Ucd {
Expand Down
5 changes: 1 addition & 4 deletions crates/vm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,7 @@ timsort = "0.1.2"
# TODO: use unic for this; needed for title case:
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
unicode-casing = { workspace = true }
# update version all at the same time
unic-ucd-bidi = { workspace = true }
unic-ucd-category = { workspace = true }
unic-ucd-ident = { workspace = true }
icu_properties = { workspace = true }

[target.'cfg(unix)'.dependencies]
rustix = { workspace = true }
Expand Down
31 changes: 22 additions & 9 deletions crates/vm/src/builtins/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ use rustpython_common::{
str::DeduceStrKind,
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
};
use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
use unic_ucd_ident::{is_xid_continue, is_xid_start};

use icu_properties::props::{
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
};
use unicode_casing::CharExt;

impl<'a> TryFromBorrowedObject<'a> for String {
Expand Down Expand Up @@ -966,7 +967,9 @@ impl PyStr {
#[pymethod]
fn isdecimal(&self) -> bool {
!self.data.is_empty()
&& self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber)
&& self.char_all(|c| {
matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber)
})
}

fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
Expand Down Expand Up @@ -1091,11 +1094,17 @@ impl PyStr {

#[pymethod]
fn isspace(&self) -> bool {
use unic_ucd_bidi::bidi_class::abbr_names::*;
!self.data.is_empty()
&& self.char_all(|c| {
GeneralCategory::of(c) == GeneralCategory::SpaceSeparator
|| matches!(BidiClass::of(c), WS | B | S)
matches!(
GeneralCategory::for_char(c),
GeneralCategory::SpaceSeparator
) || matches!(
BidiClass::for_char(c),
BidiClass::WhiteSpace
| BidiClass::ParagraphSeparator
| BidiClass::SegmentSeparator
)
})
}

Expand Down Expand Up @@ -1355,9 +1364,13 @@ impl PyStr {
pub fn isidentifier(&self) -> bool {
let Some(s) = self.to_str() else { return false };
let mut chars = s.chars();
let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c));

let is_identifier_start = chars
.next()
.is_some_and(|c| c == '_' || XidStart::for_char(c));

// a string is not an identifier if it has whitespace or starts with a number
is_identifier_start && chars.all(is_xid_continue)
is_identifier_start && chars.all(XidContinue::for_char)
}

// https://docs.python.org/3/library/stdtypes.html#str.translate
Expand Down
Loading