-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Replace unmaintained unic crates
#7555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6abe80a
36c3886
23335bd
de8cd0a
15b4a82
4789fc0
b0583ab
47dadaf
273ad65
5a776f5
0cc47eb
3245cd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| use unic_ucd_category::GeneralCategory; | ||
| use icu_properties::props::{EnumeratedProperty, GeneralCategory}; | ||
|
|
||
| /// According to python following categories aren't printable: | ||
| /// * Cc (Other, Control) | ||
|
|
@@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory; | |
| /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) | ||
| /// * Zs (Separator, Space) other than ASCII space('\x20'). | ||
| pub fn is_printable(c: char) -> bool { | ||
| let cat = GeneralCategory::of(c); | ||
| !(cat.is_other() || cat.is_separator()) | ||
| let cat = GeneralCategory::for_char(c); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like we need to have universe character set comparison script to ensure texts are actually the same. |
||
|
|
||
| !matches!( | ||
| cat, | ||
| GeneralCategory::SpaceSeparator | ||
| | GeneralCategory::LineSeparator | ||
| | GeneralCategory::ParagraphSeparator | ||
| | GeneralCategory::Control | ||
| | GeneralCategory::Format | ||
| | GeneralCategory::Surrogate | ||
| | GeneralCategory::PrivateUse | ||
| | GeneralCategory::Unassigned | ||
| ) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,15 +40,19 @@ mod unicodedata { | |
| builtins::{PyModule, PyStrRef}, | ||
| function::OptionalArg, | ||
| }; | ||
|
|
||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| use icu_properties::{ | ||
| CodePointSetData, | ||
| props::{ | ||
| BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty, | ||
| GeneralCategory, NamedEnumeratedProperty, | ||
| }, | ||
| }; | ||
| use itertools::Itertools; | ||
| use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; | ||
| use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; | ||
| use unic_char_property::EnumeratedCharProperty; | ||
| use unic_normal::StrNormalForm; | ||
| use ucd::{Codepoint, DecompositionType, Number, NumericType}; | ||
| use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; | ||
|
Comment on lines
+44
to
55
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: wc -l crates/stdlib/src/unicodedata.rsRepository: RustPython/RustPython Length of output: 101 🏁 Script executed: cat -n crates/stdlib/src/unicodedata.rs | head -250Repository: RustPython/RustPython Length of output: 10161 🏁 Script executed: # Search for unidata_version and how it's constructed
rg "unidata_version" crates/stdlib/src/unicodedata.rs -B 2 -A 2Repository: RustPython/RustPython Length of output: 300 🏁 Script executed: # Search for check_age and Age usage
rg "check_age|Age::|UNICODE_VERSION" crates/stdlib/src/unicodedata.rs -B 2 -A 2Repository: RustPython/RustPython Length of output: 1397 🏁 Script executed: fd -type f Cargo.toml | grep -E "(stdlib|Cargo)" | head -5Repository: RustPython/RustPython Length of output: 235 🏁 Script executed: # Find and examine the Cargo.toml for the stdlib crate
cat Cargo.toml.in 2>/dev/null || find . -name "Cargo.toml" -path "*/stdlib/*" | head -1 | xargs catRepository: RustPython/RustPython Length of output: 5918 🏁 Script executed: # Search for icu and unic_ucd_age dependency versions
rg "icu_|unic_ucd_age" Cargo.toml --type toml -A 1Repository: RustPython/RustPython Length of output: 131 🏁 Script executed: # Check the root Cargo.toml for workspace dependencies
head -150 Cargo.toml | grep -A 30 "^\[workspace.dependencies\]"Repository: RustPython/RustPython Length of output: 575 🏁 Script executed: # Search for the unic_ucd_age version definition
grep -A 5 "unic-ucd-age" Cargo.tomlRepository: RustPython/RustPython Length of output: 182 🏁 Script executed: # Get version info from Cargo.lock if available
ls -la Cargo.lock 2>/dev/null && grep -A 5 "unic-ucd-age\|icu_properties\|icu_normalizer" Cargo.lock | head -40Repository: RustPython/RustPython Length of output: 1168 🏁 Script executed: # Search for UNICODE_VERSION constant value or documentation
rg "UNICODE_VERSION" --type rust -A 3 -B 3 | head -50Repository: RustPython/RustPython Length of output: 1335 🏁 Script executed: # Check if there's any documentation or comments about Unicode versions in the code
rg "Unicode|unicode" crates/stdlib/src/unicodedata.rs | head -20Repository: RustPython/RustPython Length of output: 792 🏁 Script executed: # Look at the comment on line 179 about version mismatch
sed -n '175,185p' crates/stdlib/src/unicodedata.rsRepository: RustPython/RustPython Length of output: 352 🏁 Script executed: # Get more context about the ICU data versions used
rg "2.1" --type rust -B 2 -A 2 | grep -E "(icu_|version|9.0|3.2)" | head -20Repository: RustPython/RustPython Length of output: 2575 🏁 Script executed: # Check what Unicode version the ICU libraries support
web_search ICU 2.1.2 unicode version supportRepository: RustPython/RustPython Length of output: 115 🌐 Web query:
💡 Result: ICU 2.1.2 supports Unicode Standard version 2.1.2. Citations:
🌐 Web query:
💡 Result: ICU Unicode data versions 15 and 16 are supported in ICU4C releases: ICU 72/74 for Unicode 15/15.1, ICU 76 for Unicode 16. ICU 78 uses Unicode 17. In ICU4X (Rust), icu_normalizer and icu_properties crates at version 2.1 (released 2025-10-28) use data generated from ICU version release-78.1rc / icu4x/2025-10-28 tag and CLDR 48, corresponding to Unicode 17 (not 15 or 16). Earlier ICU4X versions aligned with Unicode 15 (ICU 74, 2023) and 16 (ICU 76, 2024). icu_normalizer depends on icu_properties for decomposition/composition properties. No specific "icu_properties 2.1" data version beyond the shared ICU4X data pipeline. Citations:
🏁 Script executed: # Check the actual Unicode version that unic_ucd_age provides
rg "unic.ucd.age" Cargo.lock -A 10 | head -40Repository: RustPython/RustPython Length of output: 493 🌐 Web query:
💡 Result: The unic-ucd-age 0.9.0 crate, part of the UNIC (Unicode and Internationalization Crates for Rust) project, provides access to the Unicode character Age property from the Unicode Character Database (UCD). It supports Unicode 10.0.0 data, as indicated in the project's documentation for the unic super-crate version 0.9.0, which explicitly states it includes Unicode 10.0.0 data and algorithms. The crate exposes a UNICODE_VERSION constant representing the Unicode version of its data and includes the Age struct and CharAge trait for querying when characters were first assigned in Unicode versions. Released on 2019-03-03, it depends on unic-ucd-version ^0.9.0 and is marked as unmaintained. Example usage: Age::of('A') returns UnicodeVersion { major: 1, minor: 1, micro: 0 }, matching Unicode 1.1 assignment. Citations:
🏁 Script executed: # Check what unicode version unic_ucd_age uses
rg "unic-ucd-version" Cargo.lock -A 5Repository: RustPython/RustPython Length of output: 320 Resolve the mixed Unicode database versions.
🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the recommended replacement crate for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The The use icu_properties::{CodePointMapData, props::Age};
// Replaces: Age::of(c)
let age = CodePointMapData::<Age>::new().get(c);For The migration would look roughly like:
Would you like me to draft the full diff for this migration? 🧠 Learnings used
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain
So the options for resolving the mixed-version concern are:
Given the scope of this PR (replacing unmaintained 🌐 Web query: 💡 Result: In ICU4X What is supported is the general pattern: enumerated properties (like Sources: Citations:
🌐 Web query: 💡 Result:
However, the Unicode If you specifically need
Citations:
So
Given the scope of this PR, keeping ✏️ Learnings added
🧠 Learnings used |
||
| use unic_ucd_bidi::BidiClass; | ||
| use unic_ucd_category::GeneralCategory; | ||
| use unicode_bidi_mirroring::is_mirroring; | ||
|
|
||
| pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> { | ||
| __module_exec(vm, module); | ||
|
|
@@ -117,9 +121,9 @@ mod unicodedata { | |
| .extract_char(character, vm)? | ||
| .map_or(GeneralCategory::Unassigned, |c| { | ||
| c.to_char() | ||
| .map_or(GeneralCategory::Surrogate, GeneralCategory::of) | ||
| .map_or(GeneralCategory::Surrogate, GeneralCategory::for_char) | ||
| }) | ||
| .abbr_name() | ||
| .short_name() | ||
| .to_owned()) | ||
| } | ||
|
|
||
|
|
@@ -165,8 +169,8 @@ mod unicodedata { | |
| let bidi = match self.extract_char(character, vm)? { | ||
| Some(c) => c | ||
| .to_char() | ||
| .map_or(BidiClass::LeftToRight, BidiClass::of) | ||
| .abbr_name(), | ||
| .map_or(BidiClass::LeftToRight, BidiClass::for_char) | ||
| .short_name(), | ||
| None => "", | ||
| }; | ||
| Ok(bidi) | ||
|
|
@@ -182,18 +186,34 @@ mod unicodedata { | |
| Ok(self | ||
| .extract_char(character, vm)? | ||
| .and_then(|c| c.to_char()) | ||
| .map_or(EastAsianWidth::Neutral, |c| c.east_asian_width()) | ||
| .abbr_name()) | ||
| .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char) | ||
| .short_name()) | ||
| } | ||
|
|
||
| #[pymethod] | ||
| fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> { | ||
| let text = unistr.as_wtf8(); | ||
| let normalized_text = match form { | ||
| Nfc => text.map_utf8(|s| s.nfc()).collect(), | ||
| Nfkc => text.map_utf8(|s| s.nfkc()).collect(), | ||
| Nfd => text.map_utf8(|s| s.nfd()).collect(), | ||
| Nfkd => text.map_utf8(|s| s.nfkd()).collect(), | ||
| Nfc => { | ||
| let normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfkc => { | ||
| let normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfd => { | ||
| let normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfkd => { | ||
| let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| }; | ||
| Ok(normalized_text) | ||
| } | ||
|
|
@@ -202,10 +222,26 @@ mod unicodedata { | |
| fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> { | ||
| let text = unistr.as_wtf8(); | ||
| let normalized: Wtf8Buf = match form { | ||
| Nfc => text.map_utf8(|s| s.nfc()).collect(), | ||
| Nfkc => text.map_utf8(|s| s.nfkc()).collect(), | ||
| Nfd => text.map_utf8(|s| s.nfd()).collect(), | ||
| Nfkd => text.map_utf8(|s| s.nfkd()).collect(), | ||
| Nfc => { | ||
| let normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfkc => { | ||
| let normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfd => { | ||
| let normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| Nfkd => { | ||
| let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| text.map_utf8(|s| normalizer.normalize_iter(s.chars())) | ||
| .collect() | ||
| } | ||
| }; | ||
| Ok(text == &*normalized) | ||
| } | ||
|
|
@@ -216,7 +252,8 @@ mod unicodedata { | |
| Some(c) => { | ||
| if let Some(ch) = c.to_char() { | ||
| // Check if the character is mirrored in bidirectional text using Unicode standard | ||
| Ok(if is_mirroring(ch) { 1 } else { 0 }) | ||
| let bidi_mirrored = CodePointSetData::new::<BidiMirrored>(); | ||
| Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 }) | ||
| } else { | ||
| Ok(0) | ||
| } | ||
|
|
@@ -226,11 +263,13 @@ mod unicodedata { | |
| } | ||
|
|
||
| #[pymethod] | ||
| fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> { | ||
| fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> { | ||
| Ok(self | ||
| .extract_char(character, vm)? | ||
| .and_then(|c| c.to_char()) | ||
| .map_or(0, |ch| ch.canonical_combining_class() as i32)) | ||
| .map_or(0, |ch| { | ||
| CanonicalCombiningClass::for_char(ch).to_icu4c_value() | ||
| })) | ||
| } | ||
|
|
||
| #[pymethod] | ||
|
|
@@ -339,23 +378,6 @@ mod unicodedata { | |
| } | ||
| } | ||
|
|
||
| trait EastAsianWidthAbbrName { | ||
| fn abbr_name(&self) -> &'static str; | ||
| } | ||
|
|
||
| impl EastAsianWidthAbbrName for EastAsianWidth { | ||
| fn abbr_name(&self) -> &'static str { | ||
| match self { | ||
| Self::Narrow => "Na", | ||
| Self::Wide => "W", | ||
| Self::Neutral => "N", | ||
| Self::Ambiguous => "A", | ||
| Self::FullWidth => "F", | ||
| Self::HalfWidth => "H", | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[pyattr] | ||
| fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> { | ||
| Ucd { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test was giving us a false sense that it works, because it used to report
\u{0560}as non-printable (but now it's saying it's printable).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what should we do?