Skip to content

Commit 3d96884

Browse files
authored
Replace unmaintained unic crates (#7555)
1 parent b61dfdc commit 3d96884

File tree

10 files changed

+320
-154
lines changed

10 files changed

+320
-154
lines changed

Cargo.lock

Lines changed: 214 additions & 85 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -222,15 +222,11 @@ strum = "0.28"
222222
strum_macros = "0.28"
223223
syn = "2"
224224
thiserror = "2.0"
225+
icu_properties = "2"
226+
icu_normalizer = "2"
225227
unicode-casing = "0.1.1"
226-
unic-char-property = "0.9.0"
227-
unic-normal = "0.9.0"
228228
unic-ucd-age = "0.9.0"
229-
unic-ucd-bidi = "0.9.0"
230-
unic-ucd-category = "0.9.0"
231-
unic-ucd-ident = "0.9.0"
232229
unicode_names2 = "2.0.0"
233-
unicode-bidi-mirroring = "0.4"
234230
widestring = "1.2.0"
235231
windows-sys = "0.61.2"
236232
wasm-bindgen = "0.2.106"

Lib/test/test_str.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,7 @@ def test_isprintable(self):
854854
self.assertTrue('\U0001F46F'.isprintable())
855855
self.assertFalse('\U000E0020'.isprintable())
856856

857+
@unittest.expectedFailure # TODO: RUSTPYTHON
857858
@support.requires_resource('cpu')
858859
def test_isprintable_invariant(self):
859860
for codepoint in range(sys.maxunicode + 1):

Lib/test/test_unicodedata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ def test_issue10254(self):
232232
b = 'C\u0338' * 20 + '\xC7'
233233
self.assertEqual(self.db.normalize('NFC', a), b)
234234

235-
@unittest.expectedFailure # TODO: RUSTPYTHON; ? +
236235
def test_issue29456(self):
237236
# Fix #29456
238237
u1176_str_a = '\u1100\u1176\u11a8'
@@ -389,6 +388,7 @@ def unistr(data):
389388
data = [int(x, 16) for x in data.split(" ")]
390389
return "".join([chr(x) for x in data])
391390

391+
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055
392392
@requires_resource('network')
393393
@requires_resource('cpu')
394394
def test_normalization(self):

crates/literal/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ hexf-parse = "0.2.1"
1515
is-macro.workspace = true
1616
lexical-parse-float = { version = "1.0.6", features = ["format"] }
1717
num-traits = { workspace = true }
18-
unic-ucd-category = { workspace = true }
18+
icu_properties = { workspace = true }
1919

2020
[dev-dependencies]
2121
rand = { workspace = true }

crates/literal/src/char.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use unic_ucd_category::GeneralCategory;
1+
use icu_properties::props::{EnumeratedProperty, GeneralCategory};
22

33
/// According to python following categories aren't printable:
44
/// * Cc (Other, Control)
@@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory;
1010
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
1111
/// * Zs (Separator, Space) other than ASCII space('\x20').
1212
pub fn is_printable(c: char) -> bool {
13-
let cat = GeneralCategory::of(c);
14-
!(cat.is_other() || cat.is_separator())
13+
let cat = GeneralCategory::for_char(c);
14+
15+
!matches!(
16+
cat,
17+
GeneralCategory::SpaceSeparator
18+
| GeneralCategory::LineSeparator
19+
| GeneralCategory::ParagraphSeparator
20+
| GeneralCategory::Control
21+
| GeneralCategory::Format
22+
| GeneralCategory::Surrogate
23+
| GeneralCategory::PrivateUse
24+
| GeneralCategory::Unassigned
25+
)
1526
}

crates/stdlib/Cargo.toml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,10 @@ constant_time_eq = { workspace = true }
7878
## unicode stuff
7979
unicode_names2 = { workspace = true }
8080
# update version all at the same time
81-
unic-char-property = { workspace = true }
82-
unic-normal = { workspace = true }
83-
unic-ucd-bidi = { workspace = true }
84-
unic-ucd-category = { workspace = true }
81+
icu_properties = { workspace = true }
82+
icu_normalizer = { workspace = true }
8583
unic-ucd-age = { workspace = true }
8684
ucd = "0.1.1"
87-
unicode-bidi-mirroring = { workspace = true }
8885

8986
# compression
9087
adler32 = "1.2.0"

crates/stdlib/src/unicodedata.rs

Lines changed: 62 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,19 @@ mod unicodedata {
4040
builtins::{PyModule, PyStrRef},
4141
function::OptionalArg,
4242
};
43+
44+
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
45+
use icu_properties::{
46+
CodePointSetData,
47+
props::{
48+
BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
49+
GeneralCategory, NamedEnumeratedProperty,
50+
},
51+
};
4352
use itertools::Itertools;
4453
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
45-
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
46-
use unic_char_property::EnumeratedCharProperty;
47-
use unic_normal::StrNormalForm;
54+
use ucd::{Codepoint, DecompositionType, Number, NumericType};
4855
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
49-
use unic_ucd_bidi::BidiClass;
50-
use unic_ucd_category::GeneralCategory;
51-
use unicode_bidi_mirroring::is_mirroring;
5256

5357
pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
5458
__module_exec(vm, module);
@@ -117,9 +121,9 @@ mod unicodedata {
117121
.extract_char(character, vm)?
118122
.map_or(GeneralCategory::Unassigned, |c| {
119123
c.to_char()
120-
.map_or(GeneralCategory::Surrogate, GeneralCategory::of)
124+
.map_or(GeneralCategory::Surrogate, GeneralCategory::for_char)
121125
})
122-
.abbr_name()
126+
.short_name()
123127
.to_owned())
124128
}
125129

@@ -165,8 +169,8 @@ mod unicodedata {
165169
let bidi = match self.extract_char(character, vm)? {
166170
Some(c) => c
167171
.to_char()
168-
.map_or(BidiClass::LeftToRight, BidiClass::of)
169-
.abbr_name(),
172+
.map_or(BidiClass::LeftToRight, BidiClass::for_char)
173+
.short_name(),
170174
None => "",
171175
};
172176
Ok(bidi)
@@ -182,18 +186,34 @@ mod unicodedata {
182186
Ok(self
183187
.extract_char(character, vm)?
184188
.and_then(|c| c.to_char())
185-
.map_or(EastAsianWidth::Neutral, |c| c.east_asian_width())
186-
.abbr_name())
189+
.map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
190+
.short_name())
187191
}
188192

189193
#[pymethod]
190194
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
191195
let text = unistr.as_wtf8();
192196
let normalized_text = match form {
193-
Nfc => text.map_utf8(|s| s.nfc()).collect(),
194-
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
195-
Nfd => text.map_utf8(|s| s.nfd()).collect(),
196-
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
197+
Nfc => {
198+
let normalizer = ComposingNormalizerBorrowed::new_nfc();
199+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
200+
.collect()
201+
}
202+
Nfkc => {
203+
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
204+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
205+
.collect()
206+
}
207+
Nfd => {
208+
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
209+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
210+
.collect()
211+
}
212+
Nfkd => {
213+
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
214+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
215+
.collect()
216+
}
197217
};
198218
Ok(normalized_text)
199219
}
@@ -202,10 +222,26 @@ mod unicodedata {
202222
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
203223
let text = unistr.as_wtf8();
204224
let normalized: Wtf8Buf = match form {
205-
Nfc => text.map_utf8(|s| s.nfc()).collect(),
206-
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
207-
Nfd => text.map_utf8(|s| s.nfd()).collect(),
208-
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
225+
Nfc => {
226+
let normalizer = ComposingNormalizerBorrowed::new_nfc();
227+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
228+
.collect()
229+
}
230+
Nfkc => {
231+
let normalizer = ComposingNormalizerBorrowed::new_nfkc();
232+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
233+
.collect()
234+
}
235+
Nfd => {
236+
let normalizer = DecomposingNormalizerBorrowed::new_nfd();
237+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
238+
.collect()
239+
}
240+
Nfkd => {
241+
let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
242+
text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
243+
.collect()
244+
}
209245
};
210246
Ok(text == &*normalized)
211247
}
@@ -216,7 +252,8 @@ mod unicodedata {
216252
Some(c) => {
217253
if let Some(ch) = c.to_char() {
218254
// Check if the character is mirrored in bidirectional text using Unicode standard
219-
Ok(if is_mirroring(ch) { 1 } else { 0 })
255+
let bidi_mirrored = CodePointSetData::new::<BidiMirrored>();
256+
Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 })
220257
} else {
221258
Ok(0)
222259
}
@@ -226,11 +263,13 @@ mod unicodedata {
226263
}
227264

228265
#[pymethod]
229-
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
266+
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
230267
Ok(self
231268
.extract_char(character, vm)?
232269
.and_then(|c| c.to_char())
233-
.map_or(0, |ch| ch.canonical_combining_class() as i32))
270+
.map_or(0, |ch| {
271+
CanonicalCombiningClass::for_char(ch).to_icu4c_value()
272+
}))
234273
}
235274

236275
#[pymethod]
@@ -339,23 +378,6 @@ mod unicodedata {
339378
}
340379
}
341380

342-
trait EastAsianWidthAbbrName {
343-
fn abbr_name(&self) -> &'static str;
344-
}
345-
346-
impl EastAsianWidthAbbrName for EastAsianWidth {
347-
fn abbr_name(&self) -> &'static str {
348-
match self {
349-
Self::Narrow => "Na",
350-
Self::Wide => "W",
351-
Self::Neutral => "N",
352-
Self::Ambiguous => "A",
353-
Self::FullWidth => "F",
354-
Self::HalfWidth => "H",
355-
}
356-
}
357-
}
358-
359381
#[pyattr]
360382
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
361383
Ucd {

crates/vm/Cargo.toml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,7 @@ timsort = "0.1.2"
8686
# TODO: use unic for this; needed for title case:
8787
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
8888
unicode-casing = { workspace = true }
89-
# update version all at the same time
90-
unic-ucd-bidi = { workspace = true }
91-
unic-ucd-category = { workspace = true }
92-
unic-ucd-ident = { workspace = true }
89+
icu_properties = { workspace = true }
9390

9491
[target.'cfg(unix)'.dependencies]
9592
rustix = { workspace = true }

crates/vm/src/builtins/str.rs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@ use rustpython_common::{
4343
str::DeduceStrKind,
4444
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
4545
};
46-
use unic_ucd_bidi::BidiClass;
47-
use unic_ucd_category::GeneralCategory;
48-
use unic_ucd_ident::{is_xid_continue, is_xid_start};
46+
47+
use icu_properties::props::{
48+
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
49+
};
4950
use unicode_casing::CharExt;
5051

5152
impl<'a> TryFromBorrowedObject<'a> for String {
@@ -966,7 +967,9 @@ impl PyStr {
966967
#[pymethod]
967968
fn isdecimal(&self) -> bool {
968969
!self.data.is_empty()
969-
&& self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber)
970+
&& self.char_all(|c| {
971+
matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber)
972+
})
970973
}
971974

972975
fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
@@ -1091,11 +1094,17 @@ impl PyStr {
10911094

10921095
#[pymethod]
10931096
fn isspace(&self) -> bool {
1094-
use unic_ucd_bidi::bidi_class::abbr_names::*;
10951097
!self.data.is_empty()
10961098
&& self.char_all(|c| {
1097-
GeneralCategory::of(c) == GeneralCategory::SpaceSeparator
1098-
|| matches!(BidiClass::of(c), WS | B | S)
1099+
matches!(
1100+
GeneralCategory::for_char(c),
1101+
GeneralCategory::SpaceSeparator
1102+
) || matches!(
1103+
BidiClass::for_char(c),
1104+
BidiClass::WhiteSpace
1105+
| BidiClass::ParagraphSeparator
1106+
| BidiClass::SegmentSeparator
1107+
)
10991108
})
11001109
}
11011110

@@ -1355,9 +1364,13 @@ impl PyStr {
13551364
pub fn isidentifier(&self) -> bool {
13561365
let Some(s) = self.to_str() else { return false };
13571366
let mut chars = s.chars();
1358-
let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c));
1367+
1368+
let is_identifier_start = chars
1369+
.next()
1370+
.is_some_and(|c| c == '_' || XidStart::for_char(c));
1371+
13591372
// a string is not an identifier if it has whitespace or starts with a number
1360-
is_identifier_start && chars.all(is_xid_continue)
1373+
is_identifier_start && chars.all(XidContinue::for_char)
13611374
}
13621375

13631376
// https://docs.python.org/3/library/stdtypes.html#str.translate

0 commit comments

Comments
 (0)