Skip to content

Commit 5a42103

Browse files
unicodedata: Const, embedded version
1 parent fe2a7db commit 5a42103

2 files changed

Lines changed: 35 additions & 55 deletions

File tree

crates/stdlib/build.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,13 @@ fn main() {
656656
}
657657
}
658658

659+
println!(
660+
"cargo:rustc-env=RUST_UNICODE_VERSION=\"{}.{}.{}\"",
661+
char::UNICODE_VERSION.0,
662+
char::UNICODE_VERSION.1,
663+
char::UNICODE_VERSION.2
664+
);
665+
659666
println!("cargo:rerun-if-changed=unicode/ucd32");
660667
println!("cargo:rerun-if-changed=unicode/latest");
661668

crates/stdlib/src/unicodedata.rs

Lines changed: 28 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44

55
// spell-checker:ignore codep decomp DECOMP nfkc unistr unidata
66

7-
use core::{
8-
cmp::Ordering,
9-
fmt::{self, Display, Formatter},
10-
hint::cold_path,
11-
};
7+
use core::{cmp::Ordering, hint::cold_path};
128

139
pub(crate) use unicodedata::module_def;
1410

@@ -28,25 +24,6 @@ include!(concat!(
2824
"/generated/unicode_numeric_value.rs"
2925
));
3026

31-
#[derive(Clone, Copy, Debug, PartialEq)]
32-
struct UnicodeVersion {
33-
pub major: u8,
34-
pub minor: u8,
35-
pub micro: u8,
36-
}
37-
38-
impl Display for UnicodeVersion {
39-
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
40-
write!(f, "{}.{}.{}", self.major, self.minor, self.micro)
41-
}
42-
}
43-
44-
const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
45-
major: char::UNICODE_VERSION.0,
46-
minor: char::UNICODE_VERSION.1,
47-
micro: char::UNICODE_VERSION.2,
48-
};
49-
5027
#[derive(Clone, Copy)]
5128
#[repr(u8)]
5229
enum DecompositionType {
@@ -118,8 +95,8 @@ fn lookup_property<T: Copy>(table: &[(u32, u32, T)], ch: char) -> Option<T> {
11895
.map(|i| table[i].2)
11996
}
12097

121-
fn lookup_numeric_val(ch: char, version: UnicodeVersion) -> Option<f64> {
122-
if version.major > 3 {
98+
fn lookup_numeric_val(ch: char, modern: bool) -> Option<f64> {
99+
if modern {
123100
lookup_property(NUMERIC_VALUES, ch)
124101
} else {
125102
cold_path();
@@ -162,8 +139,8 @@ mod unicodedata {
162139

163140
use super::{
164141
BIDI_CLASS, BIDI_MIRRORED, COMBINING_CLASS, DECOMP_COMPAT, DECOMP_RANGE, DECOMP_UPDATES,
165-
EAST_ASIAN_WIDTH, GENERAL_CATEGORY, NUMERIC_TYPE_DIFF, NormalizeForm, UNICODE_VERSION,
166-
UnicodeVersion, lookup_numeric_val, lookup_property,
142+
EAST_ASIAN_WIDTH, GENERAL_CATEGORY, NUMERIC_TYPE_DIFF, NormalizeForm, lookup_numeric_val,
143+
lookup_property,
167144
};
168145
use crate::vm::{
169146
Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
@@ -186,7 +163,7 @@ mod unicodedata {
186163
__module_exec(vm, module);
187164

188165
// Add UCD methods as module-level functions
189-
let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into();
166+
let ucd: PyObjectRef = Ucd::new(true).into_ref(&vm.ctx).into();
190167

191168
for attr in [
192169
"category",
@@ -213,12 +190,12 @@ mod unicodedata {
213190
#[pyclass(name = "UCD")]
214191
#[derive(Debug, PyPayload)]
215192
pub(super) struct Ucd {
216-
unic_version: UnicodeVersion,
193+
modern: bool,
217194
}
218195

219196
impl Ucd {
220-
pub(super) const fn new(unic_version: UnicodeVersion) -> Self {
221-
Self { unic_version }
197+
pub(super) const fn new(modern: bool) -> Self {
198+
Self { modern }
222199
}
223200

224201
fn extract_char(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
@@ -238,7 +215,7 @@ mod unicodedata {
238215
let Some(c) = c.to_char() else {
239216
return GeneralCategory::Surrogate.short_name();
240217
};
241-
if self.unic_version.major > 3 {
218+
if self.modern {
242219
Some(GeneralCategory::for_char(c))
243220
} else {
244221
cold_path();
@@ -291,7 +268,7 @@ mod unicodedata {
291268
self.extract_char(character, vm).map(|c| {
292269
c.to_char()
293270
.and_then(|c| {
294-
if self.unic_version.major > 3 {
271+
if self.modern {
295272
Some(BidiClass::for_char(c))
296273
} else {
297274
cold_path();
@@ -312,7 +289,7 @@ mod unicodedata {
312289
self.extract_char(character, vm).map(|c| {
313290
c.to_char()
314291
.and_then(|c| {
315-
if self.unic_version.major > 3 {
292+
if self.modern {
316293
Some(EastAsianWidth::for_char(c))
317294
} else {
318295
cold_path();
@@ -392,7 +369,7 @@ mod unicodedata {
392369
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
393370
self.extract_char(character, vm).map(|c| {
394371
c.to_char().map_or(0, |c| {
395-
(if self.unic_version.major > 3 {
372+
(if self.modern {
396373
BidiMirrored::for_char(c)
397374
} else {
398375
cold_path();
@@ -418,7 +395,7 @@ mod unicodedata {
418395
self.extract_char(character, vm).map(|c| {
419396
c.to_char()
420397
.and_then(|c| {
421-
if self.unic_version.major > 3 {
398+
if self.modern {
422399
Some(CanonicalCombiningClass::for_char(c))
423400
} else {
424401
cold_path();
@@ -442,7 +419,7 @@ mod unicodedata {
442419
// For 3.2.0, we use the original decomp for compatibility while ignoring the update.
443420
//
444421
// Finally, we don't have to do anything for the latest UCD as it's already updated.
445-
if self.unic_version.major == 3
422+
if self.modern
446423
&& let Some((_, original)) = DECOMP_UPDATES
447424
.iter()
448425
.find(|&&(codep, _original)| codep == ch as u32)
@@ -485,7 +462,7 @@ mod unicodedata {
485462
fn numeric_type_matches(&self, ch: CodePoint, expected: &[NumericType]) -> Option<char> {
486463
let ch = ch.to_char()?;
487464

488-
let actual = if self.unic_version.major > 3 {
465+
let actual = if self.modern {
489466
NumericType::for_char(ch)
490467
} else {
491468
cold_path();
@@ -506,7 +483,7 @@ mod unicodedata {
506483
let expected = [NumericType::Decimal, NumericType::Digit];
507484
self.numeric_type_matches(ch, &expected)
508485
.and_then(|ch| {
509-
let value = lookup_numeric_val(ch, UNICODE_VERSION)?;
486+
let value = lookup_numeric_val(ch, true)?;
510487
(value.trunc() == value).then(|| vm.ctx.new_int(value as u64).into())
511488
})
512489
.or_else(|| default.present())
@@ -525,7 +502,7 @@ mod unicodedata {
525502
let expected = [NumericType::Decimal];
526503
self.numeric_type_matches(ch, &expected)
527504
.and_then(|ch| {
528-
let value = lookup_numeric_val(ch, self.unic_version)?;
505+
let value = lookup_numeric_val(ch, self.modern)?;
529506
(value.trunc() == value).then(|| vm.ctx.new_int(value as u64).into())
530507
})
531508
.or_else(|| default.present())
@@ -544,34 +521,30 @@ mod unicodedata {
544521
let expected = &NumericType::ALL_VALUES[1..];
545522
self.numeric_type_matches(ch, expected)
546523
.and_then(|ch| {
547-
lookup_numeric_val(ch, self.unic_version)
548-
.map(|value| vm.ctx.new_float(value).into())
524+
lookup_numeric_val(ch, self.modern).map(|value| vm.ctx.new_float(value).into())
549525
})
550526
.or_else(|| default.present())
551527
.map(Option::Some)
552528
.ok_or_else(|| vm.new_value_error("not a numeric character"))
553529
}
554530

555531
#[pygetset]
556-
fn unidata_version(&self) -> String {
557-
self.unic_version.to_string()
532+
const fn unidata_version(&self) -> &'static str {
533+
if self.modern {
534+
env!("RUST_UNICODE_VERSION")
535+
} else {
536+
"3.2.0"
537+
}
558538
}
559539
}
560540

561541
#[pyattr]
562542
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
563-
Ucd {
564-
unic_version: UnicodeVersion {
565-
major: 3,
566-
minor: 2,
567-
micro: 0,
568-
},
569-
}
570-
.into_ref(&vm.ctx)
543+
Ucd::new(false).into_ref(&vm.ctx)
571544
}
572545

573546
#[pyattr]
574-
fn unidata_version(_vm: &VirtualMachine) -> String {
575-
UNICODE_VERSION.to_string()
547+
const fn unidata_version(_vm: &VirtualMachine) -> &'static str {
548+
env!("RUST_UNICODE_VERSION")
576549
}
577550
}

0 commit comments

Comments
 (0)