Finish unicode crate follow-up refactors

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4 Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
RustPython · Copilot · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
commit 0a340de9c30e00c6794464104397ef021244aeab
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/codegen/Cargo.toml b/crates/codegen/Cargo.toml
@@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]
 
 [dependencies]
 rustpython-compiler-core = { workspace = true }
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-literal = {workspace = true }
 rustpython-wtf8 = { workspace = true }
 ruff_python_ast = { workspace = true }
@@ -29,7 +30,6 @@ num-traits = { workspace = true }
 thiserror = { workspace = true }
 malachite-bigint = { workspace = true }
 memchr = { workspace = true }
-unicode_names2 = { workspace = true }
 
 [dev-dependencies]
 ruff_python_parser = { workspace = true }

diff --git a/crates/codegen/src/string_parser.rs b/crates/codegen/src/string_parser.rs
@@ -113,7 +113,9 @@ impl StringParser {
         let name_and_ending = self.skip_bytes(close_idx + 1);
         let name = &name_and_ending[..name_and_ending.len() - 1];
 
-        unicode_names2::character(name).ok_or_else(|| unreachable!())
+        rustpython_unicode::data::lookup(name)
+            .and_then(char::from_u32)
+            .ok_or_else(|| unreachable!())
     }
 
     /// Parse an escaped character, returning the new character.

diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml
@@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]
 
 [dependencies]
 rustpython-literal = { workspace = true }
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-wtf8 = { workspace = true }
 
 ascii = { workspace = true }
@@ -29,7 +30,6 @@ malachite-q = { workspace = true }
 malachite-base = { workspace = true }
 num-traits = { workspace = true }
 parking_lot = { workspace = true, optional = true }
-unicode_names2 = { workspace = true }
 radium = { workspace = true }
 
 lock_api = "0.4"

diff --git a/crates/common/src/encodings.rs b/crates/common/src/encodings.rs
@@ -414,7 +414,7 @@ pub mod errors {
             let mut out = String::with_capacity(num_chars * 4);
             for c in err_str.code_points() {
                 let c_u32 = c.to_u32();
-                if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
+                if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
                     write!(out, "\\N{{{c_name}}}").unwrap();
                 } else if c_u32 >= 0x10000 {
                     write!(out, "\\U{c_u32:08x}").unwrap();

diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml
@@ -28,7 +28,7 @@ flame-it = ["flame"]
 [dependencies]
 # rustpython crates
 rustpython-derive = { workspace = true }
-rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
+rustpython-unicode = { workspace = true, features = ["casefold"] }
 rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
 rustpython-common = { workspace = true }
 

diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs
@@ -6,43 +6,6 @@
 
 pub(crate) use unicodedata::module_def;
 
-use crate::vm::{
-    PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject,
-};
-
-enum NormalizeForm {
-    Nfc,
-    Nfkc,
-    Nfd,
-    Nfkd,
-}
-
-impl From<NormalizeForm> for rustpython_unicode::NormalizeForm {
-    fn from(value: NormalizeForm) -> Self {
-        match value {
-            NormalizeForm::Nfc => Self::Nfc,
-            NormalizeForm::Nfkc => Self::Nfkc,
-            NormalizeForm::Nfd => Self::Nfd,
-            NormalizeForm::Nfkd => Self::Nfkd,
-        }
-    }
-}
-
-impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
-    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
-        obj.try_value_with(
-            |form: &PyStr| match form.as_bytes() {
-                b"NFC" => Ok(Self::Nfc),
-                b"NFKC" => Ok(Self::Nfkc),
-                b"NFD" => Ok(Self::Nfd),
-                b"NFKD" => Ok(Self::Nfkd),
-                _ => Err(vm.new_value_error("invalid normalization form")),
-            },
-            vm,
-        )
-    }
-}
-
 #[pymodule]
 mod unicodedata {
     use crate::vm::{
@@ -53,13 +16,20 @@ mod unicodedata {
 
     use itertools::Itertools;
     use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
-    use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize};
+    use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data};
+
+    fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult<NormalizeForm> {
+        form.to_str()
+            .ok_or_else(|| vm.new_value_error("invalid normalization form"))?
+            .parse()
+            .map_err(|()| vm.new_value_error("invalid normalization form"))
+    }
 
     pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
         __module_exec(vm, module);
 
         // Add UCD methods as module-level functions
-        let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into();
+        let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into();
 
         for attr in [
             "category",
@@ -85,49 +55,36 @@ mod unicodedata {
     #[pyattr]
     #[pyclass(name = "UCD")]
     #[derive(Debug, PyPayload)]
-    pub(super) struct Ucd {
-        unic_version: UnicodeVersion,
-    }
-
-    impl Ucd {
-        pub const fn new(unic_version: UnicodeVersion) -> Self {
-            Self { unic_version }
-        }
+    pub(super) struct PyUcd(data::Ucd);
 
-        fn check_age(&self, c: CodePoint) -> bool {
-            data::is_assigned_in_version(c.to_u32(), self.unic_version)
+    impl PyUcd {
+        pub const fn new(ucd: data::Ucd) -> Self {
+            Self(ucd)
         }
 
-        fn extract_char(
-            &self,
-            character: PyStrRef,
-            vm: &VirtualMachine,
-        ) -> PyResult<Option<CodePoint>> {
-            let c = character
+        fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
+            character
                 .as_wtf8()
                 .code_points()
                 .exactly_one()
-                .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?;
-
-            Ok(self.check_age(c).then_some(c))
+                .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))
         }
     }
 
     #[pyclass(flags(DISALLOW_INSTANTIATION))]
-    impl Ucd {
+    impl PyUcd {
         #[pymethod]
         fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("Cn", |c| data::category(c.to_u32()))
+                .0
+                .category(Self::extract_char(character, vm)?.to_u32())
                 .to_owned())
         }
 
         #[pymethod]
         fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             if let Some(name_str) = name.to_str()
-                && let Some(character) = data::lookup(name_str)
-                && self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point"))
+                && let Some(character) = self.0.lookup(name_str)
             {
                 return Ok(char::from_u32(character)
                     .expect("unicode_names2 only returns Unicode scalar values")
@@ -147,12 +104,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            let c = self.extract_char(character, vm)?;
-
-            if let Some(c) = c
-                && self.check_age(c)
-                && let Some(name) = data::name(c.to_u32())
-            {
+            if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_str(name).into());
             }
             default.ok_or_else(|| vm.new_value_error("no such name"))
@@ -165,8 +117,8 @@ mod unicodedata {
             vm: &VirtualMachine,
         ) -> PyResult<&'static str> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("", |c| data::bidirectional(c.to_u32())))
+                .0
+                .bidirectional(Self::extract_char(character, vm)?.to_u32()))
         }
 
         /// NOTE: This function uses 9.0.0 database instead of 3.2.0
@@ -177,39 +129,51 @@ mod unicodedata {
             vm: &VirtualMachine,
         ) -> PyResult<&'static str> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("N", |c| data::east_asian_width(c.to_u32())))
+                .0
+                .east_asian_width(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
-        fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
-            Ok(normalize::normalize(form.into(), unistr.as_wtf8()))
+        fn normalize(
+            &self,
+            form: PyStrRef,
+            unistr: PyStrRef,
+            vm: &VirtualMachine,
+        ) -> PyResult<Wtf8Buf> {
+            Ok(self
+                .0
+                .normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
         }
 
         #[pymethod]
-        fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
-            Ok(normalize::is_normalized(form.into(), unistr.as_wtf8()))
+        fn is_normalized(
+            &self,
+            form: PyStrRef,
+            unistr: PyStrRef,
+            vm: &VirtualMachine,
+        ) -> PyResult<bool> {
+            Ok(self
+                .0
+                .is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
         }
 
         #[pymethod]
         fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
-            Ok(self
-                .extract_char(character, vm)?
-                .is_some_and(|c| data::mirrored(c.to_u32())) as i32)
+            Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32)
         }
 
         #[pymethod]
         fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or(0, |c| data::combining(c.to_u32())))
+                .0
+                .combining(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
         fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or_else(String::new, |c| data::decomposition(c.to_u32())))
+                .0
+                .decomposition(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
@@ -219,10 +183,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::digit(c.to_u32()))
-            {
+            if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a digit"))
@@ -235,10 +196,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::decimal(c.to_u32()))
-            {
+            if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a decimal"))
@@ -251,10 +209,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::numeric(c.to_u32()))
-            {
+            if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) {
                 let value = match value {
                     data::NumericValue::Integer(n) => n as f64,
                     data::NumericValue::Rational(num, den) => num as f64 / den as f64,
@@ -266,19 +221,17 @@ mod unicodedata {
 
         #[pygetset]
         fn unidata_version(&self) -> String {
-            self.unic_version.to_string()
+            self.0.unicode_version().to_string()
         }
     }
 
     #[pyattr]
-    fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
-        Ucd {
-            unic_version: UnicodeVersion {
-                major: 3,
-                minor: 2,
-                micro: 0,
-            },
-        }
+    fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<PyUcd> {
+        PyUcd::new(data::Ucd::new(UnicodeVersion {
+            major: 3,
+            minor: 2,
+            micro: 0,
+        }))
         .into_ref(&vm.ctx)
     }
 

diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml
@@ -9,9 +9,8 @@ repository.workspace = true
 license.workspace = true
 
 [features]
-default = ["std", "casefold"]
-std = []
-casefold = ["std", "dep:caseless"]
+default = ["casefold"]
+casefold = ["dep:caseless"]
 
 [dependencies]
 rustpython-wtf8 = { workspace = true }