From d3af1c54ec029d29cf8708f2bd5f64478a28f99e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 07:11:29 +0000
Subject: [PATCH 1/7] Initial plan


From 67485b5b7781692c0deb8d4ca6a72a6b0d3befa7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 07:39:45 +0000
Subject: [PATCH 2/7] Extract shared unicode crate

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 Cargo.lock                                    |  26 ++-
 Cargo.toml                                    |   1 +
 crates/literal/Cargo.toml                     |   2 +-
 crates/literal/src/char.rs                    |  16 +-
 crates/sre_engine/Cargo.toml                  |   1 +
 crates/sre_engine/src/string.rs               |  93 ++------
 crates/stdlib/Cargo.toml                      |   8 +-
 crates/stdlib/src/unicodedata.rs              | 210 +++++-------------
 crates/unicode/Cargo.toml                     |  29 +++
 crates/unicode/src/case.rs                    | 111 +++++++++
 crates/unicode/src/classify.rs                |  65 ++++++
 crates/unicode/src/data.rs                    | 132 +++++++++++
 crates/unicode/src/identifier.rs              |  27 +++
 crates/unicode/src/lib.rs                     |  77 +++++++
 crates/unicode/src/normalize.rs               |  40 ++++
 crates/unicode/src/regex.rs                   |  85 +++++++
 crates/vm/Cargo.toml                          |   2 +-
 crates/vm/src/builtins/str.rs                 |  52 +----
 extra_tests/snippets/stdlib_unicode_shared.py |  20 ++
 19 files changed, 687 insertions(+), 310 deletions(-)
 create mode 100644 crates/unicode/Cargo.toml
 create mode 100644 crates/unicode/src/case.rs
 create mode 100644 crates/unicode/src/classify.rs
 create mode 100644 crates/unicode/src/data.rs
 create mode 100644 crates/unicode/src/identifier.rs
 create mode 100644 crates/unicode/src/lib.rs
 create mode 100644 crates/unicode/src/normalize.rs
 create mode 100644 crates/unicode/src/regex.rs
 create mode 100644 extra_tests/snippets/stdlib_unicode_shared.py

diff --git a/Cargo.lock b/Cargo.lock
index 5fd981fd135..553d18c7be3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3242,11 +3242,11 @@ name = "rustpython-literal"
 version = "0.5.0"
 dependencies = [
  "hexf-parse",
- "icu_properties",
  "is-macro",
  "lexical-parse-float",
  "num-traits",
  "rand 0.9.2",
+ "rustpython-unicode",
  "rustpython-wtf8",
 ]
 
@@ -3338,6 +3338,7 @@ dependencies = [
  "criterion",
  "num_enum",
  "optional",
+ "rustpython-unicode",
  "rustpython-wtf8",
 ]
 
@@ -3368,8 +3369,6 @@ dependencies = [
  "gethostname",
  "hex",
  "hmac",
- "icu_normalizer",
- "icu_properties",
  "indexmap",
  "itertools 0.14.0",
  "libc",
@@ -3411,6 +3410,7 @@ dependencies = [
  "rustpython-ruff_python_parser",
  "rustpython-ruff_source_file",
  "rustpython-ruff_text_size",
+ "rustpython-unicode",
  "rustpython-vm",
  "schannel",
  "sha-1",
@@ -3421,9 +3421,6 @@ dependencies = [
  "tcl-sys",
  "termios",
  "tk-sys",
- "ucd",
- "unic-ucd-age",
- "unicode_names2 2.0.0",
  "uuid",
  "webpki-roots",
  "widestring",
@@ -3433,6 +3430,21 @@ dependencies = [
  "xml",
 ]
 
+[[package]]
+name = "rustpython-unicode"
+version = "0.5.0"
+dependencies = [
+ "caseless",
+ "icu_normalizer",
+ "icu_properties",
+ "itertools 0.14.0",
+ "rustpython-wtf8",
+ "ucd",
+ "unic-ucd-age",
+ "unicode-casing",
+ "unicode_names2 2.0.0",
+]
+
 [[package]]
 name = "rustpython-venvlauncher"
 version = "0.5.0"
@@ -3458,7 +3470,6 @@ dependencies = [
  "glob",
  "half",
  "hex",
- "icu_properties",
  "indexmap",
  "is-macro",
  "itertools 0.14.0",
@@ -3492,6 +3503,7 @@ dependencies = [
  "rustpython-ruff_python_parser",
  "rustpython-ruff_text_size",
  "rustpython-sre_engine",
+ "rustpython-unicode",
  "rustyline",
  "scoped-tls",
  "scopeguard",
diff --git a/Cargo.toml b/Cargo.toml
index 7bd8b8f3374..87ad486f079 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -153,6 +153,7 @@ rustpython-vm = { path = "crates/vm", default-features = false, version = "0.5.0
 rustpython-pylib = { path = "crates/pylib", version = "0.5.0" }
 rustpython-stdlib = { path = "crates/stdlib", default-features = false, version = "0.5.0" }
 rustpython-sre_engine = { path = "crates/sre_engine", version = "0.5.0" }
+rustpython-unicode = { path = "crates/unicode", default-features = false, version = "0.5.0" }
 rustpython-wtf8 = { path = "crates/wtf8", version = "0.5.0" }
 rustpython-doc = { path = "crates/doc", version = "0.5.0" }
 
diff --git a/crates/literal/Cargo.toml b/crates/literal/Cargo.toml
index 3f0bec33c30..929b3393807 100644
--- a/crates/literal/Cargo.toml
+++ b/crates/literal/Cargo.toml
@@ -9,13 +9,13 @@ license = { workspace = true }
 rust-version = { workspace = true }
 
 [dependencies]
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-wtf8 = { workspace = true }
 
 hexf-parse = "0.2.1"
 is-macro.workspace = true
 lexical-parse-float = { version = "1.0.6", features = ["format"] }
 num-traits = { workspace = true }
-icu_properties = { workspace = true }
 
 [dev-dependencies]
 rand = { workspace = true }
diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs
index 5b446cc1a19..4544133d3b6 100644
--- a/crates/literal/src/char.rs
+++ b/crates/literal/src/char.rs
@@ -1,5 +1,3 @@
-use icu_properties::props::{EnumeratedProperty, GeneralCategory};
-
 /// According to python following categories aren't printable:
 /// * Cc (Other, Control)
 /// * Cf (Other, Format)
@@ -10,17 +8,5 @@ use icu_properties::props::{EnumeratedProperty, GeneralCategory};
 /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
 /// * Zs (Separator, Space) other than ASCII space('\x20').
 pub fn is_printable(c: char) -> bool {
-    let cat = GeneralCategory::for_char(c);
-
-    !matches!(
-        cat,
-        GeneralCategory::SpaceSeparator
-            | GeneralCategory::LineSeparator
-            | GeneralCategory::ParagraphSeparator
-            | GeneralCategory::Control
-            | GeneralCategory::Format
-            | GeneralCategory::Surrogate
-            | GeneralCategory::PrivateUse
-            | GeneralCategory::Unassigned
-    )
+    rustpython_unicode::classify::is_repr_printable(c as u32)
 }
diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml
index 4f899e6b3e9..d95e00693ae 100644
--- a/crates/sre_engine/Cargo.toml
+++ b/crates/sre_engine/Cargo.toml
@@ -15,6 +15,7 @@ name = "benches"
 harness = false
 
 [dependencies]
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-wtf8 = { workspace = true }
 num_enum = { workspace = true }
 bitflags = { workspace = true }
diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs
index 489819bfb3e..bb2974bca5b 100644
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -332,35 +332,22 @@ const fn utf8_is_cont_byte(byte: u8) -> bool {
 /// Mask of the value bits of a continuation byte.
 const CONT_MASK: u8 = 0b0011_1111;
 
-const fn is_py_ascii_whitespace(b: u8) -> bool {
-    matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
-}
-
 #[inline]
 pub(crate) fn is_word(ch: u32) -> bool {
-    ch == '_' as u32
-        || u8::try_from(ch)
-            .map(|x| x.is_ascii_alphanumeric())
-            .unwrap_or(false)
+    rustpython_unicode::regex::is_word(ch)
 }
 #[inline]
 pub(crate) fn is_space(ch: u32) -> bool {
-    u8::try_from(ch)
-        .map(is_py_ascii_whitespace)
-        .unwrap_or(false)
+    rustpython_unicode::regex::is_space(ch)
 }
 #[inline]
 pub(crate) fn is_digit(ch: u32) -> bool {
-    u8::try_from(ch)
-        .map(|x| x.is_ascii_digit())
-        .unwrap_or(false)
+    rustpython_unicode::regex::is_digit(ch)
 }
 #[inline]
 pub(crate) fn is_loc_alnum(ch: u32) -> bool {
     // FIXME: Ignore the locales
-    u8::try_from(ch)
-        .map(|x| x.is_ascii_alphanumeric())
-        .unwrap_or(false)
+    rustpython_unicode::regex::is_locale_alnum(ch)
 }
 #[inline]
 pub(crate) fn is_loc_word(ch: u32) -> bool {
@@ -368,83 +355,37 @@ pub(crate) fn is_loc_word(ch: u32) -> bool {
 }
 #[inline]
 pub(crate) const fn is_linebreak(ch: u32) -> bool {
-    ch == '\n' as u32
+    rustpython_unicode::regex::is_linebreak(ch)
 }
 #[inline]
 pub fn lower_ascii(ch: u32) -> u32 {
-    u8::try_from(ch)
-        .map(|x| x.to_ascii_lowercase() as u32)
-        .unwrap_or(ch)
+    rustpython_unicode::regex::lower_ascii(ch)
 }
 #[inline]
 pub(crate) fn lower_locate(ch: u32) -> u32 {
     // FIXME: Ignore the locales
-    lower_ascii(ch)
+    rustpython_unicode::regex::lower_locale(ch)
 }
 #[inline]
 pub(crate) fn upper_locate(ch: u32) -> u32 {
     // FIXME: Ignore the locales
-    u8::try_from(ch)
-        .map(|x| x.to_ascii_uppercase() as u32)
-        .unwrap_or(ch)
+    rustpython_unicode::regex::upper_locale(ch)
 }
 #[inline]
 pub(crate) fn is_uni_digit(ch: u32) -> bool {
-    // TODO: check with cpython
-    char::try_from(ch)
-        .map(|x| x.is_ascii_digit())
-        .unwrap_or(false)
+    rustpython_unicode::regex::is_unicode_digit(ch)
 }
 #[inline]
 pub(crate) fn is_uni_space(ch: u32) -> bool {
-    // TODO: check with cpython
-    is_space(ch)
-        || matches!(
-            ch,
-            0x0009
-                | 0x000A
-                | 0x000B
-                | 0x000C
-                | 0x000D
-                | 0x001C
-                | 0x001D
-                | 0x001E
-                | 0x001F
-                | 0x0020
-                | 0x0085
-                | 0x00A0
-                | 0x1680
-                | 0x2000
-                | 0x2001
-                | 0x2002
-                | 0x2003
-                | 0x2004
-                | 0x2005
-                | 0x2006
-                | 0x2007
-                | 0x2008
-                | 0x2009
-                | 0x200A
-                | 0x2028
-                | 0x2029
-                | 0x202F
-                | 0x205F
-                | 0x3000
-        )
+    rustpython_unicode::regex::is_unicode_space(ch)
 }
 #[inline]
 pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
-    matches!(
-        ch,
-        0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
-    )
+    rustpython_unicode::regex::is_unicode_linebreak(ch)
 }
 #[inline]
 pub(crate) fn is_uni_alnum(ch: u32) -> bool {
-    // TODO: check with cpython
-    char::try_from(ch)
-        .map(|x| x.is_alphanumeric())
-        .unwrap_or(false)
+    rustpython_unicode::regex::is_unicode_alnum(ch)
 }
 #[inline]
 pub(crate) fn is_uni_word(ch: u32) -> bool {
@@ -452,15 +393,9 @@ pub(crate) fn is_uni_word(ch: u32) -> bool {
 }
 #[inline]
 pub fn lower_unicode(ch: u32) -> u32 {
-    // TODO: check with cpython
-    char::try_from(ch)
-        .map(|x| x.to_lowercase().next().unwrap() as u32)
-        .unwrap_or(ch)
+    rustpython_unicode::regex::lower_unicode(ch)
 }
 #[inline]
 pub fn upper_unicode(ch: u32) -> u32 {
-    // TODO: check with cpython
-    char::try_from(ch)
-        .map(|x| x.to_uppercase().next().unwrap() as u32)
-        .unwrap_or(ch)
+    rustpython_unicode::regex::upper_unicode(ch)
 }
diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml
index f828507d6cf..6945132243f 100644
--- a/crates/stdlib/Cargo.toml
+++ b/crates/stdlib/Cargo.toml
@@ -28,6 +28,7 @@ flame-it = ["flame"]
 [dependencies]
 # rustpython crates
 rustpython-derive = { workspace = true }
+rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
 rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
 rustpython-common = { workspace = true }
 
@@ -76,13 +77,6 @@ pbkdf2 = { version = "0.12", features = ["hmac"] }
 constant_time_eq = { workspace = true }
 
 ## unicode stuff
-unicode_names2 = { workspace = true }
-# update version all at the same time
-icu_properties = { workspace = true }
-icu_normalizer = { workspace = true }
-unic-ucd-age       = { workspace = true }
-ucd = "0.1.1"
-
 # compression
 adler32 = "1.2.0"
 crc32fast = "1.3.2"
diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs
index 6ee5b0c2ee8..622a7854157 100644
--- a/crates/stdlib/src/unicodedata.rs
+++ b/crates/stdlib/src/unicodedata.rs
@@ -17,6 +17,17 @@ enum NormalizeForm {
     Nfkd,
 }
 
+impl From<NormalizeForm> for rustpython_unicode::NormalizeForm {
+    fn from(value: NormalizeForm) -> Self {
+        match value {
+            NormalizeForm::Nfc => Self::Nfc,
+            NormalizeForm::Nfkc => Self::Nfkc,
+            NormalizeForm::Nfd => Self::Nfd,
+            NormalizeForm::Nfkd => Self::Nfkd,
+        }
+    }
+}
+
 impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
     fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
         obj.try_value_with(
@@ -34,25 +45,15 @@ impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
 
 #[pymodule]
 mod unicodedata {
-    use super::NormalizeForm::*;
     use crate::vm::{
         Py, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
         builtins::{PyModule, PyStrRef},
         function::OptionalArg,
     };
 
-    use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
-    use icu_properties::{
-        CodePointSetData,
-        props::{
-            BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
-            GeneralCategory, NamedEnumeratedProperty,
-        },
-    };
     use itertools::Itertools;
     use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
-    use ucd::{Codepoint, DecompositionType, Number, NumericType};
-    use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
+    use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize};
 
     pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
         __module_exec(vm, module);
@@ -94,8 +95,7 @@ mod unicodedata {
         }
 
         fn check_age(&self, c: CodePoint) -> bool {
-            c.to_char()
-                .is_none_or(|c| Age::of(c).is_some_and(|age| age.actual() <= self.unic_version))
+            data::is_assigned_in_version(c.to_u32(), self.unic_version)
         }
 
         fn extract_char(
@@ -119,21 +119,19 @@ mod unicodedata {
         fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             Ok(self
                 .extract_char(character, vm)?
-                .map_or(GeneralCategory::Unassigned, |c| {
-                    c.to_char()
-                        .map_or(GeneralCategory::Surrogate, GeneralCategory::for_char)
-                })
-                .short_name()
+                .map_or("Cn", |c| data::category(c.to_u32()))
                 .to_owned())
         }
 
         #[pymethod]
         fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             if let Some(name_str) = name.to_str()
-                && let Some(character) = unicode_names2::character(name_str)
-                && self.check_age(character.into())
+                && let Some(character) = data::lookup(name_str)
+                && self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point"))
             {
-                return Ok(character.to_string());
+                return Ok(char::from_u32(character)
+                    .expect("unicode_names2 only returns Unicode scalar values")
+                    .to_string());
             }
             Err(vm.new_key_error(
                 vm.ctx
@@ -153,9 +151,9 @@ mod unicodedata {
 
             if let Some(c) = c
                 && self.check_age(c)
-                && let Some(name) = c.to_char().and_then(unicode_names2::name)
+                && let Some(name) = data::name(c.to_u32())
             {
-                return Ok(vm.ctx.new_str(name.to_string()).into());
+                return Ok(vm.ctx.new_str(name).into());
             }
             default.ok_or_else(|| vm.new_value_error("no such name"))
         }
@@ -166,14 +164,9 @@ mod unicodedata {
             character: PyStrRef,
             vm: &VirtualMachine,
         ) -> PyResult<&'static str> {
-            let bidi = match self.extract_char(character, vm)? {
-                Some(c) => c
-                    .to_char()
-                    .map_or(BidiClass::LeftToRight, BidiClass::for_char)
-                    .short_name(),
-                None => "",
-            };
-            Ok(bidi)
+            Ok(self
+                .extract_char(character, vm)?
+                .map_or("", |c| data::bidirectional(c.to_u32())))
         }
 
         /// NOTE: This function uses 9.0.0 database instead of 3.2.0
@@ -185,110 +178,38 @@ mod unicodedata {
         ) -> PyResult<&'static str> {
             Ok(self
                 .extract_char(character, vm)?
-                .and_then(|c| c.to_char())
-                .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
-                .short_name())
+                .map_or("N", |c| data::east_asian_width(c.to_u32())))
         }
 
         #[pymethod]
         fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
-            let text = unistr.as_wtf8();
-            let normalized_text = match form {
-                Nfc => {
-                    let normalizer = ComposingNormalizerBorrowed::new_nfc();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfkc => {
-                    let normalizer = ComposingNormalizerBorrowed::new_nfkc();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfd => {
-                    let normalizer = DecomposingNormalizerBorrowed::new_nfd();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfkd => {
-                    let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-            };
-            Ok(normalized_text)
+            Ok(normalize::normalize(form.into(), unistr.as_wtf8()))
         }
 
         #[pymethod]
         fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
-            let text = unistr.as_wtf8();
-            let normalized: Wtf8Buf = match form {
-                Nfc => {
-                    let normalizer = ComposingNormalizerBorrowed::new_nfc();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfkc => {
-                    let normalizer = ComposingNormalizerBorrowed::new_nfkc();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfd => {
-                    let normalizer = DecomposingNormalizerBorrowed::new_nfd();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-                Nfkd => {
-                    let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
-                    text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
-                        .collect()
-                }
-            };
-            Ok(text == &*normalized)
+            Ok(normalize::is_normalized(form.into(), unistr.as_wtf8()))
         }
 
         #[pymethod]
         fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
-            match self.extract_char(character, vm)? {
-                Some(c) => {
-                    if let Some(ch) = c.to_char() {
-                        // Check if the character is mirrored in bidirectional text using Unicode standard
-                        let bidi_mirrored = CodePointSetData::new::<BidiMirrored>();
-                        Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 })
-                    } else {
-                        Ok(0)
-                    }
-                }
-                None => Ok(0),
-            }
+            Ok(self
+                .extract_char(character, vm)?
+                .is_some_and(|c| data::mirrored(c.to_u32())) as i32)
         }
 
         #[pymethod]
         fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
             Ok(self
                 .extract_char(character, vm)?
-                .and_then(|c| c.to_char())
-                .map_or(0, |ch| {
-                    CanonicalCombiningClass::for_char(ch).to_icu4c_value()
-                }))
+                .map_or(0, |c| data::combining(c.to_u32())))
         }
 
         #[pymethod]
         fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
-            let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) {
-                Some(ch) => ch,
-                None => return Ok(String::new()),
-            };
-            let chars: Vec<char> = ch.decomposition_map().collect();
-            // If decomposition maps to just the character itself, there's no decomposition
-            if chars.len() == 1 && chars[0] == ch {
-                return Ok(String::new());
-            }
-            let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
-            let tag = match ch.decomposition_type() {
-                Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
-                Some(dt) => decomposition_type_tag(dt),
-            };
-            Ok(format!("<{tag}> {hex_parts}"))
+            Ok(self
+                .extract_char(character, vm)?
+                .map_or_else(String::new, |c| data::decomposition(c.to_u32())))
         }
 
         #[pymethod]
@@ -298,15 +219,11 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
-            if let Some(ch) = ch
-                && matches!(
-                    ch.numeric_type(),
-                    Some(NumericType::Decimal) | Some(NumericType::Digit)
-                )
-                && let Some(Number::Integer(n)) = ch.numeric_value()
+            if let Some(value) = self
+                .extract_char(character, vm)?
+                .and_then(|c| data::digit(c.to_u32()))
             {
-                return Ok(vm.ctx.new_int(n).into());
+                return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a digit"))
         }
@@ -318,12 +235,11 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
-            if let Some(ch) = ch
-                && ch.numeric_type() == Some(NumericType::Decimal)
-                && let Some(Number::Integer(n)) = ch.numeric_value()
+            if let Some(value) = self
+                .extract_char(character, vm)?
+                .and_then(|c| data::decimal(c.to_u32()))
             {
-                return Ok(vm.ctx.new_int(n).into());
+                return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a decimal"))
         }
@@ -335,17 +251,15 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
-            if let Some(ch) = ch {
-                match ch.numeric_value() {
-                    Some(Number::Integer(n)) => {
-                        return Ok(vm.ctx.new_float(n as f64).into());
-                    }
-                    Some(Number::Rational(num, den)) => {
-                        return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
-                    }
-                    None => {}
-                }
+            if let Some(value) = self
+                .extract_char(character, vm)?
+                .and_then(|c| data::numeric(c.to_u32()))
+            {
+                let value = match value {
+                    data::NumericValue::Integer(n) => n as f64,
+                    data::NumericValue::Rational(num, den) => num as f64 / den as f64,
+                };
+                return Ok(vm.ctx.new_float(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a numeric character"))
         }
@@ -356,28 +270,6 @@ mod unicodedata {
         }
     }
 
-    fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
-        match dt {
-            DecompositionType::Canonical => "canonical",
-            DecompositionType::Compat => "compat",
-            DecompositionType::Circle => "circle",
-            DecompositionType::Final => "final",
-            DecompositionType::Font => "font",
-            DecompositionType::Fraction => "fraction",
-            DecompositionType::Initial => "initial",
-            DecompositionType::Isolated => "isolated",
-            DecompositionType::Medial => "medial",
-            DecompositionType::Narrow => "narrow",
-            DecompositionType::Nobreak => "noBreak",
-            DecompositionType::Small => "small",
-            DecompositionType::Square => "square",
-            DecompositionType::Sub => "sub",
-            DecompositionType::Super => "super",
-            DecompositionType::Vertical => "vertical",
-            DecompositionType::Wide => "wide",
-        }
-    }
-
     #[pyattr]
     fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
         Ucd {
diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml
new file mode 100644
index 00000000000..51fc781e790
--- /dev/null
+++ b/crates/unicode/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "rustpython-unicode"
+description = "Shared Unicode semantics and data for RustPython and related Python tooling."
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+repository.workspace = true
+license.workspace = true
+
+[features]
+default = ["std", "casefold"]
+std = []
+casefold = ["std", "dep:caseless"]
+
+[dependencies]
+rustpython-wtf8 = { workspace = true }
+
+icu_normalizer = { workspace = true }
+icu_properties = { workspace = true }
+itertools = { workspace = true }
+unicode-casing = { workspace = true }
+unicode_names2 = { version = "2.0.0", default-features = false, features = ["no_std"] }
+unic-ucd-age = { workspace = true }
+ucd = "0.1.1"
+caseless = { version = "0.2.2", optional = true }
+
+[lints]
+workspace = true
diff --git a/crates/unicode/src/case.rs b/crates/unicode/src/case.rs
new file mode 100644
index 00000000000..dbd71929f2d
--- /dev/null
+++ b/crates/unicode/src/case.rs
@@ -0,0 +1,111 @@
+#[cfg(feature = "casefold")]
+use alloc::string::String;
+
+#[cfg(feature = "casefold")]
+use rustpython_wtf8::Wtf8Chunk;
+use rustpython_wtf8::{Wtf8, Wtf8Buf};
+use unicode_casing::CharExt;
+
+use crate::char_from_codepoint;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct CaseMapping {
+    len: u8,
+    codepoints: [u32; 3],
+}
+
+impl CaseMapping {
+    pub const fn identity(cp: u32) -> Self {
+        Self {
+            len: 1,
+            codepoints: [cp, 0, 0],
+        }
+    }
+
+    pub const fn first(self) -> Option<u32> {
+        if self.len == 0 {
+            None
+        } else {
+            Some(self.codepoints[0])
+        }
+    }
+
+    pub fn iter(self) -> impl Iterator<Item = u32> {
+        self.codepoints.into_iter().take(usize::from(self.len))
+    }
+}
+
+fn mapping_from_chars(chars: impl Iterator<Item = char>) -> CaseMapping {
+    let mut codepoints = [0; 3];
+    let mut len = 0;
+    for ch in chars.take(codepoints.len()) {
+        codepoints[len] = ch as u32;
+        len += 1;
+    }
+    CaseMapping {
+        len: len as u8,
+        codepoints,
+    }
+}
+
+#[cfg(feature = "casefold")]
+fn mapping_from_string(text: String) -> CaseMapping {
+    mapping_from_chars(text.chars())
+}
+
+pub fn to_lowercase(cp: u32) -> CaseMapping {
+    char_from_codepoint(cp).map_or_else(
+        || CaseMapping::identity(cp),
+        |ch| mapping_from_chars(ch.to_lowercase()),
+    )
+}
+
+pub fn to_uppercase(cp: u32) -> CaseMapping {
+    char_from_codepoint(cp).map_or_else(
+        || CaseMapping::identity(cp),
+        |ch| mapping_from_chars(ch.to_uppercase()),
+    )
+}
+
+pub fn to_titlecase(cp: u32) -> CaseMapping {
+    char_from_codepoint(cp).map_or_else(
+        || CaseMapping::identity(cp),
+        |ch| mapping_from_chars(ch.to_titlecase()),
+    )
+}
+
+pub fn to_lowercase_wtf8(text: &Wtf8) -> Wtf8Buf {
+    text.map_utf8(|s| s.chars().flat_map(char::to_lowercase))
+        .collect()
+}
+
+pub fn to_uppercase_wtf8(text: &Wtf8) -> Wtf8Buf {
+    text.map_utf8(|s| s.chars().flat_map(char::to_uppercase))
+        .collect()
+}
+
+#[cfg(feature = "casefold")]
+pub fn casefold(cp: u32) -> CaseMapping {
+    char_from_codepoint(cp).map_or_else(
+        || CaseMapping::identity(cp),
+        |ch| {
+            let mut buf = [0; 4];
+            mapping_from_string(caseless::default_case_fold_str(ch.encode_utf8(&mut buf)))
+        },
+    )
+}
+
+#[cfg(feature = "casefold")]
+pub fn casefold_str(text: &str) -> String {
+    caseless::default_case_fold_str(text)
+}
+
+#[cfg(feature = "casefold")]
+pub fn casefold_wtf8(text: &Wtf8) -> Wtf8Buf {
+    text.chunks()
+        .map(|chunk| match chunk {
+            Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(casefold_str(s)),
+            Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
+        })
+        .collect()
+}
diff --git a/crates/unicode/src/classify.rs b/crates/unicode/src/classify.rs
new file mode 100644
index 00000000000..56c6679e545
--- /dev/null
+++ b/crates/unicode/src/classify.rs
@@ -0,0 +1,65 @@
+use icu_properties::props::{BidiClass, EnumeratedProperty, GeneralCategory};
+use ucd::{Codepoint, NumericType};
+
+use crate::{char_from_codepoint, is_surrogate};
+
+pub fn general_category(cp: u32) -> GeneralCategory {
+    if is_surrogate(cp) {
+        GeneralCategory::Surrogate
+    } else {
+        char_from_codepoint(cp).map_or(GeneralCategory::Unassigned, GeneralCategory::for_char)
+    }
+}
+
+pub fn is_alpha(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(char::is_alphabetic)
+}
+
+pub fn is_alnum(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(char::is_alphanumeric)
+}
+
+pub fn is_decimal(cp: u32) -> bool {
+    matches!(general_category(cp), GeneralCategory::DecimalNumber)
+}
+
+pub fn is_digit(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(|ch| {
+        matches!(
+            ch.numeric_type(),
+            Some(NumericType::Decimal) | Some(NumericType::Digit)
+        )
+    })
+}
+
+pub fn is_numeric(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(|ch| ch.numeric_value().is_some())
+}
+
+pub fn is_space(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(|ch| {
+        matches!(general_category(cp), GeneralCategory::SpaceSeparator)
+            || matches!(
+                BidiClass::for_char(ch),
+                BidiClass::WhiteSpace | BidiClass::ParagraphSeparator | BidiClass::SegmentSeparator
+            )
+    })
+}
+
+pub fn is_printable(cp: u32) -> bool {
+    cp == '\u{0020}' as u32 || is_repr_printable(cp)
+}
+
+pub fn is_repr_printable(cp: u32) -> bool {
+    !matches!(
+        general_category(cp),
+        GeneralCategory::SpaceSeparator
+            | GeneralCategory::LineSeparator
+            | GeneralCategory::ParagraphSeparator
+            | GeneralCategory::Control
+            | GeneralCategory::Format
+            | GeneralCategory::Surrogate
+            | GeneralCategory::PrivateUse
+            | GeneralCategory::Unassigned
+    )
+}
diff --git a/crates/unicode/src/data.rs b/crates/unicode/src/data.rs
new file mode 100644
index 00000000000..28d9e0b9553
--- /dev/null
+++ b/crates/unicode/src/data.rs
@@ -0,0 +1,132 @@
+use alloc::{format, string::String, vec::Vec};
+
+use icu_properties::{
+    CodePointSetData,
+    props::{
+        BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty,
+        NamedEnumeratedProperty,
+    },
+};
+use itertools::Itertools;
+use ucd::{Codepoint, DecompositionType, Number, NumericType};
+use unic_ucd_age::{Age, UnicodeVersion};
+
+use crate::{char_from_codepoint, classify, is_surrogate};
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum NumericValue {
+    Integer(i64),
+    Rational(i64, i64),
+}
+
+pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool {
+    if is_surrogate(cp) {
+        true
+    } else {
+        char_from_codepoint(cp)
+            .is_some_and(|ch| Age::of(ch).is_some_and(|age| age.actual() <= version))
+    }
+}
+
+pub fn category(cp: u32) -> &'static str {
+    classify::general_category(cp).short_name()
+}
+
+pub fn lookup(name: &str) -> Option<u32> {
+    unicode_names2::character(name).map(u32::from)
+}
+
+pub fn name(cp: u32) -> Option<String> {
+    char_from_codepoint(cp)
+        .and_then(unicode_names2::name)
+        .map(|name| name.collect())
+}
+
+pub fn bidirectional(cp: u32) -> &'static str {
+    char_from_codepoint(cp)
+        .map_or(BidiClass::LeftToRight, BidiClass::for_char)
+        .short_name()
+}
+
+pub fn east_asian_width(cp: u32) -> &'static str {
+    char_from_codepoint(cp)
+        .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char)
+        .short_name()
+}
+
+pub fn mirrored(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(|ch| CodePointSetData::new::<BidiMirrored>().contains(ch))
+}
+
+pub fn combining(cp: u32) -> u8 {
+    char_from_codepoint(cp).map_or(0, |ch| {
+        CanonicalCombiningClass::for_char(ch).to_icu4c_value()
+    })
+}
+
+pub fn decomposition(cp: u32) -> String {
+    let ch = match char_from_codepoint(cp) {
+        Some(ch) => ch,
+        None => return String::new(),
+    };
+    let chars: Vec<char> = ch.decomposition_map().collect();
+    if chars.len() == 1 && chars[0] == ch {
+        return String::new();
+    }
+    let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" ");
+    match ch.decomposition_type() {
+        Some(DecompositionType::Canonical) | None => hex_parts,
+        Some(dt) => format!("<{}> {hex_parts}", decomposition_type_tag(dt)),
+    }
+}
+
+pub fn digit(cp: u32) -> Option<u32> {
+    let ch = char_from_codepoint(cp)?;
+    if matches!(
+        ch.numeric_type(),
+        Some(NumericType::Decimal) | Some(NumericType::Digit)
+    ) && let Some(Number::Integer(value)) = ch.numeric_value()
+    {
+        return u32::try_from(value).ok();
+    }
+    None
+}
+
+pub fn decimal(cp: u32) -> Option<u32> {
+    let ch = char_from_codepoint(cp)?;
+    if ch.numeric_type() == Some(NumericType::Decimal)
+        && let Some(Number::Integer(value)) = ch.numeric_value()
+    {
+        return u32::try_from(value).ok();
+    }
+    None
+}
+
+pub fn numeric(cp: u32) -> Option<NumericValue> {
+    match char_from_codepoint(cp)?.numeric_value()? {
+        Number::Integer(value) => Some(NumericValue::Integer(value)),
+        Number::Rational(num, den) => Some(NumericValue::Rational(num.into(), den.into())),
+    }
+}
+
+fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
+    match dt {
+        DecompositionType::Canonical => "canonical",
+        DecompositionType::Compat => "compat",
+        DecompositionType::Circle => "circle",
+        DecompositionType::Final => "final",
+        DecompositionType::Font => "font",
+        DecompositionType::Fraction => "fraction",
+        DecompositionType::Initial => "initial",
+        DecompositionType::Isolated => "isolated",
+        DecompositionType::Medial => "medial",
+        DecompositionType::Narrow => "narrow",
+        DecompositionType::Nobreak => "noBreak",
+        DecompositionType::Small => "small",
+        DecompositionType::Square => "square",
+        DecompositionType::Sub => "sub",
+        DecompositionType::Super => "super",
+        DecompositionType::Vertical => "vertical",
+        DecompositionType::Wide => "wide",
+    }
+}
diff --git a/crates/unicode/src/identifier.rs b/crates/unicode/src/identifier.rs
new file mode 100644
index 00000000000..30a3ae32d5e
--- /dev/null
+++ b/crates/unicode/src/identifier.rs
@@ -0,0 +1,27 @@
+use icu_properties::props::{BinaryProperty, XidContinue, XidStart};
+
+use crate::char_from_codepoint;
+
+pub fn is_xid_start(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(XidStart::for_char)
+}
+
+pub fn is_xid_continue(cp: u32) -> bool {
+    char_from_codepoint(cp).is_some_and(XidContinue::for_char)
+}
+
+pub fn is_python_identifier_start(cp: u32) -> bool {
+    cp == '_' as u32 || is_xid_start(cp)
+}
+
+pub fn is_python_identifier_continue(cp: u32) -> bool {
+    is_xid_continue(cp)
+}
+
+pub fn is_python_identifier(text: &str) -> bool {
+    let mut chars = text.chars();
+    let is_identifier_start = chars
+        .next()
+        .is_some_and(|ch| is_python_identifier_start(ch as u32));
+    is_identifier_start && chars.all(|ch| is_python_identifier_continue(ch as u32))
+}
diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs
new file mode 100644
index 00000000000..7ac71be12f4
--- /dev/null
+++ b/crates/unicode/src/lib.rs
@@ -0,0 +1,77 @@
+#![cfg_attr(not(feature = "std"), no_std)]
+
+extern crate alloc;
+
+pub mod case;
+pub mod classify;
+pub mod data;
+pub mod identifier;
+pub mod normalize;
+pub mod regex;
+
+pub use normalize::NormalizeForm;
+pub use unic_ucd_age::{UNICODE_VERSION, UnicodeVersion};
+
+use core::char;
+
+pub(crate) fn char_from_codepoint(cp: u32) -> Option<char> {
+    char::from_u32(cp)
+}
+
+pub(crate) const fn is_surrogate(cp: u32) -> bool {
+    matches!(cp, 0xD800..=0xDFFF)
+}
+
+#[cfg(test)]
+mod tests {
+    use alloc::vec::Vec;
+    use rustpython_wtf8::Wtf8Buf;
+
+    use crate::{NormalizeForm, case, classify, data, identifier, normalize, regex};
+
+    #[test]
+    fn printable_and_repr_printable_follow_python_rules() {
+        assert!(classify::is_printable(' ' as u32));
+        assert!(!classify::is_repr_printable(' ' as u32));
+        assert!(!classify::is_printable('\n' as u32));
+    }
+
+    #[test]
+    fn identifier_and_regex_predicates_share_unicode_tables() {
+        assert!(identifier::is_python_identifier_start('_' as u32));
+        assert!(identifier::is_python_identifier("유니코드"));
+        assert!(regex::is_unicode_word('가' as u32));
+        assert!(regex::is_unicode_digit('५' as u32));
+        assert!(regex::is_unicode_space('\u{3000}' as u32));
+    }
+
+    #[test]
+    fn case_and_normalization_helpers_support_full_mappings() {
+        let upper: Vec<_> = case::to_uppercase('ß' as u32).iter().collect();
+        assert_eq!(upper, vec!['S' as u32, 'S' as u32]);
+
+        let text = Wtf8Buf::from("e\u{301}");
+        assert_eq!(
+            normalize::normalize(NormalizeForm::Nfc, &text),
+            Wtf8Buf::from("é")
+        );
+        assert!(normalize::is_normalized(
+            NormalizeForm::Nfd,
+            &normalize::normalize(NormalizeForm::Nfd, &Wtf8Buf::from("é"))
+        ));
+    }
+
+    #[test]
+    fn unicode_data_queries_match_existing_unicodedata_behavior() {
+        assert_eq!(data::category('A' as u32), "Lu");
+        assert_eq!(data::category(0xD800), "Cs");
+        assert_eq!(data::lookup("SNOWMAN"), Some('☃' as u32));
+        assert_eq!(data::name('☃' as u32).as_deref(), Some("SNOWMAN"));
+        assert_eq!(data::decimal('५' as u32), Some(5));
+        assert_eq!(data::digit('²' as u32), Some(2));
+        assert_eq!(
+            data::numeric('⅓' as u32),
+            Some(data::NumericValue::Rational(1, 3))
+        );
+    }
+}
diff --git a/crates/unicode/src/normalize.rs b/crates/unicode/src/normalize.rs
new file mode 100644
index 00000000000..a5a53504262
--- /dev/null
+++ b/crates/unicode/src/normalize.rs
@@ -0,0 +1,40 @@
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+use rustpython_wtf8::{Wtf8, Wtf8Buf};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum NormalizeForm {
+    Nfc,
+    Nfkc,
+    Nfd,
+    Nfkd,
+}
+
+pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf {
+    match form {
+        NormalizeForm::Nfc => {
+            let normalizer = ComposingNormalizerBorrowed::new_nfc();
+            text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
+                .collect()
+        }
+        NormalizeForm::Nfkc => {
+            let normalizer = ComposingNormalizerBorrowed::new_nfkc();
+            text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
+                .collect()
+        }
+        NormalizeForm::Nfd => {
+            let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+            text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
+                .collect()
+        }
+        NormalizeForm::Nfkd => {
+            let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+            text.map_utf8(|s| normalizer.normalize_iter(s.chars()))
+                .collect()
+        }
+    }
+}
+
+pub fn is_normalized(form: NormalizeForm, text: &Wtf8) -> bool {
+    let normalized = normalize(form, text);
+    text == &*normalized
+}
diff --git a/crates/unicode/src/regex.rs b/crates/unicode/src/regex.rs
new file mode 100644
index 00000000000..dcc057f7ad9
--- /dev/null
+++ b/crates/unicode/src/regex.rs
@@ -0,0 +1,85 @@
+use crate::{case, classify};
+
+const fn is_py_ascii_whitespace(byte: u8) -> bool {
+    matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
+}
+
+pub fn is_word(cp: u32) -> bool {
+    cp == '_' as u32
+        || u8::try_from(cp)
+            .map(|byte| byte.is_ascii_alphanumeric())
+            .unwrap_or(false)
+}
+
+pub fn is_space(cp: u32) -> bool {
+    u8::try_from(cp)
+        .map(is_py_ascii_whitespace)
+        .unwrap_or(false)
+}
+
+pub fn is_digit(cp: u32) -> bool {
+    u8::try_from(cp)
+        .map(|byte| byte.is_ascii_digit())
+        .unwrap_or(false)
+}
+
+pub fn is_locale_alnum(cp: u32) -> bool {
+    u8::try_from(cp)
+        .map(|byte| byte.is_ascii_alphanumeric())
+        .unwrap_or(false)
+}
+
+pub fn is_locale_word(cp: u32) -> bool {
+    cp == '_' as u32 || is_locale_alnum(cp)
+}
+
+pub const fn is_linebreak(cp: u32) -> bool {
+    cp == '\n' as u32
+}
+
+pub fn lower_ascii(cp: u32) -> u32 {
+    u8::try_from(cp)
+        .map(|byte| byte.to_ascii_lowercase() as u32)
+        .unwrap_or(cp)
+}
+
+pub fn lower_locale(cp: u32) -> u32 {
+    lower_ascii(cp)
+}
+
+pub fn upper_locale(cp: u32) -> u32 {
+    u8::try_from(cp)
+        .map(|byte| byte.to_ascii_uppercase() as u32)
+        .unwrap_or(cp)
+}
+
+pub fn is_unicode_digit(cp: u32) -> bool {
+    classify::is_decimal(cp)
+}
+
+pub fn is_unicode_space(cp: u32) -> bool {
+    classify::is_space(cp)
+}
+
+pub const fn is_unicode_linebreak(cp: u32) -> bool {
+    matches!(
+        cp,
+        0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
+    )
+}
+
+pub fn is_unicode_alnum(cp: u32) -> bool {
+    classify::is_alnum(cp)
+}
+
+pub fn is_unicode_word(cp: u32) -> bool {
+    cp == '_' as u32 || is_unicode_alnum(cp)
+}
+
+pub fn lower_unicode(cp: u32) -> u32 {
+    case::to_lowercase(cp).first().unwrap_or(cp)
+}
+
+pub fn upper_unicode(cp: u32) -> u32 {
+    case::to_uppercase(cp).first().unwrap_or(cp)
+}
diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml
index b721418a4cc..22bf943fd1c 100644
--- a/crates/vm/Cargo.toml
+++ b/crates/vm/Cargo.toml
@@ -41,6 +41,7 @@ ruff_text_size = { workspace = true, optional = true }
 rustpython-compiler-core = { workspace = true }
 rustpython-literal = { workspace = true }
 rustpython-sre_engine = { workspace = true }
+rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
 
 ascii = { workspace = true }
 ahash = { workspace = true }
@@ -86,7 +87,6 @@ timsort = "0.1.2"
 # TODO: use unic for this; needed for title case:
 # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
 unicode-casing = { workspace = true }
-icu_properties = { workspace = true }
 
 [target.'cfg(unix)'.dependencies]
 rustix = { workspace = true }
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
index b31dc6ccc9d..5731eac6475 100644
--- a/crates/vm/src/builtins/str.rs
+++ b/crates/vm/src/builtins/str.rs
@@ -41,12 +41,9 @@ use rustpython_common::{
     hash,
     lock::PyMutex,
     str::DeduceStrKind,
-    wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
+    wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat},
 };
 
-use icu_properties::props::{
-    BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
-};
 use unicode_casing::CharExt;
 
 impl<'a> TryFromBorrowedObject<'a> for String {
@@ -698,7 +695,7 @@ impl PyStr {
         match self.as_str_kind() {
             PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(),
             PyKindStr::Utf8(s) => s.to_lowercase().into(),
-            PyKindStr::Wtf8(w) => w.to_lowercase().into(),
+            PyKindStr::Wtf8(w) => rustpython_unicode::case::to_lowercase_wtf8(w).into(),
         }
     }
 
@@ -706,16 +703,9 @@ impl PyStr {
     #[pymethod]
     fn casefold(&self) -> Self {
         match self.as_str_kind() {
-            PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(),
-            PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(),
-            PyKindStr::Wtf8(w) => w
-                .chunks()
-                .map(|c| match c {
-                    Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)),
-                    Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
-                })
-                .collect::<Wtf8Buf>()
-                .into(),
+            PyKindStr::Ascii(s) => rustpython_unicode::case::casefold_str(s.as_str()).into(),
+            PyKindStr::Utf8(s) => rustpython_unicode::case::casefold_str(s).into(),
+            PyKindStr::Wtf8(w) => rustpython_unicode::case::casefold_wtf8(w).into(),
         }
     }
 
@@ -724,7 +714,7 @@ impl PyStr {
         match self.as_str_kind() {
             PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(),
             PyKindStr::Utf8(s) => s.to_uppercase().into(),
-            PyKindStr::Wtf8(w) => w.to_uppercase().into(),
+            PyKindStr::Wtf8(w) => rustpython_unicode::case::to_uppercase_wtf8(w).into(),
         }
     }
 
@@ -967,9 +957,7 @@ impl PyStr {
     #[pymethod]
     fn isdecimal(&self) -> bool {
         !self.data.is_empty()
-            && self.char_all(|c| {
-                matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber)
-            })
+            && self.char_all(|c| rustpython_unicode::classify::is_decimal(c as u32))
     }
 
     fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
@@ -1089,23 +1077,12 @@ impl PyStr {
 
     #[pymethod]
     fn isprintable(&self) -> bool {
-        self.char_all(|c| c == '\u{0020}' || rustpython_literal::char::is_printable(c))
+        self.char_all(|c| rustpython_unicode::classify::is_printable(c as u32))
     }
 
     #[pymethod]
     fn isspace(&self) -> bool {
-        !self.data.is_empty()
-            && self.char_all(|c| {
-                matches!(
-                    GeneralCategory::for_char(c),
-                    GeneralCategory::SpaceSeparator
-                ) || matches!(
-                    BidiClass::for_char(c),
-                    BidiClass::WhiteSpace
-                        | BidiClass::ParagraphSeparator
-                        | BidiClass::SegmentSeparator
-                )
-            })
+        !self.data.is_empty() && self.char_all(|c| rustpython_unicode::classify::is_space(c as u32))
     }
 
     // Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise.
@@ -1362,15 +1339,8 @@ impl PyStr {
 
     #[pymethod]
     pub fn isidentifier(&self) -> bool {
-        let Some(s) = self.to_str() else { return false };
-        let mut chars = s.chars();
-
-        let is_identifier_start = chars
-            .next()
-            .is_some_and(|c| c == '_' || XidStart::for_char(c));
-
-        // a string is not an identifier if it has whitespace or starts with a number
-        is_identifier_start && chars.all(XidContinue::for_char)
+        self.to_str()
+            .is_some_and(rustpython_unicode::identifier::is_python_identifier)
     }
 
     // https://docs.python.org/3/library/stdtypes.html#str.translate
diff --git a/extra_tests/snippets/stdlib_unicode_shared.py b/extra_tests/snippets/stdlib_unicode_shared.py
new file mode 100644
index 00000000000..94aef5cdf2b
--- /dev/null
+++ b/extra_tests/snippets/stdlib_unicode_shared.py
@@ -0,0 +1,20 @@
+import re
+import unicodedata
+
+assert "유니코드".isidentifier()
+assert "५".isdecimal()
+assert "\u3000".isspace()
+assert " ".isprintable()
+assert not "\n".isprintable()
+
+assert unicodedata.category("\ud800") == "Cs"
+assert unicodedata.lookup("SNOWMAN") == "☃"
+assert unicodedata.name("☃") == "SNOWMAN"
+assert unicodedata.normalize("NFC", "e\u0301") == "é"
+assert unicodedata.digit("²") == 2
+assert unicodedata.decimal("५") == 5
+assert unicodedata.numeric("⅓") == 1 / 3
+
+assert re.fullmatch(r"\w+", "가나다")
+assert re.fullmatch(r"\d+", "५६७")
+assert re.fullmatch(r"\s+", "\u3000")

From e968d838082a3655b4f9c29192535d07325ae120 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 07:44:23 +0000
Subject: [PATCH 3/7] Tidy shared unicode review feedback

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 crates/sre_engine/src/string.rs | 13 ++-----------
 crates/unicode/src/classify.rs  |  2 ++
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs
index bb2974bca5b..b2333330a46 100644
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -345,13 +345,8 @@ pub(crate) fn is_digit(ch: u32) -> bool {
     rustpython_unicode::regex::is_digit(ch)
 }
 #[inline]
-pub(crate) fn is_loc_alnum(ch: u32) -> bool {
-    // FIXME: Ignore the locales
-    rustpython_unicode::regex::is_locale_alnum(ch)
-}
-#[inline]
 pub(crate) fn is_loc_word(ch: u32) -> bool {
-    ch == '_' as u32 || is_loc_alnum(ch)
+    rustpython_unicode::regex::is_locale_word(ch)
 }
 #[inline]
 pub(crate) const fn is_linebreak(ch: u32) -> bool {
@@ -384,12 +379,8 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
     rustpython_unicode::regex::is_unicode_linebreak(ch)
 }
 #[inline]
-pub(crate) fn is_uni_alnum(ch: u32) -> bool {
-    rustpython_unicode::regex::is_unicode_alnum(ch)
-}
-#[inline]
 pub(crate) fn is_uni_word(ch: u32) -> bool {
-    ch == '_' as u32 || is_uni_alnum(ch)
+    rustpython_unicode::regex::is_unicode_word(ch)
 }
 #[inline]
 pub fn lower_unicode(ch: u32) -> u32 {
diff --git a/crates/unicode/src/classify.rs b/crates/unicode/src/classify.rs
index 56c6679e545..a6bd732e3cb 100644
--- a/crates/unicode/src/classify.rs
+++ b/crates/unicode/src/classify.rs
@@ -46,10 +46,12 @@ pub fn is_space(cp: u32) -> bool {
     })
 }
 
+/// Python's `str.isprintable()` semantics, which treat ASCII space as printable.
 pub fn is_printable(cp: u32) -> bool {
     cp == '\u{0020}' as u32 || is_repr_printable(cp)
 }
 
+/// Repr/escape printable semantics, which exclude all Unicode space separators.
 pub fn is_repr_printable(cp: u32) -> bool {
     !matches!(
         general_category(cp),

From 5cf1bd666709a79096a30109c0f4bbd2896c88e4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 07:45:20 +0000
Subject: [PATCH 4/7] Polish unicode regex helpers

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/e4115dbc-b1a5-4a77-90a7-38cf9ac1cdf2

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 crates/unicode/src/regex.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/crates/unicode/src/regex.rs b/crates/unicode/src/regex.rs
index dcc057f7ad9..1f2045b96bd 100644
--- a/crates/unicode/src/regex.rs
+++ b/crates/unicode/src/regex.rs
@@ -1,11 +1,13 @@
 use crate::{case, classify};
 
+const UNDERSCORE: u32 = '_' as u32;
+
 const fn is_py_ascii_whitespace(byte: u8) -> bool {
     matches!(byte, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
 }
 
 pub fn is_word(cp: u32) -> bool {
-    cp == '_' as u32
+    cp == UNDERSCORE
         || u8::try_from(cp)
             .map(|byte| byte.is_ascii_alphanumeric())
             .unwrap_or(false)
@@ -30,7 +32,7 @@ pub fn is_locale_alnum(cp: u32) -> bool {
 }
 
 pub fn is_locale_word(cp: u32) -> bool {
-    cp == '_' as u32 || is_locale_alnum(cp)
+    cp == UNDERSCORE || is_locale_alnum(cp)
 }
 
 pub const fn is_linebreak(cp: u32) -> bool {
@@ -73,7 +75,7 @@ pub fn is_unicode_alnum(cp: u32) -> bool {
 }
 
 pub fn is_unicode_word(cp: u32) -> bool {
-    cp == '_' as u32 || is_unicode_alnum(cp)
+    cp == UNDERSCORE || is_unicode_alnum(cp)
 }
 
 pub fn lower_unicode(cp: u32) -> u32 {

From 0a340de9c30e00c6794464104397ef021244aeab Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 08:53:23 +0000
Subject: [PATCH 5/7] Finish unicode crate follow-up refactors

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 Cargo.lock                          |   4 +-
 crates/codegen/Cargo.toml           |   2 +-
 crates/codegen/src/string_parser.rs |   4 +-
 crates/common/Cargo.toml            |   2 +-
 crates/common/src/encodings.rs      |   2 +-
 crates/stdlib/Cargo.toml            |   2 +-
 crates/stdlib/src/unicodedata.rs    | 163 ++++++++++------------------
 crates/unicode/Cargo.toml           |   5 +-
 crates/unicode/src/data.rs          | 100 ++++++++++++++++-
 crates/unicode/src/lib.rs           |   2 +-
 crates/unicode/src/normalize.rs     |  15 +++
 crates/vm/Cargo.toml                |   2 +-
 12 files changed, 185 insertions(+), 118 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 553d18c7be3..0e272c3eaf7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3123,9 +3123,9 @@ dependencies = [
  "rustpython-ruff_python_ast",
  "rustpython-ruff_python_parser",
  "rustpython-ruff_text_size",
+ "rustpython-unicode",
  "rustpython-wtf8",
  "thiserror 2.0.18",
- "unicode_names2 2.0.0",
 ]
 
 [[package]]
@@ -3148,9 +3148,9 @@ dependencies = [
  "parking_lot",
  "radium",
  "rustpython-literal",
+ "rustpython-unicode",
  "rustpython-wtf8",
  "siphasher",
- "unicode_names2 2.0.0",
  "widestring",
  "windows-sys 0.61.2",
 ]
diff --git a/crates/codegen/Cargo.toml b/crates/codegen/Cargo.toml
index 78065962fff..3a5a5acb810 100644
--- a/crates/codegen/Cargo.toml
+++ b/crates/codegen/Cargo.toml
@@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]
 
 [dependencies]
 rustpython-compiler-core = { workspace = true }
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-literal = {workspace = true }
 rustpython-wtf8 = { workspace = true }
 ruff_python_ast = { workspace = true }
@@ -29,7 +30,6 @@ num-traits = { workspace = true }
 thiserror = { workspace = true }
 malachite-bigint = { workspace = true }
 memchr = { workspace = true }
-unicode_names2 = { workspace = true }
 
 [dev-dependencies]
 ruff_python_parser = { workspace = true }
diff --git a/crates/codegen/src/string_parser.rs b/crates/codegen/src/string_parser.rs
index a7ad8c35a46..8934e1868cd 100644
--- a/crates/codegen/src/string_parser.rs
+++ b/crates/codegen/src/string_parser.rs
@@ -113,7 +113,9 @@ impl StringParser {
         let name_and_ending = self.skip_bytes(close_idx + 1);
         let name = &name_and_ending[..name_and_ending.len() - 1];
 
-        unicode_names2::character(name).ok_or_else(|| unreachable!())
+        rustpython_unicode::data::lookup(name)
+            .and_then(char::from_u32)
+            .ok_or_else(|| unreachable!())
     }
 
     /// Parse an escaped character, returning the new character.
diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml
index 555336f059a..a6694ad6180 100644
--- a/crates/common/Cargo.toml
+++ b/crates/common/Cargo.toml
@@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]
 
 [dependencies]
 rustpython-literal = { workspace = true }
+rustpython-unicode = { workspace = true, default-features = false }
 rustpython-wtf8 = { workspace = true }
 
 ascii = { workspace = true }
@@ -29,7 +30,6 @@ malachite-q = { workspace = true }
 malachite-base = { workspace = true }
 num-traits = { workspace = true }
 parking_lot = { workspace = true, optional = true }
-unicode_names2 = { workspace = true }
 radium = { workspace = true }
 
 lock_api = "0.4"
diff --git a/crates/common/src/encodings.rs b/crates/common/src/encodings.rs
index 913f0521e16..54a757de358 100644
--- a/crates/common/src/encodings.rs
+++ b/crates/common/src/encodings.rs
@@ -414,7 +414,7 @@ pub mod errors {
             let mut out = String::with_capacity(num_chars * 4);
             for c in err_str.code_points() {
                 let c_u32 = c.to_u32();
-                if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
+                if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
                     write!(out, "\\N{{{c_name}}}").unwrap();
                 } else if c_u32 >= 0x10000 {
                     write!(out, "\\U{c_u32:08x}").unwrap();
diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml
index 6945132243f..9e3c5382ee0 100644
--- a/crates/stdlib/Cargo.toml
+++ b/crates/stdlib/Cargo.toml
@@ -28,7 +28,7 @@ flame-it = ["flame"]
 [dependencies]
 # rustpython crates
 rustpython-derive = { workspace = true }
-rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
+rustpython-unicode = { workspace = true, features = ["casefold"] }
 rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
 rustpython-common = { workspace = true }
 
diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs
index 622a7854157..2633ba4c0a0 100644
--- a/crates/stdlib/src/unicodedata.rs
+++ b/crates/stdlib/src/unicodedata.rs
@@ -6,43 +6,6 @@
 
 pub(crate) use unicodedata::module_def;
 
-use crate::vm::{
-    PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject,
-};
-
-enum NormalizeForm {
-    Nfc,
-    Nfkc,
-    Nfd,
-    Nfkd,
-}
-
-impl From<NormalizeForm> for rustpython_unicode::NormalizeForm {
-    fn from(value: NormalizeForm) -> Self {
-        match value {
-            NormalizeForm::Nfc => Self::Nfc,
-            NormalizeForm::Nfkc => Self::Nfkc,
-            NormalizeForm::Nfd => Self::Nfd,
-            NormalizeForm::Nfkd => Self::Nfkd,
-        }
-    }
-}
-
-impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
-    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
-        obj.try_value_with(
-            |form: &PyStr| match form.as_bytes() {
-                b"NFC" => Ok(Self::Nfc),
-                b"NFKC" => Ok(Self::Nfkc),
-                b"NFD" => Ok(Self::Nfd),
-                b"NFKD" => Ok(Self::Nfkd),
-                _ => Err(vm.new_value_error("invalid normalization form")),
-            },
-            vm,
-        )
-    }
-}
-
 #[pymodule]
 mod unicodedata {
     use crate::vm::{
@@ -53,13 +16,20 @@ mod unicodedata {
 
     use itertools::Itertools;
     use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
-    use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize};
+    use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data};
+
+    fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult<NormalizeForm> {
+        form.to_str()
+            .ok_or_else(|| vm.new_value_error("invalid normalization form"))?
+            .parse()
+            .map_err(|()| vm.new_value_error("invalid normalization form"))
+    }
 
     pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
         __module_exec(vm, module);
 
         // Add UCD methods as module-level functions
-        let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into();
+        let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into();
 
         for attr in [
             "category",
@@ -85,49 +55,36 @@ mod unicodedata {
     #[pyattr]
     #[pyclass(name = "UCD")]
     #[derive(Debug, PyPayload)]
-    pub(super) struct Ucd {
-        unic_version: UnicodeVersion,
-    }
-
-    impl Ucd {
-        pub const fn new(unic_version: UnicodeVersion) -> Self {
-            Self { unic_version }
-        }
+    pub(super) struct PyUcd(data::Ucd);
 
-        fn check_age(&self, c: CodePoint) -> bool {
-            data::is_assigned_in_version(c.to_u32(), self.unic_version)
+    impl PyUcd {
+        pub const fn new(ucd: data::Ucd) -> Self {
+            Self(ucd)
         }
 
-        fn extract_char(
-            &self,
-            character: PyStrRef,
-            vm: &VirtualMachine,
-        ) -> PyResult<Option<CodePoint>> {
-            let c = character
+        fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
+            character
                 .as_wtf8()
                 .code_points()
                 .exactly_one()
-                .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?;
-
-            Ok(self.check_age(c).then_some(c))
+                .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))
         }
     }
 
     #[pyclass(flags(DISALLOW_INSTANTIATION))]
-    impl Ucd {
+    impl PyUcd {
         #[pymethod]
         fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("Cn", |c| data::category(c.to_u32()))
+                .0
+                .category(Self::extract_char(character, vm)?.to_u32())
                 .to_owned())
         }
 
         #[pymethod]
         fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             if let Some(name_str) = name.to_str()
-                && let Some(character) = data::lookup(name_str)
-                && self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point"))
+                && let Some(character) = self.0.lookup(name_str)
             {
                 return Ok(char::from_u32(character)
                     .expect("unicode_names2 only returns Unicode scalar values")
@@ -147,12 +104,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            let c = self.extract_char(character, vm)?;
-
-            if let Some(c) = c
-                && self.check_age(c)
-                && let Some(name) = data::name(c.to_u32())
-            {
+            if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_str(name).into());
             }
             default.ok_or_else(|| vm.new_value_error("no such name"))
@@ -165,8 +117,8 @@ mod unicodedata {
             vm: &VirtualMachine,
         ) -> PyResult<&'static str> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("", |c| data::bidirectional(c.to_u32())))
+                .0
+                .bidirectional(Self::extract_char(character, vm)?.to_u32()))
         }
 
         /// NOTE: This function uses 9.0.0 database instead of 3.2.0
@@ -177,39 +129,51 @@ mod unicodedata {
             vm: &VirtualMachine,
         ) -> PyResult<&'static str> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or("N", |c| data::east_asian_width(c.to_u32())))
+                .0
+                .east_asian_width(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
-        fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
-            Ok(normalize::normalize(form.into(), unistr.as_wtf8()))
+        fn normalize(
+            &self,
+            form: PyStrRef,
+            unistr: PyStrRef,
+            vm: &VirtualMachine,
+        ) -> PyResult<Wtf8Buf> {
+            Ok(self
+                .0
+                .normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
         }
 
         #[pymethod]
-        fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
-            Ok(normalize::is_normalized(form.into(), unistr.as_wtf8()))
+        fn is_normalized(
+            &self,
+            form: PyStrRef,
+            unistr: PyStrRef,
+            vm: &VirtualMachine,
+        ) -> PyResult<bool> {
+            Ok(self
+                .0
+                .is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
         }
 
         #[pymethod]
         fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
-            Ok(self
-                .extract_char(character, vm)?
-                .is_some_and(|c| data::mirrored(c.to_u32())) as i32)
+            Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32)
         }
 
         #[pymethod]
         fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or(0, |c| data::combining(c.to_u32())))
+                .0
+                .combining(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
         fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
             Ok(self
-                .extract_char(character, vm)?
-                .map_or_else(String::new, |c| data::decomposition(c.to_u32())))
+                .0
+                .decomposition(Self::extract_char(character, vm)?.to_u32()))
         }
 
         #[pymethod]
@@ -219,10 +183,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::digit(c.to_u32()))
-            {
+            if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a digit"))
@@ -235,10 +196,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::decimal(c.to_u32()))
-            {
+            if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) {
                 return Ok(vm.ctx.new_int(value).into());
             }
             default.ok_or_else(|| vm.new_value_error("not a decimal"))
@@ -251,10 +209,7 @@ mod unicodedata {
             default: OptionalArg<PyObjectRef>,
             vm: &VirtualMachine,
         ) -> PyResult {
-            if let Some(value) = self
-                .extract_char(character, vm)?
-                .and_then(|c| data::numeric(c.to_u32()))
-            {
+            if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) {
                 let value = match value {
                     data::NumericValue::Integer(n) => n as f64,
                     data::NumericValue::Rational(num, den) => num as f64 / den as f64,
@@ -266,19 +221,17 @@ mod unicodedata {
 
         #[pygetset]
         fn unidata_version(&self) -> String {
-            self.unic_version.to_string()
+            self.0.unicode_version().to_string()
         }
     }
 
     #[pyattr]
-    fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
-        Ucd {
-            unic_version: UnicodeVersion {
-                major: 3,
-                minor: 2,
-                micro: 0,
-            },
-        }
+    fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<PyUcd> {
+        PyUcd::new(data::Ucd::new(UnicodeVersion {
+            major: 3,
+            minor: 2,
+            micro: 0,
+        }))
         .into_ref(&vm.ctx)
     }
 
diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml
index 51fc781e790..ff59bb0dd77 100644
--- a/crates/unicode/Cargo.toml
+++ b/crates/unicode/Cargo.toml
@@ -9,9 +9,8 @@ repository.workspace = true
 license.workspace = true
 
 [features]
-default = ["std", "casefold"]
-std = []
-casefold = ["std", "dep:caseless"]
+default = ["casefold"]
+casefold = ["dep:caseless"]
 
 [dependencies]
 rustpython-wtf8 = { workspace = true }
diff --git a/crates/unicode/src/data.rs b/crates/unicode/src/data.rs
index 28d9e0b9553..5c9a48a071a 100644
--- a/crates/unicode/src/data.rs
+++ b/crates/unicode/src/data.rs
@@ -9,7 +9,7 @@ use icu_properties::{
 };
 use itertools::Itertools;
 use ucd::{Codepoint, DecompositionType, Number, NumericType};
-use unic_ucd_age::{Age, UnicodeVersion};
+use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
 
 use crate::{char_from_codepoint, classify, is_surrogate};
 
@@ -19,6 +19,104 @@ pub enum NumericValue {
     Rational(i64, i64),
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct Ucd {
+    unic_version: UnicodeVersion,
+}
+
+impl Default for Ucd {
+    fn default() -> Self {
+        Self::new(UNICODE_VERSION)
+    }
+}
+
+impl Ucd {
+    pub const fn new(unic_version: UnicodeVersion) -> Self {
+        Self { unic_version }
+    }
+
+    pub const fn unicode_version(&self) -> UnicodeVersion {
+        self.unic_version
+    }
+
+    pub fn category(&self, cp: u32) -> &'static str {
+        if self.contains(cp) {
+            category(cp)
+        } else {
+            "Cn"
+        }
+    }
+
+    pub fn lookup(&self, name: &str) -> Option<u32> {
+        let cp = lookup(name)?;
+        self.contains(cp).then_some(cp)
+    }
+
+    pub fn name(&self, cp: u32) -> Option<String> {
+        self.contains(cp).then(|| name(cp)).flatten()
+    }
+
+    pub fn bidirectional(&self, cp: u32) -> &'static str {
+        if self.contains(cp) {
+            bidirectional(cp)
+        } else {
+            ""
+        }
+    }
+
+    pub fn east_asian_width(&self, cp: u32) -> &'static str {
+        if self.contains(cp) {
+            east_asian_width(cp)
+        } else {
+            "N"
+        }
+    }
+
+    pub fn normalize(
+        &self,
+        form: crate::NormalizeForm,
+        text: &rustpython_wtf8::Wtf8,
+    ) -> rustpython_wtf8::Wtf8Buf {
+        crate::normalize::normalize(form, text)
+    }
+
+    pub fn is_normalized(&self, form: crate::NormalizeForm, text: &rustpython_wtf8::Wtf8) -> bool {
+        crate::normalize::is_normalized(form, text)
+    }
+
+    pub fn mirrored(&self, cp: u32) -> bool {
+        self.contains(cp) && mirrored(cp)
+    }
+
+    pub fn combining(&self, cp: u32) -> u8 {
+        if self.contains(cp) { combining(cp) } else { 0 }
+    }
+
+    pub fn decomposition(&self, cp: u32) -> String {
+        if self.contains(cp) {
+            decomposition(cp)
+        } else {
+            String::new()
+        }
+    }
+
+    pub fn digit(&self, cp: u32) -> Option<u32> {
+        self.contains(cp).then(|| digit(cp)).flatten()
+    }
+
+    pub fn decimal(&self, cp: u32) -> Option<u32> {
+        self.contains(cp).then(|| decimal(cp)).flatten()
+    }
+
+    pub fn numeric(&self, cp: u32) -> Option<NumericValue> {
+        self.contains(cp).then(|| numeric(cp)).flatten()
+    }
+
+    fn contains(&self, cp: u32) -> bool {
+        is_assigned_in_version(cp, self.unic_version)
+    }
+}
+
 pub fn is_assigned_in_version(cp: u32, version: UnicodeVersion) -> bool {
     if is_surrogate(cp) {
         true
diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs
index 7ac71be12f4..6ccc9fb3307 100644
--- a/crates/unicode/src/lib.rs
+++ b/crates/unicode/src/lib.rs
@@ -1,4 +1,4 @@
-#![cfg_attr(not(feature = "std"), no_std)]
+#![cfg_attr(not(feature = "casefold"), no_std)]
 
 extern crate alloc;
 
diff --git a/crates/unicode/src/normalize.rs b/crates/unicode/src/normalize.rs
index a5a53504262..702f27569a1 100644
--- a/crates/unicode/src/normalize.rs
+++ b/crates/unicode/src/normalize.rs
@@ -1,3 +1,4 @@
+use core::str::FromStr;
 use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
 use rustpython_wtf8::{Wtf8, Wtf8Buf};
 
@@ -9,6 +10,20 @@ pub enum NormalizeForm {
     Nfkd,
 }
 
+impl FromStr for NormalizeForm {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "NFC" => Ok(Self::Nfc),
+            "NFKC" => Ok(Self::Nfkc),
+            "NFD" => Ok(Self::Nfd),
+            "NFKD" => Ok(Self::Nfkd),
+            _ => Err(()),
+        }
+    }
+}
+
 pub fn normalize(form: NormalizeForm, text: &Wtf8) -> Wtf8Buf {
     match form {
         NormalizeForm::Nfc => {
diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml
index 22bf943fd1c..d1c52d9e40b 100644
--- a/crates/vm/Cargo.toml
+++ b/crates/vm/Cargo.toml
@@ -41,7 +41,7 @@ ruff_text_size = { workspace = true, optional = true }
 rustpython-compiler-core = { workspace = true }
 rustpython-literal = { workspace = true }
 rustpython-sre_engine = { workspace = true }
-rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
+rustpython-unicode = { workspace = true, features = ["casefold"] }
 
 ascii = { workspace = true }
 ahash = { workspace = true }

From 2934897035a4a169bd6a669ed59972725e13e64a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 5 Apr 2026 08:54:41 +0000
Subject: [PATCH 6/7] Polish unicode follow-up review fixes

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 crates/stdlib/src/unicodedata.rs | 2 +-
 crates/unicode/Cargo.toml        | 5 +++--
 crates/unicode/src/lib.rs        | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs
index 2633ba4c0a0..d563021e22d 100644
--- a/crates/stdlib/src/unicodedata.rs
+++ b/crates/stdlib/src/unicodedata.rs
@@ -67,7 +67,7 @@ mod unicodedata {
                 .as_wtf8()
                 .code_points()
                 .exactly_one()
-                .map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))
+                .map_err(|_| vm.new_type_error("argument must be a Unicode character, not str"))
         }
     }
 
diff --git a/crates/unicode/Cargo.toml b/crates/unicode/Cargo.toml
index ff59bb0dd77..51fc781e790 100644
--- a/crates/unicode/Cargo.toml
+++ b/crates/unicode/Cargo.toml
@@ -9,8 +9,9 @@ repository.workspace = true
 license.workspace = true
 
 [features]
-default = ["casefold"]
-casefold = ["dep:caseless"]
+default = ["std", "casefold"]
+std = []
+casefold = ["std", "dep:caseless"]
 
 [dependencies]
 rustpython-wtf8 = { workspace = true }
diff --git a/crates/unicode/src/lib.rs b/crates/unicode/src/lib.rs
index 6ccc9fb3307..7ac71be12f4 100644
--- a/crates/unicode/src/lib.rs
+++ b/crates/unicode/src/lib.rs
@@ -1,4 +1,4 @@
-#![cfg_attr(not(feature = "casefold"), no_std)]
+#![cfg_attr(not(feature = "std"), no_std)]
 
 extern crate alloc;
 

From 4efa5da5f68851b7b5c28fc53355590266d51db9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Apr 2026 02:07:56 +0000
Subject: [PATCH 7/7] Simplify unicode regex call sites

Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/b894057a-9bed-4f35-8400-a5731c63602d

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
---
 Cargo.lock                      |   1 -
 crates/literal/src/char.rs      |  12 ----
 crates/literal/src/escape.rs    |   6 +-
 crates/literal/src/lib.rs       |   1 -
 crates/sre_engine/src/engine.rs | 108 ++++++++++++++++++--------------
 crates/sre_engine/src/string.rs |  59 -----------------
 crates/vm/Cargo.toml            |   1 -
 crates/vm/src/stdlib/_sre.rs    |  12 ++--
 8 files changed, 69 insertions(+), 131 deletions(-)
 delete mode 100644 crates/literal/src/char.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0e272c3eaf7..6ac26758e5b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3457,7 +3457,6 @@ dependencies = [
  "ascii",
  "bitflags 2.11.0",
  "bstr",
- "caseless",
  "cfg-if",
  "chrono",
  "constant_time_eq",
diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs
deleted file mode 100644
index 4544133d3b6..00000000000
--- a/crates/literal/src/char.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-/// According to python following categories aren't printable:
-/// * Cc (Other, Control)
-/// * Cf (Other, Format)
-/// * Cs (Other, Surrogate)
-/// * Co (Other, Private Use)
-/// * Cn (Other, Not Assigned)
-/// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
-/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
-/// * Zs (Separator, Space) other than ASCII space('\x20').
-pub fn is_printable(c: char) -> bool {
-    rustpython_unicode::classify::is_repr_printable(c as u32)
-}
diff --git a/crates/literal/src/escape.rs b/crates/literal/src/escape.rs
index 1099c0a02bc..01df100a004 100644
--- a/crates/literal/src/escape.rs
+++ b/crates/literal/src/escape.rs
@@ -204,7 +204,7 @@ impl UnicodeEscape<'_> {
             '\\' | '\t' | '\r' | '\n' => 2,
             ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH
             ch if ch.is_ascii() => 1,
-            ch if crate::char::is_printable(ch) => {
+            ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
                 // max = std::cmp::max(ch, max);
                 ch.len_utf8()
             }
@@ -238,7 +238,9 @@ impl UnicodeEscape<'_> {
             ch if ch.is_ascii() => {
                 write!(formatter, "\\x{:02x}", ch as u8)
             }
-            ch if crate::char::is_printable(ch) => formatter.write_char(ch),
+            ch if rustpython_unicode::classify::is_repr_printable(ch as u32) => {
+                formatter.write_char(ch)
+            }
             '\0'..='\u{ff}' => {
                 write!(formatter, "\\x{:02x}", ch as u32)
             }
diff --git a/crates/literal/src/lib.rs b/crates/literal/src/lib.rs
index a863dd87738..6d520900142 100644
--- a/crates/literal/src/lib.rs
+++ b/crates/literal/src/lib.rs
@@ -2,7 +2,6 @@
 
 extern crate alloc;
 
-pub mod char;
 pub mod complex;
 pub mod escape;
 pub mod float;
diff --git a/crates/sre_engine/src/engine.rs b/crates/sre_engine/src/engine.rs
index 73e263012fc..c23d3477fbb 100644
--- a/crates/sre_engine/src/engine.rs
+++ b/crates/sre_engine/src/engine.rs
@@ -1,14 +1,10 @@
 // good luck to those that follow; here be dragons
 
-use crate::string::{
-    is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space,
-    is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode,
-};
-
 use super::{MAXREPEAT, SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor};
 use alloc::{vec, vec::Vec};
 use core::{convert::TryFrom, ptr::null};
 use optional::Optioned;
+use rustpython_unicode::regex as unicode_regex;
 
 #[derive(Debug, Clone, Copy)]
 pub struct Request<'a, S> {
@@ -659,10 +655,10 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
                         }
                         SreOpcode::IN => general_op_in!(charset),
                         SreOpcode::IN_IGNORE => {
-                            general_op_in!(|set, c| charset(set, lower_ascii(c)))
+                            general_op_in!(|set, c| charset(set, unicode_regex::lower_ascii(c)))
                         }
                         SreOpcode::IN_UNI_IGNORE => {
-                            general_op_in!(|set, c| charset(set, lower_unicode(c)))
+                            general_op_in!(|set, c| charset(set, unicode_regex::lower_unicode(c)))
                         }
                         SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore),
                         SreOpcode::MARK => {
@@ -803,25 +799,31 @@ fn _match<S: StrDrive>(req: &Request<'_, S>, state: &mut State, mut ctx: MatchCo
                         SreOpcode::LITERAL => general_op_literal!(|code, c| code == c),
                         SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c),
                         SreOpcode::LITERAL_IGNORE => {
-                            general_op_literal!(|code, c| code == lower_ascii(c))
+                            general_op_literal!(|code, c| code == unicode_regex::lower_ascii(c))
                         }
                         SreOpcode::NOT_LITERAL_IGNORE => {
-                            general_op_literal!(|code, c| code != lower_ascii(c))
+                            general_op_literal!(|code, c| code != unicode_regex::lower_ascii(c))
                         }
                         SreOpcode::LITERAL_UNI_IGNORE => {
-                            general_op_literal!(|code, c| code == lower_unicode(c))
+                            general_op_literal!(|code, c| code == unicode_regex::lower_unicode(c))
                         }
                         SreOpcode::NOT_LITERAL_UNI_IGNORE => {
-                            general_op_literal!(|code, c| code != lower_unicode(c))
+                            general_op_literal!(|code, c| code != unicode_regex::lower_unicode(c))
                         }
                         SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore),
                         SreOpcode::NOT_LITERAL_LOC_IGNORE => {
                             general_op_literal!(|code, c| !char_loc_ignore(code, c))
                         }
                         SreOpcode::GROUPREF => general_op_groupref!(|x| x),
-                        SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii),
-                        SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate),
-                        SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode),
+                        SreOpcode::GROUPREF_IGNORE => {
+                            general_op_groupref!(unicode_regex::lower_ascii)
+                        }
+                        SreOpcode::GROUPREF_LOC_IGNORE => {
+                            general_op_groupref!(unicode_regex::lower_locale)
+                        }
+                        SreOpcode::GROUPREF_UNI_IGNORE => {
+                            general_op_groupref!(unicode_regex::lower_unicode)
+                        }
                         SreOpcode::GROUPREF_EXISTS => {
                             let (group_start, group_end) =
                                 state.marks.get(ctx.peek_code(req, 1) as usize);
@@ -1125,7 +1127,7 @@ impl MatchContext {
     }
 
     fn at_linebreak<S: StrDrive>(&self, req: &Request<'_, S>) -> bool {
-        !self.at_end(req) && is_linebreak(self.peek_char::<S>())
+        !self.at_end(req) && unicode_regex::is_linebreak(self.peek_char::<S>())
     }
 
     fn at_boundary<S: StrDrive, F: FnMut(u32) -> bool>(
@@ -1192,54 +1194,56 @@ impl MatchContext {
 fn at<S: StrDrive>(req: &Request<'_, S>, ctx: &MatchContext, at_code: SreAtCode) -> bool {
     match at_code {
         SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(),
-        SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::<S>()),
-        SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word),
-        SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word),
+        SreAtCode::BEGINNING_LINE => {
+            ctx.at_beginning() || unicode_regex::is_linebreak(ctx.back_peek_char::<S>())
+        }
+        SreAtCode::BOUNDARY => ctx.at_boundary(req, unicode_regex::is_word),
+        SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_word),
         SreAtCode::END => {
             (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req)
         }
         SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req),
         SreAtCode::END_STRING => ctx.at_end(req),
-        SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word),
-        SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word),
-        SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word),
-        SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word),
+        SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_locale_word),
+        SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_locale_word),
+        SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, unicode_regex::is_unicode_word),
+        SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, unicode_regex::is_unicode_word),
     }
 }
 
 fn char_loc_ignore(code: u32, c: u32) -> bool {
-    code == c || code == lower_locate(c) || code == upper_locate(c)
+    code == c || code == unicode_regex::lower_locale(c) || code == unicode_regex::upper_locale(c)
 }
 
 fn charset_loc_ignore(set: &[u32], c: u32) -> bool {
-    let lo = lower_locate(c);
+    let lo = unicode_regex::lower_locale(c);
     if charset(set, c) {
         return true;
     }
-    let up = upper_locate(c);
+    let up = unicode_regex::upper_locale(c);
     up != lo && charset(set, up)
 }
 
 fn category(cat_code: SreCatCode, c: u32) -> bool {
     match cat_code {
-        SreCatCode::DIGIT => is_digit(c),
-        SreCatCode::NOT_DIGIT => !is_digit(c),
-        SreCatCode::SPACE => is_space(c),
-        SreCatCode::NOT_SPACE => !is_space(c),
-        SreCatCode::WORD => is_word(c),
-        SreCatCode::NOT_WORD => !is_word(c),
-        SreCatCode::LINEBREAK => is_linebreak(c),
-        SreCatCode::NOT_LINEBREAK => !is_linebreak(c),
-        SreCatCode::LOC_WORD => is_loc_word(c),
-        SreCatCode::LOC_NOT_WORD => !is_loc_word(c),
-        SreCatCode::UNI_DIGIT => is_uni_digit(c),
-        SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c),
-        SreCatCode::UNI_SPACE => is_uni_space(c),
-        SreCatCode::UNI_NOT_SPACE => !is_uni_space(c),
-        SreCatCode::UNI_WORD => is_uni_word(c),
-        SreCatCode::UNI_NOT_WORD => !is_uni_word(c),
-        SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c),
-        SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c),
+        SreCatCode::DIGIT => unicode_regex::is_digit(c),
+        SreCatCode::NOT_DIGIT => !unicode_regex::is_digit(c),
+        SreCatCode::SPACE => unicode_regex::is_space(c),
+        SreCatCode::NOT_SPACE => !unicode_regex::is_space(c),
+        SreCatCode::WORD => unicode_regex::is_word(c),
+        SreCatCode::NOT_WORD => !unicode_regex::is_word(c),
+        SreCatCode::LINEBREAK => unicode_regex::is_linebreak(c),
+        SreCatCode::NOT_LINEBREAK => !unicode_regex::is_linebreak(c),
+        SreCatCode::LOC_WORD => unicode_regex::is_locale_word(c),
+        SreCatCode::LOC_NOT_WORD => !unicode_regex::is_locale_word(c),
+        SreCatCode::UNI_DIGIT => unicode_regex::is_unicode_digit(c),
+        SreCatCode::UNI_NOT_DIGIT => !unicode_regex::is_unicode_digit(c),
+        SreCatCode::UNI_SPACE => unicode_regex::is_unicode_space(c),
+        SreCatCode::UNI_NOT_SPACE => !unicode_regex::is_unicode_space(c),
+        SreCatCode::UNI_WORD => unicode_regex::is_unicode_word(c),
+        SreCatCode::UNI_NOT_WORD => !unicode_regex::is_unicode_word(c),
+        SreCatCode::UNI_LINEBREAK => unicode_regex::is_unicode_linebreak(c),
+        SreCatCode::UNI_NOT_LINEBREAK => !unicode_regex::is_unicode_linebreak(c),
     }
 }
 
@@ -1320,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool {
                 if set[i + 1] <= ch && ch <= set[i + 2] {
                     return ok;
                 }
-                let ch = upper_unicode(ch);
+                let ch = unicode_regex::upper_unicode(ch);
                 if set[i + 1] <= ch && ch <= set[i + 2] {
                     return ok;
                 }
@@ -1368,10 +1372,14 @@ fn _count<S: StrDrive>(
             general_count_literal(req, ctx, end, |code, c| code != c);
         }
         SreOpcode::LITERAL_IGNORE => {
-            general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c));
+            general_count_literal(req, ctx, end, |code, c| {
+                code == unicode_regex::lower_ascii(c)
+            });
         }
         SreOpcode::NOT_LITERAL_IGNORE => {
-            general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c));
+            general_count_literal(req, ctx, end, |code, c| {
+                code != unicode_regex::lower_ascii(c)
+            });
         }
         SreOpcode::LITERAL_LOC_IGNORE => {
             general_count_literal(req, ctx, end, char_loc_ignore);
@@ -1380,10 +1388,14 @@ fn _count<S: StrDrive>(
             general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c));
         }
         SreOpcode::LITERAL_UNI_IGNORE => {
-            general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c));
+            general_count_literal(req, ctx, end, |code, c| {
+                code == unicode_regex::lower_unicode(c)
+            });
         }
         SreOpcode::NOT_LITERAL_UNI_IGNORE => {
-            general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c));
+            general_count_literal(req, ctx, end, |code, c| {
+                code != unicode_regex::lower_unicode(c)
+            });
         }
         _ => {
             /* General case */
diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs
index b2333330a46..0c548ded214 100644
--- a/crates/sre_engine/src/string.rs
+++ b/crates/sre_engine/src/string.rs
@@ -331,62 +331,3 @@ const fn utf8_is_cont_byte(byte: u8) -> bool {
 
 /// Mask of the value bits of a continuation byte.
 const CONT_MASK: u8 = 0b0011_1111;
-
-#[inline]
-pub(crate) fn is_word(ch: u32) -> bool {
-    rustpython_unicode::regex::is_word(ch)
-}
-#[inline]
-pub(crate) fn is_space(ch: u32) -> bool {
-    rustpython_unicode::regex::is_space(ch)
-}
-#[inline]
-pub(crate) fn is_digit(ch: u32) -> bool {
-    rustpython_unicode::regex::is_digit(ch)
-}
-#[inline]
-pub(crate) fn is_loc_word(ch: u32) -> bool {
-    rustpython_unicode::regex::is_locale_word(ch)
-}
-#[inline]
-pub(crate) const fn is_linebreak(ch: u32) -> bool {
-    rustpython_unicode::regex::is_linebreak(ch)
-}
-#[inline]
-pub fn lower_ascii(ch: u32) -> u32 {
-    rustpython_unicode::regex::lower_ascii(ch)
-}
-#[inline]
-pub(crate) fn lower_locate(ch: u32) -> u32 {
-    // FIXME: Ignore the locales
-    rustpython_unicode::regex::lower_locale(ch)
-}
-#[inline]
-pub(crate) fn upper_locate(ch: u32) -> u32 {
-    // FIXME: Ignore the locales
-    rustpython_unicode::regex::upper_locale(ch)
-}
-#[inline]
-pub(crate) fn is_uni_digit(ch: u32) -> bool {
-    rustpython_unicode::regex::is_unicode_digit(ch)
-}
-#[inline]
-pub(crate) fn is_uni_space(ch: u32) -> bool {
-    rustpython_unicode::regex::is_unicode_space(ch)
-}
-#[inline]
-pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
-    rustpython_unicode::regex::is_unicode_linebreak(ch)
-}
-#[inline]
-pub(crate) fn is_uni_word(ch: u32) -> bool {
-    rustpython_unicode::regex::is_unicode_word(ch)
-}
-#[inline]
-pub fn lower_unicode(ch: u32) -> u32 {
-    rustpython_unicode::regex::lower_unicode(ch)
-}
-#[inline]
-pub fn upper_unicode(ch: u32) -> u32 {
-    rustpython_unicode::regex::upper_unicode(ch)
-}
diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml
index d1c52d9e40b..4f097ff03d4 100644
--- a/crates/vm/Cargo.toml
+++ b/crates/vm/Cargo.toml
@@ -75,7 +75,6 @@ strum_macros = { workspace = true }
 thiserror = { workspace = true }
 memchr = { workspace = true }
 
-caseless = "0.2.2"
 flamer = { version = "0.5", optional = true }
 half = "2"
 psm = "0.1"
diff --git a/crates/vm/src/stdlib/_sre.rs b/crates/vm/src/stdlib/_sre.rs
index ba7044fb5a9..19d114b50a3 100644
--- a/crates/vm/src/stdlib/_sre.rs
+++ b/crates/vm/src/stdlib/_sre.rs
@@ -21,10 +21,8 @@ mod _sre {
     use crossbeam_utils::atomic::AtomicCell;
     use itertools::Itertools;
     use num_traits::ToPrimitive;
-    use rustpython_sre_engine::{
-        Request, SearchIter, SreFlag, State, StrDrive,
-        string::{lower_ascii, lower_unicode, upper_unicode},
-    };
+    use rustpython_sre_engine::{Request, SearchIter, SreFlag, State, StrDrive};
+    use rustpython_unicode::regex as unicode_regex;
 
     #[pyattr]
     pub use rustpython_sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC};
@@ -42,17 +40,17 @@ mod _sre {
     #[pyfunction]
     fn unicode_iscased(ch: i32) -> bool {
         let ch = ch as u32;
-        ch != lower_unicode(ch) || ch != upper_unicode(ch)
+        ch != unicode_regex::lower_unicode(ch) || ch != unicode_regex::upper_unicode(ch)
     }
 
     #[pyfunction]
     fn ascii_tolower(ch: i32) -> i32 {
-        lower_ascii(ch as u32) as i32
+        unicode_regex::lower_ascii(ch as u32) as i32
     }
 
     #[pyfunction]
     fn unicode_tolower(ch: i32) -> i32 {
-        lower_unicode(ch as u32) as i32
+        unicode_regex::lower_unicode(ch as u32) as i32
     }
 
     trait SreStr: StrDrive {