Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Finish unicode crate follow-up refactors
Agent-Logs-Url: https://github.com/RustPython/RustPython/sessions/1d30ae08-d8f0-431c-9299-8aea5c21f7d4

Co-authored-by: youknowone <69878+youknowone@users.noreply.github.com>
  • Loading branch information
Copilot and youknowone authored Apr 5, 2026
commit 0a340de9c30e00c6794464104397ef021244aeab
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion crates/codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ std = ["thiserror/std", "itertools/use_std"]

[dependencies]
rustpython-compiler-core = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-literal = {workspace = true }
rustpython-wtf8 = { workspace = true }
ruff_python_ast = { workspace = true }
Expand All @@ -29,7 +30,6 @@ num-traits = { workspace = true }
thiserror = { workspace = true }
malachite-bigint = { workspace = true }
memchr = { workspace = true }
unicode_names2 = { workspace = true }

[dev-dependencies]
ruff_python_parser = { workspace = true }
Expand Down
4 changes: 3 additions & 1 deletion crates/codegen/src/string_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ impl StringParser {
let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];

unicode_names2::character(name).ok_or_else(|| unreachable!())
rustpython_unicode::data::lookup(name)
.and_then(char::from_u32)
.ok_or_else(|| unreachable!())
}

/// Parse an escaped character, returning the new character.
Expand Down
2 changes: 1 addition & 1 deletion crates/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ wasm_js = ["getrandom/wasm_js"]

[dependencies]
rustpython-literal = { workspace = true }
rustpython-unicode = { workspace = true, default-features = false }
rustpython-wtf8 = { workspace = true }

ascii = { workspace = true }
Expand All @@ -29,7 +30,6 @@ malachite-q = { workspace = true }
malachite-base = { workspace = true }
num-traits = { workspace = true }
parking_lot = { workspace = true, optional = true }
unicode_names2 = { workspace = true }
radium = { workspace = true }

lock_api = "0.4"
Expand Down
2 changes: 1 addition & 1 deletion crates/common/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ pub mod errors {
let mut out = String::with_capacity(num_chars * 4);
for c in err_str.code_points() {
let c_u32 = c.to_u32();
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
if let Some(c_name) = rustpython_unicode::data::name(c_u32) {
write!(out, "\\N{{{c_name}}}").unwrap();
} else if c_u32 >= 0x10000 {
write!(out, "\\U{c_u32:08x}").unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/stdlib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ flame-it = ["flame"]
[dependencies]
# rustpython crates
rustpython-derive = { workspace = true }
rustpython-unicode = { workspace = true, features = ["std", "casefold"] }
rustpython-unicode = { workspace = true, features = ["casefold"] }
rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
rustpython-common = { workspace = true }

Expand Down
163 changes: 58 additions & 105 deletions crates/stdlib/src/unicodedata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,6 @@

pub(crate) use unicodedata::module_def;

use crate::vm::{
PyObject, PyResult, VirtualMachine, builtins::PyStr, convert::TryFromBorrowedObject,
};

enum NormalizeForm {
Nfc,
Nfkc,
Nfd,
Nfkd,
}

impl From<NormalizeForm> for rustpython_unicode::NormalizeForm {
fn from(value: NormalizeForm) -> Self {
match value {
NormalizeForm::Nfc => Self::Nfc,
NormalizeForm::Nfkc => Self::Nfkc,
NormalizeForm::Nfd => Self::Nfd,
NormalizeForm::Nfkd => Self::Nfkd,
}
}
}

impl<'a> TryFromBorrowedObject<'a> for NormalizeForm {
fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
obj.try_value_with(
|form: &PyStr| match form.as_bytes() {
b"NFC" => Ok(Self::Nfc),
b"NFKC" => Ok(Self::Nfkc),
b"NFD" => Ok(Self::Nfd),
b"NFKD" => Ok(Self::Nfkd),
_ => Err(vm.new_value_error("invalid normalization form")),
},
vm,
)
}
}

#[pymodule]
mod unicodedata {
use crate::vm::{
Expand All @@ -53,13 +16,20 @@ mod unicodedata {

use itertools::Itertools;
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
use rustpython_unicode::{UNICODE_VERSION, UnicodeVersion, data, normalize};
use rustpython_unicode::{NormalizeForm, UNICODE_VERSION, UnicodeVersion, data};

fn parse_normalize_form(form: PyStrRef, vm: &VirtualMachine) -> PyResult<NormalizeForm> {
form.to_str()
.ok_or_else(|| vm.new_value_error("invalid normalization form"))?
.parse()
.map_err(|()| vm.new_value_error("invalid normalization form"))
}

pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
__module_exec(vm, module);

// Add UCD methods as module-level functions
let ucd: PyObjectRef = Ucd::new(UNICODE_VERSION).into_ref(&vm.ctx).into();
let ucd: PyObjectRef = PyUcd::new(data::Ucd::default()).into_ref(&vm.ctx).into();

for attr in [
"category",
Expand All @@ -85,49 +55,36 @@ mod unicodedata {
#[pyattr]
#[pyclass(name = "UCD")]
#[derive(Debug, PyPayload)]
pub(super) struct Ucd {
unic_version: UnicodeVersion,
}

impl Ucd {
pub const fn new(unic_version: UnicodeVersion) -> Self {
Self { unic_version }
}
pub(super) struct PyUcd(data::Ucd);

fn check_age(&self, c: CodePoint) -> bool {
data::is_assigned_in_version(c.to_u32(), self.unic_version)
impl PyUcd {
pub const fn new(ucd: data::Ucd) -> Self {
Self(ucd)
}

fn extract_char(
&self,
character: PyStrRef,
vm: &VirtualMachine,
) -> PyResult<Option<CodePoint>> {
let c = character
fn extract_char(character: PyStrRef, vm: &VirtualMachine) -> PyResult<CodePoint> {
character
.as_wtf8()
.code_points()
.exactly_one()
.map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))?;

Ok(self.check_age(c).then_some(c))
.map_err(|_| vm.new_type_error("argument must be an unicode character, not str"))
}
}

#[pyclass(flags(DISALLOW_INSTANTIATION))]
impl Ucd {
impl PyUcd {
#[pymethod]
fn category(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
Ok(self
.extract_char(character, vm)?
.map_or("Cn", |c| data::category(c.to_u32()))
.0
.category(Self::extract_char(character, vm)?.to_u32())
.to_owned())
}

#[pymethod]
fn lookup(&self, name: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
if let Some(name_str) = name.to_str()
&& let Some(character) = data::lookup(name_str)
&& self.check_age(CodePoint::from_u32(character).expect("valid Unicode code point"))
&& let Some(character) = self.0.lookup(name_str)
{
return Ok(char::from_u32(character)
.expect("unicode_names2 only returns Unicode scalar values")
Expand All @@ -147,12 +104,7 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
let c = self.extract_char(character, vm)?;

if let Some(c) = c
&& self.check_age(c)
&& let Some(name) = data::name(c.to_u32())
{
if let Some(name) = self.0.name(Self::extract_char(character, vm)?.to_u32()) {
return Ok(vm.ctx.new_str(name).into());
}
default.ok_or_else(|| vm.new_value_error("no such name"))
Expand All @@ -165,8 +117,8 @@ mod unicodedata {
vm: &VirtualMachine,
) -> PyResult<&'static str> {
Ok(self
.extract_char(character, vm)?
.map_or("", |c| data::bidirectional(c.to_u32())))
.0
.bidirectional(Self::extract_char(character, vm)?.to_u32()))
}

/// NOTE: This function uses 9.0.0 database instead of 3.2.0
Expand All @@ -177,39 +129,51 @@ mod unicodedata {
vm: &VirtualMachine,
) -> PyResult<&'static str> {
Ok(self
.extract_char(character, vm)?
.map_or("N", |c| data::east_asian_width(c.to_u32())))
.0
.east_asian_width(Self::extract_char(character, vm)?.to_u32()))
}

#[pymethod]
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
Ok(normalize::normalize(form.into(), unistr.as_wtf8()))
fn normalize(
&self,
form: PyStrRef,
unistr: PyStrRef,
vm: &VirtualMachine,
) -> PyResult<Wtf8Buf> {
Ok(self
.0
.normalize(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
}

#[pymethod]
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
Ok(normalize::is_normalized(form.into(), unistr.as_wtf8()))
fn is_normalized(
&self,
form: PyStrRef,
unistr: PyStrRef,
vm: &VirtualMachine,
) -> PyResult<bool> {
Ok(self
.0
.is_normalized(parse_normalize_form(form, vm)?, unistr.as_wtf8()))
}

#[pymethod]
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
Ok(self
.extract_char(character, vm)?
.is_some_and(|c| data::mirrored(c.to_u32())) as i32)
Ok(self.0.mirrored(Self::extract_char(character, vm)?.to_u32()) as i32)
}

#[pymethod]
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<u8> {
Ok(self
.extract_char(character, vm)?
.map_or(0, |c| data::combining(c.to_u32())))
.0
.combining(Self::extract_char(character, vm)?.to_u32()))
}

#[pymethod]
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
Ok(self
.extract_char(character, vm)?
.map_or_else(String::new, |c| data::decomposition(c.to_u32())))
.0
.decomposition(Self::extract_char(character, vm)?.to_u32()))
}

#[pymethod]
Expand All @@ -219,10 +183,7 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
if let Some(value) = self
.extract_char(character, vm)?
.and_then(|c| data::digit(c.to_u32()))
{
if let Some(value) = self.0.digit(Self::extract_char(character, vm)?.to_u32()) {
return Ok(vm.ctx.new_int(value).into());
}
default.ok_or_else(|| vm.new_value_error("not a digit"))
Expand All @@ -235,10 +196,7 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
if let Some(value) = self
.extract_char(character, vm)?
.and_then(|c| data::decimal(c.to_u32()))
{
if let Some(value) = self.0.decimal(Self::extract_char(character, vm)?.to_u32()) {
return Ok(vm.ctx.new_int(value).into());
}
default.ok_or_else(|| vm.new_value_error("not a decimal"))
Expand All @@ -251,10 +209,7 @@ mod unicodedata {
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
if let Some(value) = self
.extract_char(character, vm)?
.and_then(|c| data::numeric(c.to_u32()))
{
if let Some(value) = self.0.numeric(Self::extract_char(character, vm)?.to_u32()) {
let value = match value {
data::NumericValue::Integer(n) => n as f64,
data::NumericValue::Rational(num, den) => num as f64 / den as f64,
Expand All @@ -266,19 +221,17 @@ mod unicodedata {

#[pygetset]
fn unidata_version(&self) -> String {
self.unic_version.to_string()
self.0.unicode_version().to_string()
}
}

#[pyattr]
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
Ucd {
unic_version: UnicodeVersion {
major: 3,
minor: 2,
micro: 0,
},
}
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<PyUcd> {
PyUcd::new(data::Ucd::new(UnicodeVersion {
major: 3,
minor: 2,
micro: 0,
}))
.into_ref(&vm.ctx)
}

Expand Down
5 changes: 2 additions & 3 deletions crates/unicode/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ repository.workspace = true
license.workspace = true

[features]
default = ["std", "casefold"]
std = []
casefold = ["std", "dep:caseless"]
default = ["casefold"]
casefold = ["dep:caseless"]

[dependencies]
rustpython-wtf8 = { workspace = true }
Expand Down
Loading
Loading