Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement missing unicodedata functions and fix lookup error type
Add combining, decomposition, digit, decimal, numeric methods to Ucd.
Change lookup() to raise KeyError instead of LookupError.
Remove expectedFailure markers from 9 passing tests.
Add unicodedata.is_normalized() method.
Rename decomp_chars to chars to fix spell check.
Remove expectedFailure from test_named_unicode_escapes and
test_urlsplit_normalization.
  • Loading branch information
youknowone committed Feb 14, 2026
commit 334936045dd49937f49fd558ffdcca24ac26f243
1 change: 0 additions & 1 deletion Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,6 @@ def test_other_escapes(self):
with self.subTest(c):
self.assertRaises(re.PatternError, re.compile, '[\\%c]' % c)

@unittest.expectedFailure # TODO: RUSTPYTHON
def test_named_unicode_escapes(self):
# test individual Unicode named escapes
self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
Expand Down
1 change: 0 additions & 1 deletion Lib/test/test_ucn.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def check_version(testfile):
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)

@unittest.expectedFailure # TODO: RUSTPYTHON
def test_errors(self):
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
Expand Down
8 changes: 0 additions & 8 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def test_no_names_in_pua(self):
char = chr(i)
self.assertRaises(ValueError, self.db.name, char)

@unittest.expectedFailure # TODO: RUSTPYTHON; LookupError: undefined character name 'LATIN SMLL LETR A'
def test_lookup_nonexistant(self):
# just make sure that lookup can fail
for nonexistent in [
Expand All @@ -133,7 +132,6 @@ def test_lookup_nonexistant(self):
]:
self.assertRaises(KeyError, self.db.lookup, nonexistent)

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
self.assertEqual(self.db.digit('9'), 9)
Expand All @@ -146,7 +144,6 @@ def test_digit(self):
self.assertRaises(TypeError, self.db.digit, 'xx')
self.assertRaises(ValueError, self.db.digit, 'x')

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'numeric'
def test_numeric(self):
self.assertEqual(self.db.numeric('A',None), None)
self.assertEqual(self.db.numeric('9'), 9)
Expand All @@ -160,7 +157,6 @@ def test_numeric(self):
self.assertRaises(TypeError, self.db.numeric, 'xx')
self.assertRaises(ValueError, self.db.numeric, 'x')

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
def test_decimal(self):
self.assertEqual(self.db.decimal('A',None), None)
self.assertEqual(self.db.decimal('9'), 9)
Expand Down Expand Up @@ -193,7 +189,6 @@ def test_bidirectional(self):
self.assertRaises(TypeError, self.db.bidirectional)
self.assertRaises(TypeError, self.db.bidirectional, 'xx')

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decomposition'
def test_decomposition(self):
self.assertEqual(self.db.decomposition('\uFFFE'),'')
self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
Expand All @@ -210,7 +205,6 @@ def test_mirrored(self):
self.assertRaises(TypeError, self.db.mirrored)
self.assertRaises(TypeError, self.db.mirrored, 'xx')

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'combining'
def test_combining(self):
self.assertEqual(self.db.combining('\uFFFE'), 0)
self.assertEqual(self.db.combining('a'), 0)
Expand Down Expand Up @@ -313,7 +307,6 @@ def test_failed_import_during_compiling(self):
"(can't load unicodedata module)"
self.assertIn(error, result.err.decode("ascii"))

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
def test_decimal_numeric_consistent(self):
# Test that decimal and numeric are consistent,
# i.e. if a character has a decimal value,
Expand All @@ -327,7 +320,6 @@ def test_decimal_numeric_consistent(self):
count += 1
self.assertTrue(count >= 10) # should have tested at least the ASCII digits

@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
def test_digit_numeric_consistent(self):
# Test that digit and numeric are consistent,
# i.e. if a character has a digit value,
Expand Down
1 change: 0 additions & 1 deletion Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1495,7 +1495,6 @@ def test_all(self):
expected.append(name)
self.assertCountEqual(urllib.parse.__all__, expected)

@unittest.expectedFailure # TODO: RUSTPYTHON
def test_urlsplit_normalization(self):
# Certain characters should never occur in the netloc,
# including under normalization.
Expand Down
140 changes: 137 additions & 3 deletions crates/stdlib/src/unicodedata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ mod unicodedata {
};
use itertools::Itertools;
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
use ucd::{Codepoint, EastAsianWidth};
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
use unic_char_property::EnumeratedCharProperty;
use unic_normal::StrNormalForm;
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
Expand All @@ -62,9 +62,15 @@ mod unicodedata {
"lookup",
"name",
"bidirectional",
"combining",
"decimal",
"decomposition",
"digit",
"east_asian_width",
"normalize",
"is_normalized",
"mirrored",
"normalize",
"numeric",
] {
module.set_attr(attr, ucd.get_attr(attr, vm)?, vm)?;
}
Expand Down Expand Up @@ -125,7 +131,11 @@ mod unicodedata {
{
return Ok(character.to_string());
}
Err(vm.new_lookup_error(format!("undefined character name '{name}'")))
Err(vm.new_key_error(
vm.ctx
.new_str(format!("undefined character name '{name}'"))
.into(),
))
}

#[pymethod]
Expand Down Expand Up @@ -189,6 +199,19 @@ mod unicodedata {
Ok(normalized_text)
}

#[pymethod]
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
use super::NormalizeForm::*;
let text = unistr.as_wtf8();
let normalized: Wtf8Buf = match form {
Nfc => text.map_utf8(|s| s.nfc()).collect(),
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
Nfd => text.map_utf8(|s| s.nfd()).collect(),
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
};
Ok(text == &*normalized)
}

#[pymethod]
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
match self.extract_char(character, vm)? {
Expand All @@ -204,12 +227,123 @@ mod unicodedata {
}
}

#[pymethod]
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
Ok(self
.extract_char(character, vm)?
.and_then(|c| c.to_char())
.map_or(0, |ch| ch.canonical_combining_class() as i32))
}

#[pymethod]
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) {
Some(ch) => ch,
None => return Ok(String::new()),
};
let chars: Vec<char> = ch.decomposition_map().collect();
// If decomposition maps to just the character itself, there's no decomposition
if chars.len() == 1 && chars[0] == ch {
return Ok(String::new());
}
let hex_parts = chars
.iter()
.map(|c| format!("{:04X}", *c as u32))
.join(" ");
let tag = match ch.decomposition_type() {
Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
Some(dt) => decomposition_type_tag(dt),
};
Ok(format!("<{tag}> {hex_parts}"))
}

#[pymethod]
fn digit(
&self,
character: PyStrRef,
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
if let Some(ch) = ch
&& matches!(
ch.numeric_type(),
Some(NumericType::Decimal) | Some(NumericType::Digit)
)
&& let Some(Number::Integer(n)) = ch.numeric_value()
{
return Ok(vm.ctx.new_int(n).into());
}
default.ok_or_else(|| vm.new_value_error("not a digit"))
}

#[pymethod]
fn decimal(
&self,
character: PyStrRef,
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
if let Some(ch) = ch
&& ch.numeric_type() == Some(NumericType::Decimal)
&& let Some(Number::Integer(n)) = ch.numeric_value()
{
return Ok(vm.ctx.new_int(n).into());
}
default.ok_or_else(|| vm.new_value_error("not a decimal"))
}

#[pymethod]
fn numeric(
&self,
character: PyStrRef,
default: OptionalArg<PyObjectRef>,
vm: &VirtualMachine,
) -> PyResult {
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
if let Some(ch) = ch {
match ch.numeric_value() {
Some(Number::Integer(n)) => {
return Ok(vm.ctx.new_float(n as f64).into());
}
Some(Number::Rational(num, den)) => {
return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
}
None => {}
}
}
default.ok_or_else(|| vm.new_value_error("not a numeric character"))
}

#[pygetset]
fn unidata_version(&self) -> String {
self.unic_version.to_string()
}
}

fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
match dt {
DecompositionType::Canonical => "canonical",
DecompositionType::Compat => "compat",
DecompositionType::Circle => "circle",
DecompositionType::Final => "final",
DecompositionType::Font => "font",
DecompositionType::Fraction => "fraction",
DecompositionType::Initial => "initial",
DecompositionType::Isolated => "isolated",
DecompositionType::Medial => "medial",
DecompositionType::Narrow => "narrow",
DecompositionType::Nobreak => "noBreak",
DecompositionType::Small => "small",
DecompositionType::Square => "square",
DecompositionType::Sub => "sub",
DecompositionType::Super => "super",
DecompositionType::Vertical => "vertical",
DecompositionType::Wide => "wide",
}
}

trait EastAsianWidthAbbrName {
fn abbr_name(&self) -> &'static str;
}
Expand Down
Loading