From 36c57daa85625a79f1e310ccd6746fe6aa7c9cf0 Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Tue, 5 May 2026 14:08:14 -0400 Subject: [PATCH] Fix swapcase() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tests for swapcase() were failing for two reasons. The first is '𐐧' casing which should be fixed with modern Unicode tables. The second failure is due to CPython's sigma override, which I implemented in --- Lib/test/test_str.py | 1 - crates/vm/src/builtins/str.rs | 39 +++++++++++++++++++++++++---------- crates/vm/src/bytes_inner.rs | 17 +++++++-------- 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 6d0e935c1c..5317bfc4dc 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -974,7 +974,6 @@ def test_title(self): self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') - @unittest.expectedFailure # TODO: RUSTPYTHON; + 𐐧 def test_swapcase(self): string_tests.StringLikeTest.test_swapcase(self) self.assertEqual('\U0001044F'.swapcase(), '\U00010427') diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 402cde304a..a4b3668c1b 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -1089,19 +1089,22 @@ impl PyStr { #[pymethod] fn swapcase(&self) -> Wtf8Buf { - let mut swapped_str = Wtf8Buf::with_capacity(self.data.len()); - for c_orig in self.as_wtf8().code_points() { - let c = c_orig.to_char_lossy(); - // to_uppercase returns an iterator because case changes may be multiple bytes - if c.is_lowercase() { - swapped_str.extend(c.to_uppercase()); - } else if c.is_uppercase() { - swapped_str.extend(c.to_lowercase()); - } else { - swapped_str.push(c_orig); + match self.as_str_kind() { + PyKindStr::Ascii(s) => swapcase_ascii(s.as_bytes()).into(), + PyKindStr::Utf8(s) => { + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + swapcase_utf8(s, &mut out); + out.0.into() + } + PyKindStr::Wtf8(s) => { + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + for chunk in s.as_bytes().utf8_chunks() { + swapcase_utf8(chunk.valid(), &mut out); + out.0.extend(chunk.invalid()); + } + out.0.into() } } - swapped_str } #[pymethod] @@ -1555,6 +1558,20 @@ impl PyStr { } } +fn swapcase_utf8(s: &str, out: &mut VecFmtWriter) { + for (i, ch) in s.char_indices() { + if ch.is_uppercase() { + lowercase_or_sigma(ch, s, i, out); + } else { + for ch in ch.to_lowercase() { + let mut buf = [0u8; 4]; + let s = ch.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } + } + } +} + impl PyRef { #[must_use] pub fn is_empty(&self) -> bool { diff --git a/crates/vm/src/bytes_inner.rs b/crates/vm/src/bytes_inner.rs index c864524561..9a8b182424 100644 --- a/crates/vm/src/bytes_inner.rs +++ b/crates/vm/src/bytes_inner.rs @@ -413,15 +413,7 @@ impl PyBytesInner { } pub fn swapcase(&self) -> Vec { - let mut new: Vec = Vec::with_capacity(self.elements.len()); - for w in &self.elements { - match w { - b'A'..=b'Z' => new.push(w.to_ascii_lowercase()), - b'a'..=b'z' => new.push(w.to_ascii_uppercase()), - x => new.push(*x), - } - } - new + swapcase_ascii(self.as_bytes()) } pub fn hex( @@ -1236,3 +1228,10 @@ pub(crate) fn bytes_to_hex( pub(crate) const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } + +pub(crate) fn swapcase_ascii(bytes: &[u8]) -> Vec { + bytes + .iter() + .map(|&b| if b.is_ascii_alphabetic() { b ^ 0x20 } else { b }) + .collect() +}