Match CPython's islower/isupper exactly

joshuamegnauth54 · joshuamegnauth54 · commit 0acaac322696 · 2026-04-23T13:42:23.000-04:00
This PR fixes a regression from my last islower/isupper patch.
Python's Bytes doesn't assume an encoding, so methods like islower
should only consider ASCII casing.

I updated islower/isupper for UTF-8 and WTF-8 to match CPython more
closely. The two functions now use the same properties as CPython and
now match CPython exactly.

I updated the unit tests to pass on Python 3.15. Unicode updates
sometimes cause properties to shift. I previously tested everything on
Python 3.14, but that lead to failures that I assumed were bugs but were
actually due to Unicode differences. For example, U+0295 is a lower case
letter in older Unicode versions but is NOT in newer versions.

One of the new tests is disabled on Python 3.14 for now because it will
fail in CI till CI is bumped to 3.15.
diff --git a/crates/vm/src/anystr.rs b/crates/vm/src/anystr.rs
@@ -4,9 +4,8 @@ use crate::{
     convert::TryFromBorrowedObject,
     function::OptionalOption,
 };
-use icu_properties::{
-    CodePointSetData,
-    props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
+use icu_properties::props::{
+    BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
 };
 use num_traits::{cast::ToPrimitive, sign::Signed};
 
@@ -405,42 +404,64 @@ pub trait AnyStr {
         rustpython_common::str::zfill(self.as_bytes(), width)
     }
 
-    // Unified form of CPython functions:
-    //  _Py_bytes_islower
-    //  unicode_islower_impl
+    // _Py_bytes_islower
     fn py_islower(&self) -> bool {
-        let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
-        let alphabetic = CodePointSetData::new::<Alphabetic>();
         let mut lower = false;
-        for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
-            if chunk.chars().any(|c| case_change.contains(c)) {
+        for byte in self
+            .as_bytes()
+            .iter()
+            .copied()
+            .filter(u8::is_ascii_alphabetic)
+        {
+            if byte.is_ascii_uppercase() {
                 return false;
             }
-
-            if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
-                lower = true;
-            }
+            lower = true;
         }
         lower
     }
 
-    // Unified form of CPython functions:
-    //   Py_bytes_isupper
-    //  unicode_isupper_impl
+    // Py_bytes_isupper
     fn py_isupper(&self) -> bool {
-        let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
-        let alphabetic = CodePointSetData::new::<Alphabetic>();
         let mut upper = false;
-        for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
-            if chunk.chars().any(|c| case_change.contains(c)) {
+        for byte in self
+            .as_bytes()
+            .iter()
+            .copied()
+            .filter(u8::is_ascii_alphabetic)
+        {
+            if byte.is_ascii_lowercase() {
                 return false;
             }
+            upper = true;
+        }
+        upper
+    }
 
-            if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
-                upper = true;
+    // Unified form of CPython functions:
+    //  unicode_isupper_impl
+    //  unicode_islower_impl
+    fn is_cased<VALID, INVALID>(&self) -> bool
+    where
+        VALID: BinaryProperty,
+        INVALID: BinaryProperty,
+    {
+        let mut all_cased = false;
+        for c in self
+            .as_bytes()
+            .utf8_chunks()
+            .flat_map(|c| c.valid().chars())
+        {
+            if INVALID::for_char(c)
+                || GeneralCategoryGroup::TitlecaseLetter.contains(GeneralCategory::for_char(c))
+            {
+                return false;
+            }
+            if !all_cased && VALID::for_char(c) {
+                all_cased = true;
             }
         }
-        upper
+        all_cased
     }
 }
 
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
@@ -6,14 +6,14 @@ use super::{
         builtins_iter,
     },
 };
-use crate::common::lock::LazyLock;
 use crate::{
     AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
     TryFromBorrowedObject, VirtualMachine,
     anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
     atomic_func,
     cformat::cformat_string,
     class::PyClassImpl,
+    common::lock::LazyLock,
     common::str::{PyKindStr, StrData, StrKind},
     convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
     format::{format, format_map},
@@ -46,7 +46,7 @@ use rustpython_common::{
 
 use icu_properties::props::{
     BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
-    NumericType, XidContinue, XidStart,
+    Lowercase, NumericType, Uppercase, XidContinue, XidStart,
 };
 use unicode_casing::CharExt;
 
@@ -2330,6 +2330,14 @@ impl AnyStr for str {
         }
         splits
     }
+
+    fn py_islower(&self) -> bool {
+        self.is_cased::<Lowercase, Uppercase>()
+    }
+
+    fn py_isupper(&self) -> bool {
+        self.is_cased::<Uppercase, Lowercase>()
+    }
 }
 
 impl AnyStrContainer<Wtf8> for Wtf8Buf {
@@ -2442,6 +2450,14 @@ impl AnyStr for Wtf8 {
         }
         splits
     }
+
+    fn py_islower(&self) -> bool {
+        self.is_cased::<Lowercase, Uppercase>()
+    }
+
+    fn py_isupper(&self) -> bool {
+        self.is_cased::<Uppercase, Lowercase>()
+    }
 }
 
 impl AnyStrContainer<AsciiStr> for AsciiString {
diff --git a/crates/vm/src/stdlib/_thread.rs b/crates/vm/src/stdlib/_thread.rs
@@ -459,7 +459,7 @@ pub(crate) mod _thread {
         {
             // On Unix, use pthread ID from the handle
             use std::os::unix::thread::JoinHandleExt;
-            handle.as_pthread_t() as u64
+            handle.as_pthread_t() as _
         }
         #[cfg(not(unix))]
         {
diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py
@@ -1,3 +1,4 @@
+import sys
 from testutils import AssertRaises, assert_raises, skip_if_unsupported
 
 assert "".__eq__(1) == NotImplemented
@@ -250,7 +251,10 @@
 assert not "\U0001f431".islower()
 assert "\U0001f431 CAT".isupper()
 assert "\U0001f431 cat".islower()
-assert "\u0295".islower()
+if sys.version_info >= (3, 15):
+    assert not "\u0295".islower()
+    assert not "\u0295".isupper()
+    assert not "\u0295".istitle()
 assert "\u1c89".isupper()
 assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")
 assert "hello".partition("is") == ("hello", "", "")
@@ -525,6 +529,8 @@ def try_mutate_str():
 assert "1a".islower()
 assert "가나다a".islower()
 assert "가나다A".isupper()
+assert not "ジョジョ".isupper()
+assert not "ジョジョ".islower()
 
 # test str.format_map()
 #
diff --git a/extra_tests/snippets/builtin_str_encode.py b/extra_tests/snippets/builtin_str_encode.py
@@ -20,3 +20,15 @@ def round_trip(s, encoding="utf-8"):
 round_trip("👺♦  𝐚Şđƒ  ☆☝")
 round_trip("☢🐣  ᖇ𝓤𝕊тⓟ𝕐𝕥卄σ𝔫  ♬👣")
 round_trip("💀👌  ק𝔂tℍⓞ𝓷 ３  🔥👤")
+
+# Bytes should not assume an encoding for isupper/islower
+assert "Æ".isupper()
+assert not "Æ".encode().isupper()
+assert "æ".islower()
+assert not "æ".encode().islower()
+
+# Invalid Unicode
+assert not b"\x80\x80".islower()
+assert not b"\x80\x80".isupper()
+assert b"\x80cat\x80".islower()
+assert b"\x80CAT\x80".isupper()

Original file line number	Diff line number	Diff line change
`@@ -459,7 +459,7 @@ pub(crate) mod _thread {`
`459`	`459`	`{`
`460`	`460`	`// On Unix, use pthread ID from the handle`
`461`	`461`	`use std::os::unix::thread::JoinHandleExt;`
`462`		`- handle.as_pthread_t() as u64`
	`462`	`+ handle.as_pthread_t() as _`
`463`	`463`	`}`
`464`	`464`	`#[cfg(not(unix))]`
`465`	`465`	`{`