Match CPython's islower/upper more closely

joshuamegnauth54 · joshuamegnauth54 · commit c5810cc09cab · 2026-04-20T23:16:52.000-04:00
This PR fixes a regression from my last islower/isupper patch.
Python's Bytes doesn't assume an encoding, so methods like islower
should only consider ASCII casing.

I updated islower/isupper for UTF-8 and WTF-8 to match CPython more
closely. The two functions now use the same properties as CPython and in
the same way that CPython does with the exception of Titlecase.

I updated the unit tests to pass on Python 3.15. Unicode updates
sometimes cause properties to shift. I previously tested everything on
Python 3.14, but that lead to failures that I assumed were bugs but were
actually due to Unicode differences. For example, U+0295 is a lower case
letter in older Unicode versions but is NOT in newer versions.
diff --git a/crates/vm/src/anystr.rs b/crates/vm/src/anystr.rs
@@ -4,10 +4,7 @@ use crate::{
     convert::TryFromBorrowedObject,
     function::OptionalOption,
 };
-use icu_properties::{
-    CodePointSetData,
-    props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
-};
+use icu_properties::props::BinaryProperty;
 use num_traits::{cast::ToPrimitive, sign::Signed};
 
 use core::ops::Range;
@@ -405,42 +402,58 @@ pub trait AnyStr {
         rustpython_common::str::zfill(self.as_bytes(), width)
     }
 
-    // Unified form of CPython functions:
-    //  _Py_bytes_islower
-    //  unicode_islower_impl
+    // _Py_bytes_islower
     fn py_islower(&self) -> bool {
-        let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
-        let alphabetic = CodePointSetData::new::<Alphabetic>();
         let mut lower = false;
-        for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
-            if chunk.chars().any(|c| case_change.contains(c)) {
+        for byte in self
+            .as_bytes()
+            .iter()
+            .copied()
+            .filter(u8::is_ascii_alphabetic)
+        {
+            if byte.is_ascii_uppercase() {
                 return false;
             }
+            lower = true;
+        }
+        lower
+    }
 
-            if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
-                lower = true;
+    // Py_bytes_isupper
+    fn py_isupper(&self) -> bool {
+        let mut upper = false;
+        for byte in self
+            .as_bytes()
+            .iter()
+            .copied()
+            .filter(u8::is_ascii_alphabetic)
+        {
+            if byte.is_ascii_lowercase() {
+                return false;
             }
+            upper = true;
         }
-        lower
+        upper
     }
 
     // Unified form of CPython functions:
-    //   Py_bytes_isupper
     //  unicode_isupper_impl
-    fn py_isupper(&self) -> bool {
-        let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
-        let alphabetic = CodePointSetData::new::<Alphabetic>();
-        let mut upper = false;
+    //  unicode_islower_impl
+    fn is_cased<VALID, INVALID>(&self) -> bool
+    where
+        VALID: BinaryProperty,
+        INVALID: BinaryProperty,
+    {
+        let mut all_cased = false;
         for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
-            if chunk.chars().any(|c| case_change.contains(c)) {
+            if chunk.chars().any(INVALID::for_char) {
                 return false;
             }
-
-            if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
-                upper = true;
+            if !all_cased && chunk.chars().any(VALID::for_char) {
+                all_cased = true;
             }
         }
-        upper
+        all_cased
     }
 }
 
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
@@ -6,14 +6,14 @@ use super::{
         builtins_iter,
     },
 };
-use crate::common::lock::LazyLock;
 use crate::{
     AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
     TryFromBorrowedObject, VirtualMachine,
     anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
     atomic_func,
     cformat::cformat_string,
     class::PyClassImpl,
+    common::lock::LazyLock,
     common::str::{PyKindStr, StrData, StrKind},
     convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
     format::{format, format_map},
@@ -46,7 +46,7 @@ use rustpython_common::{
 
 use icu_properties::props::{
     BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
-    NumericType, XidContinue, XidStart,
+    Lowercase, NumericType, Uppercase, XidContinue, XidStart,
 };
 use unicode_casing::CharExt;
 
@@ -2330,6 +2330,14 @@ impl AnyStr for str {
         }
         splits
     }
+
+    fn py_islower(&self) -> bool {
+        self.is_cased::<Lowercase, Uppercase>()
+    }
+
+    fn py_isupper(&self) -> bool {
+        self.is_cased::<Uppercase, Lowercase>()
+    }
 }
 
 impl AnyStrContainer<Wtf8> for Wtf8Buf {
@@ -2442,6 +2450,14 @@ impl AnyStr for Wtf8 {
         }
         splits
     }
+
+    fn py_islower(&self) -> bool {
+        self.is_cased::<Lowercase, Uppercase>()
+    }
+
+    fn py_isupper(&self) -> bool {
+        self.is_cased::<Uppercase, Lowercase>()
+    }
 }
 
 impl AnyStrContainer<AsciiStr> for AsciiString {
diff --git a/crates/vm/src/stdlib/_thread.rs b/crates/vm/src/stdlib/_thread.rs
@@ -466,7 +466,7 @@ pub(crate) mod _thread {
         {
             // On Unix, use pthread ID from the handle
             use std::os::unix::thread::JoinHandleExt;
-            handle.as_pthread_t() as u64
+            handle.as_pthread_t()
         }
         #[cfg(not(unix))]
         {
diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py
@@ -250,7 +250,9 @@
 assert not "\U0001f431".islower()
 assert "\U0001f431 CAT".isupper()
 assert "\U0001f431 cat".islower()
-assert "\u0295".islower()
+assert not "\u0295".islower()
+assert not "\u0295".isupper()
+assert not "\u0295".istitle()
 assert "\u1c89".isupper()
 assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")
 assert "hello".partition("is") == ("hello", "", "")
@@ -525,6 +527,8 @@ def try_mutate_str():
 assert "1a".islower()
 assert "가나다a".islower()
 assert "가나다A".isupper()
+assert not "ジョジョ".isupper()
+assert not "ジョジョ".islower()
 
 # test str.format_map()
 #
diff --git a/extra_tests/snippets/builtin_str_encode.py b/extra_tests/snippets/builtin_str_encode.py
@@ -20,3 +20,13 @@ def round_trip(s, encoding="utf-8"):
 round_trip("👺♦  𝐚Şđƒ  ☆☝")
 round_trip("☢🐣  ᖇ𝓤𝕊тⓟ𝕐𝕥卄σ𝔫  ♬👣")
 round_trip("💀👌  ק𝔂tℍⓞ𝓷 ３  🔥👤")
+
+# Bytes should not assume an encoding for isupper/islower
+assert not "\u0001f431 CAT".encode().isupper()
+assert "\u0001f431 cat".encode().islower()
+
+# Invalid Unicode
+assert not b"\x80\x80".islower()
+assert not b"\x80\x80".isupper()
+assert b"\x80cat\x80".islower()
+assert b"\x80CAT\x80".isupper()

Original file line number	Diff line number	Diff line change
`@@ -466,7 +466,7 @@ pub(crate) mod _thread {`
`466`	`466`	`{`
`467`	`467`	`// On Unix, use pthread ID from the handle`
`468`	`468`	`use std::os::unix::thread::JoinHandleExt;`
`469`		`- handle.as_pthread_t() as u64`
	`469`	`+ handle.as_pthread_t()`
`470`	`470`	`}`
`471`	`471`	`#[cfg(not(unix))]`
`472`	`472`	`{`