Skip to content

Commit 0acaac3

Browse files
Match CPython's islower/isupper exactly
This PR fixes a regression from my last islower/isupper patch. Python's Bytes doesn't assume an encoding, so methods like islower should only consider ASCII casing. I updated islower/isupper for UTF-8 and WTF-8 to match CPython more closely. The two functions now use the same properties as CPython and now match CPython exactly. I updated the unit tests to pass on Python 3.15. Unicode updates sometimes cause properties to shift. I previously tested everything on Python 3.14, but that lead to failures that I assumed were bugs but were actually due to Unicode differences. For example, U+0295 is a lower case letter in older Unicode versions but is NOT in newer versions. One of the new tests is disabled on Python 3.14 for now because it will fail in CI till CI is bumped to 3.15.
1 parent 5081f76 commit 0acaac3

5 files changed

Lines changed: 83 additions & 28 deletions

File tree

crates/vm/src/anystr.rs

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ use crate::{
44
convert::TryFromBorrowedObject,
55
function::OptionalOption,
66
};
7-
use icu_properties::{
8-
CodePointSetData,
9-
props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
7+
use icu_properties::props::{
8+
BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
109
};
1110
use num_traits::{cast::ToPrimitive, sign::Signed};
1211

@@ -405,42 +404,64 @@ pub trait AnyStr {
405404
rustpython_common::str::zfill(self.as_bytes(), width)
406405
}
407406

408-
// Unified form of CPython functions:
409-
// _Py_bytes_islower
410-
// unicode_islower_impl
407+
// _Py_bytes_islower
411408
fn py_islower(&self) -> bool {
412-
let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
413-
let alphabetic = CodePointSetData::new::<Alphabetic>();
414409
let mut lower = false;
415-
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
416-
if chunk.chars().any(|c| case_change.contains(c)) {
410+
for byte in self
411+
.as_bytes()
412+
.iter()
413+
.copied()
414+
.filter(u8::is_ascii_alphabetic)
415+
{
416+
if byte.is_ascii_uppercase() {
417417
return false;
418418
}
419-
420-
if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
421-
lower = true;
422-
}
419+
lower = true;
423420
}
424421
lower
425422
}
426423

427-
// Unified form of CPython functions:
428-
// Py_bytes_isupper
429-
// unicode_isupper_impl
424+
// Py_bytes_isupper
430425
fn py_isupper(&self) -> bool {
431-
let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
432-
let alphabetic = CodePointSetData::new::<Alphabetic>();
433426
let mut upper = false;
434-
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
435-
if chunk.chars().any(|c| case_change.contains(c)) {
427+
for byte in self
428+
.as_bytes()
429+
.iter()
430+
.copied()
431+
.filter(u8::is_ascii_alphabetic)
432+
{
433+
if byte.is_ascii_lowercase() {
436434
return false;
437435
}
436+
upper = true;
437+
}
438+
upper
439+
}
438440

439-
if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
440-
upper = true;
441+
// Unified form of CPython functions:
442+
// unicode_isupper_impl
443+
// unicode_islower_impl
444+
fn is_cased<VALID, INVALID>(&self) -> bool
445+
where
446+
VALID: BinaryProperty,
447+
INVALID: BinaryProperty,
448+
{
449+
let mut all_cased = false;
450+
for c in self
451+
.as_bytes()
452+
.utf8_chunks()
453+
.flat_map(|c| c.valid().chars())
454+
{
455+
if INVALID::for_char(c)
456+
|| GeneralCategoryGroup::TitlecaseLetter.contains(GeneralCategory::for_char(c))
457+
{
458+
return false;
459+
}
460+
if !all_cased && VALID::for_char(c) {
461+
all_cased = true;
441462
}
442463
}
443-
upper
464+
all_cased
444465
}
445466
}
446467

crates/vm/src/builtins/str.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ use super::{
66
builtins_iter,
77
},
88
};
9-
use crate::common::lock::LazyLock;
109
use crate::{
1110
AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
1211
TryFromBorrowedObject, VirtualMachine,
1312
anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
1413
atomic_func,
1514
cformat::cformat_string,
1615
class::PyClassImpl,
16+
common::lock::LazyLock,
1717
common::str::{PyKindStr, StrData, StrKind},
1818
convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
1919
format::{format, format_map},
@@ -46,7 +46,7 @@ use rustpython_common::{
4646

4747
use icu_properties::props::{
4848
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
49-
NumericType, XidContinue, XidStart,
49+
Lowercase, NumericType, Uppercase, XidContinue, XidStart,
5050
};
5151
use unicode_casing::CharExt;
5252

@@ -2330,6 +2330,14 @@ impl AnyStr for str {
23302330
}
23312331
splits
23322332
}
2333+
2334+
fn py_islower(&self) -> bool {
2335+
self.is_cased::<Lowercase, Uppercase>()
2336+
}
2337+
2338+
fn py_isupper(&self) -> bool {
2339+
self.is_cased::<Uppercase, Lowercase>()
2340+
}
23332341
}
23342342

23352343
impl AnyStrContainer<Wtf8> for Wtf8Buf {
@@ -2442,6 +2450,14 @@ impl AnyStr for Wtf8 {
24422450
}
24432451
splits
24442452
}
2453+
2454+
fn py_islower(&self) -> bool {
2455+
self.is_cased::<Lowercase, Uppercase>()
2456+
}
2457+
2458+
fn py_isupper(&self) -> bool {
2459+
self.is_cased::<Uppercase, Lowercase>()
2460+
}
24452461
}
24462462

24472463
impl AnyStrContainer<AsciiStr> for AsciiString {

crates/vm/src/stdlib/_thread.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ pub(crate) mod _thread {
459459
{
460460
// On Unix, use pthread ID from the handle
461461
use std::os::unix::thread::JoinHandleExt;
462-
handle.as_pthread_t() as u64
462+
handle.as_pthread_t() as _
463463
}
464464
#[cfg(not(unix))]
465465
{

extra_tests/snippets/builtin_str.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from testutils import AssertRaises, assert_raises, skip_if_unsupported
23

34
assert "".__eq__(1) == NotImplemented
@@ -250,7 +251,10 @@
250251
assert not "\U0001f431".islower()
251252
assert "\U0001f431 CAT".isupper()
252253
assert "\U0001f431 cat".islower()
253-
assert "\u0295".islower()
254+
if sys.version_info >= (3, 15):
255+
assert not "\u0295".islower()
256+
assert not "\u0295".isupper()
257+
assert not "\u0295".istitle()
254258
assert "\u1c89".isupper()
255259
assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")
256260
assert "hello".partition("is") == ("hello", "", "")
@@ -525,6 +529,8 @@ def try_mutate_str():
525529
assert "1a".islower()
526530
assert "가나다a".islower()
527531
assert "가나다A".isupper()
532+
assert not "ジョジョ".isupper()
533+
assert not "ジョジョ".islower()
528534

529535
# test str.format_map()
530536
#

extra_tests/snippets/builtin_str_encode.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,15 @@ def round_trip(s, encoding="utf-8"):
2020
round_trip("👺♦ 𝐚Şđƒ ☆☝")
2121
round_trip("☢🐣 ᖇ𝓤𝕊тⓟ𝕐𝕥卄σ𝔫 ♬👣")
2222
round_trip("💀👌 ק𝔂tℍⓞ𝓷 3 🔥👤")
23+
24+
# Bytes should not assume an encoding for isupper/islower
25+
assert "Æ".isupper()
26+
assert not "Æ".encode().isupper()
27+
assert "æ".islower()
28+
assert not "æ".encode().islower()
29+
30+
# Invalid Unicode
31+
assert not b"\x80\x80".islower()
32+
assert not b"\x80\x80".isupper()
33+
assert b"\x80cat\x80".islower()
34+
assert b"\x80CAT\x80".isupper()

0 commit comments

Comments
 (0)