Skip to content

Commit c5810cc

Browse files
Match CPython's islower/upper more closely
This PR fixes a regression from my last islower/isupper patch. Python's Bytes doesn't assume an encoding, so methods like islower should only consider ASCII casing. I updated islower/isupper for UTF-8 and WTF-8 to match CPython more closely. The two functions now use the same properties as CPython and in the same way that CPython does with the exception of Titlecase. I updated the unit tests to pass on Python 3.15. Unicode updates sometimes cause properties to shift. I previously tested everything on Python 3.14, but that lead to failures that I assumed were bugs but were actually due to Unicode differences. For example, U+0295 is a lower case letter in older Unicode versions but is NOT in newer versions.
1 parent b18b71b commit c5810cc

5 files changed

Lines changed: 71 additions & 28 deletions

File tree

crates/vm/src/anystr.rs

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@ use crate::{
44
convert::TryFromBorrowedObject,
55
function::OptionalOption,
66
};
7-
use icu_properties::{
8-
CodePointSetData,
9-
props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
10-
};
7+
use icu_properties::props::BinaryProperty;
118
use num_traits::{cast::ToPrimitive, sign::Signed};
129

1310
use core::ops::Range;
@@ -405,42 +402,58 @@ pub trait AnyStr {
405402
rustpython_common::str::zfill(self.as_bytes(), width)
406403
}
407404

408-
// Unified form of CPython functions:
409-
// _Py_bytes_islower
410-
// unicode_islower_impl
405+
// _Py_bytes_islower
411406
fn py_islower(&self) -> bool {
412-
let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
413-
let alphabetic = CodePointSetData::new::<Alphabetic>();
414407
let mut lower = false;
415-
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
416-
if chunk.chars().any(|c| case_change.contains(c)) {
408+
for byte in self
409+
.as_bytes()
410+
.iter()
411+
.copied()
412+
.filter(u8::is_ascii_alphabetic)
413+
{
414+
if byte.is_ascii_uppercase() {
417415
return false;
418416
}
417+
lower = true;
418+
}
419+
lower
420+
}
419421

420-
if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
421-
lower = true;
422+
// Py_bytes_isupper
423+
fn py_isupper(&self) -> bool {
424+
let mut upper = false;
425+
for byte in self
426+
.as_bytes()
427+
.iter()
428+
.copied()
429+
.filter(u8::is_ascii_alphabetic)
430+
{
431+
if byte.is_ascii_lowercase() {
432+
return false;
422433
}
434+
upper = true;
423435
}
424-
lower
436+
upper
425437
}
426438

427439
// Unified form of CPython functions:
428-
// Py_bytes_isupper
429440
// unicode_isupper_impl
430-
fn py_isupper(&self) -> bool {
431-
let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
432-
let alphabetic = CodePointSetData::new::<Alphabetic>();
433-
let mut upper = false;
441+
// unicode_islower_impl
442+
fn is_cased<VALID, INVALID>(&self) -> bool
443+
where
444+
VALID: BinaryProperty,
445+
INVALID: BinaryProperty,
446+
{
447+
let mut all_cased = false;
434448
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
435-
if chunk.chars().any(|c| case_change.contains(c)) {
449+
if chunk.chars().any(INVALID::for_char) {
436450
return false;
437451
}
438-
439-
if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
440-
upper = true;
452+
if !all_cased && chunk.chars().any(VALID::for_char) {
453+
all_cased = true;
441454
}
442455
}
443-
upper
456+
all_cased
444457
}
445458
}
446459

crates/vm/src/builtins/str.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ use super::{
66
builtins_iter,
77
},
88
};
9-
use crate::common::lock::LazyLock;
109
use crate::{
1110
AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
1211
TryFromBorrowedObject, VirtualMachine,
1312
anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
1413
atomic_func,
1514
cformat::cformat_string,
1615
class::PyClassImpl,
16+
common::lock::LazyLock,
1717
common::str::{PyKindStr, StrData, StrKind},
1818
convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
1919
format::{format, format_map},
@@ -46,7 +46,7 @@ use rustpython_common::{
4646

4747
use icu_properties::props::{
4848
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
49-
NumericType, XidContinue, XidStart,
49+
Lowercase, NumericType, Uppercase, XidContinue, XidStart,
5050
};
5151
use unicode_casing::CharExt;
5252

@@ -2330,6 +2330,14 @@ impl AnyStr for str {
23302330
}
23312331
splits
23322332
}
2333+
2334+
fn py_islower(&self) -> bool {
2335+
self.is_cased::<Lowercase, Uppercase>()
2336+
}
2337+
2338+
fn py_isupper(&self) -> bool {
2339+
self.is_cased::<Uppercase, Lowercase>()
2340+
}
23332341
}
23342342

23352343
impl AnyStrContainer<Wtf8> for Wtf8Buf {
@@ -2442,6 +2450,14 @@ impl AnyStr for Wtf8 {
24422450
}
24432451
splits
24442452
}
2453+
2454+
fn py_islower(&self) -> bool {
2455+
self.is_cased::<Lowercase, Uppercase>()
2456+
}
2457+
2458+
fn py_isupper(&self) -> bool {
2459+
self.is_cased::<Uppercase, Lowercase>()
2460+
}
24452461
}
24462462

24472463
impl AnyStrContainer<AsciiStr> for AsciiString {

crates/vm/src/stdlib/_thread.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ pub(crate) mod _thread {
466466
{
467467
// On Unix, use pthread ID from the handle
468468
use std::os::unix::thread::JoinHandleExt;
469-
handle.as_pthread_t() as u64
469+
handle.as_pthread_t()
470470
}
471471
#[cfg(not(unix))]
472472
{

extra_tests/snippets/builtin_str.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,9 @@
250250
assert not "\U0001f431".islower()
251251
assert "\U0001f431 CAT".isupper()
252252
assert "\U0001f431 cat".islower()
253-
assert "\u0295".islower()
253+
assert not "\u0295".islower()
254+
assert not "\u0295".isupper()
255+
assert not "\u0295".istitle()
254256
assert "\u1c89".isupper()
255257
assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")
256258
assert "hello".partition("is") == ("hello", "", "")
@@ -525,6 +527,8 @@ def try_mutate_str():
525527
assert "1a".islower()
526528
assert "가나다a".islower()
527529
assert "가나다A".isupper()
530+
assert not "ジョジョ".isupper()
531+
assert not "ジョジョ".islower()
528532

529533
# test str.format_map()
530534
#

extra_tests/snippets/builtin_str_encode.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,13 @@ def round_trip(s, encoding="utf-8"):
2020
round_trip("👺♦ 𝐚Şđƒ ☆☝")
2121
round_trip("☢🐣 ᖇ𝓤𝕊тⓟ𝕐𝕥卄σ𝔫 ♬👣")
2222
round_trip("💀👌 ק𝔂tℍⓞ𝓷 3 🔥👤")
23+
24+
# Bytes should not assume an encoding for isupper/islower
25+
assert not "\u0001f431 CAT".encode().isupper()
26+
assert "\u0001f431 cat".encode().islower()
27+
28+
# Invalid Unicode
29+
assert not b"\x80\x80".islower()
30+
assert not b"\x80\x80".isupper()
31+
assert b"\x80cat\x80".islower()
32+
assert b"\x80CAT\x80".isupper()

0 commit comments

Comments
 (0)