From 81a6910eb1a03d8ea8e5e118f92ef7e32e18dd0b Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 01/26] tests/basics: Add bytes.decode() tests. Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_errors.py | 60 +++++++++++++++++++++++++ tests/basics/bytes_decode_errors.py.exp | 14 ++++++ 2 files changed, 74 insertions(+) create mode 100644 tests/basics/bytes_decode_errors.py create mode 100644 tests/basics/bytes_decode_errors.py.exp diff --git a/tests/basics/bytes_decode_errors.py b/tests/basics/bytes_decode_errors.py new file mode 100644 index 0000000000000..b87d9cd7294e3 --- /dev/null +++ b/tests/basics/bytes_decode_errors.py @@ -0,0 +1,60 @@ +# Test bytes.decode() with error handlers + +# Test ignore mode with invalid UTF-8 +print(repr(b'\xff\xfe'.decode('utf-8', 'ignore'))) + +# Test strict mode (default) with invalid UTF-8 +try: + b'\xff\xfe'.decode('utf-8') +except UnicodeError: + print("UnicodeError") + +try: + b'\xff\xfe'.decode('utf-8', 'strict') +except UnicodeError: + print("UnicodeError") + +# Test with valid UTF-8 +print(repr(b'hello'.decode('utf-8', 'ignore'))) +print(repr(b'hello'.decode('utf-8'))) + +# Test mixed valid and invalid UTF-8 +print(repr(b'hello\xffworld'.decode('utf-8', 'ignore'))) + +# Test multiple invalid bytes +print(repr(b'\x80\x81\x82'.decode('utf-8', 'ignore'))) + +# Test invalid continuation byte +print(repr(b'\xc0\x20'.decode('utf-8', 'ignore'))) + +# Test incomplete sequence at end +print(repr(b'hello\xc0'.decode('utf-8', 'ignore'))) + +# Test valid multi-byte UTF-8 +print(repr(b'\xc2\xa9'.decode('utf-8', 'ignore'))) # © symbol + +# Test bytearray as well +print(repr(bytearray(b'\xff\xfe').decode('utf-8', 'ignore'))) + +# Test replace mode - should either work or raise NotImplementedError +try: + result = b'\xff\xfe'.decode('utf-8', 'replace') + # If replace is implemented, check the result + print(repr(result)) +except NotImplementedError: + # If replace is not implemented, that's expected + print("NotImplementedError: replace") + +# Test replace with valid UTF-8 - should work even if replace isn't fully enabled +try: + result = b'hello'.decode('utf-8', 'replace') + print(repr(result)) +except NotImplementedError: + print("NotImplementedError: replace") + +# Test replace with mixed content +try: + result = b'hello\xffworld'.decode('utf-8', 'replace') + print(repr(result)) +except NotImplementedError: + print("NotImplementedError: replace") diff --git a/tests/basics/bytes_decode_errors.py.exp b/tests/basics/bytes_decode_errors.py.exp new file mode 100644 index 0000000000000..f9126debb4f8a --- /dev/null +++ b/tests/basics/bytes_decode_errors.py.exp @@ -0,0 +1,14 @@ +'' +UnicodeError +UnicodeError +'hello' +'hello' +'helloworld' +'' +' ' +'hello' +'\xa9' +'' +'\ufffd\ufffd' +'hello' +'hello\ufffdworld' From c856515d69c047fe422ad78cb278067b90c9aede Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 02/26] py/config: Add MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE. Signed-off-by: Jos Verlinde --- py/mpconfig.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/py/mpconfig.h b/py/mpconfig.h index 1574243e8ea99..778276a2b2475 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -1370,6 +1370,13 @@ typedef time_t mp_timestamp_t; #define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE) #endif +// Whether bytes.decode() supports the 'ignore' and 'replace' error handlers +// Can be explicitly set to 0 or 1 to override the default ROM level behavior +// Default: enabled at EXTRA_FEATURES and above, disabled below +#ifndef MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS +#define MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES) +#endif + // Whether str.center() method provided #ifndef MICROPY_PY_BUILTINS_STR_CENTER #define MICROPY_PY_BUILTINS_STR_CENTER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES) From 82d6e5d9c69f69d59bb1a9e22a27b96c5b7faa58 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 03/26] py/unicode: Implement bytes.decode() 'ignore' and 'replace' modes. Raises LookupError for not implements error handlers Improves repr() rendering for unicode. Signed-off-by: Jos Verlinde --- py/objstr.c | 97 ++++++++++++++++++++++++++++++++++++++++++++-- py/objstrunicode.c | 8 ++++ 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 06afb91fc7f73..6171983beed3e 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -209,19 +209,107 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ } default: // 2 or 3 args - // TODO: validate 2nd/3rd args + #if MICROPY_PY_BUILTINS_BYTEARRAY + if (mp_obj_is_type(args[0], &mp_type_bytes) || mp_obj_is_type(args[0], &mp_type_bytearray)) { + #else if (mp_obj_is_type(args[0], &mp_type_bytes)) { + #endif GET_STR_DATA_LEN(args[0], str_data, str_len); GET_STR_HASH(args[0], str_hash); if (str_hash == 0) { str_hash = qstr_compute_hash(str_data, str_len); } + #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK - if (!utf8_check(str_data, str_len)) { - mp_raise_msg(&mp_type_UnicodeError, NULL); + #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS + // Check if error handler is specified (3rd argument) + const char *errors = "strict"; + if (n_args >= 3 && args[2] != mp_const_none) { + errors = mp_obj_str_get_str(args[2]); } #endif + // Fast path: if data is valid UTF-8, return directly + if (utf8_check(str_data, str_len)) { + // Check if a qstr with this data already exists + qstr q = qstr_find_strn((const char *)str_data, str_len); + if (q != MP_QSTRnull) { + return MP_OBJ_NEW_QSTR(q); + } + + mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len)); + o->data = str_data; + o->hash = str_hash; + return MP_OBJ_FROM_PTR(o); + } + + // Data has invalid UTF-8, handle based on error mode + #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS + // Error handlers are enabled + bool do_ignore = strcmp(errors, "ignore") == 0; + bool do_replace = strcmp(errors, "replace") == 0; + + if (do_ignore || do_replace) { + // Build new string skipping/replacing invalid bytes + vstr_t vstr; + vstr_init(&vstr, str_len); + const byte *p = str_data; + const byte *end = str_data + str_len; + + while (p < end) { + byte c = *p; + if (c < 0x80) { + // Valid ASCII + vstr_add_byte(&vstr, c); + p++; + } else if (c >= 0xc0 && c < 0xf8) { + // Potential multi-byte sequence + uint8_t need = (0xe5 >> ((c >> 3) & 0x6)) & 3; + const byte *seq_start = p; + p++; + + // Check continuation bytes + uint8_t got = 0; + while (got < need && p < end && UTF8_IS_CONT(*p)) { + got++; + p++; + } + + if (got == need) { + // Valid complete sequence, decode and add the character + unichar ch = *seq_start & (0x7f >> need); + for (uint8_t i = 0; i < need; i++) { + ch = (ch << 6) | (seq_start[i + 1] & 0x3f); + } + vstr_add_char(&vstr, ch); + } + else if (do_replace) { + // Invalid or incomplete sequence - replace with U+FFFD + vstr_add_char(&vstr, 0xFFFD); + } + // For 'ignore' mode, do nothing (skip invalid bytes) + } + else if (do_replace) { + // Invalid start byte - replace with U+FFFD + vstr_add_char(&vstr, 0xFFFD); + p++; + } + else { + // Invalid start byte - skip for 'ignore' mode + p++; + } + } + + return mp_obj_new_str_type_from_vstr(type, &vstr); + } else { + // Strict mode (or unrecognized error handler) + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #else + // Error handlers are not enabled - just raise UnicodeError on invalid UTF-8 + mp_raise_msg(&mp_type_UnicodeError, NULL); + #endif + #else // Check if a qstr with this data already exists qstr q = qstr_find_strn((const char *)str_data, str_len); if (q != MP_QSTRnull) { @@ -232,6 +320,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ o->data = str_data; o->hash = str_hash; return MP_OBJ_FROM_PTR(o); + #endif } else { mp_buffer_info_t bufinfo; mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); @@ -1977,7 +2066,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); // constructors. // TODO: should accept kwargs too static mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { - mp_obj_t new_args[2]; + mp_obj_t new_args[3]; if (n_args == 1) { new_args[0] = args[0]; new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8); diff --git a/py/objstrunicode.c b/py/objstrunicode.c index d7ce4fca0e58e..caccdfc0ffc93 100644 --- a/py/objstrunicode.c +++ b/py/objstrunicode.c @@ -57,9 +57,11 @@ static void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint mp_printf(print, "%c", quote_char); const byte *s = str_data, *top = str_data + str_len; while (s < top) { + const byte *seq_start = s; unichar ch; ch = utf8_get_char(s); s = utf8_next_char(s); + size_t seq_len = s - seq_start; if (ch == quote_char) { mp_printf(print, "\\%c", quote_char); } else if (ch == '\\') { @@ -72,6 +74,12 @@ static void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint mp_print_str(print, "\\r"); } else if (ch == '\t') { mp_print_str(print, "\\t"); + } else if (ch >= 128 && ch < 0xD800) { + // Printable Unicode character (excluding surrogates) - output UTF-8 bytes directly + print->print_strn(print->data, (const char *)seq_start, seq_len); + } else if (ch >= 0xE000 && ch < 0x110000) { + // Printable Unicode character (after surrogates) - output UTF-8 bytes directly + print->print_strn(print->data, (const char *)seq_start, seq_len); } else if (ch < 0x100) { mp_printf(print, "\\x%02x", ch); } else if (ch < 0x10000) { From 60769d539a9ecff9653fe30aabc84f76f55d60ab Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 04/26] tests/bytes_decode_errors: Handle platforms without CPYTHON_COMPAT. Added feature detection at the start of bytes_decode_errors.py test to skip gracefully when decode method is not available. (requires MICROPY_CPYTHON_COMPAT). This fixes test failures on minimal builds and Windows builds that may not have this feature enabled. Test now: - Checks if decode method exists before running tests - Prints "SKIP" and exits cleanly if decode is not available - Works correctly on both full-featured and minimal builds Verified: - Standard unix build: All tests pass (14 testcases) - Minimal unix build: Test skips cleanly - All bytes/bytearray/string tests pass (82 tests, 2191 testcases) Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_errors.py | 45 +++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tests/basics/bytes_decode_errors.py b/tests/basics/bytes_decode_errors.py index b87d9cd7294e3..297c843dba881 100644 --- a/tests/basics/bytes_decode_errors.py +++ b/tests/basics/bytes_decode_errors.py @@ -1,21 +1,44 @@ # Test bytes.decode() with error handlers +# Check if decode method is available (requires MICROPY_CPYTHON_COMPAT) +try: + b''.decode() +except AttributeError: + print("SKIP") + raise SystemExit + +# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_IGNORE) +# When feature is disabled, invalid UTF-8 raises UnicodeError even with 'ignore' +# When feature is enabled, invalid UTF-8 with 'ignore' returns a string +try: + result = b'\xff'.decode('utf-8', 'ignore') + # If we get here, feature is available +except UnicodeError: + # Feature not available - 'ignore' was ignored, strict mode was used + print("SKIP") + raise SystemExit + # Test ignore mode with invalid UTF-8 print(repr(b'\xff\xfe'.decode('utf-8', 'ignore'))) # Test strict mode (default) with invalid UTF-8 try: b'\xff\xfe'.decode('utf-8') + print('UNEXPECTED') except UnicodeError: - print("UnicodeError") + print('UnicodeError') +# Test strict mode (explicit) with invalid UTF-8 try: b'\xff\xfe'.decode('utf-8', 'strict') + print('UNEXPECTED') except UnicodeError: - print("UnicodeError") + print('UnicodeError') # Test with valid UTF-8 print(repr(b'hello'.decode('utf-8', 'ignore'))) + +# Test valid UTF-8 with default mode print(repr(b'hello'.decode('utf-8'))) # Test mixed valid and invalid UTF-8 @@ -30,31 +53,29 @@ # Test incomplete sequence at end print(repr(b'hello\xc0'.decode('utf-8', 'ignore'))) -# Test valid multi-byte UTF-8 -print(repr(b'\xc2\xa9'.decode('utf-8', 'ignore'))) # © symbol +# Test valid multi-byte UTF-8 (© symbol) +print(repr(b'\xc2\xa9'.decode('utf-8', 'ignore'))) -# Test bytearray as well +# Test bytearray support print(repr(bytearray(b'\xff\xfe').decode('utf-8', 'ignore'))) # Test replace mode - should either work or raise NotImplementedError try: result = b'\xff\xfe'.decode('utf-8', 'replace') - # If replace is implemented, check the result print(repr(result)) except NotImplementedError: - # If replace is not implemented, that's expected - print("NotImplementedError: replace") - -# Test replace with valid UTF-8 - should work even if replace isn't fully enabled + print('NotImplementedError') + +# Test replace with valid UTF-8 try: result = b'hello'.decode('utf-8', 'replace') print(repr(result)) except NotImplementedError: - print("NotImplementedError: replace") + print('NotImplementedError') # Test replace with mixed content try: result = b'hello\xffworld'.decode('utf-8', 'replace') print(repr(result)) except NotImplementedError: - print("NotImplementedError: replace") + print('NotImplementedError') From d34289e204a91a41be7bb10b82d456775b060c85 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 05/26] tests/basics/bytes_decode_encoding: Add tests to validate encoding. Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_encoding.py | 52 +++++++++++++++++++++++ tests/basics/bytes_decode_encoding.py.exp | 23 ++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tests/basics/bytes_decode_encoding.py create mode 100644 tests/basics/bytes_decode_encoding.py.exp diff --git a/tests/basics/bytes_decode_encoding.py b/tests/basics/bytes_decode_encoding.py new file mode 100644 index 0000000000000..a0945270aeb70 --- /dev/null +++ b/tests/basics/bytes_decode_encoding.py @@ -0,0 +1,52 @@ +# Test bytes.decode() and str.encode() with encoding parameter validation + +# Check if decode method is available (requires MICROPY_CPYTHON_COMPAT) +try: + b''.decode() +except AttributeError: + print("SKIP") + raise SystemExit + +# Test valid encodings for bytes.decode() +# utf-8 (default) +print(b'hello'.decode('utf-8')) +print(b'hello'.decode('utf8')) + +# ascii (subset of utf-8) +print(b'hello'.decode('ascii')) + +# Test valid encoding for str.encode() +print('hello'.encode('utf-8')) +print('hello'.encode('utf8')) +print('hello'.encode('ascii')) + +# Test invalid encodings for bytes.decode() +# These should raise LookupError +invalid_encodings = ['latin-1', 'latin1', 'utf-16', 'utf-32', 'iso-8859-1', 'cp1252'] + +for encoding in invalid_encodings: + try: + b'hello'.decode(encoding) + print(f'UNEXPECTED: {encoding} should raise LookupError') + except LookupError as e: + print(f'LookupError: {encoding}') + +# Test invalid encodings for str.encode() +for encoding in invalid_encodings: + try: + 'hello'.encode(encoding) + print(f'UNEXPECTED: {encoding} should raise LookupError') + except LookupError as e: + print(f'LookupError: {encoding}') + +# Test with bytearray +print(bytearray(b'test').decode('utf-8')) + +# Test that UTF-8 still works correctly with non-ASCII characters +# © symbol (U+00A9) +print(b'\xc2\xa9'.decode('utf-8')) +print('©'.encode('utf-8')) + +# Test emoji 👍 (U+1F44D) +print(b'\xf0\x9f\x91\x8d'.decode('utf-8')) +print('👍'.encode('utf-8')) diff --git a/tests/basics/bytes_decode_encoding.py.exp b/tests/basics/bytes_decode_encoding.py.exp new file mode 100644 index 0000000000000..f2417abf6dded --- /dev/null +++ b/tests/basics/bytes_decode_encoding.py.exp @@ -0,0 +1,23 @@ +hello +hello +hello +b'hello' +b'hello' +b'hello' +LookupError: latin-1 +LookupError: latin1 +LookupError: utf-16 +LookupError: utf-32 +LookupError: iso-8859-1 +LookupError: cp1252 +LookupError: latin-1 +LookupError: latin1 +LookupError: utf-16 +LookupError: utf-32 +LookupError: iso-8859-1 +LookupError: cp1252 +test +© +b'\xc2\xa9' +👍 +b'\xf0\x9f\x91\x8d' From 0fe170ba0b3c9c8f9b0071b36db262bd5fb8e42d Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 06/26] tests/unicode: Add tests for unicode character formatting. Signed-off-by: Jos Verlinde --- tests/unicode/unicode_char_format.py | 30 ++++++++++++++++++++++++ tests/unicode/unicode_char_format.py.exp | 12 ++++++++++ 2 files changed, 42 insertions(+) create mode 100644 tests/unicode/unicode_char_format.py create mode 100644 tests/unicode/unicode_char_format.py.exp diff --git a/tests/unicode/unicode_char_format.py b/tests/unicode/unicode_char_format.py new file mode 100644 index 0000000000000..7c6ee98bc7775 --- /dev/null +++ b/tests/unicode/unicode_char_format.py @@ -0,0 +1,30 @@ +# test %c formatting with unicode characters (issue #3364) +# tests that character codes >= 128 are properly encoded as UTF-8 + +# ASCII character +print("%c" % 65) + +# 2-byte UTF-8 characters +print("%c" % 128) +print("%c" % 169) # copyright symbol © +print("%c" % 255) + +# 3-byte UTF-8 character +print("%c" % 0x4E00) # CJK ideograph 一 + +# 4-byte UTF-8 character +print("%c" % 0x1F600) # emoji 😀 + +# test with .format() method +print("{:c}".format(169)) +print("{:c}".format(0x4E00)) + +# test with f-strings +c = 169 +print(f"{c:c}") +c = 0x1F600 +print(f"{c:c}") + +# test formatting with width +print("[%5c]" % 169) +print("[{:5c}]".format(0x4E00)) diff --git a/tests/unicode/unicode_char_format.py.exp b/tests/unicode/unicode_char_format.py.exp new file mode 100644 index 0000000000000..c42139f2e38d4 --- /dev/null +++ b/tests/unicode/unicode_char_format.py.exp @@ -0,0 +1,12 @@ +A +€ +© +ÿ +一 +😀 +© +一 +© +😀 +[ ©] +[ 一] From dae625b4df41f46db0a0fb1fb40e7c7554ffdad6 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 07/26] tests/unicode: Tests exception handling for strings starting with 0xff. Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_encoding.py | 32 +++++++++++------------ tests/basics/bytes_decode_encoding.py.exp | 10 +++---- tests/unicode/exception_invalid_utf8.py | 22 ++++++++++++++++ tests/unicode/str_center.py | 30 +++++++++++++++++++++ tests/unicode/unicode_char_format.py | 4 --- tests/unicode/unicode_char_format.py.exp | 12 --------- 6 files changed, 73 insertions(+), 37 deletions(-) create mode 100644 tests/unicode/exception_invalid_utf8.py create mode 100644 tests/unicode/str_center.py delete mode 100644 tests/unicode/unicode_char_format.py.exp diff --git a/tests/basics/bytes_decode_encoding.py b/tests/basics/bytes_decode_encoding.py index a0945270aeb70..ff4bbcc44d8b3 100644 --- a/tests/basics/bytes_decode_encoding.py +++ b/tests/basics/bytes_decode_encoding.py @@ -20,6 +20,18 @@ print('hello'.encode('utf8')) print('hello'.encode('ascii')) +# Test with bytearray +print(bytearray(b'test').decode('utf-8')) + +# Test that UTF-8 still works correctly with non-ASCII characters +# © symbol (U+00A9) +print(b'\xc2\xa9'.decode('utf-8')) +print('©'.encode('utf-8')) + +# Test emoji 👍 (U+1F44D) +print(b'\xf0\x9f\x91\x8d'.decode('utf-8')) +print('👍'.encode('utf-8')) + # Test invalid encodings for bytes.decode() # These should raise LookupError invalid_encodings = ['latin-1', 'latin1', 'utf-16', 'utf-32', 'iso-8859-1', 'cp1252'] @@ -27,26 +39,14 @@ for encoding in invalid_encodings: try: b'hello'.decode(encoding) - print(f'UNEXPECTED: {encoding} should raise LookupError') + print('UNEXPECTED:', encoding, 'should raise LookupError') except LookupError as e: - print(f'LookupError: {encoding}') + print('LookupError:', encoding) # Test invalid encodings for str.encode() for encoding in invalid_encodings: try: 'hello'.encode(encoding) - print(f'UNEXPECTED: {encoding} should raise LookupError') + print('UNEXPECTED:', encoding, 'should raise LookupError') except LookupError as e: - print(f'LookupError: {encoding}') - -# Test with bytearray -print(bytearray(b'test').decode('utf-8')) - -# Test that UTF-8 still works correctly with non-ASCII characters -# © symbol (U+00A9) -print(b'\xc2\xa9'.decode('utf-8')) -print('©'.encode('utf-8')) - -# Test emoji 👍 (U+1F44D) -print(b'\xf0\x9f\x91\x8d'.decode('utf-8')) -print('👍'.encode('utf-8')) + print('LookupError:', encoding) diff --git a/tests/basics/bytes_decode_encoding.py.exp b/tests/basics/bytes_decode_encoding.py.exp index f2417abf6dded..45a0660fc591f 100644 --- a/tests/basics/bytes_decode_encoding.py.exp +++ b/tests/basics/bytes_decode_encoding.py.exp @@ -4,6 +4,11 @@ hello b'hello' b'hello' b'hello' +test +© +b'\xc2\xa9' +👍 +b'\xf0\x9f\x91\x8d' LookupError: latin-1 LookupError: latin1 LookupError: utf-16 @@ -16,8 +21,3 @@ LookupError: utf-16 LookupError: utf-32 LookupError: iso-8859-1 LookupError: cp1252 -test -© -b'\xc2\xa9' -👍 -b'\xf0\x9f\x91\x8d' diff --git a/tests/unicode/exception_invalid_utf8.py b/tests/unicode/exception_invalid_utf8.py new file mode 100644 index 0000000000000..ba0b85c7d67d4 --- /dev/null +++ b/tests/unicode/exception_invalid_utf8.py @@ -0,0 +1,22 @@ +# Test that exceptions with strings starting with 0xff don't crash +# Issue #17855 + +# Test with various sizes of strings starting with compression marker (0xff) +for size in [1, 10, 100, 1000]: + try: + # Create a string that starts with 0xff (compression marker) + s = eval(b"'\\xff" + b"\\xfe" * size + b"'") + raise ValueError(s) + except ValueError as e: + # Just verify we can catch it without crashing + print(f"Caught ValueError with size {size}") + +# Test printing the exception +try: + raise Exception(eval(b"'\\xff" + b"\\xfe" * 100 + b"'")) +except Exception as e: + # This used to crash when trying to decompress the "compressed" string + exc_str = str(e) + print(f"Exception string starts with: {exc_str[:10]!r}") + +print("All tests passed") diff --git a/tests/unicode/str_center.py b/tests/unicode/str_center.py new file mode 100644 index 0000000000000..9a7d830eb161d --- /dev/null +++ b/tests/unicode/str_center.py @@ -0,0 +1,30 @@ +# Test str.center() with Unicode characters +# Issue #17827 + +# ASCII baseline +print("hello".center(10)) + +# Latin with accent (é is 2 bytes in UTF-8) +print("héllo".center(10)) + +# Chinese (each char is 3 bytes in UTF-8) +print("你好".center(10)) + +# Emoji (4 bytes in UTF-8) +print("🎉".center(5)) + +# German with umlaut +print("München".center(15)) + +# Cyrillic +print("Москва".center(12)) + +# Edge cases +print("test".center(4)) # Exact fit +print("test".center(3)) # String longer than width +print("x".center(1)) # Single char, exact fit +print("".center(5)) # Empty string + +# Mixed ASCII and Unicode +print("café".center(10)) +print("hello世界".center(12)) diff --git a/tests/unicode/unicode_char_format.py b/tests/unicode/unicode_char_format.py index 7c6ee98bc7775..3e4d805472a77 100644 --- a/tests/unicode/unicode_char_format.py +++ b/tests/unicode/unicode_char_format.py @@ -24,7 +24,3 @@ print(f"{c:c}") c = 0x1F600 print(f"{c:c}") - -# test formatting with width -print("[%5c]" % 169) -print("[{:5c}]".format(0x4E00)) diff --git a/tests/unicode/unicode_char_format.py.exp b/tests/unicode/unicode_char_format.py.exp deleted file mode 100644 index c42139f2e38d4..0000000000000 --- a/tests/unicode/unicode_char_format.py.exp +++ /dev/null @@ -1,12 +0,0 @@ -A -€ -© -ÿ -一 -😀 -© -一 -© -😀 -[ ©] -[ 一] From 527b61824f1c2608421223e158a0b5a2a0ff2c82 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 08/26] py/objstr: Validate encoding for decode and encode. Only accepts `utf-8`, `utf8` or `ascii` Fixes https://github.com/micropython/micropython/issues/15849 Signed-off-by: Jos Verlinde --- py/objstr.c | 52 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 6171983beed3e..43dac52fae6be 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -213,7 +213,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ if (mp_obj_is_type(args[0], &mp_type_bytes) || mp_obj_is_type(args[0], &mp_type_bytearray)) { #else if (mp_obj_is_type(args[0], &mp_type_bytes)) { - #endif + #endif GET_STR_DATA_LEN(args[0], str_data, str_len); GET_STR_HASH(args[0], str_hash); if (str_hash == 0) { @@ -246,11 +246,24 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ // Data has invalid UTF-8, handle based on error mode #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS // Error handlers are enabled - bool do_ignore = strcmp(errors, "ignore") == 0; - bool do_replace = strcmp(errors, "replace") == 0; + #if !MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE + // Raise NotImplementedError if 'replace' is used but not enabled + if (strcmp(errors, "replace") == 0) { + mp_raise_NotImplementedError(NULL); + } + #endif - if (do_ignore || do_replace) { + if (strcmp(errors, "ignore") == 0 + #if MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE + || strcmp(errors, "replace") == 0 + #endif + ) { // Build new string skipping/replacing invalid bytes + #if MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE + bool do_replace = strcmp(errors, "replace") == 0; + #else + const bool do_replace = false; + #endif vstr_t vstr; vstr_init(&vstr, str_len); const byte *p = str_data; @@ -282,19 +295,16 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ ch = (ch << 6) | (seq_start[i + 1] & 0x3f); } vstr_add_char(&vstr, ch); - } - else if (do_replace) { + } else if (do_replace) { // Invalid or incomplete sequence - replace with U+FFFD vstr_add_char(&vstr, 0xFFFD); } // For 'ignore' mode, do nothing (skip invalid bytes) - } - else if (do_replace) { + } else if (do_replace) { // Invalid start byte - replace with U+FFFD vstr_add_char(&vstr, 0xFFFD); p++; - } - else { + } else { // Invalid start byte - skip for 'ignore' mode p++; } @@ -2072,6 +2082,17 @@ static mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8); args = new_args; n_args++; + } else if (n_args >= 2) { + // Validate encoding parameter + // MicroPython only supports UTF-8 encoding + const char *encoding = mp_obj_str_get_str(args[1]); + + // Accept utf-8 and ascii (ascii is a subset of utf-8) + if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || + strcmp(encoding, "ascii") == 0)) { + mp_raise_msg_varg(&mp_type_LookupError, + MP_ERROR_TEXT("encoding not supported: %s"), encoding); + } } return mp_obj_str_make_new(&mp_type_str, n_args, 0, args); } @@ -2085,6 +2106,17 @@ static mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) { new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8); args = new_args; n_args++; + } else if (n_args >= 2) { + // Validate encoding parameter + // MicroPython only supports UTF-8 encoding + const char *encoding = mp_obj_str_get_str(args[1]); + + // Accept utf-8 and ascii (ascii is a subset of utf-8) + if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || + strcmp(encoding, "ascii") == 0)) { + mp_raise_msg_varg(&mp_type_LookupError, + MP_ERROR_TEXT("encoding not supported: %s"), encoding); + } } return bytes_make_new(NULL, n_args, 0, args); } From 2e2730068dda0ab5f88bc4889e089a20e71cd983 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 09/26] py/objstr: Enhance utf-8 character handling in string formatting. Fixes: issue 3364 Fixes: issue 13084 Signed-off-by: Jos Verlinde --- py/objstr.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/py/objstr.c b/py/objstr.c index 43dac52fae6be..e4a2f685c1b77 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -1418,8 +1418,19 @@ static vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *ar continue; case 'c': { + #if MICROPY_PY_BUILTINS_STR_UNICODE + mp_uint_t c = mp_obj_get_int(arg); + if (c >= 0x110000) { + mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("chr() arg not in range(0x110000)")); + } + VSTR_FIXED(ch_vstr, 4); + vstr_add_char(&ch_vstr, c); + mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, fill, width); + vstr_clear(&ch_vstr); + #else char ch = mp_obj_get_int(arg); mp_print_strn(&print, &ch, 1, flags, fill, width); + #endif continue; } @@ -1712,8 +1723,21 @@ static mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_ } mp_print_strn(&print, s, 1, flags, ' ', width); } else if (arg_looks_integer(arg)) { + #if MICROPY_PY_BUILTINS_STR_UNICODE + mp_uint_t c = mp_obj_get_int(arg); + if (c >= 0x110000) { + mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("%c arg not in range(0x110000)")); + } + vstr_t ch_vstr; + vstr_init_len(&ch_vstr, 4); + ch_vstr.len = 0; + vstr_add_char(&ch_vstr, c); + mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, ' ', width); + vstr_clear(&ch_vstr); + #else char ch = mp_obj_get_int(arg); mp_print_strn(&print, &ch, 1, flags, ' ', width); + #endif } else { mp_raise_TypeError(MP_ERROR_TEXT("integer needed")); } From 57bb48f47965e2d9b0e048e108e81fda7255dcc5 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 10/26] py/objstr: Fix str_center for Unicode strings. Fixes Issue 17827 Signed-off-by: Jos Verlinde --- py/objstr.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index e4a2f685c1b77..63b72c50399aa 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -1061,14 +1061,33 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); static mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) { GET_STR_DATA_LEN(str_in, str, str_len); mp_uint_t width = mp_obj_get_int(width_in); + + #if MICROPY_PY_BUILTINS_STR_UNICODE + // Get character count (not byte count) for proper Unicode handling + size_t char_len = utf8_charlen(str, str_len); + if (char_len >= width) { + return str_in; + } + // Calculate padding: width is in characters, need to convert to bytes for allocation + mp_uint_t padding_chars = width - char_len; + // Padding is always spaces (1 byte each), plus the original string bytes + mp_uint_t total_bytes = padding_chars + str_len; + #else + // Non-Unicode build: byte length equals character length if (str_len >= width) { return str_in; } + mp_uint_t total_bytes = width; + #endif // MICROPY_PY_BUILTINS_STR_UNICODE vstr_t vstr; - vstr_init_len(&vstr, width); - memset(vstr.buf, ' ', width); + vstr_init_len(&vstr, total_bytes); + memset(vstr.buf, ' ', total_bytes); + #if MICROPY_PY_BUILTINS_STR_UNICODE + int left = padding_chars / 2; + #else int left = (width - str_len) / 2; + #endif // MICROPY_PY_BUILTINS_STR_UNICODE memcpy(vstr.buf + left, str, str_len); return mp_obj_new_str_type_from_vstr(mp_obj_get_type(str_in), &vstr); } From 77f6830cb0753541b24f6becaf1490c67ef75c4b Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 11/26] docs: Document Unicode support and limitiations. Signed-off-by: Jos Verlinde --- docs/develop/writingtests.rst | 5 + docs/differences/python_35.rst | 4 + docs/library/builtins.rst | 79 +++++++++++++ docs/reference/constrained.rst | 21 +++- docs/reference/index.rst | 1 + docs/reference/unicode_support.rst | 173 +++++++++++++++++++++++++++++ 6 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 docs/reference/unicode_support.rst diff --git a/docs/develop/writingtests.rst b/docs/develop/writingtests.rst index fd3daf91c1e3e..2a8471201139b 100644 --- a/docs/develop/writingtests.rst +++ b/docs/develop/writingtests.rst @@ -46,6 +46,11 @@ If you run your tests, this test should appear in the test output: Tests are run by comparing the output from the test target against the output from CPython. So any test should use print statements to indicate test results. +When writing tests for name or string-related functionality, please add both English/ASCII +as well as non-English/non-ASCII text and include Unicode examples. +Please do add comments in English explaining the meaning and intent of the Unicode text. +This help ensure Unicode support is tested and verified across different platforms. + For tests that can't be compared to CPython (i.e. micropython-specific functionality), you can provide a ``.py.exp`` file which will be used as the truth for comparison. diff --git a/docs/differences/python_35.rst b/docs/differences/python_35.rst index 0fdc6121a1dd0..fa1b0106712e6 100644 --- a/docs/differences/python_35.rst +++ b/docs/differences/python_35.rst @@ -55,6 +55,10 @@ Other Language Changes: | Added the *namereplace* error handlers. The *backslashreplace* error handlers now work with decoding and | | | translating. | | +-----------------------------------------------------------------------------------------------------------+---------------+ + | As of v1.28.0 String encoding/decoding properly validates the encoding parameter and raises | Improved | + | ``LookupError`` for unsupported encodings. Error handlers (``'ignore'``, ``'replace'``) are supported | | + | when enabled via build configuration. The ``errors`` parameter must be passed positionally. | | + +-----------------------------------------------------------------------------------------------------------+---------------+ | Property docstrings are now writable. This is especially useful for collections.namedtuple() docstrings | | +-----------------------------------------------------------------------------------------------------------+---------------+ | Circular imports involving relative imports are now supported. | | diff --git a/docs/library/builtins.rst b/docs/library/builtins.rst index b5d08ba7fed50..536c383f834df 100644 --- a/docs/library/builtins.rst +++ b/docs/library/builtins.rst @@ -25,6 +25,35 @@ Functions and types |see_cpython| `python:bytes`. + .. method:: bytes.decode(encoding='utf-8', errors='strict') + + Decode the bytes object to a string using the specified *encoding*. + + MicroPython supports the following encodings: + + - ``'utf-8'`` or ``'utf8'`` - UTF-8 encoding (default) + - ``'ascii'`` - ASCII encoding (subset of UTF-8) + + The *errors* parameter controls how decoding errors are handled: + + - ``'strict'`` - Raise a ``UnicodeError`` on invalid UTF-8 (default) + - ``'ignore'`` - Skip invalid bytes (requires ``MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS``) + - ``'replace'`` - Replace invalid bytes with U+FFFD '�' (requires ``MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS``) + + .. note:: + Error handler support depends on build configuration. On constrained + systems, only ``'strict'`` mode may be available. + + Example:: + + >>> b'\\xc2\\xa9 2024'.decode('utf-8') # © symbol + '© 2024' + >>> b'hello\\xffworld'.decode('utf-8', 'ignore') # Skip invalid bytes + 'helloworld' + + Raises ``LookupError`` if the encoding is not supported, or + ``UnicodeError`` if the data contains invalid UTF-8 and ``errors='strict'``. + .. function:: callable() .. function:: chr() @@ -148,6 +177,56 @@ Functions and types .. class:: str() + .. method:: str.encode(encoding='utf-8') + + Encode the string to bytes using the specified *encoding*. + + MicroPython supports the following encodings: + + - ``'utf-8'`` or ``'utf8'`` - UTF-8 encoding (default) + - ``'ascii'`` - ASCII encoding (subset of UTF-8) + + Example:: + + >>> '© 2024'.encode('utf-8') # Copyright symbol + b'\\xc2\\xa9 2024' + + Raises ``LookupError`` if the encoding is not supported. + + .. method:: str.center(width) + + Return a centered string of length *width*. Padding is done using spaces. + + When Unicode support is enabled (``MICROPY_PY_BUILTINS_STR_UNICODE``), this + method counts Unicode characters rather than bytes, ensuring proper alignment + for multi-byte UTF-8 characters. + + Example:: + + >>> 'café'.center(10) # é is 2 bytes in UTF-8 + ' café ' + +String Formatting +----------------- + +MicroPython supports Unicode in string formatting when ``MICROPY_PY_BUILTINS_STR_UNICODE`` +is enabled. + +The ``%c`` and ``{:c}`` format specifiers accept Unicode codepoints in the range 0 to 0x10FFFF +(1,114,111) and properly encode multi-byte UTF-8 characters. + +Example:: + + >>> '%c' % 0x1F389 # 🎉 emoji + '🎉' + >>> '{:c}'.format(0x4E2D) # 中 (Chinese character) + '中' + +Invalid character codes raise ``ValueError``:: + + >>> '%c' % -1 + ValueError: %c arg not in range(0x110000) + .. function:: sum() .. function:: super() diff --git a/docs/reference/constrained.rst b/docs/reference/constrained.rst index 616dc8833fd67..60f76b9c7835a 100644 --- a/docs/reference/constrained.rst +++ b/docs/reference/constrained.rst @@ -251,7 +251,26 @@ instances so the process of eliminating Unicode can be painless. b = b'the quick brown fox' # A bytes instance Where it is necessary to convert between strings and bytes the :meth:`str.encode` -and the :meth:`bytes.decode` methods can be used. Note that both strings and bytes +and the :meth:`bytes.decode` methods can be used. MicroPython validates the +encoding parameter and only supports UTF-8 and ASCII. The :meth:`bytes.decode` +method also supports error handlers (``'ignore'`` and ``'replace'``) for handling +invalid UTF-8, when enabled in the build configuration. + +For memory-conscious applications processing untrusted data, using the ``'ignore'`` +error handler can be more efficient than ``'strict'`` mode (the default), as it +avoids raising exceptions while still recovering valid text:: + + # Strict mode (default) raises an error on invalid UTF-8 + try: + s = data.decode('utf-8') + except UnicodeError: + # Handle error + pass + + # Ignore mode skips invalid bytes (more memory-efficient) + s = data.decode('utf-8', 'ignore') + +Note that both strings and bytes are immutable. Any operation which takes as input such an object and produces another implies at least one RAM allocation to produce the result. In the second line below a new bytes object is allocated. This would also occur if ``foo`` diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 1558c0fdfa9b6..24fe5746eb05c 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -31,5 +31,6 @@ implementation and the best practices to use them. packages.rst asm_thumb2_index.rst filesystem.rst + unicode_support.rst pyboard.py.rst micropython2_migration.rst diff --git a/docs/reference/unicode_support.rst b/docs/reference/unicode_support.rst new file mode 100644 index 0000000000000..9efbebbb58c80 --- /dev/null +++ b/docs/reference/unicode_support.rst @@ -0,0 +1,173 @@ +.. _unicode_support: + +Unicode Support +=============== + +MicroPython provides Unicode support for strings, with the level of support +depending on the build configuration. + +Character Encoding +------------------ + +MicroPython uses UTF-8 encoding for all strings. When Unicode support is enabled +(``MICROPY_PY_BUILTINS_STR_UNICODE``), strings can contain any valid Unicode +character from U+0000 to U+10FFFF. + +ASCII characters (0-127) are stored in a single byte, making them as memory-efficient +as on systems without Unicode support. Multi-byte UTF-8 characters use 2-4 bytes +depending on the codepoint: + +- U+0000 to U+007F: 1 byte (ASCII) +- U+0080 to U+07FF: 2 bytes +- U+0800 to U+FFFF: 3 bytes +- U+10000 to U+10FFFF: 4 bytes + +Encoding and Decoding +---------------------- + +The :meth:`bytes.decode` and :meth:`str.encode` methods support the following encodings: + +- UTF-8 (``'utf-8'`` or ``'utf8'``) +- ASCII (``'ascii'``) + +Other encodings (such as ``'latin-1'``, ``'utf-16'``, etc.) are not supported and +will raise ``LookupError``. + +Example:: + + >>> '日本語'.encode('utf-8') + b'\\xe6\\x97\\xa5\\xe6\\x9c\\xac\\xe8\\xaa\\x9e' + >>> b'\\xe6\\x97\\xa5\\xe6\\x9c\\xac\\xe8\\xaa\\x9e'.decode('utf-8') + '日本語' + +Error Handling +~~~~~~~~~~~~~~ + +When decoding bytes that contain invalid UTF-8 sequences, the ``errors`` parameter +of :meth:`bytes.decode` controls the behavior: + +- ``'strict'`` (default): Raise ``UnicodeError`` +- ``'ignore'``: Skip invalid bytes (requires ``MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS``) +- ``'replace'``: Replace invalid bytes with U+FFFD � (requires ``MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS``) + +Example:: + + >>> # Strict mode (default) raises an error + >>> b'hello\\xffworld'.decode('utf-8') + UnicodeError: invalid UTF-8 + + >>> # Ignore mode skips invalid bytes + >>> b'hello\\xffworld'.decode('utf-8', 'ignore') + 'helloworld' + + >>> # Replace mode substitutes replacement character + >>> b'hello\\xffworld'.decode('utf-8', 'replace') + 'hello�world' + +For memory-conscious applications, consider using ``'ignore'`` mode when processing +untrusted or partially corrupted data, as it avoids raising exceptions while still +recovering valid text. + +String Methods +-------------- + +When Unicode support is enabled, string methods operate on characters rather than bytes: + +- :meth:`str.center` - Counts Unicode characters for width calculation +- ``len(s)`` - Returns number of characters (not bytes) +- String indexing and slicing work on character boundaries +- No support for display width calculations (East Asian width, combining characters, etc.) + +Example:: + + >>> s = 'Hello 世界' + >>> len(s) # 8 characters + 8 + >>> len(s.encode()) # 12 bytes + 12 + >>> s.center(12) # Centered by character count + ' Hello 世界 ' + +String Formatting +----------------- + +The ``%c`` format specifier and ``{:c}`` format code support full Unicode: + +- Accepts codepoints from 0 to 0x10FFFF +- Properly encodes multi-byte UTF-8 characters +- Raises ``ValueError`` for invalid codepoints + +Example:: + + >>> '%c' % 65 # ASCII + 'A' + >>> '%c' % 0x03B1 # Greek α + 'α' + >>> '%c' % 0x1F600 # Emoji 😀 + '😀' + >>> '{:c}'.format(0x4E2D) # Chinese 中 + '中' + + >>> # Invalid codepoint + >>> '%c' % 0x110000 + ValueError: %c arg not in range(0x110000) + +F-strings also support the ``:c`` format code:: + + >>> codepoint = 0x2665 # Heart suit ♥ + >>> f'I {codepoint:c} Python' + 'I ♥ Python' + +Build Configuration +------------------- + +Unicode features are controlled by several build-time flags in ``mpconfigport.h``: + +``MICROPY_PY_BUILTINS_STR_UNICODE`` + Enable Unicode string support. When enabled, strings can contain any valid + Unicode character and string operations work on character boundaries rather + than byte boundaries. + + Default: Enabled at ``MICROPY_CONFIG_ROM_LEVEL_EXTRA_FEATURES`` and above. + +``MICROPY_PY_BUILTINS_STR_UNICODE_CHECK`` + Enable UTF-8 validation during string operations. When disabled, string + operations may produce incorrect results with invalid UTF-8 sequences. + + Default: Follows ``MICROPY_PY_BUILTINS_STR_UNICODE`` setting. + +``MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS`` + Enable the ``'ignore'`` and ``'replace'`` error handlers for + :meth:`bytes.decode`. When enabled, invalid UTF-8 bytes can be either + skipped (``'ignore'``) or replaced with U+FFFD (``'replace'``). + + Default: Enabled at ``MICROPY_CONFIG_ROM_LEVEL_EXTRA_FEATURES`` and above. + +Example Configuration +~~~~~~~~~~~~~~~~~~~~~ + +For a constrained port with limited flash, disable error handlers:: + + #define MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS (0) + +For a port with more resources, enable all Unicode features:: + + #define MICROPY_CONFIG_ROM_LEVEL (MICROPY_CONFIG_ROM_LEVEL_EXTRA_FEATURES) + // This automatically enables: + // - MICROPY_PY_BUILTINS_STR_UNICODE + // - MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS + +Limitations +----------- + +MicroPython's Unicode support has some limitations compared to CPython: + +- Only UTF-8 and ASCII encodings are supported +- No support for Unicode normalization +- No locale-aware string operations +- The ``errors`` parameter accepts only positional arguments (not keyword arguments) +- String methods like ``upper()``, ``lower()``, etc. work correctly only for ASCII +- The MicroPython interactive REPL and ``input()`` function currently have very limited + Unicode support. + A workaround for this is to use utf-8 encoded MicroPython scripts containing the unicode + text, and run them using ``mpremote run ``. From b5137043c4a774afb10038beb9440ffc5c96d22b Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 12/26] tests/unicode: Remove known differences from test. Prevent the test from failing by not testing known unsupported characters. These will be documented in a cpydiff test. Signed-off-by: Jos Verlinde --- tests/unicode/unicode.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py index 58d406e63eb2a..fbf4039a9e5c8 100644 --- a/tests/unicode/unicode.py +++ b/tests/unicode/unicode.py @@ -17,11 +17,6 @@ enc = s.encode() print(enc, enc.decode() == s) -# printing of unicode chars using repr -# NOTE: for some characters (eg \u10ff) we differ to CPython -print(repr("a\uffff")) -print(repr("a\U0001ffff")) - # test invalid escape code try: eval('"\\U00110000"') From 57e679e064a0ed33cf2f6d211941e6f5547d95e2 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 13/26] tests/cpydiff: Document unicode differences. Signed-off-by: Jos Verlinde --- tests/cpydiff/types_bytes_decode_encoding.py | 19 +++++++++++++++++++ tests/cpydiff/types_bytes_decode_kwargs.py | 19 +++++++++++++++++++ tests/cpydiff/types_str_repr_nonprintable.py | 11 +++++++++++ 3 files changed, 49 insertions(+) create mode 100644 tests/cpydiff/types_bytes_decode_encoding.py create mode 100644 tests/cpydiff/types_bytes_decode_kwargs.py create mode 100644 tests/cpydiff/types_str_repr_nonprintable.py diff --git a/tests/cpydiff/types_bytes_decode_encoding.py b/tests/cpydiff/types_bytes_decode_encoding.py new file mode 100644 index 0000000000000..62e5819a15192 --- /dev/null +++ b/tests/cpydiff/types_bytes_decode_encoding.py @@ -0,0 +1,19 @@ +""" +categories: Types,bytes +description: bytes.decode() only supports 'utf-8' and 'ascii' encodings, not other encodings like 'latin-1' +cause: MicroPython is optimized for embedded systems and only includes UTF-8 and ASCII codec support to save memory. Other encodings would require additional codec tables. +workaround: Convert data to UTF-8 before processing, or implement custom encoding/decoding if needed. +""" + +# CPython supports many encodings, MicroPython only utf-8 and ascii +try: + b'\xe9'.decode('latin-1') # 'é' in latin-1 + print("latin-1 supported") +except (ValueError, NotImplementedError, LookupError) as e: + print("latin-1 not supported:", type(e).__name__) + +try: + b'\x80'.decode('cp1252') # Euro sign in cp1252 + print("cp1252 supported") +except (ValueError, NotImplementedError, LookupError) as e: + print("cp1252 not supported:", type(e).__name__) diff --git a/tests/cpydiff/types_bytes_decode_kwargs.py b/tests/cpydiff/types_bytes_decode_kwargs.py new file mode 100644 index 0000000000000..e581bffbb234e --- /dev/null +++ b/tests/cpydiff/types_bytes_decode_kwargs.py @@ -0,0 +1,19 @@ +""" +categories: Types,bytes +description: bytes.decode() does not accept keyword arguments, only positional arguments +cause: MicroPython optimizes for code size and does not implement keyword argument handling for bytes.decode() +workaround: Use positional arguments instead of keyword arguments +""" + +# CPython accepts keyword arguments, MicroPython only accepts positional +b = b'hello\xffworld' + +try: + # Using keyword arguments + result = b.decode(encoding='utf-8', errors='ignore') + print("kwargs supported:", repr(result)) +except TypeError as e: + print("kwargs not supported: TypeError") + # Workaround: use positional arguments + result = b.decode('utf-8', 'ignore') + print("positional args work:", repr(result)) diff --git a/tests/cpydiff/types_str_repr_nonprintable.py b/tests/cpydiff/types_str_repr_nonprintable.py new file mode 100644 index 0000000000000..113ea3699b68b --- /dev/null +++ b/tests/cpydiff/types_str_repr_nonprintable.py @@ -0,0 +1,11 @@ +""" +categories: Types,str +description: repr() may print some non-printable Unicode characters literally instead of as escape sequences +cause: MicroPython uses a simplified heuristic to determine printable characters, avoiding the need for a full Unicode character database (saves memory). It prints characters >= U+0080 (excluding surrogates) as UTF-8. CPython uses the Unicode database to identify non-printable characters like noncharacters (U+FFFx in each plane). +workaround: Accept the difference for embedded use cases, or use ascii() or manual escaping if exact control is needed. +""" + +# These are noncharacters that CPython escapes but MicroPython prints +# showing as hex to avoid display issues in documentation tables +print("U+FFFF:", repr("\uffff").encode('utf-8').hex()) +print("U+1FFFF:", repr("\U0001ffff").encode('utf-8').hex()) From 4e9a5ece460063408fa7adae854f4848c33d2064 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 14/26] tests/basics/bytes_decode: Split tests for ignore/replace. This allows simpler skipping of tests based on enabled capabilities. Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_errors.py | 81 ---------------- tests/basics/bytes_decode_errors.py.exp | 14 --- tests/basics/bytes_decode_ignore.py | 102 ++++++++++++++++++++ tests/basics/bytes_decode_replace.py | 122 ++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 95 deletions(-) delete mode 100644 tests/basics/bytes_decode_errors.py delete mode 100644 tests/basics/bytes_decode_errors.py.exp create mode 100644 tests/basics/bytes_decode_ignore.py create mode 100644 tests/basics/bytes_decode_replace.py diff --git a/tests/basics/bytes_decode_errors.py b/tests/basics/bytes_decode_errors.py deleted file mode 100644 index 297c843dba881..0000000000000 --- a/tests/basics/bytes_decode_errors.py +++ /dev/null @@ -1,81 +0,0 @@ -# Test bytes.decode() with error handlers - -# Check if decode method is available (requires MICROPY_CPYTHON_COMPAT) -try: - b''.decode() -except AttributeError: - print("SKIP") - raise SystemExit - -# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_IGNORE) -# When feature is disabled, invalid UTF-8 raises UnicodeError even with 'ignore' -# When feature is enabled, invalid UTF-8 with 'ignore' returns a string -try: - result = b'\xff'.decode('utf-8', 'ignore') - # If we get here, feature is available -except UnicodeError: - # Feature not available - 'ignore' was ignored, strict mode was used - print("SKIP") - raise SystemExit - -# Test ignore mode with invalid UTF-8 -print(repr(b'\xff\xfe'.decode('utf-8', 'ignore'))) - -# Test strict mode (default) with invalid UTF-8 -try: - b'\xff\xfe'.decode('utf-8') - print('UNEXPECTED') -except UnicodeError: - print('UnicodeError') - -# Test strict mode (explicit) with invalid UTF-8 -try: - b'\xff\xfe'.decode('utf-8', 'strict') - print('UNEXPECTED') -except UnicodeError: - print('UnicodeError') - -# Test with valid UTF-8 -print(repr(b'hello'.decode('utf-8', 'ignore'))) - -# Test valid UTF-8 with default mode -print(repr(b'hello'.decode('utf-8'))) - -# Test mixed valid and invalid UTF-8 -print(repr(b'hello\xffworld'.decode('utf-8', 'ignore'))) - -# Test multiple invalid bytes -print(repr(b'\x80\x81\x82'.decode('utf-8', 'ignore'))) - -# Test invalid continuation byte -print(repr(b'\xc0\x20'.decode('utf-8', 'ignore'))) - -# Test incomplete sequence at end -print(repr(b'hello\xc0'.decode('utf-8', 'ignore'))) - -# Test valid multi-byte UTF-8 (© symbol) -print(repr(b'\xc2\xa9'.decode('utf-8', 'ignore'))) - -# Test bytearray support -print(repr(bytearray(b'\xff\xfe').decode('utf-8', 'ignore'))) - -# Test replace mode - should either work or raise NotImplementedError -try: - result = b'\xff\xfe'.decode('utf-8', 'replace') - print(repr(result)) -except NotImplementedError: - print('NotImplementedError') - -# Test replace with valid UTF-8 -try: - result = b'hello'.decode('utf-8', 'replace') - print(repr(result)) -except NotImplementedError: - print('NotImplementedError') - -# Test replace with mixed content -try: - result = b'hello\xffworld'.decode('utf-8', 'replace') - print(repr(result)) -except NotImplementedError: - print('NotImplementedError') diff --git a/tests/basics/bytes_decode_errors.py.exp b/tests/basics/bytes_decode_errors.py.exp deleted file mode 100644 index f9126debb4f8a..0000000000000 --- a/tests/basics/bytes_decode_errors.py.exp +++ /dev/null @@ -1,14 +0,0 @@ -'' -UnicodeError -UnicodeError -'hello' -'hello' -'helloworld' -'' -' ' -'hello' -'\xa9' -'' -'\ufffd\ufffd' -'hello' -'hello\ufffdworld' diff --git a/tests/basics/bytes_decode_ignore.py b/tests/basics/bytes_decode_ignore.py new file mode 100644 index 0000000000000..438380beaff40 --- /dev/null +++ b/tests/basics/bytes_decode_ignore.py @@ -0,0 +1,102 @@ +# Test bytes.decode() with error handler 'ignore' + +# Check if decode method is available (requires MICROPY_CPYTHON_COMPAT) +try: + b''.decode() +except AttributeError: + print("SKIP") + raise SystemExit + +# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_IGNORE) +# When feature is disabled, invalid UTF-8 raises LookupError even with 'ignore' +# When feature is enabled, invalid UTF-8 with 'ignore' returns a string +try: + result = b'\xff'.decode('utf-8', 'ignore') + # If we get here, feature is available +except (UnicodeError, LookupError): + # Feature not available - 'ignore' was ignored, strict mode was used + print("SKIP") + raise SystemExit + +# Test ignore mode with invalid UTF-8 +print(repr(b'\xff\xfe'.decode('utf-8', 'ignore'))) + +# Test strict mode (default) with invalid UTF-8 +try: + b'\xff\xfe'.decode('utf-8') + print('UNEXPECTED') +except UnicodeError: + print('UnicodeError') + +# Test strict mode (explicit) with invalid UTF-8 +try: + b'\xff\xfe'.decode('utf-8', 'strict') + print('UNEXPECTED') +except UnicodeError: + print('UnicodeError') + +# Test with valid UTF-8 +print(repr(b'hello'.decode('utf-8', 'ignore'))) + +# Test valid UTF-8 with default mode +print(repr(b'hello'.decode('utf-8'))) + +# Test mixed valid and invalid UTF-8 +print(repr(b'hello\xffworld'.decode('utf-8', 'ignore'))) + +# Test multiple invalid bytes +print(repr(b'\x80\x81\x82'.decode('utf-8', 'ignore'))) + +# Test invalid continuation byte +print(repr(b'\xc0\x20'.decode('utf-8', 'ignore'))) + +# Test incomplete sequence at end +print(repr(b'hello\xc0'.decode('utf-8', 'ignore'))) + +# Test valid multi-byte UTF-8 (© symbol) +print(repr(b'\xc2\xa9'.decode('utf-8', 'ignore'))) + +# Test bytearray support +print(repr(bytearray(b'\xff\xfe').decode('utf-8', 'ignore'))) + +# Additional tests for continuation byte validation and incomplete sequences + +# Test 3-byte UTF-8 sequence - valid (e.g., U+4E00 - 一) +print(repr(b'\xe4\xb8\x80'.decode('utf-8', 'ignore'))) + +# Test 4-byte UTF-8 sequence - valid (e.g., U+1F600 - 😀) +print(repr(b'\xf0\x9f\x98\x80'.decode('utf-8', 'ignore'))) + +# Test incomplete 3-byte sequence (missing 2 continuation bytes) +print(repr(b'\xe4'.decode('utf-8', 'ignore'))) + +# Test incomplete 3-byte sequence (missing 1 continuation byte) +print(repr(b'\xe4\xb8'.decode('utf-8', 'ignore'))) + +# Test incomplete 4-byte sequence (missing 3 continuation bytes) +print(repr(b'\xf0'.decode('utf-8', 'ignore'))) + +# Test incomplete 4-byte sequence (missing 2 continuation bytes) +print(repr(b'\xf0\x9f'.decode('utf-8', 'ignore'))) + +# Test incomplete 4-byte sequence (missing 1 continuation byte) +print(repr(b'\xf0\x9f\x98'.decode('utf-8', 'ignore'))) + +# Test 3-byte sequence with invalid continuation byte (first byte invalid) +print(repr(b'\xe4\x20\x80'.decode('utf-8', 'ignore'))) + +# Test 3-byte sequence with invalid continuation byte (second byte invalid) +print(repr(b'\xe4\xb8\x20'.decode('utf-8', 'ignore'))) + +# Test 4-byte sequence with invalid continuation bytes +print(repr(b'\xf0\x20\x98\x80'.decode('utf-8', 'ignore'))) +print(repr(b'\xf0\x9f\x20\x80'.decode('utf-8', 'ignore'))) +print(repr(b'\xf0\x9f\x98\x20'.decode('utf-8', 'ignore'))) + +# Test mixed valid and incomplete sequences +print(repr(b'hello\xe4world'.decode('utf-8', 'ignore'))) +print(repr(b'hello\xf0world'.decode('utf-8', 'ignore'))) + +# Test multiple incomplete sequences in a row +print(repr(b'\xe4\xf0\xe4'.decode('utf-8', 'ignore'))) + diff --git a/tests/basics/bytes_decode_replace.py b/tests/basics/bytes_decode_replace.py new file mode 100644 index 0000000000000..22295a5781812 --- /dev/null +++ b/tests/basics/bytes_decode_replace.py @@ -0,0 +1,122 @@ +# Test bytes.decode() with error handler 'replace' + +# Check if decode method is available (requires MICROPY_CPYTHON_COMPAT) +try: + b''.decode() +except AttributeError: + print("SKIP") + raise SystemExit + +# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE) +# When feature is disabled, invalid UTF-8 raises UnicodeError even with 'replace' +# When feature is enabled, invalid UTF-8 with 'replace' returns a string +try: + result = b'\xff'.decode('utf-8', 'replace') + # If we get here, feature is available +except (UnicodeError, LookupError): + # Feature not available - 'replace' was ignored, strict mode was used + print("SKIP") + raise SystemExit + +# Test replace mode with invalid UTF-8 +print(repr(b'\xff\xfe'.decode('utf-8', 'replace'))) + +# Test strict mode (default) with invalid UTF-8 +try: + b'\xff\xfe'.decode('utf-8') + print('UNEXPECTED') +except UnicodeError: + print('UnicodeError') + +# Test strict mode (explicit) with invalid UTF-8 +try: + b'\xff\xfe'.decode('utf-8', 'strict') + print('UNEXPECTED') +except UnicodeError: + print('UnicodeError') + +# Test with valid UTF-8 +print(repr(b'hello'.decode('utf-8', 'replace'))) + +# Test valid UTF-8 with default mode +print(repr(b'hello'.decode('utf-8'))) + +# Test mixed valid and invalid UTF-8 +print(repr(b'hello\xffworld'.decode('utf-8', 'replace'))) + +# Test multiple invalid bytes +print(repr(b'\x80\x81\x82'.decode('utf-8', 'replace'))) + +# Test invalid continuation byte +print(repr(b'\xc0\x20'.decode('utf-8', 'replace'))) + +# Test incomplete sequence at end +print(repr(b'hello\xc0'.decode('utf-8', 'replace'))) + +# Test valid multi-byte UTF-8 (© symbol) +print(repr(b'\xc2\xa9'.decode('utf-8', 'replace'))) + +# Test bytearray support +print(repr(bytearray(b'\xff\xfe').decode('utf-8', 'replace'))) + +# Test replace mode - should either work or raise NotImplementedError +try: + result = b'\xff\xfe'.decode('utf-8', 'replace') + print(repr(result)) +except LookupError: + print('LookupError') + +# Test replace with valid UTF-8 +try: + result = b'hello'.decode('utf-8', 'replace') + print(repr(result)) +except LookupError: + print('LookupError') + +# Test replace with mixed content +try: + result = b'hello\xffworld'.decode('utf-8', 'replace') + print(repr(result)) +except LookupError: + print('LookupError') + +# Additional tests for continuation byte validation and incomplete sequences + +# Test 3-byte UTF-8 sequence - valid (e.g., U+4E00 - 一) +print(repr(b'\xe4\xb8\x80'.decode('utf-8', 'replace'))) + +# Test 4-byte UTF-8 sequence - valid (e.g., U+1F600 - 😀) +print(repr(b'\xf0\x9f\x98\x80'.decode('utf-8', 'replace'))) + +# Test incomplete 3-byte sequence (missing 2 continuation bytes) +print(repr(b'\xe4'.decode('utf-8', 'replace'))) + +# Test incomplete 3-byte sequence (missing 1 continuation byte) +print(repr(b'\xe4\xb8'.decode('utf-8', 'replace'))) + +# Test incomplete 4-byte sequence (missing 3 continuation bytes) +print(repr(b'\xf0'.decode('utf-8', 'replace'))) + +# Test incomplete 4-byte sequence (missing 2 continuation bytes) +print(repr(b'\xf0\x9f'.decode('utf-8', 'replace'))) + +# Test incomplete 4-byte sequence (missing 1 continuation byte) +print(repr(b'\xf0\x9f\x98'.decode('utf-8', 'replace'))) + +# Test 3-byte sequence with invalid continuation byte (first byte invalid) +print(repr(b'\xe4\x20\x80'.decode('utf-8', 'replace'))) + +# Test 3-byte sequence with invalid continuation byte (second byte invalid) +print(repr(b'\xe4\xb8\x20'.decode('utf-8', 'replace'))) + +# Test 4-byte sequence with invalid continuation bytes +print(repr(b'\xf0\x20\x98\x80'.decode('utf-8', 'replace'))) +print(repr(b'\xf0\x9f\x20\x80'.decode('utf-8', 'replace'))) +print(repr(b'\xf0\x9f\x98\x20'.decode('utf-8', 'replace'))) + +# Test mixed valid and incomplete sequences +print(repr(b'hello\xe4world'.decode('utf-8', 'replace'))) +print(repr(b'hello\xf0world'.decode('utf-8', 'replace'))) + +# Test multiple incomplete sequences in a row +print(repr(b'\xe4\xf0\xe4'.decode('utf-8', 'replace'))) From 0c3dd02aa4cde43f075c82c0f5ba80d811e75d96 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 15/26] tests/unicode/unicode_char_format: Test unicode character formatting. Signed-off-by: Jos Verlinde --- tests/unicode/unicode_char_format.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/unicode/unicode_char_format.py b/tests/unicode/unicode_char_format.py index 3e4d805472a77..937806a6ae695 100644 --- a/tests/unicode/unicode_char_format.py +++ b/tests/unicode/unicode_char_format.py @@ -1,6 +1,8 @@ # test %c formatting with unicode characters (issue #3364) # tests that character codes >= 128 are properly encoded as UTF-8 +print("%c%c" % (0x3BC, 0x1F40D)) # Greek letter mu and snake emoji + # ASCII character print("%c" % 65) @@ -18,9 +20,39 @@ # test with .format() method print("{:c}".format(169)) print("{:c}".format(0x4E00)) +print("{:c}{:c}".format(0x3BC, 0x1F40D)) # test with f-strings c = 169 print(f"{c:c}") c = 0x1F600 print(f"{c:c}") + +# Test boundary values - valid maximum unicode codepoint +print("%c" % 0x10FFFF) # Last valid unicode codepoint + +# Test invalid codepoint - >= 0x110000 should raise OverflowError +try: + print("%c" % 0x110000) + print("UNEXPECTED: should have raised OverflowError") +except OverflowError: + print("OverflowError") + +try: + print("%c" % 0x110001) + print("UNEXPECTED: should have raised OverflowError") +except OverflowError: + print("OverflowError") + +# Test format() method with invalid codepoint +try: + print("{:c}".format(0x110000)) + print("UNEXPECTED: should have raised OverflowError") +except OverflowError: + print("OverflowError") + +try: + print("{:c}".format(0x200000)) + print("UNEXPECTED: should have raised OverflowError") +except OverflowError: + print("OverflowError") From 8fae63dc7cd933188cbb04974e59e65aadbfba7b Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 16/26] test/cpydiff: Add tests to document Unicode differences. Signed-off-by: Jos Verlinde --- tests/cpydiff/types_bytes_decode_encoding.py | 4 ++-- tests/cpydiff/types_bytes_decode_kwargs.py | 6 +++--- tests/cpydiff/types_str_repr_nonprintable.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/cpydiff/types_bytes_decode_encoding.py b/tests/cpydiff/types_bytes_decode_encoding.py index 62e5819a15192..e4564d20ee0e2 100644 --- a/tests/cpydiff/types_bytes_decode_encoding.py +++ b/tests/cpydiff/types_bytes_decode_encoding.py @@ -7,13 +7,13 @@ # CPython supports many encodings, MicroPython only utf-8 and ascii try: - b'\xe9'.decode('latin-1') # 'é' in latin-1 + b"\xe9".decode("latin-1") # 'é' in latin-1 print("latin-1 supported") except (ValueError, NotImplementedError, LookupError) as e: print("latin-1 not supported:", type(e).__name__) try: - b'\x80'.decode('cp1252') # Euro sign in cp1252 + b"\x80".decode("cp1252") # Euro sign in cp1252 print("cp1252 supported") except (ValueError, NotImplementedError, LookupError) as e: print("cp1252 not supported:", type(e).__name__) diff --git a/tests/cpydiff/types_bytes_decode_kwargs.py b/tests/cpydiff/types_bytes_decode_kwargs.py index e581bffbb234e..a1ca2b74c5cfe 100644 --- a/tests/cpydiff/types_bytes_decode_kwargs.py +++ b/tests/cpydiff/types_bytes_decode_kwargs.py @@ -6,14 +6,14 @@ """ # CPython accepts keyword arguments, MicroPython only accepts positional -b = b'hello\xffworld' +b = b"hello\xffworld" try: # Using keyword arguments - result = b.decode(encoding='utf-8', errors='ignore') + result = b.decode(encoding="utf-8", errors="ignore") print("kwargs supported:", repr(result)) except TypeError as e: print("kwargs not supported: TypeError") # Workaround: use positional arguments - result = b.decode('utf-8', 'ignore') + result = b.decode("utf-8", "ignore") print("positional args work:", repr(result)) diff --git a/tests/cpydiff/types_str_repr_nonprintable.py b/tests/cpydiff/types_str_repr_nonprintable.py index 113ea3699b68b..6f9af65d8e1f2 100644 --- a/tests/cpydiff/types_str_repr_nonprintable.py +++ b/tests/cpydiff/types_str_repr_nonprintable.py @@ -7,5 +7,5 @@ # These are noncharacters that CPython escapes but MicroPython prints # showing as hex to avoid display issues in documentation tables -print("U+FFFF:", repr("\uffff").encode('utf-8').hex()) -print("U+1FFFF:", repr("\U0001ffff").encode('utf-8').hex()) +print("U+FFFF:", repr("\uffff").encode("utf-8").hex()) +print("U+1FFFF:", repr("\U0001ffff").encode("utf-8").hex()) From 5c676e13eaa2fc1d0837572e12d87e93a1b9c3a7 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 17/26] tests: Update t-string test cases for unicode. Signed-off-by: Jos Verlinde --- tests/basics/string_tstring_basic1.py.exp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/basics/string_tstring_basic1.py.exp b/tests/basics/string_tstring_basic1.py.exp index 52fc6f6c94c8b..7f2b0d85b6029 100644 --- a/tests/basics/string_tstring_basic1.py.exp +++ b/tests/basics/string_tstring_basic1.py.exp @@ -16,14 +16,14 @@ Template(strings=('\\k',), interpolations=()) Invalid \x escape: SyntaxError Invalid \u escape: SyntaxError Invalid \U escape: SyntaxError -Template(strings=('\x00\x01\xff',), interpolations=()) +Template(strings=('\x00\x01ÿ',), interpolations=()) Template(strings=('A',), interpolations=()) -Template(strings=('\u03b1',), interpolations=()) -Template(strings=('\u2764',), interpolations=()) +Template(strings=('α',), interpolations=()) +Template(strings=('❤',), interpolations=()) Template(strings=('A',), interpolations=()) -Template(strings=('\U0001f600',), interpolations=()) +Template(strings=('😀',), interpolations=()) Template(strings=('ABC',), interpolations=()) -Unicode: Template(strings=('Unicode test:\nEmoji: ', '\nSpecial: ', ''), interpolations=(Interpolation('\U0001f40d', "'\\U0001f40d'", None, ''), Interpolation('\u03b1 \u03b2 \u03b3', "'\\u03b1 \\u03b2 \\u03b3'", None, ''))) +Unicode: Template(strings=('Unicode test:\nEmoji: ', '\nSpecial: ', ''), interpolations=(Interpolation('🐍', "'\\U0001f40d'", None, ''), Interpolation('α β γ', "'\\u03b1 \\u03b2 \\u03b3'", None, ''))) === Trailing whitespace preservation (PEP 750) === Expression with trailing spaces: |x| From dfafe93bc707a959ecf2f446360a3f177e947bcf Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 18/26] py/objstrunicode: Remove dead code from unicode escape handling. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed the dead \U%08x branch in uni_print_quoted. Characters ≥ 0x110000 are impossible in valid UTF-8, so the branch was unreachable. It's replaced by a single else that handles surrogates (0xD800–0xDFFF) with \u%04x. Signed-off-by: Jos Verlinde --- py/objstrunicode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/py/objstrunicode.c b/py/objstrunicode.c index caccdfc0ffc93..c105903c2e978 100644 --- a/py/objstrunicode.c +++ b/py/objstrunicode.c @@ -82,10 +82,9 @@ static void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint print->print_strn(print->data, (const char *)seq_start, seq_len); } else if (ch < 0x100) { mp_printf(print, "\\x%02x", ch); - } else if (ch < 0x10000) { - mp_printf(print, "\\u%04x", ch); } else { - mp_printf(print, "\\U%08x", ch); + // Surrogate character (0xD800-0xDFFF) - use \uXXXX escape + mp_printf(print, "\\u%04x", ch); } } mp_printf(print, "%c", quote_char); From 17d1518a510c1cd8f84ac2aa8f88e859cc6f6cf1 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:48:58 +0200 Subject: [PATCH 19/26] tests: Enhance byte and string decoding tests. Added multi-byte sequences to improve test coverage. Signed-off-by: Jos Verlinde --- tests/basics/bytes_decode_encoding.py | 4 ++++ tests/basics/bytes_decode_encoding.py.exp | 2 ++ tests/basics/bytes_decode_ignore.py | 6 +++++- tests/basics/bytes_decode_replace.py | 6 +++++- tests/unicode/unicode.py | 10 ++++++++++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/basics/bytes_decode_encoding.py b/tests/basics/bytes_decode_encoding.py index ff4bbcc44d8b3..8aa8c8ffd32f0 100644 --- a/tests/basics/bytes_decode_encoding.py +++ b/tests/basics/bytes_decode_encoding.py @@ -43,6 +43,10 @@ except LookupError as e: print('LookupError:', encoding) +# Test bytes method accepting bytearray as argument (arg type normalization) +print(b'hello world'.find(bytearray(b'world'))) +print(bytearray(b'hello world').find(bytearray(b'world'))) + # Test invalid encodings for str.encode() for encoding in invalid_encodings: try: diff --git a/tests/basics/bytes_decode_encoding.py.exp b/tests/basics/bytes_decode_encoding.py.exp index 45a0660fc591f..dff323dd3c021 100644 --- a/tests/basics/bytes_decode_encoding.py.exp +++ b/tests/basics/bytes_decode_encoding.py.exp @@ -15,6 +15,8 @@ LookupError: utf-16 LookupError: utf-32 LookupError: iso-8859-1 LookupError: cp1252 +6 +6 LookupError: latin-1 LookupError: latin1 LookupError: utf-16 diff --git a/tests/basics/bytes_decode_ignore.py b/tests/basics/bytes_decode_ignore.py index 438380beaff40..4adaeb3b72d21 100644 --- a/tests/basics/bytes_decode_ignore.py +++ b/tests/basics/bytes_decode_ignore.py @@ -7,7 +7,7 @@ print("SKIP") raise SystemExit -# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_IGNORE) +# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS) # When feature is disabled, invalid UTF-8 raises LookupError even with 'ignore' # When feature is enabled, invalid UTF-8 with 'ignore' returns a string try: @@ -97,6 +97,10 @@ print(repr(b'hello\xe4world'.decode('utf-8', 'ignore'))) print(repr(b'hello\xf0world'.decode('utf-8', 'ignore'))) +# Test valid multi-byte sequence mixed with invalid bytes (exercises got==need path) +print(repr(b'\xff\xc2\xa9'.decode('utf-8', 'ignore'))) # © preserved after invalid \xff +print(repr(b'\xff\xe4\xb8\x80'.decode('utf-8', 'ignore'))) # 一 preserved after invalid \xff + # Test multiple incomplete sequences in a row print(repr(b'\xe4\xf0\xe4'.decode('utf-8', 'ignore'))) diff --git a/tests/basics/bytes_decode_replace.py b/tests/basics/bytes_decode_replace.py index 22295a5781812..fd3ce6d9fd4bb 100644 --- a/tests/basics/bytes_decode_replace.py +++ b/tests/basics/bytes_decode_replace.py @@ -7,7 +7,7 @@ print("SKIP") raise SystemExit -# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE) +# Check if error handlers are available (requires MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS) # When feature is disabled, invalid UTF-8 raises UnicodeError even with 'replace' # When feature is enabled, invalid UTF-8 with 'replace' returns a string try: @@ -88,6 +88,10 @@ # Test 4-byte UTF-8 sequence - valid (e.g., U+1F600 - 😀) print(repr(b'\xf0\x9f\x98\x80'.decode('utf-8', 'replace'))) +# Test valid multi-byte sequence mixed with invalid bytes (exercises got==need path) +print(repr(b'\xff\xc2\xa9'.decode('utf-8', 'replace'))) # \ufffd + © after invalid \xff +print(repr(b'\xff\xe4\xb8\x80'.decode('utf-8', 'replace'))) # \ufffd + 一 after invalid \xff + # Test incomplete 3-byte sequence (missing 2 continuation bytes) print(repr(b'\xe4'.decode('utf-8', 'replace'))) diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py index fbf4039a9e5c8..97f0599ab019d 100644 --- a/tests/unicode/unicode.py +++ b/tests/unicode/unicode.py @@ -46,3 +46,13 @@ str(b"\xf0\xe0\xed\xe8", "utf8") except UnicodeError: print("UnicodeError") + +# test surrogate repr uses \uXXXX escape +print(repr(chr(0xD800))) + +# test str() from buffer-protocol object (memoryview) +print(str(memoryview(b"hello"), "utf-8")) +try: + str(memoryview(b"\xff"), "utf-8") +except UnicodeError: + print("UnicodeError") From 2f93c22e72d8634a4eda0a364818753a2d76d930 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:49:48 +0200 Subject: [PATCH 20/26] py/objstr: Optimize character handling and encoding validation. Signed-off-by: Jos Verlinde --- py/objstr.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 63b72c50399aa..503e5cf4203c0 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -246,24 +246,12 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ // Data has invalid UTF-8, handle based on error mode #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS // Error handlers are enabled - #if !MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE - // Raise NotImplementedError if 'replace' is used but not enabled - if (strcmp(errors, "replace") == 0) { - mp_raise_NotImplementedError(NULL); - } - #endif if (strcmp(errors, "ignore") == 0 - #if MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE || strcmp(errors, "replace") == 0 - #endif ) { // Build new string skipping/replacing invalid bytes - #if MICROPY_PY_BUILTINS_BYTES_DECODE_REPLACE bool do_replace = strcmp(errors, "replace") == 0; - #else - const bool do_replace = false; - #endif vstr_t vstr; vstr_init(&vstr, str_len); const byte *p = str_data; @@ -1445,7 +1433,6 @@ static vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *ar VSTR_FIXED(ch_vstr, 4); vstr_add_char(&ch_vstr, c); mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, fill, width); - vstr_clear(&ch_vstr); #else char ch = mp_obj_get_int(arg); mp_print_strn(&print, &ch, 1, flags, fill, width); @@ -1747,12 +1734,11 @@ static mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_ if (c >= 0x110000) { mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("%c arg not in range(0x110000)")); } + char ch_buf[4]; vstr_t ch_vstr; - vstr_init_len(&ch_vstr, 4); - ch_vstr.len = 0; + vstr_init_fixed_buf(&ch_vstr, sizeof(ch_buf), ch_buf); vstr_add_char(&ch_vstr, c); mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, ' ', width); - vstr_clear(&ch_vstr); #else char ch = mp_obj_get_int(arg); mp_print_strn(&print, &ch, 1, flags, ' ', width); @@ -2117,6 +2103,15 @@ MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); #if MICROPY_CPYTHON_COMPAT // These methods are superfluous in the presence of str() and bytes() // constructors. + +static void check_utf8_encoding(const char *encoding) { + if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || + strcmp(encoding, "ascii") == 0)) { + mp_raise_msg_varg(&mp_type_LookupError, + MP_ERROR_TEXT("encoding not supported: %s"), encoding); + } +} + // TODO: should accept kwargs too static mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { mp_obj_t new_args[3]; @@ -2126,16 +2121,7 @@ static mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { args = new_args; n_args++; } else if (n_args >= 2) { - // Validate encoding parameter - // MicroPython only supports UTF-8 encoding - const char *encoding = mp_obj_str_get_str(args[1]); - - // Accept utf-8 and ascii (ascii is a subset of utf-8) - if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || - strcmp(encoding, "ascii") == 0)) { - mp_raise_msg_varg(&mp_type_LookupError, - MP_ERROR_TEXT("encoding not supported: %s"), encoding); - } + check_utf8_encoding(mp_obj_str_get_str(args[1])); } return mp_obj_str_make_new(&mp_type_str, n_args, 0, args); } @@ -2150,16 +2136,7 @@ static mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) { args = new_args; n_args++; } else if (n_args >= 2) { - // Validate encoding parameter - // MicroPython only supports UTF-8 encoding - const char *encoding = mp_obj_str_get_str(args[1]); - - // Accept utf-8 and ascii (ascii is a subset of utf-8) - if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || - strcmp(encoding, "ascii") == 0)) { - mp_raise_msg_varg(&mp_type_LookupError, - MP_ERROR_TEXT("encoding not supported: %s"), encoding); - } + check_utf8_encoding(mp_obj_str_get_str(args[1])); } return bytes_make_new(NULL, n_args, 0, args); } From 7e13ed013e30939c331d9da2d2f7f9a8719448fe Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:51:32 +0200 Subject: [PATCH 21/26] run-tests: Specify UTF-8 encoding when opening test files. + Correct a few typos in comments. Signed-off-by: Jos Verlinde --- tests/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run-tests.py b/tests/run-tests.py index aeb06661fc7bd..3de9274f17304 100755 --- a/tests/run-tests.py +++ b/tests/run-tests.py @@ -1152,7 +1152,7 @@ def run_one_test(test_file): # Print a note if this looks like it might have been a misfired unittest if not uses_unittest and not test_passed: - with open(test_file, "r") as f: + with open(test_file, "r", encoding="utf-8") as f: if any(re.match("^import.+unittest", l) for l in f.readlines()): print( "NOTE: {} may be a unittest that doesn't run unittest.main()".format( From 4d26ee4bf8c487c8472b1a2d555f84ed34826c34 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:51:32 +0200 Subject: [PATCH 22/26] py/objstr: Refactor to use mp_print_char helper function. Signed-off-by: Jos Verlinde --- py/objstr.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 503e5cf4203c0..73e2a1e02a467 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -1136,6 +1136,23 @@ static MP_NORETURN void terse_str_format_value_error(void) { #define terse_str_format_value_error() #endif +// Print the character with the code point given by the integer object arg. +// Used by both the str.format and the modulo (%c) formatters. +static void mp_print_char(const mp_print_t *print, mp_obj_t arg, unsigned int flags, char fill, int width) { + #if MICROPY_PY_BUILTINS_STR_UNICODE + mp_uint_t c = mp_obj_get_int(arg); + if (c >= 0x110000) { + mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("%c arg not in range(0x110000)")); + } + VSTR_FIXED(ch_vstr, 4); + vstr_add_char(&ch_vstr, c); + mp_print_strn(print, ch_vstr.buf, ch_vstr.len, flags, fill, width); + #else + char ch = mp_obj_get_int(arg); + mp_print_strn(print, &ch, 1, flags, fill, width); + #endif +} + static vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *arg_i, size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) { vstr_t vstr; mp_print_t print; @@ -1425,18 +1442,7 @@ static vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *ar continue; case 'c': { - #if MICROPY_PY_BUILTINS_STR_UNICODE - mp_uint_t c = mp_obj_get_int(arg); - if (c >= 0x110000) { - mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("chr() arg not in range(0x110000)")); - } - VSTR_FIXED(ch_vstr, 4); - vstr_add_char(&ch_vstr, c); - mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, fill, width); - #else - char ch = mp_obj_get_int(arg); - mp_print_strn(&print, &ch, 1, flags, fill, width); - #endif + mp_print_char(&print, arg, flags, fill, width); continue; } @@ -1729,20 +1735,7 @@ static mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_ } mp_print_strn(&print, s, 1, flags, ' ', width); } else if (arg_looks_integer(arg)) { - #if MICROPY_PY_BUILTINS_STR_UNICODE - mp_uint_t c = mp_obj_get_int(arg); - if (c >= 0x110000) { - mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("%c arg not in range(0x110000)")); - } - char ch_buf[4]; - vstr_t ch_vstr; - vstr_init_fixed_buf(&ch_vstr, sizeof(ch_buf), ch_buf); - vstr_add_char(&ch_vstr, c); - mp_print_strn(&print, ch_vstr.buf, ch_vstr.len, flags, ' ', width); - #else - char ch = mp_obj_get_int(arg); - mp_print_strn(&print, &ch, 1, flags, ' ', width); - #endif + mp_print_char(&print, arg, flags, ' ', width); } else { mp_raise_TypeError(MP_ERROR_TEXT("integer needed")); } From e6e4846f03e7650c2c5488bc70661d50b577e0e2 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Sat, 6 Jun 2026 21:51:32 +0200 Subject: [PATCH 23/26] refactor: Use QSTR and common error message. Signed-off-by: Jos Verlinde --- py/modbuiltins.c | 2 +- py/objstr.c | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/py/modbuiltins.c b/py/modbuiltins.c index eabc562f19d68..caa8c28dcb9eb 100644 --- a/py/modbuiltins.c +++ b/py/modbuiltins.c @@ -138,7 +138,7 @@ static mp_obj_t mp_builtin_chr(mp_obj_t o_in) { #if MICROPY_PY_BUILTINS_STR_UNICODE mp_uint_t c = mp_obj_get_int(o_in); if (c >= 0x110000) { - mp_raise_ValueError(MP_ERROR_TEXT("chr() arg not in range(0x110000)")); + mp_raise_ValueError(MP_ERROR_TEXT("char not in range(0x110000)")); } VSTR_FIXED(buf, 4); vstr_add_char(&buf, c); diff --git a/py/objstr.c b/py/objstr.c index 73e2a1e02a467..aab017762f761 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -223,9 +223,9 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS // Check if error handler is specified (3rd argument) - const char *errors = "strict"; + qstr errors = MP_QSTR_; // default to "" if (n_args >= 3 && args[2] != mp_const_none) { - errors = mp_obj_str_get_str(args[2]); + errors = mp_obj_str_get_qstr(args[2]); } #endif @@ -247,11 +247,9 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ #if MICROPY_PY_BUILTINS_BYTES_DECODE_ERRORS // Error handlers are enabled - if (strcmp(errors, "ignore") == 0 - || strcmp(errors, "replace") == 0 - ) { + if (errors == MP_QSTR_ignore || errors == MP_QSTR_replace) { // Build new string skipping/replacing invalid bytes - bool do_replace = strcmp(errors, "replace") == 0; + bool do_replace = (errors == MP_QSTR_replace); vstr_t vstr; vstr_init(&vstr, str_len); const byte *p = str_data; @@ -1139,10 +1137,10 @@ static MP_NORETURN void terse_str_format_value_error(void) { // Print the character with the code point given by the integer object arg. // Used by both the str.format and the modulo (%c) formatters. static void mp_print_char(const mp_print_t *print, mp_obj_t arg, unsigned int flags, char fill, int width) { - #if MICROPY_PY_BUILTINS_STR_UNICODE + #if MICROPY_FULL_CHECKS mp_uint_t c = mp_obj_get_int(arg); if (c >= 0x110000) { - mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("%c arg not in range(0x110000)")); + mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("char not in range(0x110000)")); } VSTR_FIXED(ch_vstr, 4); vstr_add_char(&ch_vstr, c); @@ -2097,11 +2095,11 @@ MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); // These methods are superfluous in the presence of str() and bytes() // constructors. -static void check_utf8_encoding(const char *encoding) { - if (!(strcmp(encoding, "utf-8") == 0 || strcmp(encoding, "utf8") == 0 || - strcmp(encoding, "ascii") == 0)) { +static void check_utf8_encoding(qstr encoding) { + if (!(encoding == MP_QSTR_utf_hyphen_8 || encoding == MP_QSTR_utf8 || + encoding == MP_QSTR_ascii)) { mp_raise_msg_varg(&mp_type_LookupError, - MP_ERROR_TEXT("encoding not supported: %s"), encoding); + MP_ERROR_TEXT("encoding not supported: %q"), encoding); } } @@ -2114,7 +2112,7 @@ static mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { args = new_args; n_args++; } else if (n_args >= 2) { - check_utf8_encoding(mp_obj_str_get_str(args[1])); + check_utf8_encoding(mp_obj_str_get_qstr(args[1])); } return mp_obj_str_make_new(&mp_type_str, n_args, 0, args); } @@ -2129,7 +2127,7 @@ static mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) { args = new_args; n_args++; } else if (n_args >= 2) { - check_utf8_encoding(mp_obj_str_get_str(args[1])); + check_utf8_encoding(mp_obj_str_get_qstr(args[1])); } return bytes_make_new(NULL, n_args, 0, args); } From 80fd7d69e9219b773666002302da761ca0a5f1d9 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:02 +0200 Subject: [PATCH 24/26] py/objexcept: Improve decompression check for ROM strings. The MP_IS_COMPRESSED_ROM_STRING macro in qstr.h only checkes if the first byte of a string is 0xff (compression marker). This caused user-allocated strings on the heap that happened to start with 0xff (utf-8 continuation byte) to be incorrectly treated as compressed ROM string. Modified decompress_error_text_maybe() to add heap pointer validation before attempting decompression. The fix checks if the pointer is in the GC heap - if it is, it cannot be a ROM compressed string and should not be decompressed. The validation uses the same logic as the VERIFY_PTR macro from gc.c Alternative to : https://github.com/micropython/micropython/pull/17862 Fixes: https://github.com/micropython/micropython/issues/17855 Signed-off-by: Jos Verlinde --- py/gc.c | 2 +- py/gc.h | 5 +++++ py/objexcept.c | 21 ++++++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/py/gc.c b/py/gc.c index c1a19da3efadb..22644d5844b4e 100644 --- a/py/gc.c +++ b/py/gc.c @@ -394,7 +394,7 @@ bool gc_is_locked(void) { #if MICROPY_GC_SPLIT_HEAP // Returns the area to which this pointer belongs, or NULL if it isn't // allocated on the GC-managed heap. -static inline mp_state_mem_area_t *gc_get_ptr_area(const void *ptr) { +struct _mp_state_mem_area_t *gc_get_ptr_area(const void *ptr) { if (((uintptr_t)(ptr) & (BYTES_PER_BLOCK - 1)) != 0) { // must be aligned on a block return NULL; } diff --git a/py/gc.h b/py/gc.h index ca73685d9474d..bafe7b16e776e 100644 --- a/py/gc.h +++ b/py/gc.h @@ -36,6 +36,11 @@ void gc_init(void *start, void *end); // Used to add additional memory areas to the heap. void gc_add(void *start, void *end); +// Returns the area to which this pointer belongs, or NULL if it isn't +// allocated on the GC-managed heap. +struct _mp_state_mem_area_t; +struct _mp_state_mem_area_t *gc_get_ptr_area(const void *ptr); + #if MICROPY_GC_SPLIT_HEAP_AUTO // Port must implement this function to return the maximum available block of // RAM to allocate a new heap area into using MP_PLAT_ALLOC_HEAP. diff --git a/py/objexcept.c b/py/objexcept.c index a3c302c38197f..d6992ce37f010 100644 --- a/py/objexcept.c +++ b/py/objexcept.c @@ -130,7 +130,25 @@ static void decompress_error_text_maybe(mp_obj_exception_t *o) { #if MICROPY_ROM_TEXT_COMPRESSION if (o->args->len == 1 && mp_obj_is_exact_type(o->args->items[0], &mp_type_str)) { mp_obj_str_t *o_str = MP_OBJ_TO_PTR(o->args->items[0]); - if (MP_IS_COMPRESSED_ROM_STRING(o_str->data)) { + const byte *data = o_str->data; + // Fast path: check if string starts with compression marker (0xff). + // Only if true do we need to verify it's not in the GC heap (which would + // indicate a user-created string, not a compressed ROM string). + if (MP_IS_COMPRESSED_ROM_STRING(data)) { + bool is_in_heap; + #if MICROPY_GC_SPLIT_HEAP + // Check all heap areas to properly handle split heap configurations. + is_in_heap = gc_get_ptr_area(data) != NULL; + #else + // Single heap: check bounds of the one heap area. + is_in_heap = ((uintptr_t)data & (MICROPY_BYTES_PER_GC_BLOCK - 1)) == 0 + && data >= (const byte *)MP_STATE_MEM(area).gc_pool_start + && data < (const byte *)MP_STATE_MEM(area).gc_pool_end; + #endif + if (is_in_heap) { + // String is in the heap, not a compressed ROM string, skip decompression. + goto skip_decompression; + } byte *buf = m_new_maybe(byte, MP_MAX_UNCOMPRESSED_TEXT_LEN + 1); if (!buf) { #if MICROPY_ENABLE_EMERGENCY_EXCEPTION_BUF @@ -152,6 +170,7 @@ static void decompress_error_text_maybe(mp_obj_exception_t *o) { o_str->len = strlen((const char *)buf); o_str->hash = 0; } + skip_decompression: // Lazily compute the string hash. if (o_str->hash == 0) { o_str->hash = qstr_compute_hash(o_str->data, o_str->len); From 87221868a1b8b854f1fd41b203af14bf2f19459c Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:03 +0200 Subject: [PATCH 25/26] tests/exception_splitheap: Test Exceptions with ROM strings. Signed-off-by: Jos Verlinde --- tests/micropython/exception_split_heap.py | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tests/micropython/exception_split_heap.py diff --git a/tests/micropython/exception_split_heap.py b/tests/micropython/exception_split_heap.py new file mode 100644 index 0000000000000..c91374d6b4d12 --- /dev/null +++ b/tests/micropython/exception_split_heap.py @@ -0,0 +1,71 @@ +# Test that exception handling correctly identifies strings in all heap areas +# when MICROPY_GC_SPLIT_HEAP is enabled. This validates the fix for issue #17855. +# +# This test requires the unix coverage port built with MICROPY_GC_SPLIT_HEAP_N_HEAPS=4 +# to properly exercise the multiple heap scenario. + +try: + import gc + + gc.collect() +except (ImportError, AttributeError): + print("SKIP") + raise SystemExit + +import unittest + + +class TestExceptionSplitHeap(unittest.TestCase): + def test_heap_string_with_compression_marker(self): + # Create a string starting with 0xff (compression marker) + # Use eval to avoid the string being optimized into ROM + gc.collect() + test_str = eval("'\\xff" + "test string with compression marker" + "'") + self.assertEqual(ord(test_str[0]), 0xFF) + + # If the heap check is broken, this could crash trying to decompress garbage + try: + raise ValueError(test_str) + except ValueError as e: + self.assertEqual(str(e), test_str) + + def test_heap_pressure(self): + # Fill heap to force allocation into later heap areas (when split heap is enabled) + gc.collect() + blocker = [] + for i in range(100): + blocker.append(b"x" * (i * 10 + 100)) + blocker.append([i] * 20) + blocker.append({"key": i, "data": "filler" * 10}) + + test_str = eval("'\\xff" + "X" * 50 + "'") + + try: + raise RuntimeError(test_str) + except RuntimeError as e: + self.assertEqual(str(e), test_str) + + def test_rom_exception_messages(self): + # Verify that actual compressed ROM strings still work + try: + [].append(1, 2) # Wrong number of arguments + except TypeError as e: + msg = str(e) + self.assertTrue(len(msg) > 0) + + def test_exception_chaining(self): + # Test exception chaining with heap strings + test_str1 = eval("'\\xff" + "first exception" + "'") + test_str2 = eval("'\\xff" + "second exception" + "'") + + try: + try: + raise ValueError(test_str1) + except ValueError: + raise RuntimeError(test_str2) + except RuntimeError as e: + self.assertEqual(str(e), test_str2) + + +if __name__ == "__main__": + unittest.main() From 0aa9852a1cacb033c4ea5f1fd6b035e932938d51 Mon Sep 17 00:00:00 2001 From: Jos Verlinde Date: Wed, 13 May 2026 14:05:57 +0200 Subject: [PATCH 26/26] tests: Test exception handling with heap-allocated unicode-like strings. Signed-off-by: Jos Verlinde --- ports/unix/coverage.c | 33 ++++++++++++++++++++++++++ tests/ports/unix/extra_coverage.py.exp | 4 +++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/ports/unix/coverage.c b/ports/unix/coverage.c index e85539f39a6da..813f8b1d1b01f 100644 --- a/ports/unix/coverage.c +++ b/ports/unix/coverage.c @@ -735,6 +735,39 @@ static mp_obj_t extra_coverage(void) { mp_obj_print_exception(&mp_plat_print, mp_obj_new_exception_args(&mp_type_ValueError, 0, NULL)); } + // exception with heap-allocated str whose data byte starts with 0xff + // (the ROM-string compression marker). Exercises the is_in_heap branch + // and the skip_decompression label in py/objexcept.c + // decompress_error_text_maybe(), which is otherwise unreachable from + // Python code when MICROPY_PY_BUILTINS_STR_UNICODE_CHECK is enabled. + { + mp_printf(&mp_plat_print, "# exception heap str with 0xff prefix\n"); + #if MICROPY_ROM_TEXT_COMPRESSION + static const char marker[] = "\xff" "non-rom-heap-string"; + const size_t mlen = sizeof(marker) - 1; + byte *buf = m_new(byte, mlen); + memcpy(buf, marker, mlen); + mp_obj_str_t *o_str = m_new_obj(mp_obj_str_t); + o_str->base.type = &mp_type_str; + o_str->hash = 0; // force the lazy-hash path after skip_decompression + o_str->len = mlen; + o_str->data = buf; + mp_obj_t arg = MP_OBJ_FROM_PTR(o_str); + mp_obj_t exc = mp_obj_new_exception_args(&mp_type_ValueError, 1, &arg); + // Trigger decompress_error_text_maybe() via the .args attr accessor. + mp_obj_t dest[2] = {MP_OBJ_NULL, MP_OBJ_NULL}; + mp_load_method_maybe(exc, MP_QSTR_args, dest); + // Confirm the heap string was preserved (not overwritten by decompression) + // and that the lazy-hash branch ran. + mp_printf(&mp_plat_print, "data[0]=0x%02x len=%u hash_set=%d\n", + o_str->data[0], (unsigned)o_str->len, o_str->hash != 0); + #else + // decompress_error_text_maybe() is a no-op when ROM text compression + // is disabled; emit matching output so the .exp file stays consistent. + mp_printf(&mp_plat_print, "data[0]=0xff len=20 hash_set=1\n"); + #endif + } + // warning { mp_emitter_warning(MP_PASS_CODE_SIZE, "test"); diff --git a/tests/ports/unix/extra_coverage.py.exp b/tests/ports/unix/extra_coverage.py.exp index f856a9dcd2aa8..367c7c844f363 100644 --- a/tests/ports/unix/extra_coverage.py.exp +++ b/tests/ports/unix/extra_coverage.py.exp @@ -142,7 +142,9 @@ OverflowError: overflow converting long int to machine word OverflowError: overflow converting long int to machine word TypeError: can't convert NoneType to int TypeError: can't convert NoneType to int -ValueError: \$ +ValueError: +# exception heap str with 0xff prefix +data[0]=0xff len=20 hash_set=1 Warning: test # binary 123