Skip to content

Commit 100b870

Browse files
committed
Implement UTF-32 encode/decode and fix UTF-16 empty encode
- Add UTF-32, UTF-32-LE, UTF-32-BE encode/decode in _pycodecs.py - Register utf_32 codec functions in codecs.rs via delegate_pycodecs - Fix PyUnicode_EncodeUTF16 returning "" instead of [] for empty input - Remove resolved expectedFailure decorators in test_codecs.py - Add failure reasons to remaining expectedFailure comments
1 parent db347b3 commit 100b870

File tree

10 files changed

+334
-205
lines changed

10 files changed

+334
-205
lines changed

Lib/_pycodecs.py

Lines changed: 143 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,145 @@ def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
357357
return res, consumed
358358

359359

360+
def STORECHAR32(ch, byteorder):
361+
"""Store a 32-bit character as 4 bytes in the specified byte order."""
362+
b0 = ch & 0xff
363+
b1 = (ch >> 8) & 0xff
364+
b2 = (ch >> 16) & 0xff
365+
b3 = (ch >> 24) & 0xff
366+
if byteorder == 'little':
367+
return [b0, b1, b2, b3]
368+
else: # big-endian
369+
return [b3, b2, b1, b0]
370+
371+
372+
def PyUnicode_EncodeUTF32(s, size, errors, byteorder='little'):
373+
"""Encode a Unicode string to UTF-32."""
374+
p = []
375+
bom = sys.byteorder
376+
377+
if byteorder == 'native':
378+
bom = sys.byteorder
379+
# Add BOM for native encoding
380+
p += STORECHAR32(0xFEFF, bom)
381+
382+
if size == 0:
383+
return []
384+
385+
if byteorder == 'little':
386+
bom = 'little'
387+
elif byteorder == 'big':
388+
bom = 'big'
389+
390+
for c in s:
391+
ch = ord(c)
392+
# UTF-32 doesn't need surrogate pairs, each character is encoded directly
393+
p += STORECHAR32(ch, bom)
394+
395+
return p
396+
397+
398+
def utf_32_encode(obj, errors='strict'):
399+
"""UTF-32 encoding with BOM."""
400+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'native')
401+
res = bytes(res)
402+
return res, len(obj)
403+
404+
405+
def utf_32_le_encode(obj, errors='strict'):
406+
"""UTF-32 little-endian encoding without BOM."""
407+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'little')
408+
res = bytes(res)
409+
return res, len(obj)
410+
411+
412+
def utf_32_be_encode(obj, errors='strict'):
413+
"""UTF-32 big-endian encoding without BOM."""
414+
res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'big')
415+
res = bytes(res)
416+
return res, len(obj)
417+
418+
419+
def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final=0):
420+
"""Decode UTF-32 encoded bytes to Unicode string."""
421+
if size == 0:
422+
return [], 0, 0
423+
424+
if size % 4 != 0:
425+
if not final:
426+
# Incomplete data, return what we can decode
427+
size = (size // 4) * 4
428+
if size == 0:
429+
return [], 0, 0
430+
else:
431+
# Final data must be complete
432+
if errors == 'strict':
433+
raise UnicodeDecodeError('utf-32', bytes(data), size - (size % 4), size,
434+
'truncated data')
435+
elif errors == 'ignore':
436+
size = (size // 4) * 4
437+
elif errors == 'replace':
438+
size = (size // 4) * 4
439+
440+
result = []
441+
pos = 0
442+
443+
while pos + 3 < size:
444+
if byteorder == 'little':
445+
ch = data[pos] | (data[pos+1] << 8) | (data[pos+2] << 16) | (data[pos+3] << 24)
446+
else: # big-endian
447+
ch = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3]
448+
449+
# Validate code point
450+
if ch > 0x10FFFF:
451+
if errors == 'strict':
452+
raise UnicodeDecodeError('utf-32', bytes(data), pos, pos+4,
453+
'codepoint not in range(0x110000)')
454+
elif errors == 'replace':
455+
result.append('\ufffd')
456+
# 'ignore' - skip this character
457+
else:
458+
result.append(chr(ch))
459+
460+
pos += 4
461+
462+
return result, pos, 0
463+
464+
465+
def utf_32_decode(data, errors='strict', final=0):
466+
"""UTF-32 decoding with BOM detection."""
467+
if len(data) >= 4:
468+
# Check for BOM
469+
if data[0:4] == b'\xff\xfe\x00\x00':
470+
# UTF-32 LE BOM
471+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'little', final)
472+
res = ''.join(res)
473+
return res, consumed + 4
474+
elif data[0:4] == b'\x00\x00\xfe\xff':
475+
# UTF-32 BE BOM
476+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'big', final)
477+
res = ''.join(res)
478+
return res, consumed + 4
479+
480+
# Default to little-endian if no BOM
481+
byteorder = 'little' if sys.byteorder == 'little' else 'big'
482+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, byteorder, final)
483+
res = ''.join(res)
484+
return res, consumed
485+
486+
487+
def utf_32_le_decode(data, errors='strict', final=0):
488+
"""UTF-32 little-endian decoding without BOM."""
489+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'little', final)
490+
res = ''.join(res)
491+
return res, consumed
492+
493+
494+
def utf_32_be_decode(data, errors='strict', final=0):
495+
"""UTF-32 big-endian decoding without BOM."""
496+
res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'big', final)
497+
res = ''.join(res)
498+
return res, consumed
360499

361500

362501
# ----------------------------------------------------------------------
@@ -677,8 +816,8 @@ def PyUnicode_AsASCIIString(unistr):
677816

678817
if not type(unistr) == str:
679818
raise TypeError
680-
return PyUnicode_EncodeASCII(str(unistr),
681-
len(str),
819+
return PyUnicode_EncodeASCII(unistr,
820+
len(unistr),
682821
None)
683822

684823
def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True):
@@ -815,7 +954,7 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
815954
p += STORECHAR(0xFEFF, bom)
816955

817956
if (size == 0):
818-
return ""
957+
return []
819958

820959
if (byteorder == 'little' ):
821960
bom = 'little'
@@ -1084,7 +1223,7 @@ def PyUnicode_EncodeRawUnicodeEscape(s, size):
10841223
def charmapencode_output(c, mapping):
10851224

10861225
rep = mapping[c]
1087-
if isinstance(rep, int) or isinstance(rep, int):
1226+
if isinstance(rep, int):
10881227
if rep < 256:
10891228
return [rep]
10901229
else:

Lib/test/test_array.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,6 @@ def test_numbers(self):
176176
self.assertEqual(a, b,
177177
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
178178

179-
# TODO: RUSTPYTHON - requires UTF-32 encoding support in codecs and proper array reconstructor implementation
180-
@unittest.expectedFailure
181179
def test_unicode(self):
182180
teststr = "Bonne Journ\xe9e \U0002030a\U00020347"
183181
testcases = (

Lib/test/test_bigmem.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -638,8 +638,6 @@ def test_encode_utf7(self, size):
638638
except MemoryError:
639639
pass # acceptable on 32-bit
640640

641-
# TODO: RUSTPYTHON
642-
@unittest.expectedFailure
643641
@bigmemtest(size=_4G // 4 + 5, memuse=ascii_char_size + ucs4_char_size + 4)
644642
def test_encode_utf32(self, size):
645643
try:

Lib/test/test_codeccallbacks.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,6 @@ def handler2(exc):
281281
b"g[<252><223>]"
282282
)
283283

284-
# TODO: RUSTPYTHON
285-
@unittest.expectedFailure
286284
def test_longstrings(self):
287285
# test long strings to check for memory overflow problems
288286
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
@@ -684,8 +682,6 @@ def test_badandgoodsurrogateescapeexceptions(self):
684682
("\udc80", 2)
685683
)
686684

687-
# TODO: RUSTPYTHON
688-
@unittest.expectedFailure
689685
def test_badandgoodsurrogatepassexceptions(self):
690686
surrogatepass_errors = codecs.lookup_error('surrogatepass')
691687
# "surrogatepass" complains about a non-exception passed in

0 commit comments

Comments
 (0)