@@ -357,6 +357,145 @@ def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
357357 return res , consumed
358358
359359
360+ def STORECHAR32 (ch , byteorder ):
361+ """Store a 32-bit character as 4 bytes in the specified byte order."""
362+ b0 = ch & 0xff
363+ b1 = (ch >> 8 ) & 0xff
364+ b2 = (ch >> 16 ) & 0xff
365+ b3 = (ch >> 24 ) & 0xff
366+ if byteorder == 'little' :
367+ return [b0 , b1 , b2 , b3 ]
368+ else : # big-endian
369+ return [b3 , b2 , b1 , b0 ]
370+
371+
372+ def PyUnicode_EncodeUTF32 (s , size , errors , byteorder = 'little' ):
373+ """Encode a Unicode string to UTF-32."""
374+ p = []
375+ bom = sys .byteorder
376+
377+ if byteorder == 'native' :
378+ bom = sys .byteorder
379+ # Add BOM for native encoding
380+ p += STORECHAR32 (0xFEFF , bom )
381+
382+ if size == 0 :
383+ return []
384+
385+ if byteorder == 'little' :
386+ bom = 'little'
387+ elif byteorder == 'big' :
388+ bom = 'big'
389+
390+ for c in s :
391+ ch = ord (c )
392+ # UTF-32 doesn't need surrogate pairs, each character is encoded directly
393+ p += STORECHAR32 (ch , bom )
394+
395+ return p
396+
397+
398+ def utf_32_encode (obj , errors = 'strict' ):
399+ """UTF-32 encoding with BOM."""
400+ res = PyUnicode_EncodeUTF32 (obj , len (obj ), errors , 'native' )
401+ res = bytes (res )
402+ return res , len (obj )
403+
404+
405+ def utf_32_le_encode (obj , errors = 'strict' ):
406+ """UTF-32 little-endian encoding without BOM."""
407+ res = PyUnicode_EncodeUTF32 (obj , len (obj ), errors , 'little' )
408+ res = bytes (res )
409+ return res , len (obj )
410+
411+
412+ def utf_32_be_encode (obj , errors = 'strict' ):
413+ """UTF-32 big-endian encoding without BOM."""
414+ res = PyUnicode_EncodeUTF32 (obj , len (obj ), errors , 'big' )
415+ res = bytes (res )
416+ return res , len (obj )
417+
418+
419+ def PyUnicode_DecodeUTF32Stateful (data , size , errors , byteorder = 'little' , final = 0 ):
420+ """Decode UTF-32 encoded bytes to Unicode string."""
421+ if size == 0 :
422+ return [], 0 , 0
423+
424+ if size % 4 != 0 :
425+ if not final :
426+ # Incomplete data, return what we can decode
427+ size = (size // 4 ) * 4
428+ if size == 0 :
429+ return [], 0 , 0
430+ else :
431+ # Final data must be complete
432+ if errors == 'strict' :
433+ raise UnicodeDecodeError ('utf-32' , bytes (data ), size - (size % 4 ), size ,
434+ 'truncated data' )
435+ elif errors == 'ignore' :
436+ size = (size // 4 ) * 4
437+ elif errors == 'replace' :
438+ size = (size // 4 ) * 4
439+
440+ result = []
441+ pos = 0
442+
443+ while pos + 3 < size :
444+ if byteorder == 'little' :
445+ ch = data [pos ] | (data [pos + 1 ] << 8 ) | (data [pos + 2 ] << 16 ) | (data [pos + 3 ] << 24 )
446+ else : # big-endian
447+ ch = (data [pos ] << 24 ) | (data [pos + 1 ] << 16 ) | (data [pos + 2 ] << 8 ) | data [pos + 3 ]
448+
449+ # Validate code point
450+ if ch > 0x10FFFF :
451+ if errors == 'strict' :
452+ raise UnicodeDecodeError ('utf-32' , bytes (data ), pos , pos + 4 ,
453+ 'codepoint not in range(0x110000)' )
454+ elif errors == 'replace' :
455+ result .append ('\ufffd ' )
456+ # 'ignore' - skip this character
457+ else :
458+ result .append (chr (ch ))
459+
460+ pos += 4
461+
462+ return result , pos , 0
463+
464+
465+ def utf_32_decode (data , errors = 'strict' , final = 0 ):
466+ """UTF-32 decoding with BOM detection."""
467+ if len (data ) >= 4 :
468+ # Check for BOM
469+ if data [0 :4 ] == b'\xff \xfe \x00 \x00 ' :
470+ # UTF-32 LE BOM
471+ res , consumed , _ = PyUnicode_DecodeUTF32Stateful (data [4 :], len (data )- 4 , errors , 'little' , final )
472+ res = '' .join (res )
473+ return res , consumed + 4
474+ elif data [0 :4 ] == b'\x00 \x00 \xfe \xff ' :
475+ # UTF-32 BE BOM
476+ res , consumed , _ = PyUnicode_DecodeUTF32Stateful (data [4 :], len (data )- 4 , errors , 'big' , final )
477+ res = '' .join (res )
478+ return res , consumed + 4
479+
480+ # Default to little-endian if no BOM
481+ byteorder = 'little' if sys .byteorder == 'little' else 'big'
482+ res , consumed , _ = PyUnicode_DecodeUTF32Stateful (data , len (data ), errors , byteorder , final )
483+ res = '' .join (res )
484+ return res , consumed
485+
486+
487+ def utf_32_le_decode (data , errors = 'strict' , final = 0 ):
488+ """UTF-32 little-endian decoding without BOM."""
489+ res , consumed , _ = PyUnicode_DecodeUTF32Stateful (data , len (data ), errors , 'little' , final )
490+ res = '' .join (res )
491+ return res , consumed
492+
493+
494+ def utf_32_be_decode (data , errors = 'strict' , final = 0 ):
495+ """UTF-32 big-endian decoding without BOM."""
496+ res , consumed , _ = PyUnicode_DecodeUTF32Stateful (data , len (data ), errors , 'big' , final )
497+ res = '' .join (res )
498+ return res , consumed
360499
361500
362501# ----------------------------------------------------------------------
@@ -677,8 +816,8 @@ def PyUnicode_AsASCIIString(unistr):
677816
678817 if not type (unistr ) == str :
679818 raise TypeError
680- return PyUnicode_EncodeASCII (str ( unistr ) ,
681- len (str ),
819+ return PyUnicode_EncodeASCII (unistr ,
820+ len (unistr ),
682821 None )
683822
684823def PyUnicode_DecodeUTF16Stateful (s , size , errors , byteorder = 'native' , final = True ):
@@ -815,7 +954,7 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
815954 p += STORECHAR (0xFEFF , bom )
816955
817956 if (size == 0 ):
818- return ""
957+ return []
819958
820959 if (byteorder == 'little' ):
821960 bom = 'little'
@@ -1084,7 +1223,7 @@ def PyUnicode_EncodeRawUnicodeEscape(s, size):
10841223def charmapencode_output (c , mapping ):
10851224
10861225 rep = mapping [c ]
1087- if isinstance (rep , int ) or isinstance ( rep , int ) :
1226+ if isinstance (rep , int ):
10881227 if rep < 256 :
10891228 return [rep ]
10901229 else :
0 commit comments