@@ -85,16 +85,6 @@ def escape_encode( obj, errors='strict'):
8585 v = s [1 :- 1 ]
8686 return v , len (v )
8787
88- def utf_8_decode ( data , errors = 'strict' , final = False ):
89- """None
90- """
91- consumed = len (data )
92- if final :
93- consumed = 0
94- res , consumed = PyUnicode_DecodeUTF8Stateful (data , len (data ), errors , final )
95- res = '' .join (res )
96- return res , consumed
97-
9888def raw_unicode_escape_decode ( data , errors = 'strict' ):
9989 """None
10090 """
@@ -324,13 +314,6 @@ def raw_unicode_escape_encode( obj, errors='strict'):
324314 res = bytes (res )
325315 return res , len (res )
326316
327- def utf_8_encode ( obj , errors = 'strict' ):
328- """None
329- """
330- res = PyUnicode_EncodeUTF8 (obj , len (obj ), errors )
331- res = bytes (res )
332- return res , len (res )
333-
334317def utf_16_le_encode ( obj , errors = 'strict' ):
335318 """None
336319 """
@@ -882,240 +865,6 @@ def unicode_call_errorhandler(errors, encoding,
882865 else :
883866 raise TypeError ("encoding error handler must return (unicode, int) tuple, not %s" % repr (res ))
884867
885- def PyUnicode_DecodeUTF8 (s , size , errors ):
886- return PyUnicode_DecodeUTF8Stateful (s , size , errors , False )
887-
888- ## /* Map UTF-8 encoded prefix byte to sequence length. zero means
889- ## illegal prefix. see RFC 2279 for details */
890- utf8_code_length = [
891- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
892- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
893- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
894- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
895- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
896- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
897- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
898- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
899- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
900- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
901- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
902- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
903- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
904- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
905- 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 ,
906- 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 0 , 0
907- ]
908-
909- def PyUnicode_DecodeUTF8Stateful (s , size , errors , final ):
910-
911- consumed = 0
912- if (size == 0 ):
913- if not final :
914- consumed = 0
915- return '' , consumed
916- p = []
917- pos = 0
918- while pos < size :
919- ch = s [pos ]
920- if ch < 0x80 :
921- p += chr (ch )
922- pos += 1
923- continue
924-
925- n = utf8_code_length [ch ]
926- startinpos = pos
927- if (startinpos + n > size ):
928- if not final :
929- break
930- else :
931- errmsg = "unexpected end of data"
932- endinpos = size
933- res = unicode_call_errorhandler (
934- errors , "utf8" , errmsg ,
935- s , startinpos , endinpos )
936- p += res [0 ]
937- pos = res [1 ]
938- if n == 0 :
939- errmsg = "unexpected code byte"
940- endinpos = startinpos + 1
941- res = unicode_call_errorhandler (
942- errors , "utf8" , errmsg ,
943- s , startinpos , endinpos )
944- p += res [0 ]
945- pos = res [1 ]
946- elif n == 1 :
947- errmsg = "internal error"
948- endinpos = startinpos + 1
949- res = unicode_call_errorhandler (
950- errors , "utf8" , errmsg ,
951- s , startinpos , endinpos )
952- p += res [0 ]
953- pos = res [1 ]
954- elif n == 2 :
955- if ((s [pos + 1 ] & 0xc0 ) != 0x80 ):
956- errmsg = "invalid data"
957- endinpos = startinpos + 2
958- res = unicode_call_errorhandler (
959- errors , "utf8" , errmsg ,
960- s , startinpos , endinpos )
961- p += res [0 ]
962- pos = res [1 ]
963- else :
964- c = ((s [pos ] & 0x1f ) << 6 ) + (s [pos + 1 ] & 0x3f )
965- if c < 0x80 :
966- errmsg = "illegal encoding"
967- endinpos = startinpos + 2
968- res = unicode_call_errorhandler (
969- errors , "utf8" , errmsg ,
970- s , startinpos , endinpos )
971- p += res [0 ]
972- pos = res [1 ]
973- else :
974- p += chr (c )
975- pos += n
976- #break
977- elif n == 3 :
978- if ((s [pos + 1 ] & 0xc0 ) != 0x80 or
979- (s [pos + 2 ] & 0xc0 ) != 0x80 ):
980- errmsg = "invalid data"
981- endinpos = startinpos + 3
982- res = unicode_call_errorhandler (
983- errors , "utf8" , errmsg ,
984- s , startinpos , endinpos )
985- p += res [0 ]
986- pos = res [1 ]
987- else :
988- c = ((s [pos ] & 0x0f ) << 12 ) + \
989- ((s [pos + 1 ] & 0x3f ) << 6 ) + \
990- (s [pos + 2 ] & 0x3f )
991-
992- ## /* Note: UTF-8 encodings of surrogates are considered
993- ## legal UTF-8 sequences;
994- ##
995- ## XXX For wide builds (UCS-4) we should probably try
996- ## to recombine the surrogates into a single code
997- ## unit.
998- ## */
999- if c < 0x0800 :
1000- errmsg = "illegal encoding"
1001- endinpos = startinpos + 3
1002- res = unicode_call_errorhandler (
1003- errors , "utf8" , errmsg ,
1004- s , startinpos , endinpos )
1005- p += res [0 ]
1006- pos = res [1 ]
1007- else :
1008- p += chr (c )
1009- pos += n
1010- elif n == 4 :
1011- ## case 4:
1012- if ((s [pos + 1 ] & 0xc0 ) != 0x80 or
1013- (s [pos + 2 ] & 0xc0 ) != 0x80 or
1014- (s [pos + 3 ] & 0xc0 ) != 0x80 ):
1015-
1016- errmsg = "invalid data"
1017- startinpos = pos
1018- endinpos = startinpos + 4
1019- res = unicode_call_errorhandler (
1020- errors , "utf8" , errmsg ,
1021- s , startinpos , endinpos )
1022- p += res [0 ]
1023- pos = res [1 ]
1024- else :
1025- c = ((s [pos + 0 ] & 0x7 ) << 18 ) + ((s [pos + 1 ] & 0x3f ) << 12 ) + \
1026- ((s [pos + 2 ] & 0x3f ) << 6 ) + (s [pos + 3 ] & 0x3f )
1027- #/* validate and convert to UTF-16 */
1028- if ((c < 0x10000 ) or (c > 0x10ffff )):
1029- #/* minimum value allowed for 4 byte encoding */
1030- #/* maximum value allowed for UTF-16 */
1031-
1032- errmsg = "illegal encoding"
1033- startinpos = pos
1034- endinpos = startinpos + 4
1035- res = unicode_call_errorhandler (
1036- errors , "utf8" , errmsg ,
1037- s , startinpos , endinpos )
1038- p += res [0 ]
1039- pos = res [1 ]
1040- else :
1041- #ifdef Py_UNICODE_WIDE
1042- if c < sys .maxunicode :
1043- p += chr (c )
1044- pos += n
1045- else :
1046- ## /* compute and append the two surrogates: */
1047- ## /* translate from 10000..10FFFF to 0..FFFF */
1048- c -= 0x10000
1049- #/* high surrogate = top 10 bits added to D800 */
1050- p += chr (0xD800 + (c >> 10 ))
1051- #/* low surrogate = bottom 10 bits added to DC00 */
1052- p += chr (0xDC00 + (c & 0x03FF ))
1053- pos += n
1054- else :
1055- ## default:
1056- ## /* Other sizes are only needed for UCS-4 */
1057- errmsg = "unsupported Unicode code range"
1058- startinpos = pos
1059- endinpos = startinpos + n
1060- res = unicode_call_errorhandler (
1061- errors , "utf8" , errmsg ,
1062- s , startinpos , endinpos )
1063- p += res [0 ]
1064- pos = res [1 ]
1065-
1066- #continue
1067-
1068- if not final :
1069- consumed = pos
1070- return p , pos # consumed
1071-
1072- def PyUnicode_EncodeUTF8 (s , size , errors ):
1073-
1074- #assert(s != None)
1075- assert (size >= 0 )
1076- p = bytearray ()
1077- i = 0
1078- while i < size :
1079- ch = s [i ]
1080- i += 1
1081- if (ord (ch ) < 0x80 ):
1082- ## /* Encode ASCII */
1083- p .append (ord (ch ))
1084- elif (ord (ch ) < 0x0800 ) :
1085- ## /* Encode Latin-1 */
1086- p .append (0xc0 | (ord (ch ) >> 6 ))
1087- p .append (0x80 | (ord (ch ) & 0x3f ))
1088- else :
1089- ## /* Encode UCS2 Unicode ordinals */
1090- if (ord (ch ) < 0x10000 ):
1091- ## /* Special case: check for high surrogate */
1092- if (0xD800 <= ord (ch ) and ord (ch ) <= 0xDBFF and i != size ) :
1093- ch2 = s [i ]
1094- ## /* Check for low surrogate and combine the two to
1095- ## form a UCS4 value */
1096- if (0xDC00 <= ord (ch2 ) and ord (ch2 ) <= 0xDFFF ) :
1097- ch3 = ((ord (ch ) - 0xD800 ) << 10 | (ord (ch2 ) - 0xDC00 )) + 0x10000
1098- i += 1
1099- p += encodeUCS4 (ch3 )
1100- continue
1101- ## /* Fall through: handles isolated high surrogates */
1102- p .append (0xe0 | (ord (ch ) >> 12 ))
1103- p .append (0x80 | ((ord (ch ) >> 6 ) & 0x3f ))
1104- p .append (0x80 | (ord (ch ) & 0x3f ))
1105- continue
1106- else :
1107- p += encodeUCS4 (ord (ch ))
1108- return p
1109-
1110- def encodeUCS4 (ch ):
1111- ## /* Encode UCS4 Unicode ordinals */
1112- p = bytearray ()
1113- p .append (0xf0 | (ch >> 18 ))
1114- p .append (0x80 | ((ch >> 12 ) & 0x3f ))
1115- p .append (0x80 | ((ch >> 6 ) & 0x3f ))
1116- p .append (0x80 | (ch & 0x3f ))
1117- return p
1118-
1119868#/* --- Latin-1 Codec ------------------------------------------------------ */
1120869
1121870def PyUnicode_DecodeLatin1 (s , size , errors ):
0 commit comments