Skip to content

Commit c8cc931

Browse files
committed
Update PyUnicode_DecodeUTF8 from RFC 2279 to RFC 3629.
1) #8271: when a byte sequence is invalid, only the start byte and all the valid continuation bytes are now replaced by U+FFFD, instead of replacing the number of bytes specified by the start byte. See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (pages 94-95); 2) 5- and 6-bytes-long UTF-8 sequences are now considered invalid (no changes in behavior); 3) Add code and tests to reject surrogates (U+D800-U+DFFF) as defined in RFC 3629, but leave it commented out since it's not backward compatible; 4) Change the error messages "unexpected code byte" to "invalid start byte" and "invalid data" to "invalid continuation byte"; 5) Add an extensive set of tests in test_unicode; 6) Fix test_codeccallbacks because it was failing after this change.
1 parent cab5c5c commit c8cc931

3 files changed

Lines changed: 226 additions & 73 deletions

File tree

Lib/test/test_codeccallbacks.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,28 +153,30 @@ def test_backslashescape(self):
153153
sout += "\\U%08x" % sys.maxunicode
154154
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
155155

156-
def test_decoderelaxedutf8(self):
157-
# This is the test for a decoding callback handler,
158-
# that relaxes the UTF-8 minimal encoding restriction.
159-
# A null byte that is encoded as "\xc0\x80" will be
160-
# decoded as a null byte. All other illegal sequences
161-
# will be handled strictly.
156+
def test_decoding_callbacks(self):
157+
# This is a test for a decoding callback handler
158+
# that allows the decoding of the invalid sequence
159+
# "\xc0\x80" and returns "\x00" instead of raising an error.
160+
# All other illegal sequences will be handled strictly.
162161
def relaxedutf8(exc):
163162
if not isinstance(exc, UnicodeDecodeError):
164163
raise TypeError("don't know how to handle %r" % exc)
165-
if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
164+
if exc.object[exc.start:exc.start+2] == "\xc0\x80":
166165
return (u"\x00", exc.start+2) # retry after two bytes
167166
else:
168167
raise exc
169168

170-
codecs.register_error(
171-
"test.relaxedutf8", relaxedutf8)
169+
codecs.register_error("test.relaxedutf8", relaxedutf8)
172170

171+
# all the "\xc0\x80" will be decoded to "\x00"
173172
sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
174173
sout = u"a\x00b\x00c\xfc\x00\x00"
175174
self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
175+
176+
# "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
176177
sin = "\xc0\x80\xc0\x81"
177-
self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8")
178+
self.assertRaises(UnicodeDecodeError, sin.decode,
179+
"utf-8", "test.relaxedutf8")
178180

179181
def test_charmapencode(self):
180182
# For charmap encodings the replacement string will be

Lib/test/test_unicode.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,164 @@ def test_codecs_utf8(self):
600600
# * strict decoding testing for all of the
601601
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
602602

603+
def test_utf8_decode_valid_sequences(self):
604+
sequences = [
605+
# single byte
606+
('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
607+
# 2 bytes
608+
('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
609+
# 3 bytes
610+
('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
611+
('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
612+
# 4 bytes
613+
('\xF0\x90\x80\x80', u'\U00010000'),
614+
('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
615+
]
616+
for seq, res in sequences:
617+
self.assertEqual(seq.decode('utf-8'), res)
618+
619+
for ch in map(unichr, range(0, sys.maxunicode)):
620+
self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
621+
622+
def test_utf8_decode_invalid_sequences(self):
623+
# continuation bytes in a sequence of 2, 3, or 4 bytes
624+
continuation_bytes = map(chr, range(0x80, 0xC0))
625+
# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
626+
invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
627+
# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
628+
invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
629+
invalid_start_bytes = (
630+
continuation_bytes + invalid_2B_seq_start_bytes +
631+
invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
632+
)
633+
634+
for byte in invalid_start_bytes:
635+
self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
636+
637+
for sb in invalid_2B_seq_start_bytes:
638+
for cb in continuation_bytes:
639+
self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
640+
641+
for sb in invalid_4B_seq_start_bytes:
642+
for cb1 in continuation_bytes[:3]:
643+
for cb3 in continuation_bytes[:3]:
644+
self.assertRaises(UnicodeDecodeError,
645+
(sb+cb1+'\x80'+cb3).decode, 'utf-8')
646+
647+
for cb in map(chr, range(0x80, 0xA0)):
648+
self.assertRaises(UnicodeDecodeError,
649+
('\xE0'+cb+'\x80').decode, 'utf-8')
650+
self.assertRaises(UnicodeDecodeError,
651+
('\xE0'+cb+'\xBF').decode, 'utf-8')
652+
# XXX: surrogates shouldn't be valid UTF-8!
653+
# see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
654+
# (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
655+
#for cb in map(chr, range(0xA0, 0xC0)):
656+
#sys.__stdout__.write('\\xED\\x%02x\\x80\n' % ord(cb))
657+
#self.assertRaises(UnicodeDecodeError,
658+
#('\xED'+cb+'\x80').decode, 'utf-8')
659+
#self.assertRaises(UnicodeDecodeError,
660+
#('\xED'+cb+'\xBF').decode, 'utf-8')
661+
for cb in map(chr, range(0x80, 0x90)):
662+
self.assertRaises(UnicodeDecodeError,
663+
('\xF0'+cb+'\x80\x80').decode, 'utf-8')
664+
self.assertRaises(UnicodeDecodeError,
665+
('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
666+
for cb in map(chr, range(0x90, 0xC0)):
667+
self.assertRaises(UnicodeDecodeError,
668+
('\xF4'+cb+'\x80\x80').decode, 'utf-8')
669+
self.assertRaises(UnicodeDecodeError,
670+
('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
671+
672+
def test_issue8271(self):
673+
# Issue #8271: when a byte sequence is invalid, only the start byte
674+
# and all the valid continuation bytes should be replaced by U+FFFD,
675+
# not the number of bytes specified by the start byte.
676+
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
677+
# table 3-8, Row 2) for more information about the algorithm used.
678+
FFFD = u'\ufffd'
679+
sequences = [
680+
# invalid start bytes
681+
('\x80', FFFD), # continuation byte
682+
('\x80\x80', FFFD*2), # 2 continuation bytes
683+
('\xc0', FFFD),
684+
('\xc0\xc0', FFFD*2),
685+
('\xc1', FFFD),
686+
('\xc1\xc0', FFFD*2),
687+
('\xc0\xc1', FFFD*2),
688+
# with start byte of a 2-byte sequence
689+
('\xc2', FFFD), # only the start byte
690+
('\xc2\xc2', FFFD*2), # 2 start bytes
691+
('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
692+
('\xc2\x41', FFFD+'A'), # invalid continuation byte
693+
# with start byte of a 3-byte sequence
694+
('\xe1', FFFD), # only the start byte
695+
('\xe1\xe1', FFFD*2), # 2 start bytes
696+
('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
697+
('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
698+
('\xe1\x80', FFFD), # only 1 continuation byte
699+
('\xe1\x41', FFFD+'A'), # invalid continuation byte
700+
('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
701+
('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
702+
('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
703+
('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
704+
('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
705+
# with start byte of a 4-byte sequence
706+
('\xf1', FFFD), # only the start byte
707+
('\xf1\xf1', FFFD*2), # 2 start bytes
708+
('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
709+
('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
710+
('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
711+
('\xf1\x80', FFFD), # only 1 continuation bytes
712+
('\xf1\x80\x80', FFFD), # only 2 continuation bytes
713+
('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
714+
('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
715+
('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
716+
('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
717+
('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
718+
('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
719+
('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
720+
('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
721+
('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
722+
('\xf1\xf1\x80\x41', FFFD*2+'A'),
723+
('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
724+
# with invalid start byte of a 4-byte sequence (rfc2279)
725+
('\xf5', FFFD), # only the start byte
726+
('\xf5\xf5', FFFD*2), # 2 start bytes
727+
('\xf5\x80', FFFD*2), # only 1 continuation byte
728+
('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
729+
('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
730+
('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
731+
('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
732+
('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
733+
# with invalid start byte of a 5-byte sequence (rfc2279)
734+
('\xf8', FFFD), # only the start byte
735+
('\xf8\xf8', FFFD*2), # 2 start bytes
736+
('\xf8\x80', FFFD*2), # only one continuation byte
737+
('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
738+
('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
739+
# with invalid start byte of a 6-byte sequence (rfc2279)
740+
('\xfc', FFFD), # only the start byte
741+
('\xfc\xfc', FFFD*2), # 2 start bytes
742+
('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
743+
('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
744+
# invalid start byte
745+
('\xfe', FFFD),
746+
('\xfe\x80\x80', FFFD*3),
747+
# other sequences
748+
('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
749+
('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
750+
('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
751+
('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
752+
u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
753+
]
754+
for n, (seq, res) in enumerate(sequences):
755+
self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
756+
self.assertEqual(seq.decode('utf-8', 'replace'), res)
757+
self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
758+
self.assertEqual(seq.decode('utf-8', 'ignore'),
759+
res.replace(u'\uFFFD', ''))
760+
603761
def test_codecs_idna(self):
604762
# Test whether trailing dot is preserved
605763
self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")

Objects/unicodeobject.c

Lines changed: 56 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,24 +1863,24 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
18631863

18641864
static
18651865
char utf8_code_length[256] = {
1866-
/* Map UTF-8 encoded prefix byte to sequence length. zero means
1867-
illegal prefix. see RFC 2279 for details */
1866+
/* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867+
illegal prefix. See RFC 3629 for details */
1868+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
18681869
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18691871
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18701872
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18711873
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18721874
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1876-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
18771877
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18781878
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1881-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1882-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1883-
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1879+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880+
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883+
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
18841884
};
18851885

18861886
PyObject *PyUnicode_DecodeUTF8(const char *s,
@@ -1897,6 +1897,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
18971897
{
18981898
const char *starts = s;
18991899
int n;
1900+
int k;
19001901
Py_ssize_t startinpos;
19011902
Py_ssize_t endinpos;
19021903
Py_ssize_t outpos;
@@ -1939,15 +1940,17 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
19391940
else {
19401941
errmsg = "unexpected end of data";
19411942
startinpos = s-starts;
1942-
endinpos = size;
1943+
endinpos = startinpos+1;
1944+
for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1945+
endinpos++;
19431946
goto utf8Error;
19441947
}
19451948
}
19461949

19471950
switch (n) {
19481951

19491952
case 0:
1950-
errmsg = "unexpected code byte";
1953+
errmsg = "invalid start byte";
19511954
startinpos = s-starts;
19521955
endinpos = startinpos+1;
19531956
goto utf8Error;
@@ -1960,70 +1963,67 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
19601963

19611964
case 2:
19621965
if ((s[1] & 0xc0) != 0x80) {
1963-
errmsg = "invalid data";
1966+
errmsg = "invalid continuation byte";
19641967
startinpos = s-starts;
1965-
endinpos = startinpos+2;
1968+
endinpos = startinpos + 1;
19661969
goto utf8Error;
19671970
}
19681971
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1969-
if (ch < 0x80) {
1970-
startinpos = s-starts;
1971-
endinpos = startinpos+2;
1972-
errmsg = "illegal encoding";
1973-
goto utf8Error;
1974-
}
1975-
else
1976-
*p++ = (Py_UNICODE)ch;
1972+
assert ((ch > 0x007F) && (ch <= 0x07FF));
1973+
*p++ = (Py_UNICODE)ch;
19771974
break;
19781975

19791976
case 3:
1977+
/* XXX: surrogates shouldn't be valid UTF-8!
1978+
see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979+
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980+
Uncomment the 2 lines below to make them invalid,
1981+
codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
19801982
if ((s[1] & 0xc0) != 0x80 ||
1981-
(s[2] & 0xc0) != 0x80) {
1982-
errmsg = "invalid data";
1983+
(s[2] & 0xc0) != 0x80 ||
1984+
((unsigned char)s[0] == 0xE0 &&
1985+
(unsigned char)s[1] < 0xA0)/* ||
1986+
((unsigned char)s[0] == 0xED &&
1987+
(unsigned char)s[1] > 0x9F)*/) {
1988+
errmsg = "invalid continuation byte";
19831989
startinpos = s-starts;
1984-
endinpos = startinpos+3;
1990+
endinpos = startinpos + 1;
1991+
1992+
/* if s[1] first two bits are 1 and 0, then the invalid
1993+
continuation byte is s[2], so increment endinpos by 1,
1994+
if not, s[1] is invalid and endinpos doesn't need to
1995+
be incremented. */
1996+
if ((s[1] & 0xC0) == 0x80)
1997+
endinpos++;
19851998
goto utf8Error;
19861999
}
19872000
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1988-
if (ch < 0x0800) {
1989-
/* Note: UTF-8 encodings of surrogates are considered
1990-
legal UTF-8 sequences;
1991-
1992-
XXX For wide builds (UCS-4) we should probably try
1993-
to recombine the surrogates into a single code
1994-
unit.
1995-
*/
1996-
errmsg = "illegal encoding";
1997-
startinpos = s-starts;
1998-
endinpos = startinpos+3;
1999-
goto utf8Error;
2000-
}
2001-
else
2002-
*p++ = (Py_UNICODE)ch;
2001+
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2002+
*p++ = (Py_UNICODE)ch;
20032003
break;
20042004

20052005
case 4:
20062006
if ((s[1] & 0xc0) != 0x80 ||
20072007
(s[2] & 0xc0) != 0x80 ||
2008-
(s[3] & 0xc0) != 0x80) {
2009-
errmsg = "invalid data";
2008+
(s[3] & 0xc0) != 0x80 ||
2009+
((unsigned char)s[0] == 0xF0 &&
2010+
(unsigned char)s[1] < 0x90) ||
2011+
((unsigned char)s[0] == 0xF4 &&
2012+
(unsigned char)s[1] > 0x8F)) {
2013+
errmsg = "invalid continuation byte";
20102014
startinpos = s-starts;
2011-
endinpos = startinpos+4;
2015+
endinpos = startinpos + 1;
2016+
if ((s[1] & 0xC0) == 0x80) {
2017+
endinpos++;
2018+
if ((s[2] & 0xC0) == 0x80)
2019+
endinpos++;
2020+
}
20122021
goto utf8Error;
20132022
}
20142023
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2015-
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2016-
/* validate and convert to UTF-16 */
2017-
if ((ch < 0x10000) /* minimum value allowed for 4
2018-
byte encoding */
2019-
|| (ch > 0x10ffff)) /* maximum value allowed for
2020-
UTF-16 */
2021-
{
2022-
errmsg = "illegal encoding";
2023-
startinpos = s-starts;
2024-
endinpos = startinpos+4;
2025-
goto utf8Error;
2026-
}
2024+
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2025+
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2026+
20272027
#ifdef Py_UNICODE_WIDE
20282028
*p++ = (Py_UNICODE)ch;
20292029
#else
@@ -2039,13 +2039,6 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20392039
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
20402040
#endif
20412041
break;
2042-
2043-
default:
2044-
/* Other sizes are only needed for UCS-4 */
2045-
errmsg = "unsupported Unicode code range";
2046-
startinpos = s-starts;
2047-
endinpos = startinpos+n;
2048-
goto utf8Error;
20492042
}
20502043
s += n;
20512044
continue;

0 commit comments

Comments
 (0)