@@ -1349,7 +1349,7 @@ def test_issue8271(self):
13491349 # with start byte of a 2-byte sequence
13501350 (b'\xc2 ' , FFFD ), # only the start byte
13511351 (b'\xc2 \xc2 ' , FFFD * 2 ), # 2 start bytes
1352- (b'\xc2 \xc2 \xc2 ' , FFFD * 3 ), # 2 start bytes
1352+ (b'\xc2 \xc2 \xc2 ' , FFFD * 3 ), # 3 start bytes
13531353 (b'\xc2 \x41 ' , FFFD + 'A' ), # invalid continuation byte
13541354 # with start byte of a 3-byte sequence
13551355 (b'\xe1 ' , FFFD ), # only the start byte
@@ -1419,6 +1419,226 @@ def test_issue8271(self):
14191419 self .assertEqual (seq .decode ('utf-8' , 'ignore' ),
14201420 res .replace ('\uFFFD ' , '' ))
14211421
1422+ def to_bytestring (self , seq ):
1423+ return bytes (int (c , 16 ) for c in seq .split ())
1424+
1425+ def assertCorrectUTF8Decoding (self , seq , res , err ):
1426+ """
1427+ Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1428+ 'strict' is used, returns res when 'replace' is used, and that doesn't
1429+ return anything when 'ignore' is used.
1430+ """
1431+ with self .assertRaises (UnicodeDecodeError ) as cm :
1432+ seq .decode ('utf-8' )
1433+ exc = cm .exception
1434+
1435+ self .assertIn (err , str (exc ))
1436+ self .assertEqual (seq .decode ('utf-8' , 'replace' ), res )
1437+ self .assertEqual ((b'aaaa' + seq + b'bbbb' ).decode ('utf-8' , 'replace' ),
1438+ 'aaaa' + res + 'bbbb' )
1439+ res = res .replace ('\ufffd ' , '' )
1440+ self .assertEqual (seq .decode ('utf-8' , 'ignore' ), res )
1441+ self .assertEqual ((b'aaaa' + seq + b'bbbb' ).decode ('utf-8' , 'ignore' ),
1442+ 'aaaa' + res + 'bbbb' )
1443+
1444+ def test_invalid_start_byte (self ):
1445+ """
1446+ Test that an 'invalid start byte' error is raised when the first byte
1447+ is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1448+ 4-bytes sequence. The invalid start byte is replaced with a single
1449+ U+FFFD when errors='replace'.
1450+ E.g. <80> is a continuation byte and can appear only after a start byte.
1451+ """
1452+ FFFD = '\ufffd '
1453+ for byte in b'\x80 \xA0 \x9F \xBF \xC0 \xC1 \xF5 \xFF ' :
1454+ self .assertCorrectUTF8Decoding (bytes ([byte ]), '\ufffd ' ,
1455+ 'invalid start byte' )
1456+
1457+ def test_unexpected_end_of_data (self ):
1458+ """
1459+ Test that an 'unexpected end of data' error is raised when the string
1460+ ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1461+ enough continuation bytes. The incomplete sequence is replaced with a
1462+ single U+FFFD when errors='replace'.
1463+ E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1464+ sequence, but it's followed by only 2 valid continuation bytes and the
1465+ last continuation bytes is missing.
1466+ Note: the continuation bytes must be all valid, if one of them is
1467+ invalid another error will be raised.
1468+ """
1469+ sequences = [
1470+ 'C2' , 'DF' ,
1471+ 'E0 A0' , 'E0 BF' , 'E1 80' , 'E1 BF' , 'EC 80' , 'EC BF' ,
1472+ 'ED 80' , 'ED 9F' , 'EE 80' , 'EE BF' , 'EF 80' , 'EF BF' ,
1473+ 'F0 90' , 'F0 BF' , 'F0 90 80' , 'F0 90 BF' , 'F0 BF 80' , 'F0 BF BF' ,
1474+ 'F1 80' , 'F1 BF' , 'F1 80 80' , 'F1 80 BF' , 'F1 BF 80' , 'F1 BF BF' ,
1475+ 'F3 80' , 'F3 BF' , 'F3 80 80' , 'F3 80 BF' , 'F3 BF 80' , 'F3 BF BF' ,
1476+ 'F4 80' , 'F4 8F' , 'F4 80 80' , 'F4 80 BF' , 'F4 8F 80' , 'F4 8F BF'
1477+ ]
1478+ FFFD = '\ufffd '
1479+ for seq in sequences :
1480+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), '\ufffd ' ,
1481+ 'unexpected end of data' )
1482+
1483+ def test_invalid_cb_for_2bytes_seq (self ):
1484+ """
1485+ Test that an 'invalid continuation byte' error is raised when the
1486+ continuation byte of a 2-bytes sequence is invalid. The start byte
1487+ is replaced by a single U+FFFD and the second byte is handled
1488+ separately when errors='replace'.
1489+ E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1490+ sequence, but 41 is not a valid continuation byte because it's the
1491+ ASCII letter 'A'.
1492+ """
1493+ FFFD = '\ufffd '
1494+ FFFDx2 = FFFD * 2
1495+ sequences = [
1496+ ('C2 00' , FFFD + '\x00 ' ), ('C2 7F' , FFFD + '\x7f ' ),
1497+ ('C2 C0' , FFFDx2 ), ('C2 FF' , FFFDx2 ),
1498+ ('DF 00' , FFFD + '\x00 ' ), ('DF 7F' , FFFD + '\x7f ' ),
1499+ ('DF C0' , FFFDx2 ), ('DF FF' , FFFDx2 ),
1500+ ]
1501+ for seq , res in sequences :
1502+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1503+ 'invalid continuation byte' )
1504+
1505+ def test_invalid_cb_for_3bytes_seq (self ):
1506+ """
1507+ Test that an 'invalid continuation byte' error is raised when the
1508+ continuation byte(s) of a 3-bytes sequence are invalid. When
1509+ errors='replace', if the first continuation byte is valid, the first
1510+ two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1511+ third byte is handled separately, otherwise only the start byte is
1512+ replaced with a U+FFFD and the other continuation bytes are handled
1513+ separately.
1514+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1515+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1516+ because it's the ASCII letter 'A'.
1517+ Note: when the start byte is E0 or ED, the valid ranges for the first
1518+ continuation byte are limited to A0..BF and 80..9F respectively.
1519+ Python 2 used to consider all the bytes in range 80..BF valid when the
1520+ start byte was ED. This is fixed in Python 3.
1521+ """
1522+ FFFD = '\ufffd '
1523+ FFFDx2 = FFFD * 2
1524+ sequences = [
1525+ ('E0 00' , FFFD + '\x00 ' ), ('E0 7F' , FFFD + '\x7f ' ), ('E0 80' , FFFDx2 ),
1526+ ('E0 9F' , FFFDx2 ), ('E0 C0' , FFFDx2 ), ('E0 FF' , FFFDx2 ),
1527+ ('E0 A0 00' , FFFD + '\x00 ' ), ('E0 A0 7F' , FFFD + '\x7f ' ),
1528+ ('E0 A0 C0' , FFFDx2 ), ('E0 A0 FF' , FFFDx2 ),
1529+ ('E0 BF 00' , FFFD + '\x00 ' ), ('E0 BF 7F' , FFFD + '\x7f ' ),
1530+ ('E0 BF C0' , FFFDx2 ), ('E0 BF FF' , FFFDx2 ), ('E1 00' , FFFD + '\x00 ' ),
1531+ ('E1 7F' , FFFD + '\x7f ' ), ('E1 C0' , FFFDx2 ), ('E1 FF' , FFFDx2 ),
1532+ ('E1 80 00' , FFFD + '\x00 ' ), ('E1 80 7F' , FFFD + '\x7f ' ),
1533+ ('E1 80 C0' , FFFDx2 ), ('E1 80 FF' , FFFDx2 ),
1534+ ('E1 BF 00' , FFFD + '\x00 ' ), ('E1 BF 7F' , FFFD + '\x7f ' ),
1535+ ('E1 BF C0' , FFFDx2 ), ('E1 BF FF' , FFFDx2 ), ('EC 00' , FFFD + '\x00 ' ),
1536+ ('EC 7F' , FFFD + '\x7f ' ), ('EC C0' , FFFDx2 ), ('EC FF' , FFFDx2 ),
1537+ ('EC 80 00' , FFFD + '\x00 ' ), ('EC 80 7F' , FFFD + '\x7f ' ),
1538+ ('EC 80 C0' , FFFDx2 ), ('EC 80 FF' , FFFDx2 ),
1539+ ('EC BF 00' , FFFD + '\x00 ' ), ('EC BF 7F' , FFFD + '\x7f ' ),
1540+ ('EC BF C0' , FFFDx2 ), ('EC BF FF' , FFFDx2 ), ('ED 00' , FFFD + '\x00 ' ),
1541+ ('ED 7F' , FFFD + '\x7f ' ),
1542+ ('ED A0' , FFFDx2 ), ('ED BF' , FFFDx2 ), # see note ^
1543+ ('ED C0' , FFFDx2 ), ('ED FF' , FFFDx2 ), ('ED 80 00' , FFFD + '\x00 ' ),
1544+ ('ED 80 7F' , FFFD + '\x7f ' ), ('ED 80 C0' , FFFDx2 ),
1545+ ('ED 80 FF' , FFFDx2 ), ('ED 9F 00' , FFFD + '\x00 ' ),
1546+ ('ED 9F 7F' , FFFD + '\x7f ' ), ('ED 9F C0' , FFFDx2 ),
1547+ ('ED 9F FF' , FFFDx2 ), ('EE 00' , FFFD + '\x00 ' ),
1548+ ('EE 7F' , FFFD + '\x7f ' ), ('EE C0' , FFFDx2 ), ('EE FF' , FFFDx2 ),
1549+ ('EE 80 00' , FFFD + '\x00 ' ), ('EE 80 7F' , FFFD + '\x7f ' ),
1550+ ('EE 80 C0' , FFFDx2 ), ('EE 80 FF' , FFFDx2 ),
1551+ ('EE BF 00' , FFFD + '\x00 ' ), ('EE BF 7F' , FFFD + '\x7f ' ),
1552+ ('EE BF C0' , FFFDx2 ), ('EE BF FF' , FFFDx2 ), ('EF 00' , FFFD + '\x00 ' ),
1553+ ('EF 7F' , FFFD + '\x7f ' ), ('EF C0' , FFFDx2 ), ('EF FF' , FFFDx2 ),
1554+ ('EF 80 00' , FFFD + '\x00 ' ), ('EF 80 7F' , FFFD + '\x7f ' ),
1555+ ('EF 80 C0' , FFFDx2 ), ('EF 80 FF' , FFFDx2 ),
1556+ ('EF BF 00' , FFFD + '\x00 ' ), ('EF BF 7F' , FFFD + '\x7f ' ),
1557+ ('EF BF C0' , FFFDx2 ), ('EF BF FF' , FFFDx2 ),
1558+ ]
1559+ for seq , res in sequences :
1560+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1561+ 'invalid continuation byte' )
1562+
1563+ def test_invalid_cb_for_4bytes_seq (self ):
1564+ """
1565+ Test that an 'invalid continuation byte' error is raised when the
1566+ continuation byte(s) of a 4-bytes sequence are invalid. When
1567+ errors='replace',the start byte and all the following valid
1568+ continuation bytes are replaced with a single U+FFFD, and all the bytes
1569+ starting from the first invalid continuation bytes (included) are
1570+ handled separately.
1571+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1572+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1573+ because it's the ASCII letter 'A'.
1574+ Note: when the start byte is E0 or ED, the valid ranges for the first
1575+ continuation byte are limited to A0..BF and 80..9F respectively.
1576+ However, when the start byte is ED, Python 2 considers all the bytes
1577+ in range 80..BF valid. This is fixed in Python 3.
1578+ """
1579+ FFFD = '\ufffd '
1580+ FFFDx2 = FFFD * 2
1581+ sequences = [
1582+ ('F0 00' , FFFD + '\x00 ' ), ('F0 7F' , FFFD + '\x7f ' ), ('F0 80' , FFFDx2 ),
1583+ ('F0 8F' , FFFDx2 ), ('F0 C0' , FFFDx2 ), ('F0 FF' , FFFDx2 ),
1584+ ('F0 90 00' , FFFD + '\x00 ' ), ('F0 90 7F' , FFFD + '\x7f ' ),
1585+ ('F0 90 C0' , FFFDx2 ), ('F0 90 FF' , FFFDx2 ),
1586+ ('F0 BF 00' , FFFD + '\x00 ' ), ('F0 BF 7F' , FFFD + '\x7f ' ),
1587+ ('F0 BF C0' , FFFDx2 ), ('F0 BF FF' , FFFDx2 ),
1588+ ('F0 90 80 00' , FFFD + '\x00 ' ), ('F0 90 80 7F' , FFFD + '\x7f ' ),
1589+ ('F0 90 80 C0' , FFFDx2 ), ('F0 90 80 FF' , FFFDx2 ),
1590+ ('F0 90 BF 00' , FFFD + '\x00 ' ), ('F0 90 BF 7F' , FFFD + '\x7f ' ),
1591+ ('F0 90 BF C0' , FFFDx2 ), ('F0 90 BF FF' , FFFDx2 ),
1592+ ('F0 BF 80 00' , FFFD + '\x00 ' ), ('F0 BF 80 7F' , FFFD + '\x7f ' ),
1593+ ('F0 BF 80 C0' , FFFDx2 ), ('F0 BF 80 FF' , FFFDx2 ),
1594+ ('F0 BF BF 00' , FFFD + '\x00 ' ), ('F0 BF BF 7F' , FFFD + '\x7f ' ),
1595+ ('F0 BF BF C0' , FFFDx2 ), ('F0 BF BF FF' , FFFDx2 ),
1596+ ('F1 00' , FFFD + '\x00 ' ), ('F1 7F' , FFFD + '\x7f ' ), ('F1 C0' , FFFDx2 ),
1597+ ('F1 FF' , FFFDx2 ), ('F1 80 00' , FFFD + '\x00 ' ),
1598+ ('F1 80 7F' , FFFD + '\x7f ' ), ('F1 80 C0' , FFFDx2 ),
1599+ ('F1 80 FF' , FFFDx2 ), ('F1 BF 00' , FFFD + '\x00 ' ),
1600+ ('F1 BF 7F' , FFFD + '\x7f ' ), ('F1 BF C0' , FFFDx2 ),
1601+ ('F1 BF FF' , FFFDx2 ), ('F1 80 80 00' , FFFD + '\x00 ' ),
1602+ ('F1 80 80 7F' , FFFD + '\x7f ' ), ('F1 80 80 C0' , FFFDx2 ),
1603+ ('F1 80 80 FF' , FFFDx2 ), ('F1 80 BF 00' , FFFD + '\x00 ' ),
1604+ ('F1 80 BF 7F' , FFFD + '\x7f ' ), ('F1 80 BF C0' , FFFDx2 ),
1605+ ('F1 80 BF FF' , FFFDx2 ), ('F1 BF 80 00' , FFFD + '\x00 ' ),
1606+ ('F1 BF 80 7F' , FFFD + '\x7f ' ), ('F1 BF 80 C0' , FFFDx2 ),
1607+ ('F1 BF 80 FF' , FFFDx2 ), ('F1 BF BF 00' , FFFD + '\x00 ' ),
1608+ ('F1 BF BF 7F' , FFFD + '\x7f ' ), ('F1 BF BF C0' , FFFDx2 ),
1609+ ('F1 BF BF FF' , FFFDx2 ), ('F3 00' , FFFD + '\x00 ' ),
1610+ ('F3 7F' , FFFD + '\x7f ' ), ('F3 C0' , FFFDx2 ), ('F3 FF' , FFFDx2 ),
1611+ ('F3 80 00' , FFFD + '\x00 ' ), ('F3 80 7F' , FFFD + '\x7f ' ),
1612+ ('F3 80 C0' , FFFDx2 ), ('F3 80 FF' , FFFDx2 ),
1613+ ('F3 BF 00' , FFFD + '\x00 ' ), ('F3 BF 7F' , FFFD + '\x7f ' ),
1614+ ('F3 BF C0' , FFFDx2 ), ('F3 BF FF' , FFFDx2 ),
1615+ ('F3 80 80 00' , FFFD + '\x00 ' ), ('F3 80 80 7F' , FFFD + '\x7f ' ),
1616+ ('F3 80 80 C0' , FFFDx2 ), ('F3 80 80 FF' , FFFDx2 ),
1617+ ('F3 80 BF 00' , FFFD + '\x00 ' ), ('F3 80 BF 7F' , FFFD + '\x7f ' ),
1618+ ('F3 80 BF C0' , FFFDx2 ), ('F3 80 BF FF' , FFFDx2 ),
1619+ ('F3 BF 80 00' , FFFD + '\x00 ' ), ('F3 BF 80 7F' , FFFD + '\x7f ' ),
1620+ ('F3 BF 80 C0' , FFFDx2 ), ('F3 BF 80 FF' , FFFDx2 ),
1621+ ('F3 BF BF 00' , FFFD + '\x00 ' ), ('F3 BF BF 7F' , FFFD + '\x7f ' ),
1622+ ('F3 BF BF C0' , FFFDx2 ), ('F3 BF BF FF' , FFFDx2 ),
1623+ ('F4 00' , FFFD + '\x00 ' ), ('F4 7F' , FFFD + '\x7f ' ), ('F4 90' , FFFDx2 ),
1624+ ('F4 BF' , FFFDx2 ), ('F4 C0' , FFFDx2 ), ('F4 FF' , FFFDx2 ),
1625+ ('F4 80 00' , FFFD + '\x00 ' ), ('F4 80 7F' , FFFD + '\x7f ' ),
1626+ ('F4 80 C0' , FFFDx2 ), ('F4 80 FF' , FFFDx2 ),
1627+ ('F4 8F 00' , FFFD + '\x00 ' ), ('F4 8F 7F' , FFFD + '\x7f ' ),
1628+ ('F4 8F C0' , FFFDx2 ), ('F4 8F FF' , FFFDx2 ),
1629+ ('F4 80 80 00' , FFFD + '\x00 ' ), ('F4 80 80 7F' , FFFD + '\x7f ' ),
1630+ ('F4 80 80 C0' , FFFDx2 ), ('F4 80 80 FF' , FFFDx2 ),
1631+ ('F4 80 BF 00' , FFFD + '\x00 ' ), ('F4 80 BF 7F' , FFFD + '\x7f ' ),
1632+ ('F4 80 BF C0' , FFFDx2 ), ('F4 80 BF FF' , FFFDx2 ),
1633+ ('F4 8F 80 00' , FFFD + '\x00 ' ), ('F4 8F 80 7F' , FFFD + '\x7f ' ),
1634+ ('F4 8F 80 C0' , FFFDx2 ), ('F4 8F 80 FF' , FFFDx2 ),
1635+ ('F4 8F BF 00' , FFFD + '\x00 ' ), ('F4 8F BF 7F' , FFFD + '\x7f ' ),
1636+ ('F4 8F BF C0' , FFFDx2 ), ('F4 8F BF FF' , FFFDx2 )
1637+ ]
1638+ for seq , res in sequences :
1639+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1640+ 'invalid continuation byte' )
1641+
14221642 def test_codecs_idna (self ):
14231643 # Test whether trailing dot is preserved
14241644 self .assertEqual ("www.python.org." .encode ("idna" ), b"www.python.org." )
0 commit comments