Skip to content

Commit 7cb19d4

Browse files
author
walter.doerwald
committed
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
git-svn-id: http://svn.python.org/projects/python/trunk@57146 6015fed2-1504-0410-9fe1-9d1591cc4771
1 parent d082850 commit 7cb19d4

File tree

12 files changed

+999
-2
lines changed

12 files changed

+999
-2
lines changed

Doc/c-api/concrete.rst

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
13011301
object. Error handling is "strict". Return *NULL* if an exception was raised
13021302
by the codec.
13031303

1304+
These are the UTF-32 codec APIs:
1305+
1306+
.. % --- UTF-32 Codecs ------------------------------------------------------ */
1307+
1308+
1309+
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
1310+
1311+
Decode *length* bytes from a UTF-32 encoded buffer string and return the
1312+
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
1313+
handling. It defaults to "strict".
1314+
1315+
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
1316+
order::
1317+
1318+
*byteorder == -1: little endian
1319+
*byteorder == 0: native order
1320+
*byteorder == 1: big endian
1321+
1322+
and then switches if the first four bytes of the input data are a byte order mark
1323+
(BOM) and the specified byte order is native order. This BOM is not copied into
1324+
the resulting Unicode string. After completion, *\*byteorder* is set to the
1325+
current byte order at the end of input data.
1326+
1327+
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
1328+
1329+
If *byteorder* is *NULL*, the codec starts in native order mode.
1330+
1331+
Return *NULL* if an exception was raised by the codec.
1332+
1333+
.. versionadded:: 2.6
1334+
1335+
1336+
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
1337+
1338+
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
1339+
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
1340+
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
1341+
by four) as an error. Those bytes will not be decoded and the number of bytes
1342+
that have been decoded will be stored in *consumed*.
1343+
1344+
.. versionadded:: 2.6
1345+
1346+
1347+
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
1348+
1349+
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
1350+
data in *s*. If *byteorder* is not ``0``, output is written according to the
1351+
following byte order::
1352+
1353+
byteorder == -1: little endian
1354+
byteorder == 0: native byte order (writes a BOM mark)
1355+
byteorder == 1: big endian
1356+
1357+
If byteorder is ``0``, the output string will always start with the Unicode BOM
1358+
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
1359+
1360+
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
1361+
as a single codepoint.
1362+
1363+
Return *NULL* if an exception was raised by the codec.
1364+
1365+
.. versionadded:: 2.6
1366+
1367+
1368+
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
1369+
1370+
Return a Python string using the UTF-32 encoding in native byte order. The
1371+
string always starts with a BOM mark. Error handling is "strict". Return
1372+
*NULL* if an exception was raised by the codec.
1373+
1374+
.. versionadded:: 2.6
1375+
1376+
13041377
These are the UTF-16 codec APIs:
13051378

13061379
.. % --- UTF-16 Codecs ------------------------------------------------------ */

Doc/library/codecs.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
10451045
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
10461046
| | s_jisx0213 | |
10471047
+-----------------+--------------------------------+--------------------------------+
1048+
| utf_32 | U32, utf32 | all languages |
1049+
+-----------------+--------------------------------+--------------------------------+
1050+
| utf_32_be | UTF-32BE | all languages |
1051+
+-----------------+--------------------------------+--------------------------------+
1052+
| utf_32_le | UTF-32LE | all languages |
1053+
+-----------------+--------------------------------+--------------------------------+
10481054
| utf_16 | U16, utf16 | all languages |
10491055
+-----------------+--------------------------------+--------------------------------+
10501056
| utf_16_be | UTF-16BE | all languages (BMP only) |

Include/unicodeobject.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
145145
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
146146
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
147147
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
148+
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
148149
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
149150
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
150151
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
159160
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
160161
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
161162
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
163+
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
164+
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
162165
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
163166
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
164167
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
170173
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
171174
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
172175
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
176+
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
173177
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
174178
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
175179
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
223227
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
224228
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
225229
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
230+
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
226231
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
227232
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
228233
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
237242
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
238243
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
239244
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
245+
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
246+
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
240247
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
241248
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
242249
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
248255
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
249256
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
250257
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
258+
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
251259
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
252260
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
253261
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
701709
const char *errors /* error handling */
702710
);
703711

712+
/* --- UTF-32 Codecs ------------------------------------------------------ */
713+
714+
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
715+
the corresponding Unicode object.
716+
717+
errors (if non-NULL) defines the error handling. It defaults
718+
to "strict".
719+
720+
If byteorder is non-NULL, the decoder starts decoding using the
721+
given byte order:
722+
723+
*byteorder == -1: little endian
724+
*byteorder == 0: native order
725+
*byteorder == 1: big endian
726+
727+
In native mode, the first four bytes of the stream are checked for a
728+
BOM mark. If found, the BOM mark is analysed, the byte order
729+
adjusted and the BOM skipped. In the other modes, no BOM mark
730+
interpretation is done. After completion, *byteorder is set to the
731+
current byte order at the end of input data.
732+
733+
If byteorder is NULL, the codec starts in native order mode.
734+
735+
*/
736+
737+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
738+
const char *string, /* UTF-32 encoded string */
739+
Py_ssize_t length, /* size of string */
740+
const char *errors, /* error handling */
741+
int *byteorder /* pointer to byteorder to use
742+
0=native;-1=LE,1=BE; updated on
743+
exit */
744+
);
745+
746+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
747+
const char *string, /* UTF-32 encoded string */
748+
Py_ssize_t length, /* size of string */
749+
const char *errors, /* error handling */
750+
int *byteorder, /* pointer to byteorder to use
751+
0=native;-1=LE,1=BE; updated on
752+
exit */
753+
Py_ssize_t *consumed /* bytes consumed */
754+
);
755+
756+
/* Returns a Python string using the UTF-32 encoding in native byte
757+
order. The string always starts with a BOM mark. */
758+
759+
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
760+
PyObject *unicode /* Unicode object */
761+
);
762+
763+
/* Returns a Python string object holding the UTF-32 encoded value of
764+
the Unicode data.
765+
766+
If byteorder is not 0, output is written according to the following
767+
byte order:
768+
769+
byteorder == -1: little endian
770+
byteorder == 0: native byte order (writes a BOM mark)
771+
byteorder == 1: big endian
772+
773+
If byteorder is 0, the output string will always start with the
774+
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
775+
prepended.
776+
777+
*/
778+
779+
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
780+
const Py_UNICODE *data, /* Unicode char buffer */
781+
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
782+
const char *errors, /* error handling */
783+
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
784+
);
785+
704786
/* --- UTF-16 Codecs ------------------------------------------------------ */
705787

706788
/* Decodes length bytes from a UTF-16 encoded buffer string and returns

Lib/encodings/aliases.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,16 @@
490490
'unicodelittleunmarked' : 'utf_16_le',
491491
'utf_16le' : 'utf_16_le',
492492

493+
# utf_32 codec
494+
'u32' : 'utf_32',
495+
'utf32' : 'utf_32',
496+
497+
# utf_32_be codec
498+
'utf_32be' : 'utf_32_be',
499+
500+
# utf_32_le codec
501+
'utf_32le' : 'utf_32_le',
502+
493503
# utf_7 codec
494504
'u7' : 'utf_7',
495505
'utf7' : 'utf_7',

Lib/encodings/utf_32.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""
2+
Python 'utf-32' Codec
3+
"""
4+
import codecs, sys
5+
6+
### Codec APIs
7+
8+
encode = codecs.utf_32_encode
9+
10+
def decode(input, errors='strict'):
11+
return codecs.utf_32_decode(input, errors, True)
12+
13+
class IncrementalEncoder(codecs.IncrementalEncoder):
14+
def __init__(self, errors='strict'):
15+
codecs.IncrementalEncoder.__init__(self, errors)
16+
self.encoder = None
17+
18+
def encode(self, input, final=False):
19+
if self.encoder is None:
20+
result = codecs.utf_32_encode(input, self.errors)[0]
21+
if sys.byteorder == 'little':
22+
self.encoder = codecs.utf_32_le_encode
23+
else:
24+
self.encoder = codecs.utf_32_be_encode
25+
return result
26+
return self.encoder(input, self.errors)[0]
27+
28+
def reset(self):
29+
codecs.IncrementalEncoder.reset(self)
30+
self.encoder = None
31+
32+
def getstate(self):
33+
# state info we return to the caller:
34+
# 0: stream is in natural order for this platform
35+
# 2: endianness hasn't been determined yet
36+
# (we're never writing in unnatural order)
37+
return (2 if self.encoder is None else 0)
38+
39+
def setstate(self, state):
40+
if state:
41+
self.encoder = None
42+
else:
43+
if sys.byteorder == 'little':
44+
self.encoder = codecs.utf_32_le_encode
45+
else:
46+
self.encoder = codecs.utf_32_be_encode
47+
48+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
49+
def __init__(self, errors='strict'):
50+
codecs.BufferedIncrementalDecoder.__init__(self, errors)
51+
self.decoder = None
52+
53+
def _buffer_decode(self, input, errors, final):
54+
if self.decoder is None:
55+
(output, consumed, byteorder) = \
56+
codecs.utf_32_ex_decode(input, errors, 0, final)
57+
if byteorder == -1:
58+
self.decoder = codecs.utf_32_le_decode
59+
elif byteorder == 1:
60+
self.decoder = codecs.utf_32_be_decode
61+
elif consumed >= 4:
62+
raise UnicodeError("UTF-32 stream does not start with BOM")
63+
return (output, consumed)
64+
return self.decoder(input, self.errors, final)
65+
66+
def reset(self):
67+
codecs.BufferedIncrementalDecoder.reset(self)
68+
self.decoder = None
69+
70+
def getstate(self):
71+
# additonal state info from the base class must be None here,
72+
# as it isn't passed along to the caller
73+
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
74+
# additional state info we pass to the caller:
75+
# 0: stream is in natural order for this platform
76+
# 1: stream is in unnatural order
77+
# 2: endianness hasn't been determined yet
78+
if self.decoder is None:
79+
return (state, 2)
80+
addstate = int((sys.byteorder == "big") !=
81+
(self.decoder is codecs.utf_32_be_decode))
82+
return (state, addstate)
83+
84+
def setstate(self, state):
85+
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
86+
codecs.BufferedIncrementalDecoder.setstate(self, state)
87+
state = state[1]
88+
if state == 0:
89+
self.decoder = (codecs.utf_32_be_decode
90+
if sys.byteorder == "big"
91+
else codecs.utf_32_le_decode)
92+
elif state == 1:
93+
self.decoder = (codecs.utf_32_le_decode
94+
if sys.byteorder == "big"
95+
else codecs.utf_32_be_decode)
96+
else:
97+
self.decoder = None
98+
99+
class StreamWriter(codecs.StreamWriter):
100+
def __init__(self, stream, errors='strict'):
101+
self.bom_written = False
102+
codecs.StreamWriter.__init__(self, stream, errors)
103+
104+
def encode(self, input, errors='strict'):
105+
self.bom_written = True
106+
result = codecs.utf_32_encode(input, errors)
107+
if sys.byteorder == 'little':
108+
self.encode = codecs.utf_32_le_encode
109+
else:
110+
self.encode = codecs.utf_32_be_encode
111+
return result
112+
113+
class StreamReader(codecs.StreamReader):
114+
115+
def reset(self):
116+
codecs.StreamReader.reset(self)
117+
try:
118+
del self.decode
119+
except AttributeError:
120+
pass
121+
122+
def decode(self, input, errors='strict'):
123+
(object, consumed, byteorder) = \
124+
codecs.utf_32_ex_decode(input, errors, 0, False)
125+
if byteorder == -1:
126+
self.decode = codecs.utf_32_le_decode
127+
elif byteorder == 1:
128+
self.decode = codecs.utf_32_be_decode
129+
elif consumed>=4:
130+
raise UnicodeError,"UTF-32 stream does not start with BOM"
131+
return (object, consumed)
132+
133+
### encodings module API
134+
135+
def getregentry():
136+
return codecs.CodecInfo(
137+
name='utf-32',
138+
encode=encode,
139+
decode=decode,
140+
incrementalencoder=IncrementalEncoder,
141+
incrementaldecoder=IncrementalDecoder,
142+
streamreader=StreamReader,
143+
streamwriter=StreamWriter,
144+
)

0 commit comments

Comments
 (0)