glix
diff --git a/‎Doc/c-api/concrete.rst‎
Lines changed: 73 additions & 0 deletions b/‎Doc/c-api/concrete.rst‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎Doc/library/codecs.rst‎
Lines changed: 6 additions & 0 deletions b/‎Doc/library/codecs.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎Include/unicodeobject.h‎
Lines changed: 82 additions & 0 deletions b/‎Include/unicodeobject.h‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎Lib/encodings/aliases.py‎
Lines changed: 10 additions & 0 deletions b/‎Lib/encodings/aliases.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Lib/encodings/utf_32.py‎
Lines changed: 144 additions & 0 deletions b/‎Lib/encodings/utf_32.py‎
Lines changed: 144 additions & 0 deletions
@@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
    object.  Error handling is "strict".  Return *NULL* if an exception was raised
    by the codec.
 
+These are the UTF-32 codec APIs:
+
+.. % --- UTF-32 Codecs ------------------------------------------------------ */
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
+
+   Decode *length* bytes from a UTF-32 encoded buffer string and return the
+   corresponding Unicode object.  *errors* (if non-*NULL*) defines the error
+   handling. It defaults to "strict".
+
+   If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
+   order::
+
+      *byteorder == -1: little endian
+      *byteorder == 0:  native order
+      *byteorder == 1:  big endian
+
+   and then switches if the first four bytes of the input data are a byte order mark
+   (BOM) and the specified byte order is native order.  This BOM is not copied into
+   the resulting Unicode string.  After completion, *\*byteorder* is set to the
+   current byte order at the end of input data.
+
+   In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
+
+   If *byteorder* is *NULL*, the codec starts in native order mode.
+
+   Return *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
+
+   If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
+   *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
+   trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
+   by four) as an error. Those bytes will not be decoded and the number of bytes
+   that have been decoded will be stored in *consumed*.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
+
+   Return a Python bytes object holding the UTF-32 encoded value of the Unicode
+   data in *s*.  If *byteorder* is not ``0``, output is written according to the
+   following byte order::
+
+      byteorder == -1: little endian
+      byteorder == 0:  native byte order (writes a BOM mark)
+      byteorder == 1:  big endian
+
+   If byteorder is ``0``, the output string will always start with the Unicode BOM
+   mark (U+FEFF). In the other two modes, no BOM mark is prepended.
+
+   If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
+   as a single codepoint.
+
+   Return *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
+
+   Return a Python string using the UTF-32 encoding in native byte order. The
+   string always starts with a BOM mark.  Error handling is "strict".  Return
+   *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
 These are the UTF-16 codec APIs:
 
 .. % --- UTF-16 Codecs ------------------------------------------------------ */
 
@@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
 | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       |
 |                 | s_jisx0213                     |                                |
 +-----------------+--------------------------------+--------------------------------+
+| utf_32          | U32, utf32                     | all languages                  |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_be       | UTF-32BE                       | all languages                  |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_le       | UTF-32LE                       | all languages                  |
++-----------------+--------------------------------+--------------------------------+
 | utf_16          | U16, utf16                     | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_16_be       | UTF-16BE                       | all languages (BMP only)       |
 
@@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
     const char *errors		/* error handling */
     );
 
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-32 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict". 
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+	*byteorder == -1: little endian
+	*byteorder == 0:  native order
+	*byteorder == 1:  big endian
+
+   In native mode, the first four bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
+    const char *string, 	/* UTF-32 encoded string */
+    Py_ssize_t length,	 	/* size of string */
+    const char *errors,		/* error handling */
+    int *byteorder		/* pointer to byteorder to use
+				   0=native;-1=LE,1=BE; updated on
+				   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
+    const char *string, 	/* UTF-32 encoded string */
+    Py_ssize_t length,	 	/* size of string */
+    const char *errors,		/* error handling */
+    int *byteorder,		/* pointer to byteorder to use
+				   0=native;-1=LE,1=BE; updated on
+				   exit */
+    Py_ssize_t *consumed	/* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-32 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
+    PyObject *unicode	 	/* Unicode object */
+    );
+
+/* Returns a Python string object holding the UTF-32 encoded value of
+   the Unicode data.
+
+   If byteorder is not 0, output is written according to the following
+   byte order:
+
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+
+   If byteorder is 0, the output string will always start with the
+   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+   prepended.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
+    const Py_UNICODE *data, 	/* Unicode char buffer */
+    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
+    const char *errors,		/* error handling */
+    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
+    );
+
 /* --- UTF-16 Codecs ------------------------------------------------------ */
 
 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 
@@ -490,6 +490,16 @@
     'unicodelittleunmarked' : 'utf_16_le',
     'utf_16le'           : 'utf_16_le',
 
+    # utf_32 codec
+    'u32'                : 'utf_32',
+    'utf32'              : 'utf_32',
+
+    # utf_32_be codec
+    'utf_32be'           : 'utf_32_be',
+
+    # utf_32_le codec
+    'utf_32le'           : 'utf_32_le',
+
     # utf_7 codec
     'u7'                 : 'utf_7',
     'utf7'               : 'utf_7',
 
@@ -0,0 +1,144 @@
+"""
+Python 'utf-32' Codec
+"""
+import codecs, sys
+
+### Codec APIs
+
+encode = codecs.utf_32_encode
+
+def decode(input, errors='strict'):
+    return codecs.utf_32_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def __init__(self, errors='strict'):
+        codecs.IncrementalEncoder.__init__(self, errors)
+        self.encoder = None
+
+    def encode(self, input, final=False):
+        if self.encoder is None:
+            result = codecs.utf_32_encode(input, self.errors)[0]
+            if sys.byteorder == 'little':
+                self.encoder = codecs.utf_32_le_encode
+            else:
+                self.encoder = codecs.utf_32_be_encode
+            return result
+        return self.encoder(input, self.errors)[0]
+
+    def reset(self):
+        codecs.IncrementalEncoder.reset(self)
+        self.encoder = None
+
+    def getstate(self):
+        # state info we return to the caller:
+        # 0: stream is in natural order for this platform
+        # 2: endianness hasn't been determined yet
+        # (we're never writing in unnatural order)
+        return (2 if self.encoder is None else 0)
+
+    def setstate(self, state):
+        if state:
+            self.encoder = None
+        else:
+            if sys.byteorder == 'little':
+                self.encoder = codecs.utf_32_le_encode
+            else:
+                self.encoder = codecs.utf_32_be_encode
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def __init__(self, errors='strict'):
+        codecs.BufferedIncrementalDecoder.__init__(self, errors)
+        self.decoder = None
+
+    def _buffer_decode(self, input, errors, final):
+        if self.decoder is None:
+            (output, consumed, byteorder) = \
+                codecs.utf_32_ex_decode(input, errors, 0, final)
+            if byteorder == -1:
+                self.decoder = codecs.utf_32_le_decode
+            elif byteorder == 1:
+                self.decoder = codecs.utf_32_be_decode
+            elif consumed >= 4:
+                raise UnicodeError("UTF-32 stream does not start with BOM")
+            return (output, consumed)
+        return self.decoder(input, self.errors, final)
+
+    def reset(self):
+        codecs.BufferedIncrementalDecoder.reset(self)
+        self.decoder = None
+
+    def getstate(self):
+        # additonal state info from the base class must be None here,
+        # as it isn't passed along to the caller
+        state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
+        # additional state info we pass to the caller:
+        # 0: stream is in natural order for this platform
+        # 1: stream is in unnatural order
+        # 2: endianness hasn't been determined yet
+        if self.decoder is None:
+            return (state, 2)
+        addstate = int((sys.byteorder == "big") !=
+                       (self.decoder is codecs.utf_32_be_decode))
+        return (state, addstate)
+
+    def setstate(self, state):
+        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+        codecs.BufferedIncrementalDecoder.setstate(self, state)
+        state = state[1]
+        if state == 0:
+            self.decoder = (codecs.utf_32_be_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_32_le_decode)
+        elif state == 1:
+            self.decoder = (codecs.utf_32_le_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_32_be_decode)
+        else:
+            self.decoder = None
+
+class StreamWriter(codecs.StreamWriter):
+    def __init__(self, stream, errors='strict'):
+        self.bom_written = False
+        codecs.StreamWriter.__init__(self, stream, errors)
+
+    def encode(self, input, errors='strict'):
+        self.bom_written = True
+        result = codecs.utf_32_encode(input, errors)
+        if sys.byteorder == 'little':
+            self.encode = codecs.utf_32_le_encode
+        else:
+            self.encode = codecs.utf_32_be_encode
+        return result
+
+class StreamReader(codecs.StreamReader):
+
+    def reset(self):
+        codecs.StreamReader.reset(self)
+        try:
+            del self.decode
+        except AttributeError:
+            pass
+
+    def decode(self, input, errors='strict'):
+        (object, consumed, byteorder) = \
+            codecs.utf_32_ex_decode(input, errors, 0, False)
+        if byteorder == -1:
+            self.decode = codecs.utf_32_le_decode
+        elif byteorder == 1:
+            self.decode = codecs.utf_32_be_decode
+        elif consumed>=4:
+            raise UnicodeError,"UTF-32 stream does not start with BOM"
+        return (object, consumed)
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='utf-32',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )