@@ -5085,32 +5085,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
50855085 const char * errors ,
50865086 int byteorder )
50875087{
5088- int kind ;
5089- void * data ;
5088+ enum PyUnicode_Kind kind ;
5089+ const void * data ;
50905090 Py_ssize_t len ;
50915091 PyObject * v ;
5092- unsigned char * p ;
5093- Py_ssize_t nsize , i ;
5094- /* Offsets from p for storing byte pairs in the right order. */
5092+ PY_UINT32_T * out ;
50955093#if PY_LITTLE_ENDIAN
5096- int iorder [] = { 0 , 1 , 2 , 3 } ;
5094+ int native_ordering = byteorder <= 0 ;
50975095#else
5098- int iorder [] = { 3 , 2 , 1 , 0 } ;
5096+ int native_ordering = byteorder >= 0 ;
50995097#endif
51005098 const char * encoding ;
5099+ Py_ssize_t nsize , pos ;
51015100 PyObject * errorHandler = NULL ;
51025101 PyObject * exc = NULL ;
51035102 PyObject * rep = NULL ;
51045103
5105- #define STORECHAR (CH ) \
5106- do { \
5107- p[iorder[3]] = ((CH) >> 24) & 0xff; \
5108- p[iorder[2]] = ((CH) >> 16) & 0xff; \
5109- p[iorder[1]] = ((CH) >> 8) & 0xff; \
5110- p[iorder[0]] = (CH) & 0xff; \
5111- p += 4; \
5112- } while(0)
5113-
51145104 if (!PyUnicode_Check (str )) {
51155105 PyErr_BadArgument ();
51165106 return NULL ;
@@ -5121,67 +5111,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51215111 data = PyUnicode_DATA (str );
51225112 len = PyUnicode_GET_LENGTH (str );
51235113
5124- nsize = len + (byteorder == 0 );
5125- if (nsize > PY_SSIZE_T_MAX / 4 )
5114+ if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
51265115 return PyErr_NoMemory ();
5116+ nsize = len + (byteorder == 0 );
51275117 v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
51285118 if (v == NULL )
51295119 return NULL ;
51305120
5131- p = (unsigned char * )PyBytes_AS_STRING (v );
5121+ /* output buffer is 4-bytes aligned */
5122+ assert (_Py_IS_ALIGNED (PyBytes_AS_STRING (v ), 4 ));
5123+ out = (PY_UINT32_T * )PyBytes_AS_STRING (v );
51325124 if (byteorder == 0 )
5133- STORECHAR ( 0xFEFF ) ;
5125+ * out ++ = 0xFEFF ;
51345126 if (len == 0 )
5135- return v ;
5127+ goto done ;
51365128
5137- if (byteorder == -1 ) {
5138- /* force LE */
5139- iorder [0 ] = 0 ;
5140- iorder [1 ] = 1 ;
5141- iorder [2 ] = 2 ;
5142- iorder [3 ] = 3 ;
5129+ if (byteorder == -1 )
51435130 encoding = "utf-32-le" ;
5144- }
5145- else if (byteorder == 1 ) {
5146- /* force BE */
5147- iorder [0 ] = 3 ;
5148- iorder [1 ] = 2 ;
5149- iorder [2 ] = 1 ;
5150- iorder [3 ] = 0 ;
5131+ else if (byteorder == 1 )
51515132 encoding = "utf-32-be" ;
5152- }
51535133 else
51545134 encoding = "utf-32" ;
51555135
51565136 if (kind == PyUnicode_1BYTE_KIND ) {
5157- for (i = 0 ; i < len ; i ++ )
5158- STORECHAR (PyUnicode_READ (kind , data , i ));
5159- return v ;
5137+ ucs1lib_utf32_encode ((const Py_UCS1 * )data , len , & out , native_ordering );
5138+ goto done ;
51605139 }
51615140
5162- for (i = 0 ; i < len ;) {
5141+ pos = 0 ;
5142+ while (pos < len ) {
51635143 Py_ssize_t repsize , moreunits ;
5164- Py_UCS4 ch = PyUnicode_READ (kind , data , i );
5165- i ++ ;
5166- assert (ch <= MAX_UNICODE );
5167- if (!Py_UNICODE_IS_SURROGATE (ch )) {
5168- STORECHAR (ch );
5169- continue ;
5144+
5145+ if (kind == PyUnicode_2BYTE_KIND ) {
5146+ pos += ucs2lib_utf32_encode ((const Py_UCS2 * )data + pos , len - pos ,
5147+ & out , native_ordering );
51705148 }
5149+ else {
5150+ assert (kind == PyUnicode_4BYTE_KIND );
5151+ pos += ucs4lib_utf32_encode ((const Py_UCS4 * )data + pos , len - pos ,
5152+ & out , native_ordering );
5153+ }
5154+ if (pos == len )
5155+ break ;
51715156
51725157 rep = unicode_encode_call_errorhandler (
51735158 errors , & errorHandler ,
51745159 encoding , "surrogates not allowed" ,
5175- str , & exc , i - 1 , i , & i );
5176-
5160+ str , & exc , pos , pos + 1 , & pos );
51775161 if (!rep )
51785162 goto error ;
51795163
51805164 if (PyBytes_Check (rep )) {
51815165 repsize = PyBytes_GET_SIZE (rep );
51825166 if (repsize & 3 ) {
51835167 raise_encode_exception (& exc , encoding ,
5184- str , i - 1 , i ,
5168+ str , pos - 1 , pos ,
51855169 "surrogates not allowed" );
51865170 goto error ;
51875171 }
@@ -5194,15 +5178,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51945178 moreunits = repsize = PyUnicode_GET_LENGTH (rep );
51955179 if (!PyUnicode_IS_ASCII (rep )) {
51965180 raise_encode_exception (& exc , encoding ,
5197- str , i - 1 , i ,
5181+ str , pos - 1 , pos ,
51985182 "surrogates not allowed" );
51995183 goto error ;
52005184 }
52015185 }
52025186
52035187 /* four bytes are reserved for each surrogate */
52045188 if (moreunits > 1 ) {
5205- Py_ssize_t outpos = p - (unsigned char * ) PyBytes_AS_STRING (v );
5189+ Py_ssize_t outpos = out - (PY_UINT32_T * ) PyBytes_AS_STRING (v );
52065190 Py_ssize_t morebytes = 4 * (moreunits - 1 );
52075191 if (PyBytes_GET_SIZE (v ) > PY_SSIZE_T_MAX - morebytes ) {
52085192 /* integer overflow */
@@ -5211,20 +5195,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
52115195 }
52125196 if (_PyBytes_Resize (& v , PyBytes_GET_SIZE (v ) + morebytes ) < 0 )
52135197 goto error ;
5214- p = (unsigned char * ) PyBytes_AS_STRING (v ) + outpos ;
5198+ out = (PY_UINT32_T * ) PyBytes_AS_STRING (v ) + outpos ;
52155199 }
52165200
52175201 if (PyBytes_Check (rep )) {
5218- Py_MEMCPY (p , PyBytes_AS_STRING (rep ), repsize );
5219- p += repsize ;
5202+ Py_MEMCPY (out , PyBytes_AS_STRING (rep ), repsize );
5203+ out += moreunits ;
52205204 } else /* rep is unicode */ {
5221- const Py_UCS1 * repdata ;
52225205 assert (PyUnicode_KIND (rep ) == PyUnicode_1BYTE_KIND );
5223- repdata = PyUnicode_1BYTE_DATA (rep );
5224- while (repsize -- ) {
5225- Py_UCS4 ch = * repdata ++ ;
5226- STORECHAR (ch );
5227- }
5206+ ucs1lib_utf32_encode (PyUnicode_1BYTE_DATA (rep ), repsize ,
5207+ & out , native_ordering );
52285208 }
52295209
52305210 Py_CLEAR (rep );
@@ -5233,19 +5213,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
52335213 /* Cut back to size actually needed. This is necessary for, for example,
52345214 encoding of a string containing isolated surrogates and the 'ignore'
52355215 handler is used. */
5236- nsize = p - (unsigned char * ) PyBytes_AS_STRING (v );
5216+ nsize = ( unsigned char * ) out - (unsigned char * ) PyBytes_AS_STRING (v );
52375217 if (nsize != PyBytes_GET_SIZE (v ))
52385218 _PyBytes_Resize (& v , nsize );
52395219 Py_XDECREF (errorHandler );
52405220 Py_XDECREF (exc );
5221+ done :
52415222 return v ;
52425223 error :
52435224 Py_XDECREF (rep );
52445225 Py_XDECREF (errorHandler );
52455226 Py_XDECREF (exc );
52465227 Py_XDECREF (v );
52475228 return NULL ;
5248- #undef STORECHAR
52495229}
52505230
52515231PyObject *
0 commit comments