@@ -5085,22 +5085,32 @@ _PyUnicode_EncodeUTF32(PyObject *str,
50855085 const char * errors ,
50865086 int byteorder )
50875087{
5088- enum PyUnicode_Kind kind ;
5089- const void * data ;
5088+ int kind ;
5089+ void * data ;
50905090 Py_ssize_t len ;
50915091 PyObject * v ;
5092- PY_UINT32_T * out ;
5092+ unsigned char * p ;
5093+ Py_ssize_t nsize , i ;
5094+ /* Offsets from p for storing byte pairs in the right order. */
50935095#if PY_LITTLE_ENDIAN
5094- int native_ordering = byteorder <= 0 ;
5096+ int iorder [] = { 0 , 1 , 2 , 3 } ;
50955097#else
5096- int native_ordering = byteorder >= 0 ;
5098+ int iorder [] = { 3 , 2 , 1 , 0 } ;
50975099#endif
50985100 const char * encoding ;
5099- Py_ssize_t nsize , pos ;
51005101 PyObject * errorHandler = NULL ;
51015102 PyObject * exc = NULL ;
51025103 PyObject * rep = NULL ;
51035104
5105+ #define STORECHAR (CH ) \
5106+ do { \
5107+ p[iorder[3]] = ((CH) >> 24) & 0xff; \
5108+ p[iorder[2]] = ((CH) >> 16) & 0xff; \
5109+ p[iorder[1]] = ((CH) >> 8) & 0xff; \
5110+ p[iorder[0]] = (CH) & 0xff; \
5111+ p += 4; \
5112+ } while(0)
5113+
51045114 if (!PyUnicode_Check (str )) {
51055115 PyErr_BadArgument ();
51065116 return NULL ;
@@ -5111,61 +5121,67 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51115121 data = PyUnicode_DATA (str );
51125122 len = PyUnicode_GET_LENGTH (str );
51135123
5114- if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
5115- return PyErr_NoMemory ();
51165124 nsize = len + (byteorder == 0 );
5125+ if (nsize > PY_SSIZE_T_MAX / 4 )
5126+ return PyErr_NoMemory ();
51175127 v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
51185128 if (v == NULL )
51195129 return NULL ;
51205130
5121- /* output buffer is 4-bytes aligned */
5122- assert (_Py_IS_ALIGNED (PyBytes_AS_STRING (v ), 4 ));
5123- out = (PY_UINT32_T * )PyBytes_AS_STRING (v );
5131+ p = (unsigned char * )PyBytes_AS_STRING (v );
51245132 if (byteorder == 0 )
5125- * out ++ = 0xFEFF ;
5133+ STORECHAR ( 0xFEFF ) ;
51265134 if (len == 0 )
5127- goto done ;
5135+ return v ;
51285136
5129- if (byteorder == -1 )
5137+ if (byteorder == -1 ) {
5138+ /* force LE */
5139+ iorder [0 ] = 0 ;
5140+ iorder [1 ] = 1 ;
5141+ iorder [2 ] = 2 ;
5142+ iorder [3 ] = 3 ;
51305143 encoding = "utf-32-le" ;
5131- else if (byteorder == 1 )
5144+ }
5145+ else if (byteorder == 1 ) {
5146+ /* force BE */
5147+ iorder [0 ] = 3 ;
5148+ iorder [1 ] = 2 ;
5149+ iorder [2 ] = 1 ;
5150+ iorder [3 ] = 0 ;
51325151 encoding = "utf-32-be" ;
5152+ }
51335153 else
51345154 encoding = "utf-32" ;
51355155
51365156 if (kind == PyUnicode_1BYTE_KIND ) {
5137- ucs1lib_utf32_encode ((const Py_UCS1 * )data , len , & out , native_ordering );
5138- goto done ;
5157+ for (i = 0 ; i < len ; i ++ )
5158+ STORECHAR (PyUnicode_READ (kind , data , i ));
5159+ return v ;
51395160 }
51405161
5141- pos = 0 ;
5142- while (pos < len ) {
5162+ for (i = 0 ; i < len ;) {
51435163 Py_ssize_t repsize , moreunits ;
5144-
5145- if (kind == PyUnicode_2BYTE_KIND ) {
5146- pos += ucs2lib_utf32_encode ((const Py_UCS2 * )data + pos , len - pos ,
5147- & out , native_ordering );
5148- }
5149- else {
5150- assert (kind == PyUnicode_4BYTE_KIND );
5151- pos += ucs4lib_utf32_encode ((const Py_UCS4 * )data + pos , len - pos ,
5152- & out , native_ordering );
5164+ Py_UCS4 ch = PyUnicode_READ (kind , data , i );
5165+ i ++ ;
5166+ assert (ch <= MAX_UNICODE );
5167+ if (!Py_UNICODE_IS_SURROGATE (ch )) {
5168+ STORECHAR (ch );
5169+ continue ;
51535170 }
5154- if (pos == len )
5155- break ;
51565171
51575172 rep = unicode_encode_call_errorhandler (
51585173 errors , & errorHandler ,
51595174 encoding , "surrogates not allowed" ,
5160- str , & exc , pos , pos + 1 , & pos );
5175+ str , & exc , i - 1 , i , & i );
5176+
51615177 if (!rep )
51625178 goto error ;
51635179
51645180 if (PyBytes_Check (rep )) {
51655181 repsize = PyBytes_GET_SIZE (rep );
51665182 if (repsize & 3 ) {
51675183 raise_encode_exception (& exc , encoding ,
5168- str , pos - 1 , pos ,
5184+ str , i - 1 , i ,
51695185 "surrogates not allowed" );
51705186 goto error ;
51715187 }
@@ -5178,15 +5194,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51785194 moreunits = repsize = PyUnicode_GET_LENGTH (rep );
51795195 if (!PyUnicode_IS_ASCII (rep )) {
51805196 raise_encode_exception (& exc , encoding ,
5181- str , pos - 1 , pos ,
5197+ str , i - 1 , i ,
51825198 "surrogates not allowed" );
51835199 goto error ;
51845200 }
51855201 }
51865202
51875203 /* four bytes are reserved for each surrogate */
51885204 if (moreunits > 1 ) {
5189- Py_ssize_t outpos = out - (PY_UINT32_T * ) PyBytes_AS_STRING (v );
5205+ Py_ssize_t outpos = p - (unsigned char * ) PyBytes_AS_STRING (v );
51905206 Py_ssize_t morebytes = 4 * (moreunits - 1 );
51915207 if (PyBytes_GET_SIZE (v ) > PY_SSIZE_T_MAX - morebytes ) {
51925208 /* integer overflow */
@@ -5195,16 +5211,20 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51955211 }
51965212 if (_PyBytes_Resize (& v , PyBytes_GET_SIZE (v ) + morebytes ) < 0 )
51975213 goto error ;
5198- out = (PY_UINT32_T * ) PyBytes_AS_STRING (v ) + outpos ;
5214+ p = (unsigned char * ) PyBytes_AS_STRING (v ) + outpos ;
51995215 }
52005216
52015217 if (PyBytes_Check (rep )) {
5202- Py_MEMCPY (out , PyBytes_AS_STRING (rep ), repsize );
5203- out += moreunits ;
5218+ Py_MEMCPY (p , PyBytes_AS_STRING (rep ), repsize );
5219+ p += repsize ;
52045220 } else /* rep is unicode */ {
5221+ const Py_UCS1 * repdata ;
52055222 assert (PyUnicode_KIND (rep ) == PyUnicode_1BYTE_KIND );
5206- ucs1lib_utf32_encode (PyUnicode_1BYTE_DATA (rep ), repsize ,
5207- & out , native_ordering );
5223+ repdata = PyUnicode_1BYTE_DATA (rep );
5224+ while (repsize -- ) {
5225+ Py_UCS4 ch = * repdata ++ ;
5226+ STORECHAR (ch );
5227+ }
52085228 }
52095229
52105230 Py_CLEAR (rep );
@@ -5213,19 +5233,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
52135233 /* Cut back to size actually needed. This is necessary for, for example,
52145234 encoding of a string containing isolated surrogates and the 'ignore'
52155235 handler is used. */
5216- nsize = ( unsigned char * ) out - (unsigned char * ) PyBytes_AS_STRING (v );
5236+ nsize = p - (unsigned char * ) PyBytes_AS_STRING (v );
52175237 if (nsize != PyBytes_GET_SIZE (v ))
52185238 _PyBytes_Resize (& v , nsize );
52195239 Py_XDECREF (errorHandler );
52205240 Py_XDECREF (exc );
5221- done :
52225241 return v ;
52235242 error :
52245243 Py_XDECREF (rep );
52255244 Py_XDECREF (errorHandler );
52265245 Py_XDECREF (exc );
52275246 Py_XDECREF (v );
52285247 return NULL ;
5248+ #undef STORECHAR
52295249}
52305250
52315251PyObject *
0 commit comments