@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
28202820
28212821/* --- MBCS codecs for Windows -------------------------------------------- */
28222822
2823- PyObject * PyUnicode_DecodeMBCS (const char * s ,
2824- Py_ssize_t size ,
2825- const char * errors )
2823+ #if SIZEOF_INT < SIZEOF_SSIZE_T
2824+ #define NEED_RETRY
2825+ #endif
2826+
2827+ /* XXX This code is limited to "true" double-byte encodings, as
2828+ a) it assumes an incomplete character consists of a single byte, and
2829+ b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830+ encodings, see IsDBCSLeadByteEx documentation. */
2831+
2832+ static int is_dbcs_lead_byte (const char * s , int offset )
2833+ {
2834+ const char * curr = s + offset ;
2835+
2836+ if (IsDBCSLeadByte (* curr )) {
2837+ const char * prev = CharPrev (s , curr );
2838+ return (prev == curr ) || !IsDBCSLeadByte (* prev ) || (curr - prev == 2 );
2839+ }
2840+ return 0 ;
2841+ }
2842+
2843+ /*
2844+ * Decode MBCS string into unicode object. If 'final' is set, converts
2845+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2846+ */
2847+ static int decode_mbcs (PyUnicodeObject * * v ,
2848+ const char * s , /* MBCS string */
2849+ int size , /* sizeof MBCS string */
2850+ int final )
28262851{
2827- PyUnicodeObject * v ;
28282852 Py_UNICODE * p ;
2829- DWORD usize ;
2853+ Py_ssize_t n = 0 ;
2854+ int usize = 0 ;
2855+
2856+ assert (size >= 0 );
2857+
2858+ /* Skip trailing lead-byte unless 'final' is set */
2859+ if (!final && size >= 1 && is_dbcs_lead_byte (s , size - 1 ))
2860+ -- size ;
28302861
28312862 /* First get the size of the result */
2832- assert (size < INT_MAX );
2833- usize = MultiByteToWideChar (CP_ACP , 0 , s , (int )size , NULL , 0 );
2834- if (size > 0 && usize == 0 )
2835- return PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2863+ if (size > 0 ) {
2864+ usize = MultiByteToWideChar (CP_ACP , 0 , s , size , NULL , 0 );
2865+ if (usize == 0 ) {
2866+ PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2867+ return -1 ;
2868+ }
2869+ }
28362870
2837- v = _PyUnicode_New (usize );
2838- if (v == NULL )
2839- return NULL ;
2840- if (usize == 0 )
2841- return (PyObject * )v ;
2842- p = PyUnicode_AS_UNICODE (v );
2843- if (0 == MultiByteToWideChar (CP_ACP , 0 , s , (int )size , p , usize )) {
2844- Py_DECREF (v );
2845- return PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2871+ if (* v == NULL ) {
2872+ /* Create unicode object */
2873+ * v = _PyUnicode_New (usize );
2874+ if (* v == NULL )
2875+ return -1 ;
2876+ }
2877+ else {
2878+ /* Extend unicode object */
2879+ n = PyUnicode_GET_SIZE (* v );
2880+ if (_PyUnicode_Resize (v , n + usize ) < 0 )
2881+ return -1 ;
2882+ }
2883+
2884+ /* Do the conversion */
2885+ if (size > 0 ) {
2886+ p = PyUnicode_AS_UNICODE (* v ) + n ;
2887+ if (0 == MultiByteToWideChar (CP_ACP , 0 , s , size , p , usize )) {
2888+ PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2889+ return -1 ;
2890+ }
2891+ }
2892+
2893+ return size ;
2894+ }
2895+
2896+ PyObject * PyUnicode_DecodeMBCSStateful (const char * s ,
2897+ Py_ssize_t size ,
2898+ const char * errors ,
2899+ Py_ssize_t * consumed )
2900+ {
2901+ PyUnicodeObject * v = NULL ;
2902+ int done ;
2903+
2904+ if (consumed )
2905+ * consumed = 0 ;
2906+
2907+ #ifdef NEED_RETRY
2908+ retry :
2909+ if (size > INT_MAX )
2910+ done = decode_mbcs (& v , s , INT_MAX , 0 );
2911+ else
2912+ #endif
2913+ done = decode_mbcs (& v , s , (int )size , !consumed );
2914+
2915+ if (done < 0 ) {
2916+ Py_XDECREF (v );
2917+ return NULL ;
2918+ }
2919+
2920+ if (consumed )
2921+ * consumed += done ;
2922+
2923+ #ifdef NEED_RETRY
2924+ if (size > INT_MAX ) {
2925+ s += done ;
2926+ size -= done ;
2927+ goto retry ;
28462928 }
2929+ #endif
28472930
28482931 return (PyObject * )v ;
28492932}
28502933
2851- PyObject * PyUnicode_EncodeMBCS (const Py_UNICODE * p ,
2934+ PyObject * PyUnicode_DecodeMBCS (const char * s ,
28522935 Py_ssize_t size ,
28532936 const char * errors )
28542937{
2855- PyObject * repr ;
2856- char * s ;
2857- DWORD mbcssize ;
2938+ return PyUnicode_DecodeMBCSStateful (s , size , errors , NULL );
2939+ }
28582940
2859- /* If there are no characters, bail now! */
2860- if (size == 0 )
2861- return PyString_FromString ("" );
2941+ /*
2942+ * Convert unicode into string object (MBCS).
2943+ * Returns 0 if succeed, -1 otherwise.
2944+ */
2945+ static int encode_mbcs (PyObject * * repr ,
2946+ const Py_UNICODE * p , /* unicode */
2947+ int size ) /* size of unicode */
2948+ {
2949+ int mbcssize = 0 ;
2950+ Py_ssize_t n = 0 ;
2951+
2952+ assert (size >= 0 );
28622953
28632954 /* First get the size of the result */
2864- assert (size < INT_MAX );
2865- mbcssize = WideCharToMultiByte (CP_ACP , 0 , p , (int )size , NULL , 0 , NULL , NULL );
2866- if (mbcssize == 0 )
2867- return PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2955+ if (size > 0 ) {
2956+ mbcssize = WideCharToMultiByte (CP_ACP , 0 , p , size , NULL , 0 , NULL , NULL );
2957+ if (mbcssize == 0 ) {
2958+ PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2959+ return -1 ;
2960+ }
2961+ }
28682962
2869- repr = PyString_FromStringAndSize (NULL , mbcssize );
2870- if (repr == NULL )
2871- return NULL ;
2872- if (mbcssize == 0 )
2873- return repr ;
2963+ if (* repr == NULL ) {
2964+ /* Create string object */
2965+ * repr = PyString_FromStringAndSize (NULL , mbcssize );
2966+ if (* repr == NULL )
2967+ return -1 ;
2968+ }
2969+ else {
2970+ /* Extend string object */
2971+ n = PyString_Size (* repr );
2972+ if (_PyString_Resize (repr , n + mbcssize ) < 0 )
2973+ return -1 ;
2974+ }
28742975
28752976 /* Do the conversion */
2876- s = PyString_AS_STRING (repr );
2877- assert (size < INT_MAX );
2878- if (0 == WideCharToMultiByte (CP_ACP , 0 , p , (int )size , s , mbcssize , NULL , NULL )) {
2879- Py_DECREF (repr );
2880- return PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2977+ if (size > 0 ) {
2978+ char * s = PyString_AS_STRING (* repr ) + n ;
2979+ if (0 == WideCharToMultiByte (CP_ACP , 0 , p , size , s , mbcssize , NULL , NULL )) {
2980+ PyErr_SetFromWindowsErrWithFilename (0 , NULL );
2981+ return -1 ;
2982+ }
28812983 }
2984+
2985+ return 0 ;
2986+ }
2987+
2988+ PyObject * PyUnicode_EncodeMBCS (const Py_UNICODE * p ,
2989+ Py_ssize_t size ,
2990+ const char * errors )
2991+ {
2992+ PyObject * repr = NULL ;
2993+ int ret ;
2994+
2995+ #ifdef NEED_RETRY
2996+ retry :
2997+ if (size > INT_MAX )
2998+ ret = encode_mbcs (& repr , p , INT_MAX );
2999+ else
3000+ #endif
3001+ ret = encode_mbcs (& repr , p , (int )size );
3002+
3003+ if (ret < 0 ) {
3004+ Py_XDECREF (repr );
3005+ return NULL ;
3006+ }
3007+
3008+ #ifdef NEED_RETRY
3009+ if (size > INT_MAX ) {
3010+ p += INT_MAX ;
3011+ size -= INT_MAX ;
3012+ goto retry ;
3013+ }
3014+ #endif
3015+
28823016 return repr ;
28833017}
28843018
@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
28933027 NULL );
28943028}
28953029
3030+ #undef NEED_RETRY
3031+
28963032#endif /* MS_WINDOWS */
28973033
28983034/* --- Character Mapping Codec -------------------------------------------- */
0 commit comments