@@ -305,28 +305,29 @@ typedef enum {
305305 _Py_ERROR_UNKNOWN = 0 ,
306306 _Py_ERROR_STRICT ,
307307 _Py_ERROR_SURROGATEESCAPE ,
308- _Py_ERROR_SURROGATEPASS ,
309308 _Py_ERROR_REPLACE ,
310309 _Py_ERROR_IGNORE ,
310+ _Py_ERROR_BACKSLASHREPLACE ,
311+ _Py_ERROR_SURROGATEPASS ,
311312 _Py_ERROR_XMLCHARREFREPLACE ,
312313 _Py_ERROR_OTHER
313314} _Py_error_handler ;
314315
315316static _Py_error_handler
316317get_error_handler (const char * errors )
317318{
318- if (errors == NULL )
319- return _Py_ERROR_STRICT ;
320- if (strcmp (errors , "strict" ) == 0 )
319+ if (errors == NULL || strcmp (errors , "strict" ) == 0 )
321320 return _Py_ERROR_STRICT ;
322321 if (strcmp (errors , "surrogateescape" ) == 0 )
323322 return _Py_ERROR_SURROGATEESCAPE ;
324- if (strcmp (errors , "surrogatepass" ) == 0 )
325- return _Py_ERROR_SURROGATEPASS ;
326- if (strcmp (errors , "ignore" ) == 0 )
327- return _Py_ERROR_IGNORE ;
328323 if (strcmp (errors , "replace" ) == 0 )
329324 return _Py_ERROR_REPLACE ;
325+ if (strcmp (errors , "ignore" ) == 0 )
326+ return _Py_ERROR_IGNORE ;
327+ if (strcmp (errors , "backslashreplace" ) == 0 )
328+ return _Py_ERROR_BACKSLASHREPLACE ;
329+ if (strcmp (errors , "surrogatepass" ) == 0 )
330+ return _Py_ERROR_SURROGATEPASS ;
330331 if (strcmp (errors , "xmlcharrefreplace" ) == 0 )
331332 return _Py_ERROR_XMLCHARREFREPLACE ;
332333 return _Py_ERROR_OTHER ;
@@ -771,6 +772,126 @@ unicode_result_unchanged(PyObject *unicode)
771772 return _PyUnicode_Copy (unicode );
772773}
773774
775+ /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
776+ ASCII, Latin1, UTF-8, etc. */
777+ static char *
778+ backslashreplace (_PyBytesWriter * writer , Py_ssize_t prealloc_per_char ,
779+ char * str ,
780+ PyObject * unicode , Py_ssize_t collstart , Py_ssize_t collend )
781+ {
782+ Py_ssize_t size , i , prealloc ;
783+ Py_UCS4 ch ;
784+ enum PyUnicode_Kind kind ;
785+ void * data ;
786+
787+ assert (PyUnicode_IS_READY (unicode ));
788+ kind = PyUnicode_KIND (unicode );
789+ data = PyUnicode_DATA (unicode );
790+
791+ size = 0 ;
792+ /* determine replacement size */
793+ for (i = collstart ; i < collend ; ++ i ) {
794+ Py_ssize_t incr ;
795+
796+ ch = PyUnicode_READ (kind , data , i );
797+ if (ch < 0x100 )
798+ incr = 2 + 2 ;
799+ else if (ch < 0x10000 )
800+ incr = 2 + 4 ;
801+ else {
802+ assert (ch <= MAX_UNICODE );
803+ incr = 2 + 6 ;
804+ }
805+ if (size > PY_SSIZE_T_MAX - incr ) {
806+ PyErr_SetString (PyExc_OverflowError ,
807+ "encoded result is too long for a Python string" );
808+ return NULL ;
809+ }
810+ size += incr ;
811+ }
812+
813+ prealloc = prealloc_per_char * (collend - collstart );
814+ if (size > prealloc ) {
815+ str = _PyBytesWriter_Prepare (writer , str , size - prealloc );
816+ if (str == NULL )
817+ return NULL ;
818+ }
819+
820+ /* generate replacement */
821+ for (i = collstart ; i < collend ; ++ i ) {
822+ ch = PyUnicode_READ (kind , data , i );
823+ if (ch < 0x100 )
824+ str += sprintf (str , "\\x%02x" , ch );
825+ else if (ch < 0x10000 )
826+ str += sprintf (str , "\\u%04x" , ch );
827+ else {
828+ assert (ch <= MAX_UNICODE );
829+ str += sprintf (str , "\\U%08x" , ch );
830+ }
831+ }
832+ return str ;
833+ }
834+
835+ /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
836+ ASCII, Latin1, UTF-8, etc. */
837+ static char *
838+ xmlcharrefreplace (_PyBytesWriter * writer , Py_ssize_t prealloc_per_char ,
839+ char * str ,
840+ PyObject * unicode , Py_ssize_t collstart , Py_ssize_t collend )
841+ {
842+ Py_ssize_t size , i , prealloc ;
843+ Py_UCS4 ch ;
844+ enum PyUnicode_Kind kind ;
845+ void * data ;
846+
847+ assert (PyUnicode_IS_READY (unicode ));
848+ kind = PyUnicode_KIND (unicode );
849+ data = PyUnicode_DATA (unicode );
850+
851+ size = 0 ;
852+ /* determine replacement size */
853+ for (i = collstart ; i < collend ; ++ i ) {
854+ Py_ssize_t incr ;
855+
856+ ch = PyUnicode_READ (kind , data , i );
857+ if (ch < 10 )
858+ incr = 2 + 1 + 1 ;
859+ else if (ch < 100 )
860+ incr = 2 + 2 + 1 ;
861+ else if (ch < 1000 )
862+ incr = 2 + 3 + 1 ;
863+ else if (ch < 10000 )
864+ incr = 2 + 4 + 1 ;
865+ else if (ch < 100000 )
866+ incr = 2 + 5 + 1 ;
867+ else if (ch < 1000000 )
868+ incr = 2 + 6 + 1 ;
869+ else {
870+ assert (ch <= MAX_UNICODE );
871+ incr = 2 + 7 + 1 ;
872+ }
873+ if (size > PY_SSIZE_T_MAX - incr ) {
874+ PyErr_SetString (PyExc_OverflowError ,
875+ "encoded result is too long for a Python string" );
876+ return NULL ;
877+ }
878+ size += incr ;
879+ }
880+
881+ prealloc = prealloc_per_char * (collend - collstart );
882+ if (size > prealloc ) {
883+ str = _PyBytesWriter_Prepare (writer , str , size - prealloc );
884+ if (str == NULL )
885+ return NULL ;
886+ }
887+
888+ /* generate replacement */
889+ for (i = collstart ; i < collend ; ++ i ) {
890+ str += sprintf (str , "&#%d;" , PyUnicode_READ (kind , data , i ));
891+ }
892+ return str ;
893+ }
894+
774895/* --- Bloom Filters ----------------------------------------------------- */
775896
776897/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode,
67136834 ++ pos ;
67146835 }
67156836 else {
6716- Py_ssize_t requiredsize ;
67176837 PyObject * repunicode ;
67186838 Py_ssize_t repsize , newpos , i ;
67196839 /* startpos for collecting unencodable chars */
@@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode,
67446864 pos = collend ;
67456865 break ;
67466866
6747- case _Py_ERROR_XMLCHARREFREPLACE :
6748- requiredsize = 0 ;
6749- /* determine replacement size */
6750- for (i = collstart ; i < collend ; ++ i ) {
6751- Py_ssize_t incr ;
6752-
6753- ch = PyUnicode_READ (kind , data , i );
6754- if (ch < 10 )
6755- incr = 2 + 1 + 1 ;
6756- else if (ch < 100 )
6757- incr = 2 + 2 + 1 ;
6758- else if (ch < 1000 )
6759- incr = 2 + 3 + 1 ;
6760- else if (ch < 10000 )
6761- incr = 2 + 4 + 1 ;
6762- else if (ch < 100000 )
6763- incr = 2 + 5 + 1 ;
6764- else if (ch < 1000000 )
6765- incr = 2 + 6 + 1 ;
6766- else {
6767- assert (ch <= MAX_UNICODE );
6768- incr = 2 + 7 + 1 ;
6769- }
6770- if (requiredsize > PY_SSIZE_T_MAX - incr )
6771- goto overflow ;
6772- requiredsize += incr ;
6773- }
6774-
6775- str = _PyBytesWriter_Prepare (& writer , str , requiredsize - 1 );
6867+ case _Py_ERROR_BACKSLASHREPLACE :
6868+ str = backslashreplace (& writer , 1 , str ,
6869+ unicode , collstart , collend );
67766870 if (str == NULL )
67776871 goto onError ;
6872+ pos = collend ;
6873+ break ;
67786874
6779- /* generate replacement */
6780- for (i = collstart ; i < collend ; ++ i ) {
6781- str += sprintf (str , "&#%d;" , PyUnicode_READ (kind , data , i ));
6782- }
6875+ case _Py_ERROR_XMLCHARREFREPLACE :
6876+ str = xmlcharrefreplace (& writer , 1 , str ,
6877+ unicode , collstart , collend );
6878+ if (str == NULL )
6879+ goto onError ;
67836880 pos = collend ;
67846881 break ;
67856882
@@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode,
68106907 if (PyBytes_Check (repunicode )) {
68116908 /* Directly copy bytes result to output. */
68126909 repsize = PyBytes_Size (repunicode );
6813- str = _PyBytesWriter_Prepare (& writer , str , repsize - 1 );
6814- if (str == NULL )
6815- goto onError ;
6910+ if (repsize > 1 ) {
6911+ str = _PyBytesWriter_Prepare (& writer , str , repsize - 1 );
6912+ if (str == NULL )
6913+ goto onError ;
6914+ }
68166915 memcpy (str , PyBytes_AsString (repunicode ), repsize );
68176916 str += repsize ;
68186917 pos = newpos ;
@@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode,
68566955 Py_XDECREF (exc );
68576956 return _PyBytesWriter_Finish (& writer , str );
68586957
6859- overflow :
6860- PyErr_SetString (PyExc_OverflowError ,
6861- "encoded result is too long for a Python string" );
6862-
68636958 onError :
68646959 _PyBytesWriter_Dealloc (& writer );
68656960 Py_XDECREF (error_handler_obj );
0 commit comments