Skip to content

Commit e7bf86c

Browse files
committed
Optimize backslashreplace error handler
Issue python#25318: Optimize backslashreplace and xmlcharrefreplace error handlers in UTF-8 encoder. Optimize also backslashreplace error handler for ASCII and Latin1 encoders. Use the new _PyBytesWriter API to optimize these error handlers for the encoders. It avoids to create an exception and call the slow implementation of the error handler.
1 parent fdfbf78 commit e7bf86c

File tree

2 files changed

+160
-51
lines changed

2 files changed

+160
-51
lines changed

Objects/stringlib/codecs.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
334334
i += (endpos - startpos - 1);
335335
break;
336336

337-
338337
case _Py_ERROR_SURROGATEPASS:
339338
for (k=startpos; k<endpos; k++) {
340339
ch = data[k];
@@ -345,6 +344,22 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
345344
i += (endpos - startpos - 1);
346345
break;
347346

347+
case _Py_ERROR_BACKSLASHREPLACE:
348+
p = backslashreplace(&writer, max_char_size, p,
349+
unicode, startpos, endpos);
350+
if (p == NULL)
351+
goto error;
352+
i += (endpos - startpos - 1);
353+
break;
354+
355+
case _Py_ERROR_XMLCHARREFREPLACE:
356+
p = xmlcharrefreplace(&writer, max_char_size, p,
357+
unicode, startpos, endpos);
358+
if (p == NULL)
359+
goto error;
360+
i += (endpos - startpos - 1);
361+
break;
362+
348363
case _Py_ERROR_SURROGATEESCAPE:
349364
for (k=startpos; k<endpos; k++) {
350365
ch = data[k];
@@ -359,7 +374,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
359374
startpos = k;
360375
assert(startpos < endpos);
361376
/* fall through the default handler */
362-
363377
default:
364378
rep = unicode_encode_call_errorhandler(
365379
errors, &error_handler_obj, "utf-8", "surrogates not allowed",

Objects/unicodeobject.c

Lines changed: 144 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -305,28 +305,29 @@ typedef enum {
305305
_Py_ERROR_UNKNOWN=0,
306306
_Py_ERROR_STRICT,
307307
_Py_ERROR_SURROGATEESCAPE,
308-
_Py_ERROR_SURROGATEPASS,
309308
_Py_ERROR_REPLACE,
310309
_Py_ERROR_IGNORE,
310+
_Py_ERROR_BACKSLASHREPLACE,
311+
_Py_ERROR_SURROGATEPASS,
311312
_Py_ERROR_XMLCHARREFREPLACE,
312313
_Py_ERROR_OTHER
313314
} _Py_error_handler;
314315

315316
static _Py_error_handler
316317
get_error_handler(const char *errors)
317318
{
318-
if (errors == NULL)
319-
return _Py_ERROR_STRICT;
320-
if (strcmp(errors, "strict") == 0)
319+
if (errors == NULL || strcmp(errors, "strict") == 0)
321320
return _Py_ERROR_STRICT;
322321
if (strcmp(errors, "surrogateescape") == 0)
323322
return _Py_ERROR_SURROGATEESCAPE;
324-
if (strcmp(errors, "surrogatepass") == 0)
325-
return _Py_ERROR_SURROGATEPASS;
326-
if (strcmp(errors, "ignore") == 0)
327-
return _Py_ERROR_IGNORE;
328323
if (strcmp(errors, "replace") == 0)
329324
return _Py_ERROR_REPLACE;
325+
if (strcmp(errors, "ignore") == 0)
326+
return _Py_ERROR_IGNORE;
327+
if (strcmp(errors, "backslashreplace") == 0)
328+
return _Py_ERROR_BACKSLASHREPLACE;
329+
if (strcmp(errors, "surrogatepass") == 0)
330+
return _Py_ERROR_SURROGATEPASS;
330331
if (strcmp(errors, "xmlcharrefreplace") == 0)
331332
return _Py_ERROR_XMLCHARREFREPLACE;
332333
return _Py_ERROR_OTHER;
@@ -771,6 +772,126 @@ unicode_result_unchanged(PyObject *unicode)
771772
return _PyUnicode_Copy(unicode);
772773
}
773774

775+
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
776+
ASCII, Latin1, UTF-8, etc. */
777+
static char*
778+
backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
779+
char *str,
780+
PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781+
{
782+
Py_ssize_t size, i, prealloc;
783+
Py_UCS4 ch;
784+
enum PyUnicode_Kind kind;
785+
void *data;
786+
787+
assert(PyUnicode_IS_READY(unicode));
788+
kind = PyUnicode_KIND(unicode);
789+
data = PyUnicode_DATA(unicode);
790+
791+
size = 0;
792+
/* determine replacement size */
793+
for (i = collstart; i < collend; ++i) {
794+
Py_ssize_t incr;
795+
796+
ch = PyUnicode_READ(kind, data, i);
797+
if (ch < 0x100)
798+
incr = 2+2;
799+
else if (ch < 0x10000)
800+
incr = 2+4;
801+
else {
802+
assert(ch <= MAX_UNICODE);
803+
incr = 2+6;
804+
}
805+
if (size > PY_SSIZE_T_MAX - incr) {
806+
PyErr_SetString(PyExc_OverflowError,
807+
"encoded result is too long for a Python string");
808+
return NULL;
809+
}
810+
size += incr;
811+
}
812+
813+
prealloc = prealloc_per_char * (collend - collstart);
814+
if (size > prealloc) {
815+
str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
816+
if (str == NULL)
817+
return NULL;
818+
}
819+
820+
/* generate replacement */
821+
for (i = collstart; i < collend; ++i) {
822+
ch = PyUnicode_READ(kind, data, i);
823+
if (ch < 0x100)
824+
str += sprintf(str, "\\x%02x", ch);
825+
else if (ch < 0x10000)
826+
str += sprintf(str, "\\u%04x", ch);
827+
else {
828+
assert(ch <= MAX_UNICODE);
829+
str += sprintf(str, "\\U%08x", ch);
830+
}
831+
}
832+
return str;
833+
}
834+
835+
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
836+
ASCII, Latin1, UTF-8, etc. */
837+
static char*
838+
xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
839+
char *str,
840+
PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
841+
{
842+
Py_ssize_t size, i, prealloc;
843+
Py_UCS4 ch;
844+
enum PyUnicode_Kind kind;
845+
void *data;
846+
847+
assert(PyUnicode_IS_READY(unicode));
848+
kind = PyUnicode_KIND(unicode);
849+
data = PyUnicode_DATA(unicode);
850+
851+
size = 0;
852+
/* determine replacement size */
853+
for (i = collstart; i < collend; ++i) {
854+
Py_ssize_t incr;
855+
856+
ch = PyUnicode_READ(kind, data, i);
857+
if (ch < 10)
858+
incr = 2+1+1;
859+
else if (ch < 100)
860+
incr = 2+2+1;
861+
else if (ch < 1000)
862+
incr = 2+3+1;
863+
else if (ch < 10000)
864+
incr = 2+4+1;
865+
else if (ch < 100000)
866+
incr = 2+5+1;
867+
else if (ch < 1000000)
868+
incr = 2+6+1;
869+
else {
870+
assert(ch <= MAX_UNICODE);
871+
incr = 2+7+1;
872+
}
873+
if (size > PY_SSIZE_T_MAX - incr) {
874+
PyErr_SetString(PyExc_OverflowError,
875+
"encoded result is too long for a Python string");
876+
return NULL;
877+
}
878+
size += incr;
879+
}
880+
881+
prealloc = prealloc_per_char * (collend - collstart);
882+
if (size > prealloc) {
883+
str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
884+
if (str == NULL)
885+
return NULL;
886+
}
887+
888+
/* generate replacement */
889+
for (i = collstart; i < collend; ++i) {
890+
str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
891+
}
892+
return str;
893+
}
894+
774895
/* --- Bloom Filters ----------------------------------------------------- */
775896

776897
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode,
67136834
++pos;
67146835
}
67156836
else {
6716-
Py_ssize_t requiredsize;
67176837
PyObject *repunicode;
67186838
Py_ssize_t repsize, newpos, i;
67196839
/* startpos for collecting unencodable chars */
@@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode,
67446864
pos = collend;
67456865
break;
67466866

6747-
case _Py_ERROR_XMLCHARREFREPLACE:
6748-
requiredsize = 0;
6749-
/* determine replacement size */
6750-
for (i = collstart; i < collend; ++i) {
6751-
Py_ssize_t incr;
6752-
6753-
ch = PyUnicode_READ(kind, data, i);
6754-
if (ch < 10)
6755-
incr = 2+1+1;
6756-
else if (ch < 100)
6757-
incr = 2+2+1;
6758-
else if (ch < 1000)
6759-
incr = 2+3+1;
6760-
else if (ch < 10000)
6761-
incr = 2+4+1;
6762-
else if (ch < 100000)
6763-
incr = 2+5+1;
6764-
else if (ch < 1000000)
6765-
incr = 2+6+1;
6766-
else {
6767-
assert(ch <= MAX_UNICODE);
6768-
incr = 2+7+1;
6769-
}
6770-
if (requiredsize > PY_SSIZE_T_MAX - incr)
6771-
goto overflow;
6772-
requiredsize += incr;
6773-
}
6774-
6775-
str = _PyBytesWriter_Prepare(&writer, str, requiredsize-1);
6867+
case _Py_ERROR_BACKSLASHREPLACE:
6868+
str = backslashreplace(&writer, 1, str,
6869+
unicode, collstart, collend);
67766870
if (str == NULL)
67776871
goto onError;
6872+
pos = collend;
6873+
break;
67786874

6779-
/* generate replacement */
6780-
for (i = collstart; i < collend; ++i) {
6781-
str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6782-
}
6875+
case _Py_ERROR_XMLCHARREFREPLACE:
6876+
str = xmlcharrefreplace(&writer, 1, str,
6877+
unicode, collstart, collend);
6878+
if (str == NULL)
6879+
goto onError;
67836880
pos = collend;
67846881
break;
67856882

@@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode,
68106907
if (PyBytes_Check(repunicode)) {
68116908
/* Directly copy bytes result to output. */
68126909
repsize = PyBytes_Size(repunicode);
6813-
str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
6814-
if (str == NULL)
6815-
goto onError;
6910+
if (repsize > 1) {
6911+
str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
6912+
if (str == NULL)
6913+
goto onError;
6914+
}
68166915
memcpy(str, PyBytes_AsString(repunicode), repsize);
68176916
str += repsize;
68186917
pos = newpos;
@@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode,
68566955
Py_XDECREF(exc);
68576956
return _PyBytesWriter_Finish(&writer, str);
68586957

6859-
overflow:
6860-
PyErr_SetString(PyExc_OverflowError,
6861-
"encoded result is too long for a Python string");
6862-
68636958
onError:
68646959
_PyBytesWriter_Dealloc(&writer);
68656960
Py_XDECREF(error_handler_obj);

0 commit comments

Comments
 (0)