Skip to content

Commit 5c0ba36

Browse files
committed
Fix massive slowdown in string formatting with the % operator
1 parent 438818b commit 5c0ba36

1 file changed

Lines changed: 127 additions & 114 deletions

File tree

Objects/unicodeobject.c

Lines changed: 127 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -12693,17 +12693,13 @@ formatlong(PyObject *val, int flags, int prec, int type)
1269312693
return result;
1269412694
}
1269512695

12696-
static int
12697-
formatchar(Py_UCS4 *buf,
12698-
size_t buflen,
12699-
PyObject *v)
12696+
static Py_UCS4
12697+
formatchar(PyObject *v)
1270012698
{
1270112699
/* presume that the buffer is at least 3 characters long */
1270212700
if (PyUnicode_Check(v)) {
1270312701
if (PyUnicode_GET_LENGTH(v) == 1) {
12704-
buf[0] = PyUnicode_READ_CHAR(v, 0);
12705-
buf[1] = '\0';
12706-
return 1;
12702+
return PyUnicode_READ_CHAR(v, 0);
1270712703
}
1270812704
goto onError;
1270912705
}
@@ -12717,38 +12713,45 @@ formatchar(Py_UCS4 *buf,
1271712713
if (x < 0 || x > 0x10ffff) {
1271812714
PyErr_SetString(PyExc_OverflowError,
1271912715
"%c arg not in range(0x110000)");
12720-
return -1;
12716+
return (Py_UCS4) -1;
1272112717
}
1272212718

12723-
buf[0] = (Py_UCS4) x;
12724-
buf[1] = '\0';
12725-
return 1;
12719+
return (Py_UCS4) x;
1272612720
}
1272712721

1272812722
onError:
1272912723
PyErr_SetString(PyExc_TypeError,
1273012724
"%c requires int or char");
12731-
return -1;
12725+
return (Py_UCS4) -1;
1273212726
}
1273312727

12734-
/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12735-
FORMATBUFLEN is the length of the buffer in which chars are formatted.
12736-
*/
12737-
#define FORMATBUFLEN (size_t)10
12738-
1273912728
PyObject *
1274012729
PyUnicode_Format(PyObject *format, PyObject *args)
1274112730
{
1274212731
void *fmt;
1274312732
int fmtkind;
1274412733
PyObject *result;
12745-
Py_UCS4 *res, *res0;
12746-
Py_UCS4 max;
1274712734
int kind;
12748-
Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12735+
int r;
12736+
Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
1274912737
int args_owned = 0;
1275012738
PyObject *dict = NULL;
12739+
PyObject *temp = NULL;
12740+
PyObject *second = NULL;
1275112741
PyUnicodeObject *uformat;
12742+
_PyAccu acc;
12743+
static PyObject *plus, *minus, *blank, *zero, *percent;
12744+
12745+
if (!plus && !(plus = get_latin1_char('+')))
12746+
return NULL;
12747+
if (!minus && !(minus = get_latin1_char('-')))
12748+
return NULL;
12749+
if (!blank && !(blank = get_latin1_char(' ')))
12750+
return NULL;
12751+
if (!zero && !(zero = get_latin1_char('0')))
12752+
return NULL;
12753+
if (!percent && !(percent = get_latin1_char('%')))
12754+
return NULL;
1275212755

1275312756
if (format == NULL || args == NULL) {
1275412757
PyErr_BadInternalCall();
@@ -12757,18 +12760,13 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1275712760
uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
1275812761
if (uformat == NULL || PyUnicode_READY(uformat) == -1)
1275912762
return NULL;
12763+
if (_PyAccu_Init(&acc))
12764+
goto onError;
1276012765
fmt = PyUnicode_DATA(uformat);
1276112766
fmtkind = PyUnicode_KIND(uformat);
1276212767
fmtcnt = PyUnicode_GET_LENGTH(uformat);
1276312768
fmtpos = 0;
1276412769

12765-
reslen = rescnt = fmtcnt + 100;
12766-
res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12767-
if (res0 == NULL) {
12768-
PyErr_NoMemory();
12769-
goto onError;
12770-
}
12771-
1277212770
if (PyTuple_Check(args)) {
1277312771
arglen = PyTuple_Size(args);
1277412772
argidx = 0;
@@ -12783,34 +12781,34 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1278312781

1278412782
while (--fmtcnt >= 0) {
1278512783
if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12786-
if (--rescnt < 0) {
12787-
rescnt = fmtcnt + 100;
12788-
reslen += rescnt;
12789-
res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12790-
if (res0 == NULL){
12791-
PyErr_NoMemory();
12792-
goto onError;
12793-
}
12794-
res = res0 + reslen - rescnt;
12795-
--rescnt;
12796-
}
12797-
*res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12784+
PyObject *nonfmt;
12785+
Py_ssize_t nonfmtpos;
12786+
nonfmtpos = fmtpos++;
12787+
while (fmtcnt >= 0 &&
12788+
PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12789+
fmtpos++;
12790+
fmtcnt--;
12791+
}
12792+
nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12793+
if (nonfmt == NULL)
12794+
goto onError;
12795+
r = _PyAccu_Accumulate(&acc, nonfmt);
12796+
Py_DECREF(nonfmt);
12797+
if (r)
12798+
goto onError;
1279812799
}
1279912800
else {
1280012801
/* Got a format specifier */
1280112802
int flags = 0;
1280212803
Py_ssize_t width = -1;
1280312804
int prec = -1;
1280412805
Py_UCS4 c = '\0';
12805-
Py_UCS4 fill;
12806+
Py_UCS4 fill, sign;
1280612807
int isnumok;
1280712808
PyObject *v = NULL;
12808-
PyObject *temp = NULL;
12809-
void *pbuf;
12810-
Py_ssize_t pindex;
12811-
Py_UNICODE sign;
12812-
Py_ssize_t len, len1;
12813-
Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12809+
void *pbuf = NULL;
12810+
Py_ssize_t pindex, len;
12811+
PyObject *signobj = NULL, *fillobj = NULL;
1281412812

1281512813
fmtpos++;
1281612814
if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
@@ -12955,15 +12953,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1295512953
}
1295612954
sign = 0;
1295712955
fill = ' ';
12956+
fillobj = blank;
1295812957
switch (c) {
1295912958

1296012959
case '%':
12961-
pbuf = formatbuf;
12962-
kind = PyUnicode_4BYTE_KIND;
12963-
/* presume that buffer length is at least 1 */
12964-
PyUnicode_WRITE(kind, pbuf, 0, '%');
12965-
len = 1;
12966-
break;
12960+
_PyAccu_Accumulate(&acc, percent);
12961+
continue;
1296712962

1296812963
case 's':
1296912964
case 'r':
@@ -13045,8 +13040,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1304513040
"not %.200s", (char)c, Py_TYPE(v)->tp_name);
1304613041
goto onError;
1304713042
}
13048-
if (flags & F_ZERO)
13043+
if (flags & F_ZERO) {
1304913044
fill = '0';
13045+
fillobj = zero;
13046+
}
1305013047
break;
1305113048

1305213049
case 'e':
@@ -13066,17 +13063,25 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1306613063
kind = PyUnicode_KIND(temp);
1306713064
len = PyUnicode_GET_LENGTH(temp);
1306813065
sign = 1;
13069-
if (flags & F_ZERO)
13066+
if (flags & F_ZERO) {
1307013067
fill = '0';
13068+
fillobj = zero;
13069+
}
1307113070
break;
1307213071

1307313072
case 'c':
13074-
pbuf = formatbuf;
13075-
kind = PyUnicode_4BYTE_KIND;
13076-
len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
13077-
if (len < 0)
13073+
{
13074+
Py_UCS4 ch = formatchar(v);
13075+
if (ch == (Py_UCS4) -1)
13076+
goto onError;
13077+
temp = _PyUnicode_FromUCS4(&ch, 1);
13078+
if (temp == NULL)
1307813079
goto onError;
13080+
pbuf = PyUnicode_DATA(temp);
13081+
kind = PyUnicode_KIND(temp);
13082+
len = PyUnicode_GET_LENGTH(temp);
1307913083
break;
13084+
}
1308013085

1308113086
default:
1308213087
PyErr_Format(PyExc_ValueError,
@@ -13090,90 +13095,105 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1309013095
/* pbuf is initialized here. */
1309113096
pindex = 0;
1309213097
if (sign) {
13093-
if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13094-
PyUnicode_READ(kind, pbuf, pindex) == '+') {
13095-
sign = PyUnicode_READ(kind, pbuf, pindex++);
13098+
if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13099+
signobj = minus;
1309613100
len--;
13101+
pindex++;
13102+
}
13103+
else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13104+
signobj = plus;
13105+
len--;
13106+
pindex++;
1309713107
}
1309813108
else if (flags & F_SIGN)
13099-
sign = '+';
13109+
signobj = plus;
1310013110
else if (flags & F_BLANK)
13101-
sign = ' ';
13111+
signobj = blank;
1310213112
else
1310313113
sign = 0;
1310413114
}
1310513115
if (width < len)
1310613116
width = len;
13107-
if (rescnt - (sign != 0) < width) {
13108-
reslen -= rescnt;
13109-
rescnt = width + fmtcnt + 100;
13110-
reslen += rescnt;
13111-
if (reslen < 0) {
13112-
Py_XDECREF(temp);
13113-
PyErr_NoMemory();
13114-
goto onError;
13115-
}
13116-
res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13117-
if (res0 == 0) {
13118-
PyErr_NoMemory();
13119-
Py_XDECREF(temp);
13120-
goto onError;
13121-
}
13122-
res = res0 + reslen - rescnt;
13123-
}
1312413117
if (sign) {
13125-
if (fill != ' ')
13126-
*res++ = sign;
13127-
rescnt--;
13118+
if (fill != ' ') {
13119+
assert(signobj != NULL);
13120+
if (_PyAccu_Accumulate(&acc, signobj))
13121+
goto onError;
13122+
}
1312813123
if (width > len)
1312913124
width--;
1313013125
}
1313113126
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
1313213127
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13133-
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13128+
assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
1313413129
if (fill != ' ') {
13135-
*res++ = PyUnicode_READ(kind, pbuf, pindex++);
13136-
*res++ = PyUnicode_READ(kind, pbuf, pindex++);
13130+
second = get_latin1_char(
13131+
PyUnicode_READ(kind, pbuf, pindex + 1));
13132+
pindex += 2;
13133+
if (second == NULL ||
13134+
_PyAccu_Accumulate(&acc, zero) ||
13135+
_PyAccu_Accumulate(&acc, second))
13136+
goto onError;
13137+
Py_CLEAR(second);
1313713138
}
13138-
rescnt -= 2;
1313913139
width -= 2;
1314013140
if (width < 0)
1314113141
width = 0;
1314213142
len -= 2;
1314313143
}
1314413144
if (width > len && !(flags & F_LJUST)) {
13145+
assert(fillobj != NULL);
1314513146
do {
13146-
--rescnt;
13147-
*res++ = fill;
13147+
if (_PyAccu_Accumulate(&acc, fillobj))
13148+
goto onError;
1314813149
} while (--width > len);
1314913150
}
1315013151
if (fill == ' ') {
13151-
if (sign)
13152-
*res++ = sign;
13152+
if (sign) {
13153+
assert(signobj != NULL);
13154+
if (_PyAccu_Accumulate(&acc, signobj))
13155+
goto onError;
13156+
}
1315313157
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
1315413158
assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
1315513159
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13156-
*res++ = PyUnicode_READ(kind, pbuf, pindex++);
13157-
*res++ = PyUnicode_READ(kind, pbuf, pindex++);
13160+
second = get_latin1_char(
13161+
PyUnicode_READ(kind, pbuf, pindex + 1));
13162+
pindex += 2;
13163+
if (second == NULL ||
13164+
_PyAccu_Accumulate(&acc, zero) ||
13165+
_PyAccu_Accumulate(&acc, second))
13166+
goto onError;
13167+
Py_CLEAR(second);
1315813168
}
1315913169
}
1316013170
/* Copy all characters, preserving len */
13161-
len1 = len;
13162-
while (len1--) {
13163-
*res++ = PyUnicode_READ(kind, pbuf, pindex++);
13164-
rescnt--;
13171+
if (temp != NULL) {
13172+
assert(pbuf == PyUnicode_DATA(temp));
13173+
v = PyUnicode_Substring(temp, pindex, pindex + len);
13174+
}
13175+
else {
13176+
const char *p = (const char *) pbuf;
13177+
assert(pbuf != NULL);
13178+
p = p + PyUnicode_KIND_SIZE(kind, pindex);
13179+
v = PyUnicode_FromKindAndData(kind, p, len);
1316513180
}
13181+
if (v == NULL)
13182+
goto onError;
13183+
r = _PyAccu_Accumulate(&acc, v);
13184+
Py_DECREF(v);
13185+
if (r)
13186+
goto onError;
1316613187
while (--width >= len) {
13167-
--rescnt;
13168-
*res++ = ' ';
13188+
if (_PyAccu_Accumulate(&acc, blank))
13189+
goto onError;
1316913190
}
1317013191
if (dict && (argidx < arglen) && c != '%') {
1317113192
PyErr_SetString(PyExc_TypeError,
1317213193
"not all arguments converted during string formatting");
13173-
Py_XDECREF(temp);
1317413194
goto onError;
1317513195
}
13176-
Py_XDECREF(temp);
13196+
Py_CLEAR(temp);
1317713197
} /* '%' */
1317813198
} /* until end */
1317913199
if (argidx < arglen && !dict) {
@@ -13182,27 +13202,20 @@ PyUnicode_Format(PyObject *format, PyObject *args)
1318213202
goto onError;
1318313203
}
1318413204

13185-
13186-
for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13187-
if (*res > max)
13188-
max = *res;
13189-
result = PyUnicode_New(reslen - rescnt, max);
13190-
if (!result)
13191-
goto onError;
13192-
kind = PyUnicode_KIND(result);
13193-
for (res = res0; res < res0+reslen-rescnt; res++)
13194-
PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13195-
PyMem_Free(res0);
13205+
result = _PyAccu_Finish(&acc);
1319613206
if (args_owned) {
1319713207
Py_DECREF(args);
1319813208
}
1319913209
Py_DECREF(uformat);
13200-
assert(_PyUnicode_CheckConsistency(result, 1));
13210+
Py_XDECREF(temp);
13211+
Py_XDECREF(second);
1320113212
return (PyObject *)result;
1320213213

1320313214
onError:
13204-
PyMem_Free(res0);
1320513215
Py_DECREF(uformat);
13216+
Py_XDECREF(temp);
13217+
Py_XDECREF(second);
13218+
_PyAccu_Destroy(&acc);
1320613219
if (args_owned) {
1320713220
Py_DECREF(args);
1320813221
}

0 commit comments

Comments
 (0)