Skip to content

Commit e2cef88

Browse files
Issue python#16061: Speed up str.replace() for replacing 1-character strings.
1 parent a707f29 commit e2cef88

File tree

7 files changed

+102
-26
lines changed

7 files changed

+102
-26
lines changed

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,7 @@ UNICODE_DEPS = \
726726
$(srcdir)/Objects/stringlib/find_max_char.h \
727727
$(srcdir)/Objects/stringlib/localeutil.h \
728728
$(srcdir)/Objects/stringlib/partition.h \
729+
$(srcdir)/Objects/stringlib/replace.h \
729730
$(srcdir)/Objects/stringlib/split.h \
730731
$(srcdir)/Objects/stringlib/ucs1lib.h \
731732
$(srcdir)/Objects/stringlib/ucs2lib.h \

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ What's New in Python 3.4.0 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #16061: Speed up str.replace() for replacing 1-character strings.
14+
1315
- Issue #17715: Fix segmentation fault from raising an exception in a __trunc__
1416
method.
1517

Objects/stringlib/replace.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* stringlib: replace implementation */
2+
3+
#ifndef STRINGLIB_FASTSEARCH_H
4+
#error must include "stringlib/fastsearch.h" before including this module
5+
#endif
6+
7+
Py_LOCAL_INLINE(void)
8+
STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
9+
Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10+
{
11+
*s = u2;
12+
while (--maxcount && ++s != end) {
13+
/* Find the next character to be replaced.
14+
15+
If it occurs often, it is faster to scan for it using an inline
16+
loop. If it occurs seldom, it is faster to scan for it using a
17+
function call; the overhead of the function call is amortized
18+
across the many characters that call covers. We start with an
19+
inline loop and use a heuristic to determine whether to fall back
20+
to a function call. */
21+
if (*s != u1) {
22+
int attempts = 10;
23+
/* search u1 in a dummy loop */
24+
while (1) {
25+
if (++s == end)
26+
return;
27+
if (*s == u1)
28+
break;
29+
if (!--attempts) {
30+
/* if u1 was not found for attempts iterations,
31+
use FASTSEARCH() or memchr() */
32+
#if STRINGLIB_SIZEOF_CHAR == 1
33+
s++;
34+
s = memchr(s, u1, end - s);
35+
if (s == NULL)
36+
return;
37+
#else
38+
Py_ssize_t i;
39+
STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1;
40+
s++;
41+
i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH);
42+
if (i < 0)
43+
return;
44+
s += i;
45+
#endif
46+
/* restart the dummy loop */
47+
break;
48+
}
49+
}
50+
}
51+
*s = u2;
52+
}
53+
}

Objects/unicodeobject.c

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
605605
#include "stringlib/split.h"
606606
#include "stringlib/count.h"
607607
#include "stringlib/find.h"
608+
#include "stringlib/replace.h"
608609
#include "stringlib/find_max_char.h"
609610
#include "stringlib/localeutil.h"
610611
#include "stringlib/undef.h"
@@ -615,6 +616,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
615616
#include "stringlib/split.h"
616617
#include "stringlib/count.h"
617618
#include "stringlib/find.h"
619+
#include "stringlib/replace.h"
618620
#include "stringlib/find_max_char.h"
619621
#include "stringlib/localeutil.h"
620622
#include "stringlib/undef.h"
@@ -625,6 +627,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
625627
#include "stringlib/split.h"
626628
#include "stringlib/count.h"
627629
#include "stringlib/find.h"
630+
#include "stringlib/replace.h"
628631
#include "stringlib/find_max_char.h"
629632
#include "stringlib/localeutil.h"
630633
#include "stringlib/undef.h"
@@ -9927,6 +9930,31 @@ anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
99279930
return 0;
99289931
}
99299932

9933+
static void
9934+
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9935+
Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9936+
{
9937+
int kind = PyUnicode_KIND(u);
9938+
void *data = PyUnicode_DATA(u);
9939+
Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9940+
if (kind == PyUnicode_1BYTE_KIND) {
9941+
ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9942+
(Py_UCS1 *)data + len,
9943+
u1, u2, maxcount);
9944+
}
9945+
else if (kind == PyUnicode_2BYTE_KIND) {
9946+
ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9947+
(Py_UCS2 *)data + len,
9948+
u1, u2, maxcount);
9949+
}
9950+
else {
9951+
assert(kind == PyUnicode_4BYTE_KIND);
9952+
ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9953+
(Py_UCS4 *)data + len,
9954+
u1, u2, maxcount);
9955+
}
9956+
}
9957+
99309958
static PyObject *
99319959
replace(PyObject *self, PyObject *str1,
99329960
PyObject *str2, Py_ssize_t maxcount)
@@ -9943,7 +9971,7 @@ replace(PyObject *self, PyObject *str1,
99439971
Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
99449972
Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
99459973
int mayshrink;
9946-
Py_UCS4 maxchar, maxchar_str2;
9974+
Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
99479975

99489976
if (maxcount < 0)
99499977
maxcount = PY_SSIZE_T_MAX;
@@ -9952,15 +9980,16 @@ replace(PyObject *self, PyObject *str1,
99529980

99539981
if (str1 == str2)
99549982
goto nothing;
9955-
if (skind < kind1)
9956-
/* substring too wide to be present */
9957-
goto nothing;
99589983

99599984
maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9985+
maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9986+
if (maxchar < maxchar_str1)
9987+
/* substring too wide to be present */
9988+
goto nothing;
99609989
maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
99619990
/* Replacing str1 with str2 may cause a maxchar reduction in the
99629991
result string. */
9963-
mayshrink = (maxchar_str2 < maxchar);
9992+
mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
99649993
maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
99659994

99669995
if (len1 == len2) {
@@ -9970,36 +9999,19 @@ replace(PyObject *self, PyObject *str1,
99709999
if (len1 == 1) {
997110000
/* replace characters */
997210001
Py_UCS4 u1, u2;
9973-
int rkind;
9974-
Py_ssize_t index, pos;
9975-
char *src, *rbuf;
10002+
Py_ssize_t pos;
997610003

997710004
u1 = PyUnicode_READ(kind1, buf1, 0);
9978-
pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10005+
pos = findchar(sbuf, skind, slen, u1, 1);
997910006
if (pos < 0)
998010007
goto nothing;
998110008
u2 = PyUnicode_READ(kind2, buf2, 0);
998210009
u = PyUnicode_New(slen, maxchar);
998310010
if (!u)
998410011
goto error;
9985-
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9986-
rkind = PyUnicode_KIND(u);
9987-
rbuf = PyUnicode_DATA(u);
998810012

9989-
PyUnicode_WRITE(rkind, rbuf, pos, u2);
9990-
index = 0;
9991-
src = sbuf;
9992-
while (--maxcount)
9993-
{
9994-
pos++;
9995-
src += pos * PyUnicode_KIND(self);
9996-
slen -= pos;
9997-
index += pos;
9998-
pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9999-
if (pos < 0)
10000-
break;
10001-
PyUnicode_WRITE(rkind, rbuf, index + pos, u2);
10002-
}
10013+
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10014+
replace_1char_inplace(u, pos, u1, u2, maxcount);
1000310015
}
1000410016
else {
1000510017
int rkind = skind;

PC/VS9.0/pythoncore.vcproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,10 @@
15861586
RelativePath="..\..\Objects\rangeobject.c"
15871587
>
15881588
</File>
1589+
<File
1590+
RelativePath="..\..\Objects\stringlib\replace.h"
1591+
>
1592+
</File>
15891593
<File
15901594
RelativePath="..\..\Objects\setobject.c"
15911595
>

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@
475475
<ClInclude Include="..\Objects\stringlib\fastsearch.h" />
476476
<ClInclude Include="..\Objects\stringlib\find.h" />
477477
<ClInclude Include="..\Objects\stringlib\partition.h" />
478+
<ClInclude Include="..\Objects\stringlib\replace.h" />
478479
<ClInclude Include="..\Objects\stringlib\split.h" />
479480
<ClInclude Include="..\Objects\unicodetype_db.h" />
480481
<ClInclude Include="..\Parser\parser.h" />

PCbuild/pythoncore.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,9 @@
378378
<ClInclude Include="..\Objects\stringlib\partition.h">
379379
<Filter>Objects</Filter>
380380
</ClInclude>
381+
<ClInclude Include="..\Objects\stringlib\replace.h">
382+
<Filter>Objects</Filter>
383+
</ClInclude>
381384
<ClInclude Include="..\Objects\stringlib\split.h">
382385
<Filter>Objects</Filter>
383386
</ClInclude>

0 commit comments

Comments
 (0)