Skip to content

Commit 583a939

Browse files
Issue python#15027: Rewrite the UTF-32 encoder. It is now 1.6x to 3.5x faster.
1 parent 41adc26 commit 583a939

File tree

4 files changed

+133
-62
lines changed

4 files changed

+133
-62
lines changed

Doc/whatsnew/3.4.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1213,7 +1213,9 @@ Other Improvements
12131213
Significant Optimizations
12141214
=========================
12151215

1216-
* The UTF-32 decoder is now 3x to 4x faster.
1216+
* The UTF-32 decoder is now 3x to 4x faster. The UTF-32 encoder is now 1.6x
1217+
to 3.5x faster. (Contributed by Serhiy Storchaka in :issue:`14625` and
1218+
:issue:`15027`.)
12171219

12181220
* The cost of hash collisions for sets is now reduced. Each hash table
12191221
probe now checks a series of consecutive, adjacent key/hash pairs before

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Release date: 2014-01-05
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #15027: Rewrite the UTF-32 encoder. It is now 1.6x to 3.5x faster.
14+
1315
- Issue #17432: Drop UCS2 from names of Unicode functions in python3.def.
1416

1517
- Issue #19526: Exclude all new API from the stable ABI. Exceptions can be

Objects/stringlib/codecs.h

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,93 @@ STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
718718
return len - (end - in + 1);
719719
#endif
720720
}
721+
722+
#if STRINGLIB_SIZEOF_CHAR == 1
723+
# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
724+
#elif STRINGLIB_SIZEOF_CHAR == 2
725+
# define SWAB4(CH, tmp) (tmp = (CH), \
726+
((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
727+
/* high bytes are zero */
728+
#else
729+
# define SWAB4(CH, tmp) (tmp = (CH), \
730+
tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
731+
((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
732+
#endif
733+
Py_LOCAL_INLINE(Py_ssize_t)
734+
STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
735+
Py_ssize_t len,
736+
PY_UINT32_T **outptr,
737+
int native_ordering)
738+
{
739+
PY_UINT32_T *out = *outptr;
740+
const STRINGLIB_CHAR *end = in + len;
741+
if (native_ordering) {
742+
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
743+
while (in < unrolled_end) {
744+
#if STRINGLIB_SIZEOF_CHAR > 1
745+
/* check if any character is a surrogate character */
746+
if (((in[0] ^ 0xd800) &
747+
(in[1] ^ 0xd800) &
748+
(in[2] ^ 0xd800) &
749+
(in[3] ^ 0xd800) & 0xf800) == 0)
750+
break;
751+
#endif
752+
out[0] = in[0];
753+
out[1] = in[1];
754+
out[2] = in[2];
755+
out[3] = in[3];
756+
in += 4; out += 4;
757+
}
758+
while (in < end) {
759+
Py_UCS4 ch;
760+
ch = *in++;
761+
#if STRINGLIB_SIZEOF_CHAR > 1
762+
if (Py_UNICODE_IS_SURROGATE(ch)) {
763+
/* reject surrogate characters (U+DC800-U+DFFF) */
764+
goto fail;
765+
}
766+
#endif
767+
*out++ = ch;
768+
}
769+
} else {
770+
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
771+
while (in < unrolled_end) {
772+
#if STRINGLIB_SIZEOF_CHAR > 1
773+
Py_UCS4 ch1, ch2, ch3, ch4;
774+
/* check if any character is a surrogate character */
775+
if (((in[0] ^ 0xd800) &
776+
(in[1] ^ 0xd800) &
777+
(in[2] ^ 0xd800) &
778+
(in[3] ^ 0xd800) & 0xf800) == 0)
779+
break;
780+
#endif
781+
out[0] = SWAB4(in[0], ch1);
782+
out[1] = SWAB4(in[1], ch2);
783+
out[2] = SWAB4(in[2], ch3);
784+
out[3] = SWAB4(in[3], ch4);
785+
in += 4; out += 4;
786+
}
787+
while (in < end) {
788+
Py_UCS4 ch = *in++;
789+
#if STRINGLIB_SIZEOF_CHAR > 1
790+
if (Py_UNICODE_IS_SURROGATE(ch)) {
791+
/* reject surrogate characters (U+DC800-U+DFFF) */
792+
goto fail;
793+
}
794+
#endif
795+
*out++ = SWAB4(ch, ch);
796+
}
797+
}
798+
*outptr = out;
799+
return len;
800+
#if STRINGLIB_SIZEOF_CHAR > 1
801+
fail:
802+
*outptr = out;
803+
return len - (end - in + 1);
804+
#endif
805+
}
806+
#undef SWAB4
807+
721808
#endif
722809

723810
#endif /* STRINGLIB_IS_UNICODE */

Objects/unicodeobject.c

Lines changed: 41 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5085,32 +5085,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
50855085
const char *errors,
50865086
int byteorder)
50875087
{
5088-
int kind;
5089-
void *data;
5088+
enum PyUnicode_Kind kind;
5089+
const void *data;
50905090
Py_ssize_t len;
50915091
PyObject *v;
5092-
unsigned char *p;
5093-
Py_ssize_t nsize, i;
5094-
/* Offsets from p for storing byte pairs in the right order. */
5092+
PY_UINT32_T *out;
50955093
#if PY_LITTLE_ENDIAN
5096-
int iorder[] = {0, 1, 2, 3};
5094+
int native_ordering = byteorder <= 0;
50975095
#else
5098-
int iorder[] = {3, 2, 1, 0};
5096+
int native_ordering = byteorder >= 0;
50995097
#endif
51005098
const char *encoding;
5099+
Py_ssize_t nsize, pos;
51015100
PyObject *errorHandler = NULL;
51025101
PyObject *exc = NULL;
51035102
PyObject *rep = NULL;
51045103

5105-
#define STORECHAR(CH) \
5106-
do { \
5107-
p[iorder[3]] = ((CH) >> 24) & 0xff; \
5108-
p[iorder[2]] = ((CH) >> 16) & 0xff; \
5109-
p[iorder[1]] = ((CH) >> 8) & 0xff; \
5110-
p[iorder[0]] = (CH) & 0xff; \
5111-
p += 4; \
5112-
} while(0)
5113-
51145104
if (!PyUnicode_Check(str)) {
51155105
PyErr_BadArgument();
51165106
return NULL;
@@ -5121,67 +5111,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51215111
data = PyUnicode_DATA(str);
51225112
len = PyUnicode_GET_LENGTH(str);
51235113

5124-
nsize = len + (byteorder == 0);
5125-
if (nsize > PY_SSIZE_T_MAX / 4)
5114+
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
51265115
return PyErr_NoMemory();
5116+
nsize = len + (byteorder == 0);
51275117
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
51285118
if (v == NULL)
51295119
return NULL;
51305120

5131-
p = (unsigned char *)PyBytes_AS_STRING(v);
5121+
/* output buffer is 4-bytes aligned */
5122+
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5123+
out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
51325124
if (byteorder == 0)
5133-
STORECHAR(0xFEFF);
5125+
*out++ = 0xFEFF;
51345126
if (len == 0)
5135-
return v;
5127+
goto done;
51365128

5137-
if (byteorder == -1) {
5138-
/* force LE */
5139-
iorder[0] = 0;
5140-
iorder[1] = 1;
5141-
iorder[2] = 2;
5142-
iorder[3] = 3;
5129+
if (byteorder == -1)
51435130
encoding = "utf-32-le";
5144-
}
5145-
else if (byteorder == 1) {
5146-
/* force BE */
5147-
iorder[0] = 3;
5148-
iorder[1] = 2;
5149-
iorder[2] = 1;
5150-
iorder[3] = 0;
5131+
else if (byteorder == 1)
51515132
encoding = "utf-32-be";
5152-
}
51535133
else
51545134
encoding = "utf-32";
51555135

51565136
if (kind == PyUnicode_1BYTE_KIND) {
5157-
for (i = 0; i < len; i++)
5158-
STORECHAR(PyUnicode_READ(kind, data, i));
5159-
return v;
5137+
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5138+
goto done;
51605139
}
51615140

5162-
for (i = 0; i < len;) {
5141+
pos = 0;
5142+
while (pos < len) {
51635143
Py_ssize_t repsize, moreunits;
5164-
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5165-
i++;
5166-
assert(ch <= MAX_UNICODE);
5167-
if (!Py_UNICODE_IS_SURROGATE(ch)) {
5168-
STORECHAR(ch);
5169-
continue;
5144+
5145+
if (kind == PyUnicode_2BYTE_KIND) {
5146+
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5147+
&out, native_ordering);
51705148
}
5149+
else {
5150+
assert(kind == PyUnicode_4BYTE_KIND);
5151+
pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5152+
&out, native_ordering);
5153+
}
5154+
if (pos == len)
5155+
break;
51715156

51725157
rep = unicode_encode_call_errorhandler(
51735158
errors, &errorHandler,
51745159
encoding, "surrogates not allowed",
5175-
str, &exc, i-1, i, &i);
5176-
5160+
str, &exc, pos, pos + 1, &pos);
51775161
if (!rep)
51785162
goto error;
51795163

51805164
if (PyBytes_Check(rep)) {
51815165
repsize = PyBytes_GET_SIZE(rep);
51825166
if (repsize & 3) {
51835167
raise_encode_exception(&exc, encoding,
5184-
str, i - 1, i,
5168+
str, pos - 1, pos,
51855169
"surrogates not allowed");
51865170
goto error;
51875171
}
@@ -5194,15 +5178,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51945178
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
51955179
if (!PyUnicode_IS_ASCII(rep)) {
51965180
raise_encode_exception(&exc, encoding,
5197-
str, i - 1, i,
5181+
str, pos - 1, pos,
51985182
"surrogates not allowed");
51995183
goto error;
52005184
}
52015185
}
52025186

52035187
/* four bytes are reserved for each surrogate */
52045188
if (moreunits > 1) {
5205-
Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5189+
Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
52065190
Py_ssize_t morebytes = 4 * (moreunits - 1);
52075191
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
52085192
/* integer overflow */
@@ -5211,20 +5195,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
52115195
}
52125196
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
52135197
goto error;
5214-
p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5198+
out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
52155199
}
52165200

52175201
if (PyBytes_Check(rep)) {
5218-
Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5219-
p += repsize;
5202+
Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5203+
out += moreunits;
52205204
} else /* rep is unicode */ {
5221-
const Py_UCS1 *repdata;
52225205
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5223-
repdata = PyUnicode_1BYTE_DATA(rep);
5224-
while (repsize--) {
5225-
Py_UCS4 ch = *repdata++;
5226-
STORECHAR(ch);
5227-
}
5206+
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5207+
&out, native_ordering);
52285208
}
52295209

52305210
Py_CLEAR(rep);
@@ -5233,19 +5213,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
52335213
/* Cut back to size actually needed. This is necessary for, for example,
52345214
encoding of a string containing isolated surrogates and the 'ignore'
52355215
handler is used. */
5236-
nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5216+
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
52375217
if (nsize != PyBytes_GET_SIZE(v))
52385218
_PyBytes_Resize(&v, nsize);
52395219
Py_XDECREF(errorHandler);
52405220
Py_XDECREF(exc);
5221+
done:
52415222
return v;
52425223
error:
52435224
Py_XDECREF(rep);
52445225
Py_XDECREF(errorHandler);
52455226
Py_XDECREF(exc);
52465227
Py_XDECREF(v);
52475228
return NULL;
5248-
#undef STORECHAR
52495229
}
52505230

52515231
PyObject *

0 commit comments

Comments
 (0)