Skip to content

Commit 2bf8993

Browse files
committed
Optimize bytes.fromhex() and bytearray.fromhex()
Issue python#25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
1 parent ebcf9ed commit 2bf8993

File tree

7 files changed

+101
-95
lines changed

7 files changed

+101
-95
lines changed

Doc/whatsnew/3.6.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ Optimizations
161161
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
162162
Victor Stinner in :issue:`25399`).
163163

164+
* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
165+
between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
166+
164167

165168
Build and C API Changes
166169
=======================

Include/bytesobject.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
6767
Py_ssize_t format_len,
6868
PyObject *args,
6969
int use_bytearray);
70+
PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
71+
PyObject *string,
72+
int use_bytearray);
7073
#endif
7174
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
7275
const char *, Py_ssize_t,

Include/longobject.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
6565
# error "void* different in size from int, long and long long"
6666
#endif /* SIZEOF_VOID_P */
6767

68-
/* Used by Python/mystrtoul.c. */
68+
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
6969
#ifndef Py_LIMITED_API
7070
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
7171
#endif

Lib/test/test_bytes.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,20 @@ def test_fromhex(self):
301301
self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
302302
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
303303

304+
for data, pos in (
305+
# invalid first hexadecimal character
306+
('12 x4 56', 3),
307+
# invalid second hexadecimal character
308+
('12 3x 56', 4),
309+
# two invalid hexadecimal characters
310+
('12 xy 56', 3),
311+
# test non-ASCII string
312+
('12 3\xff 56', 4),
313+
):
314+
with self.assertRaises(ValueError) as cm:
315+
self.type2test.fromhex(data)
316+
self.assertIn('at position %s' % pos, str(cm.exception))
317+
304318
def test_hex(self):
305319
self.assertRaises(TypeError, self.type2test.hex)
306320
self.assertRaises(TypeError, self.type2test.hex, 1)

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
14+
between 2x and 3.5x faster.
15+
1316
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
1417
API. Formatting is now between 2.5 and 5 times faster.
1518

Objects/bytearrayobject.c

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2823,48 +2823,7 @@ static PyObject *
28232823
bytearray_fromhex_impl(PyObject*cls, PyObject *string)
28242824
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
28252825
{
2826-
PyObject *newbytes;
2827-
char *buf;
2828-
Py_ssize_t hexlen, byteslen, i, j;
2829-
int top, bot;
2830-
void *data;
2831-
unsigned int kind;
2832-
2833-
assert(PyUnicode_Check(string));
2834-
if (PyUnicode_READY(string))
2835-
return NULL;
2836-
kind = PyUnicode_KIND(string);
2837-
data = PyUnicode_DATA(string);
2838-
hexlen = PyUnicode_GET_LENGTH(string);
2839-
2840-
byteslen = hexlen/2; /* This overestimates if there are spaces */
2841-
newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
2842-
if (!newbytes)
2843-
return NULL;
2844-
buf = PyByteArray_AS_STRING(newbytes);
2845-
for (i = j = 0; i < hexlen; i += 2) {
2846-
/* skip over spaces in the input */
2847-
while (PyUnicode_READ(kind, data, i) == ' ')
2848-
i++;
2849-
if (i >= hexlen)
2850-
break;
2851-
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
2852-
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
2853-
if (top == -1 || bot == -1) {
2854-
PyErr_Format(PyExc_ValueError,
2855-
"non-hexadecimal number found in "
2856-
"fromhex() arg at position %zd", i);
2857-
goto error;
2858-
}
2859-
buf[j++] = (top << 4) + bot;
2860-
}
2861-
if (PyByteArray_Resize(newbytes, j) < 0)
2862-
goto error;
2863-
return newbytes;
2864-
2865-
error:
2866-
Py_DECREF(newbytes);
2867-
return NULL;
2826+
return _PyBytes_FromHex(string, 1);
28682827
}
28692828

28702829
PyDoc_STRVAR(hex__doc__,

Objects/bytesobject.c

Lines changed: 76 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
3030
*/
3131
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
3232

33+
/* Forward declaration */
34+
Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
35+
char *str);
36+
3337
/*
3438
For PyBytes_FromString(), the parameter `str' points to a null-terminated
3539
string containing exactly `size' bytes.
@@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
30783082
);
30793083
}
30803084

3081-
static int
3082-
hex_digit_to_int(Py_UCS4 c)
3083-
{
3084-
if (c >= 128)
3085-
return -1;
3086-
if (Py_ISDIGIT(c))
3087-
return c - '0';
3088-
else {
3089-
if (Py_ISUPPER(c))
3090-
c = Py_TOLOWER(c);
3091-
if (c >= 'a' && c <= 'f')
3092-
return c - 'a' + 10;
3093-
}
3094-
return -1;
3095-
}
3096-
30973085
/*[clinic input]
30983086
@classmethod
30993087
bytes.fromhex
@@ -3111,47 +3099,83 @@ static PyObject *
31113099
bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
31123100
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
31133101
{
3114-
PyObject *newstring;
3102+
return _PyBytes_FromHex(string, 0);
3103+
}
3104+
3105+
PyObject*
3106+
_PyBytes_FromHex(PyObject *string, int use_bytearray)
3107+
{
31153108
char *buf;
3116-
Py_ssize_t hexlen, byteslen, i, j;
3117-
int top, bot;
3118-
void *data;
3119-
unsigned int kind;
3109+
Py_ssize_t hexlen, invalid_char;
3110+
unsigned int top, bot;
3111+
Py_UCS1 *str, *end;
3112+
_PyBytesWriter writer;
3113+
3114+
_PyBytesWriter_Init(&writer);
3115+
writer.use_bytearray = use_bytearray;
31203116

31213117
assert(PyUnicode_Check(string));
31223118
if (PyUnicode_READY(string))
31233119
return NULL;
3124-
kind = PyUnicode_KIND(string);
3125-
data = PyUnicode_DATA(string);
31263120
hexlen = PyUnicode_GET_LENGTH(string);
31273121

3128-
byteslen = hexlen/2; /* This overestimates if there are spaces */
3129-
newstring = PyBytes_FromStringAndSize(NULL, byteslen);
3130-
if (!newstring)
3122+
if (!PyUnicode_IS_ASCII(string)) {
3123+
void *data = PyUnicode_DATA(string);
3124+
unsigned int kind = PyUnicode_KIND(string);
3125+
Py_ssize_t i;
3126+
3127+
/* search for the first non-ASCII character */
3128+
for (i = 0; i < hexlen; i++) {
3129+
if (PyUnicode_READ(kind, data, i) >= 128)
3130+
break;
3131+
}
3132+
invalid_char = i;
3133+
goto error;
3134+
}
3135+
3136+
assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
3137+
str = PyUnicode_1BYTE_DATA(string);
3138+
3139+
/* This overestimates if there are spaces */
3140+
buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
3141+
if (buf == NULL)
31313142
return NULL;
3132-
buf = PyBytes_AS_STRING(newstring);
3133-
for (i = j = 0; i < hexlen; i += 2) {
3143+
3144+
end = str + hexlen;
3145+
while (str < end) {
31343146
/* skip over spaces in the input */
3135-
while (PyUnicode_READ(kind, data, i) == ' ')
3136-
i++;
3137-
if (i >= hexlen)
3138-
break;
3139-
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
3140-
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
3141-
if (top == -1 || bot == -1) {
3142-
PyErr_Format(PyExc_ValueError,
3143-
"non-hexadecimal number found in "
3144-
"fromhex() arg at position %zd", i);
3147+
if (*str == ' ') {
3148+
do {
3149+
str++;
3150+
} while (*str == ' ');
3151+
if (str >= end)
3152+
break;
3153+
}
3154+
3155+
top = _PyLong_DigitValue[*str];
3156+
if (top >= 16) {
3157+
invalid_char = str - PyUnicode_1BYTE_DATA(string);
31453158
goto error;
31463159
}
3147-
buf[j++] = (top << 4) + bot;
3160+
str++;
3161+
3162+
bot = _PyLong_DigitValue[*str];
3163+
if (bot >= 16) {
3164+
invalid_char = str - PyUnicode_1BYTE_DATA(string);
3165+
goto error;
3166+
}
3167+
str++;
3168+
3169+
*buf++ = (unsigned char)((top << 4) + bot);
31483170
}
3149-
if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
3150-
goto error;
3151-
return newstring;
3171+
3172+
return _PyBytesWriter_Finish(&writer, buf);
31523173

31533174
error:
3154-
Py_XDECREF(newstring);
3175+
PyErr_Format(PyExc_ValueError,
3176+
"non-hexadecimal number found in "
3177+
"fromhex() arg at position %zd", invalid_char);
3178+
_PyBytesWriter_Dealloc(&writer);
31553179
return NULL;
31563180
}
31573181

@@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
38883912
}
38893913

38903914
Py_LOCAL_INLINE(Py_ssize_t)
3891-
_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
3915+
_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
38923916
{
38933917
char *start = _PyBytesWriter_AsString(writer);
38943918
assert(str != NULL);
@@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
39633987
allocated += allocated / OVERALLOCATE_FACTOR;
39643988
}
39653989

3966-
pos = _PyBytesWriter_GetPos(writer, str);
3990+
pos = _PyBytesWriter_GetSize(writer, str);
39673991
if (!writer->use_small_buffer) {
39683992
if (writer->use_bytearray) {
39693993
if (PyByteArray_Resize(writer->buffer, allocated))
@@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
40414065
PyObject *
40424066
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
40434067
{
4044-
Py_ssize_t pos;
4068+
Py_ssize_t size;
40454069
PyObject *result;
40464070

40474071
_PyBytesWriter_CheckConsistency(writer, str);
40484072

4049-
pos = _PyBytesWriter_GetPos(writer, str);
4050-
if (pos == 0 && !writer->use_bytearray) {
4073+
size = _PyBytesWriter_GetSize(writer, str);
4074+
if (size == 0 && !writer->use_bytearray) {
40514075
Py_CLEAR(writer->buffer);
40524076
/* Get the empty byte string singleton */
40534077
result = PyBytes_FromStringAndSize(NULL, 0);
40544078
}
40554079
else if (writer->use_small_buffer) {
4056-
result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
4080+
result = PyBytes_FromStringAndSize(writer->small_buffer, size);
40574081
}
40584082
else {
40594083
result = writer->buffer;
40604084
writer->buffer = NULL;
40614085

4062-
if (pos != writer->allocated) {
4086+
if (size != writer->allocated) {
40634087
if (writer->use_bytearray) {
4064-
if (PyByteArray_Resize(result, pos)) {
4088+
if (PyByteArray_Resize(result, size)) {
40654089
Py_DECREF(result);
40664090
return NULL;
40674091
}
40684092
}
40694093
else {
4070-
if (_PyBytes_Resize(&result, pos)) {
4094+
if (_PyBytes_Resize(&result, size)) {
40714095
assert(result == NULL);
40724096
return NULL;
40734097
}

0 commit comments

Comments
 (0)