Skip to content

Commit bea5706

Browse files
authored
bpo-32677: Optimize str.isascii() (GH-5356)
1 parent ea8fc52 commit bea5706

File tree

2 files changed

+43
-4
lines changed

2 files changed

+43
-4
lines changed

Lib/test/string_tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,13 @@ def test_isascii(self):
916916
self.checkequal(True, '\x00\x7f', 'isascii')
917917
self.checkequal(False, '\x80', 'isascii')
918918
self.checkequal(False, '\xe9', 'isascii')
919+
# bytes.isascii() and bytearray.isascii() has optimization which
920+
# check 4 or 8 bytes at once. So check some alignments.
921+
for p in range(8):
922+
self.checkequal(True, ' '*p + '\x7f', 'isascii')
923+
self.checkequal(False, ' '*p + '\x80', 'isascii')
924+
self.checkequal(True, ' '*p + '\x7f' + ' '*8, 'isascii')
925+
self.checkequal(False, ' '*p + '\x80' + ' '*8, 'isascii')
919926

920927
def test_isdigit(self):
921928
self.checkequal(False, '', 'isdigit')

Objects/bytes_methods.c

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,19 +98,51 @@ PyDoc_STRVAR_shared(_Py_isascii__doc__,
9898
Return True if B is empty or all characters in B are ASCII,\n\
9999
False otherwise.");
100100

101+
// Optimization is copied from ascii_decode in unicodeobject.c
102+
/* Mask to quickly check whether a C 'long' contains a
103+
non-ASCII, UTF8-encoded char. */
104+
#if (SIZEOF_LONG == 8)
105+
# define ASCII_CHAR_MASK 0x8080808080808080UL
106+
#elif (SIZEOF_LONG == 4)
107+
# define ASCII_CHAR_MASK 0x80808080UL
108+
#else
109+
# error C 'long' size should be either 4 or 8!
110+
#endif
111+
101112
PyObject*
102113
_Py_bytes_isascii(const char *cptr, Py_ssize_t len)
103114
{
104-
const unsigned char *p = (unsigned char *) cptr;
105-
const unsigned char *e = p + len;
106-
for (; p < e; p++) {
107-
if (*p >= 128) {
115+
const char *p = cptr;
116+
const char *end = p + len;
117+
const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
118+
119+
while (p < end) {
120+
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
121+
for an explanation. */
122+
if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
123+
/* Help allocation */
124+
const char *_p = p;
125+
while (_p < aligned_end) {
126+
unsigned long value = *(unsigned long *) _p;
127+
if (value & ASCII_CHAR_MASK) {
128+
Py_RETURN_FALSE;
129+
}
130+
_p += SIZEOF_LONG;
131+
}
132+
p = _p;
133+
if (_p == end)
134+
break;
135+
}
136+
if ((unsigned char)*p & 0x80) {
108137
Py_RETURN_FALSE;
109138
}
139+
p++;
110140
}
111141
Py_RETURN_TRUE;
112142
}
113143

144+
#undef ASCII_CHAR_MASK
145+
114146

115147
PyDoc_STRVAR_shared(_Py_isdigit__doc__,
116148
"B.isdigit() -> bool\n\

0 commit comments

Comments
 (0)