@@ -20,23 +20,46 @@ Py_LOCAL_INLINE(Py_UCS4)
2020STRINGLIB (find_max_char )(const STRINGLIB_CHAR * begin , const STRINGLIB_CHAR * end )
2121{
2222 const unsigned char * p = (const unsigned char * ) begin ;
23+ const unsigned char * _begin = (const unsigned char * )begin ;
24+ const unsigned char * aligned_start = (const unsigned char * )(
25+ ((intptr_t )_begin + (SIZEOF_SIZE_T - 1 )) & ~(SIZEOF_SIZE_T - 1 ));
26+ const unsigned char * _end = (const unsigned char * )end ;
27+ const size_t * aligned_end = (const size_t * )((intptr_t )_end & ~(SIZEOF_SIZE_T - 1 ));
28+ const size_t * unrolled_end = aligned_end - 3 ;
29+ unsigned char accumulator = 0 ;
30+ /* Do not test each character individually, bit use bitwise OR and test
31+ all characters at once. */
32+ while (p < _end && p < aligned_start ) {
33+ accumulator |= * p ;
34+ p += 1 ;
35+ }
36+ if (accumulator & 0x80 ) {
37+ return 255 ;
38+ } else if (p == end ) {
39+ return 127 ;
40+ }
2341
24- while (p < end ) {
25- if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )) {
26- /* Help register allocation */
27- const unsigned char * _p = p ;
28- while (_p + SIZEOF_SIZE_T <= end ) {
29- size_t value = * (const size_t * ) _p ;
30- if (value & UCS1_ASCII_CHAR_MASK )
31- return 255 ;
32- _p += SIZEOF_SIZE_T ;
33- }
34- p = _p ;
35- if (p == end )
36- break ;
37- }
38- if (* p ++ & 0x80 )
42+ /* On 64-bit platforms with 128-bit vectors (x86-64, arm64) the
43+ compiler can load 4 size_t values into two 16-byte vectors and do a
44+ vector bitwise OR. */
45+ const size_t * _p = (const size_t * )p ;
46+ while (_p < unrolled_end ) {
47+ size_t value = _p [0 ] | _p [1 ] | _p [2 ] | _p [3 ];
48+ if (value & UCS1_ASCII_CHAR_MASK ) {
3949 return 255 ;
50+ }
51+ _p += 4 ;
52+ }
53+ size_t value = 0 ;
54+ while (_p < aligned_end ) {
55+ value |= * _p ;
56+ }
57+ p = (const unsigned char * )_p ;
58+ while (p < _end ) {
59+ value |= * p ;
60+ }
61+ if (value & UCS1_ASCII_CHAR_MASK ) {
62+ return 255 ;
4063 }
4164 return 127 ;
4265}
0 commit comments