@@ -89,65 +89,63 @@ really_inline void find_whitespace_and_structurals(simd_input<ARCHITECTURE> in,
8989// base_ptr[base] incrementing base as we go
9090// will potentially store extra values beyond end of valid bits, so base_ptr
9191// needs to be large enough to handle this
92- really_inline void flatten_bits (uint32_t *base_ptr, uint32_t &base , uint32_t idx, uint64_t bits) {
92+ really_inline void flatten_bits (uint32_t *&base_ptr , uint32_t idx, uint64_t bits) {
9393 // In some instances, the next branch is expensive because it is mispredicted.
9494 // Unfortunately, in other cases,
9595 // it helps tremendously.
9696 if (bits == 0 )
9797 return ;
9898 uint32_t cnt = _mm_popcnt_u64 (bits);
99- uint32_t next_base = base + cnt;
10099 idx -= 64 ;
101- base_ptr += base;
102100 {
103- base_ptr[0 ] = idx + trailing_zeroes (bits);
104- bits = _blsr_u64 (bits);
105- base_ptr[1 ] = idx + trailing_zeroes (bits);
106- bits = _blsr_u64 (bits);
107- base_ptr[2 ] = idx + trailing_zeroes (bits);
108- bits = _blsr_u64 (bits);
109- base_ptr[3 ] = idx + trailing_zeroes (bits);
110- bits = _blsr_u64 (bits);
111- base_ptr[4 ] = idx + trailing_zeroes (bits);
112- bits = _blsr_u64 (bits);
113- base_ptr[5 ] = idx + trailing_zeroes (bits);
114- bits = _blsr_u64 (bits);
115- base_ptr[6 ] = idx + trailing_zeroes (bits);
116- bits = _blsr_u64 (bits);
117- base_ptr[7 ] = idx + trailing_zeroes (bits);
118- bits = _blsr_u64 (bits);
119- base_ptr += 8 ;
101+ base_ptr[0 ] = idx + trailing_zeroes (bits);
102+ bits = _blsr_u64 (bits);
103+ base_ptr[1 ] = idx + trailing_zeroes (bits);
104+ bits = _blsr_u64 (bits);
105+ base_ptr[2 ] = idx + trailing_zeroes (bits);
106+ bits = _blsr_u64 (bits);
107+ base_ptr[3 ] = idx + trailing_zeroes (bits);
108+ bits = _blsr_u64 (bits);
109+ base_ptr[4 ] = idx + trailing_zeroes (bits);
110+ bits = _blsr_u64 (bits);
111+ base_ptr[5 ] = idx + trailing_zeroes (bits);
112+ bits = _blsr_u64 (bits);
113+ base_ptr[6 ] = idx + trailing_zeroes (bits);
114+ bits = _blsr_u64 (bits);
115+ base_ptr[7 ] = idx + trailing_zeroes (bits);
116+ bits = _blsr_u64 (bits);
120117 }
121118 // We hope that the next branch is easily predicted.
122119 if (cnt > 8 ) {
123- base_ptr[0 ] = idx + trailing_zeroes (bits);
124- bits = _blsr_u64 (bits);
125- base_ptr[1 ] = idx + trailing_zeroes (bits);
126- bits = _blsr_u64 (bits);
127- base_ptr[2 ] = idx + trailing_zeroes (bits);
128- bits = _blsr_u64 (bits);
129- base_ptr[3 ] = idx + trailing_zeroes (bits);
130- bits = _blsr_u64 (bits);
131- base_ptr[4 ] = idx + trailing_zeroes (bits);
132- bits = _blsr_u64 (bits);
133- base_ptr[5 ] = idx + trailing_zeroes (bits);
134- bits = _blsr_u64 (bits);
135- base_ptr[6 ] = idx + trailing_zeroes (bits);
136- bits = _blsr_u64 (bits);
137- base_ptr[7 ] = idx + trailing_zeroes (bits);
138- bits = _blsr_u64 (bits);
139- base_ptr += 8 ;
120+ base_ptr[8 ] = idx + trailing_zeroes (bits);
121+ bits = _blsr_u64 (bits);
122+ base_ptr[9 ] = idx + trailing_zeroes (bits);
123+ bits = _blsr_u64 (bits);
124+ base_ptr[10 ] = idx + trailing_zeroes (bits);
125+ bits = _blsr_u64 (bits);
126+ base_ptr[11 ] = idx + trailing_zeroes (bits);
127+ bits = _blsr_u64 (bits);
128+ base_ptr[12 ] = idx + trailing_zeroes (bits);
129+ bits = _blsr_u64 (bits);
130+ base_ptr[13 ] = idx + trailing_zeroes (bits);
131+ bits = _blsr_u64 (bits);
132+ base_ptr[14 ] = idx + trailing_zeroes (bits);
133+ bits = _blsr_u64 (bits);
134+ base_ptr[15 ] = idx + trailing_zeroes (bits);
135+ bits = _blsr_u64 (bits);
140136 }
141- if (cnt > 16 ) { // unluckly: we rarely get here
142- // since it means having one structural or pseudo-structral element
143- // every 4 characters (possible with inputs like "","","",...).
144- do {
145- base_ptr[0 ] = idx + trailing_zeroes (bits);
146- bits = _blsr_u64 (bits);
147- base_ptr++;
148- } while (bits != 0 );
137+ if (cnt > 16 ) {
138+ // unluckly: this loop will rarely ever trigger
139+ // since it means having one structural or pseudo-structral element
140+ // every 4 characters (possible with inputs like "","","",...).
141+ uint32_t i = 16 ;
142+ do {
143+ base_ptr[i] = idx + trailing_zeroes (bits);
144+ bits = _blsr_u64 (bits);
145+ i++;
146+ } while (i < cnt);
149147 }
150- base = next_base ;
148+ base_ptr += cnt ;
151149}
152150
153151#include " generic/stage1_find_marks.h"
0 commit comments