1+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
2+ #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
3+
4+ // This file provides the same function as
5+ // stage1_find_marks_flatten.h, but uses Intel intrinsics.
6+ // This should provide better performance on Visual Studio
7+ // and other compilers that do a conservative optimization.
8+
9+ #include " simdjson/common_defs.h"
10+ #include " simdjson/portability.h"
11+
12+ TARGET_HASWELL
13+ namespace simdjson {
14+ namespace haswell {
15+
16+ // flatten out values in 'bits' assuming that they are are to have values of idx
17+ // plus their position in the bitvector, and store these indexes at
18+ // base_ptr[base] incrementing base as we go
19+ // will potentially store extra values beyond end of valid bits, so base_ptr
20+ // needs to be large enough to handle this
21+ really_inline void flatten_bits (uint32_t *base_ptr, uint32_t &base,
22+ uint32_t idx, uint64_t bits) {
23+ // In some instances, the next branch is expensive because it is mispredicted.
24+ // Unfortunately, in other cases,
25+ // it helps tremendously.
26+ if (bits == 0 ) return ;
27+ uint32_t cnt = _popcnt64 (bits);
28+ uint32_t next_base = base + cnt;
29+ idx -= 64 ;
30+ base_ptr += base;
31+ {
32+ base_ptr[0 ] = idx + _mm_tzcnt_64 (bits);
33+ bits = _blsr_u64 (bits);
34+ base_ptr[1 ] = idx + _mm_tzcnt_64 (bits);
35+ bits = _blsr_u64 (bits);
36+ base_ptr[2 ] = idx + _mm_tzcnt_64 (bits);
37+ bits = _blsr_u64 (bits);
38+ base_ptr[3 ] = idx + _mm_tzcnt_64 (bits);
39+ bits = _blsr_u64 (bits);
40+ base_ptr[4 ] = idx + _mm_tzcnt_64 (bits);
41+ bits = _blsr_u64 (bits);
42+ base_ptr[5 ] = idx + _mm_tzcnt_64 (bits);
43+ bits = _blsr_u64 (bits);
44+ base_ptr[6 ] = idx + _mm_tzcnt_64 (bits);
45+ bits = _blsr_u64 (bits);
46+ base_ptr[7 ] = idx + _mm_tzcnt_64 (bits);
47+ bits = _blsr_u64 (bits);
48+ base_ptr += 8 ;
49+ }
50+ // We hope that the next branch is easily predicted.
51+ if (cnt > 8 ) {
52+ base_ptr[0 ] = idx + _mm_tzcnt_64 (bits);
53+ bits = _blsr_u64 (bits);
54+ base_ptr[1 ] = idx + _mm_tzcnt_64 (bits);
55+ bits = _blsr_u64 (bits);
56+ base_ptr[2 ] = idx + _mm_tzcnt_64 (bits);
57+ bits = _blsr_u64 (bits);
58+ base_ptr[3 ] = idx + _mm_tzcnt_64 (bits);
59+ bits = _blsr_u64 (bits);
60+ base_ptr[4 ] = idx + _mm_tzcnt_64 (bits);
61+ bits = _blsr_u64 (bits);
62+ base_ptr[5 ] = idx + _mm_tzcnt_64 (bits);
63+ bits = _blsr_u64 (bits);
64+ base_ptr[6 ] = idx + _mm_tzcnt_64 (bits);
65+ bits = _blsr_u64 (bits);
66+ base_ptr[7 ] = idx + _mm_tzcnt_64 (bits);
67+ bits = _blsr_u64 (bits);
68+ base_ptr += 8 ;
69+ }
70+ if (cnt > 16 ) { // unluckly: we rarely get here
71+ // since it means having one structural or pseudo-structral element
72+ // every 4 characters (possible with inputs like "","","",...).
73+ do {
74+ base_ptr[0 ] = idx + _mm_tzcnt_64 (bits);
75+ bits = _blsr_u64 (bits);
76+ base_ptr++;
77+ } while (bits != 0 );
78+ }
79+ base = next_base;
80+ }
81+ } // haswell
82+ } // simdjson
83+ UNTARGET_REGION
84+
85+
86+ #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
0 commit comments