forked from simdjson/simdjson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbitmask.h
More file actions
43 lines (37 loc) · 1.26 KB
/
bitmask.h
File metadata and controls
43 lines (37 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#ifndef SIMDJSON_ARM64_BITMASK_H
#define SIMDJSON_ARM64_BITMASK_H
#include "simdjson.h"
#include "arm64/intrinsics.h"
namespace simdjson {
namespace arm64 {
//
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
//
// For example, prefix_xor(00100100) == 00011100
//
really_inline uint64_t prefix_xor(uint64_t bitmask) {
/////////////
// We could do this with PMULL, but it is apparently slow.
//
//#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
//return vmull_p64(-1ULL, bitmask);
//#else
// Analysis by @sebpop:
// When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
// in between other vector code, so effectively the extra cycles of the sequence do not matter
// because the GPR units are idle otherwise and the critical path is on the FP side.
// Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
// and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
///////////
bitmask ^= bitmask << 1;
bitmask ^= bitmask << 2;
bitmask ^= bitmask << 4;
bitmask ^= bitmask << 8;
bitmask ^= bitmask << 16;
bitmask ^= bitmask << 32;
return bitmask;
}
} // namespace arm64
} // namespace simdjson
UNTARGET_REGION
#endif