66#ifdef IS_ARM64
77namespace simdjson {
88
9- template <>
10- struct simd_input <Architecture::ARM64> {
11- uint8x16_t i0;
12- uint8x16_t i1;
13- uint8x16_t i2;
14- uint8x16_t i3;
15- };
16-
17- template <>
18- really_inline simd_input<Architecture::ARM64>
19- fill_input<Architecture::ARM64>(const uint8_t *ptr) {
20- struct simd_input <Architecture::ARM64> in;
21- in.i0 = vld1q_u8 (ptr + 0 );
22- in.i1 = vld1q_u8 (ptr + 16 );
23- in.i2 = vld1q_u8 (ptr + 32 );
24- in.i3 = vld1q_u8 (ptr + 48 );
25- return in;
26- }
27-
289really_inline uint16_t neon_movemask (uint8x16_t input) {
2910 const uint8x16_t bit_mask = {0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 ,
30- 0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
11+ 0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
3112 uint8x16_t minput = vandq_u8 (input, bit_mask);
3213 uint8x16_t tmp = vpaddq_u8 (minput, minput);
3314 tmp = vpaddq_u8 (tmp, tmp);
@@ -38,7 +19,7 @@ really_inline uint16_t neon_movemask(uint8x16_t input) {
3819really_inline uint64_t neon_movemask_bulk (uint8x16_t p0, uint8x16_t p1,
3920 uint8x16_t p2, uint8x16_t p3) {
4021 const uint8x16_t bit_mask = {0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 ,
41- 0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
22+ 0x01 , 0x02 , 0x4 , 0x8 , 0x10 , 0x20 , 0x40 , 0x80 };
4223 uint8x16_t t0 = vandq_u8 (p0, bit_mask);
4324 uint8x16_t t1 = vandq_u8 (p1, bit_mask);
4425 uint8x16_t t2 = vandq_u8 (p2, bit_mask);
@@ -51,26 +32,38 @@ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
5132}
5233
5334template <>
54- really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
55- simd_input<Architecture::ARM64> in, uint8_t m) {
56- const uint8x16_t mask = vmovq_n_u8 (m);
57- uint8x16_t cmp_res_0 = vceqq_u8 (in.i0 , mask);
58- uint8x16_t cmp_res_1 = vceqq_u8 (in.i1 , mask);
59- uint8x16_t cmp_res_2 = vceqq_u8 (in.i2 , mask);
60- uint8x16_t cmp_res_3 = vceqq_u8 (in.i3 , mask);
61- return neon_movemask_bulk (cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
62- }
35+ struct simd_input <Architecture::ARM64> {
36+ uint8x16_t i0;
37+ uint8x16_t i1;
38+ uint8x16_t i2;
39+ uint8x16_t i3;
6340
64- template <>
65- really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
66- simd_input<Architecture::ARM64> in, uint8_t m) {
67- const uint8x16_t mask = vmovq_n_u8 (m);
68- uint8x16_t cmp_res_0 = vcleq_u8 (in.i0 , mask);
69- uint8x16_t cmp_res_1 = vcleq_u8 (in.i1 , mask);
70- uint8x16_t cmp_res_2 = vcleq_u8 (in.i2 , mask);
71- uint8x16_t cmp_res_3 = vcleq_u8 (in.i3 , mask);
72- return neon_movemask_bulk (cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
73- }
41+ really_inline simd_input (const uint8_t *ptr) {
42+ this ->i0 = vld1q_u8 (ptr + 0 );
43+ this ->i1 = vld1q_u8 (ptr + 16 );
44+ this ->i2 = vld1q_u8 (ptr + 32 );
45+ this ->i3 = vld1q_u8 (ptr + 48 );
46+ }
47+
48+ really_inline uint64_t eq (uint8_t m) {
49+ const uint8x16_t mask = vmovq_n_u8 (m);
50+ uint8x16_t cmp_res_0 = vceqq_u8 (this ->i0 , mask);
51+ uint8x16_t cmp_res_1 = vceqq_u8 (this ->i1 , mask);
52+ uint8x16_t cmp_res_2 = vceqq_u8 (this ->i2 , mask);
53+ uint8x16_t cmp_res_3 = vceqq_u8 (this ->i3 , mask);
54+ return neon_movemask_bulk (cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
55+ }
56+
57+ really_inline uint64_t lteq (uint8_t m) {
58+ const uint8x16_t mask = vmovq_n_u8 (m);
59+ uint8x16_t cmp_res_0 = vcleq_u8 (this ->i0 , mask);
60+ uint8x16_t cmp_res_1 = vcleq_u8 (this ->i1 , mask);
61+ uint8x16_t cmp_res_2 = vcleq_u8 (this ->i2 , mask);
62+ uint8x16_t cmp_res_3 = vcleq_u8 (this ->i3 , mask);
63+ return neon_movemask_bulk (cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
64+ }
65+
66+ }; // struct simd_input
7467
7568} // namespace simdjson
7669
0 commit comments