Skip to content

Commit 8f01cec

Browse files
committed
Move simd_input and associated functions to their own header
1 parent 2ca574d commit 8f01cec

8 files changed

Lines changed: 226 additions & 160 deletions

include/simdjson/simd_input.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef SIMDJSON_SIMD_INPUT_H
2+
#define SIMDJSON_SIMD_INPUT_H
3+
4+
#include "simdjson/common_defs.h"
5+
#include "simdjson/portability.h"
6+
#include "simdjson/simdjson.h"
7+
#include <cassert>
8+
9+
namespace simdjson {
10+
11+
template <Architecture> struct simd_input;
12+
13+
// a straightforward comparison of a mask against input.
14+
template <Architecture T>
15+
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
16+
17+
template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
18+
19+
// find all values less than or equal than the content of maxval (using unsigned
20+
// arithmetic)
21+
template <Architecture T>
22+
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
23+
24+
} // namespace simdjson
25+
26+
#endif
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#ifndef SIMDJSON_SIMD_INPUT_ARM64_H
2+
#define SIMDJSON_SIMD_INPUT_ARM64_H
3+
4+
#include "simdjson/simd_input.h"
5+
6+
#ifdef IS_ARM64
7+
namespace simdjson {
8+
9+
template <>
10+
struct simd_input<Architecture::ARM64> {
11+
uint8x16_t i0;
12+
uint8x16_t i1;
13+
uint8x16_t i2;
14+
uint8x16_t i3;
15+
};
16+
17+
template <>
18+
really_inline simd_input<Architecture::ARM64>
19+
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
20+
struct simd_input<Architecture::ARM64> in;
21+
in.i0 = vld1q_u8(ptr + 0);
22+
in.i1 = vld1q_u8(ptr + 16);
23+
in.i2 = vld1q_u8(ptr + 32);
24+
in.i3 = vld1q_u8(ptr + 48);
25+
return in;
26+
}
27+
28+
really_inline uint16_t neon_movemask(uint8x16_t input) {
29+
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
30+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
31+
uint8x16_t minput = vandq_u8(input, bit_mask);
32+
uint8x16_t tmp = vpaddq_u8(minput, minput);
33+
tmp = vpaddq_u8(tmp, tmp);
34+
tmp = vpaddq_u8(tmp, tmp);
35+
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
36+
}
37+
38+
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
39+
uint8x16_t p2, uint8x16_t p3) {
40+
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
41+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
42+
uint8x16_t t0 = vandq_u8(p0, bit_mask);
43+
uint8x16_t t1 = vandq_u8(p1, bit_mask);
44+
uint8x16_t t2 = vandq_u8(p2, bit_mask);
45+
uint8x16_t t3 = vandq_u8(p3, bit_mask);
46+
uint8x16_t sum0 = vpaddq_u8(t0, t1);
47+
uint8x16_t sum1 = vpaddq_u8(t2, t3);
48+
sum0 = vpaddq_u8(sum0, sum1);
49+
sum0 = vpaddq_u8(sum0, sum0);
50+
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
51+
}
52+
53+
template <>
54+
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
55+
simd_input<Architecture::ARM64> in, uint8_t m) {
56+
const uint8x16_t mask = vmovq_n_u8(m);
57+
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
58+
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
59+
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
60+
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
61+
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
62+
}
63+
64+
template <>
65+
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
66+
simd_input<Architecture::ARM64> in, uint8_t m) {
67+
const uint8x16_t mask = vmovq_n_u8(m);
68+
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
69+
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
70+
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
71+
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
72+
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
73+
}
74+
75+
} // namespace simdjson
76+
77+
#endif // IS_ARM64
78+
#endif // SIMDJSON_SIMD_INPUT_ARM64_H
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#ifndef SIMDJSON_SIMD_INPUT_HASWELL_H
2+
#define SIMDJSON_SIMD_INPUT_HASWELL_H
3+
4+
#include "simdjson/simd_input.h"
5+
6+
#ifdef IS_X86_64
7+
8+
TARGET_HASWELL
9+
namespace simdjson {
10+
11+
template <>
12+
struct simd_input<Architecture::HASWELL> {
13+
__m256i lo;
14+
__m256i hi;
15+
};
16+
17+
template <>
18+
really_inline simd_input<Architecture::HASWELL>
19+
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
20+
struct simd_input<Architecture::HASWELL> in;
21+
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
22+
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
23+
return in;
24+
}
25+
26+
template <>
27+
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
28+
simd_input<Architecture::HASWELL> in, uint8_t m) {
29+
const __m256i mask = _mm256_set1_epi8(m);
30+
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
31+
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
32+
__m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
33+
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
34+
return res_0 | (res_1 << 32);
35+
}
36+
37+
template <>
38+
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
39+
simd_input<Architecture::HASWELL> in, uint8_t m) {
40+
const __m256i maxval = _mm256_set1_epi8(m);
41+
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
42+
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
43+
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
44+
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
45+
return res_0 | (res_1 << 32);
46+
}
47+
48+
} // namespace simdjson
49+
UNTARGET_REGION
50+
51+
#endif // IS_X86_64
52+
#endif // SIMDJSON_SIMD_INPUT_HASWELL_H
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#ifndef SIMDJSON_SIMD_INPUT_WESTMERE_H
2+
#define SIMDJSON_SIMD_INPUT_WESTMERE_H
3+
4+
#include "simdjson/simd_input.h"
5+
6+
#ifdef IS_X86_64
7+
8+
TARGET_WESTMERE
9+
namespace simdjson {
10+
11+
template <>
12+
struct simd_input<Architecture::WESTMERE> {
13+
__m128i v0;
14+
__m128i v1;
15+
__m128i v2;
16+
__m128i v3;
17+
};
18+
19+
template <>
20+
really_inline simd_input<Architecture::WESTMERE>
21+
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
22+
struct simd_input<Architecture::WESTMERE> in;
23+
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
24+
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
25+
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
26+
in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
27+
return in;
28+
}
29+
30+
template <>
31+
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
32+
simd_input<Architecture::WESTMERE> in, uint8_t m) {
33+
const __m128i mask = _mm_set1_epi8(m);
34+
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
35+
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
36+
__m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
37+
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
38+
__m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
39+
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
40+
__m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
41+
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
42+
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
43+
}
44+
45+
template <>
46+
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
47+
simd_input<Architecture::WESTMERE> in, uint8_t m) {
48+
const __m128i maxval = _mm_set1_epi8(m);
49+
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
50+
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
51+
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
52+
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
53+
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
54+
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
55+
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
56+
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
57+
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
58+
}
59+
60+
} // namespace simdjson
61+
UNTARGET_REGION
62+
63+
#endif // IS_X86_64
64+
#endif // SIMDJSON_SIMD_INPUT_WESTMERE_H

include/simdjson/stage1_find_marks.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
#include "simdjson/parsedjson.h"
66
#include "simdjson/portability.h"
77
#include "simdjson/simdjson.h"
8+
#include "simdjson/simd_input.h"
89
#include <cassert>
910

1011
namespace simdjson {
1112

12-
template <Architecture> struct simd_input;
13-
1413
template <Architecture> uint64_t compute_quote_mask(uint64_t quote_bits);
1514

1615
namespace {
@@ -36,17 +35,6 @@ void check_utf8(simd_input<T> in, utf8_checking_state<T> &state);
3635
template <Architecture T>
3736
ErrorValues check_utf8_errors(utf8_checking_state<T> &state);
3837

39-
// a straightforward comparison of a mask against input.
40-
template <Architecture T>
41-
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
42-
43-
template <Architecture T> simd_input<T> fill_input(const uint8_t *ptr);
44-
45-
// find all values less than or equal than the content of maxval (using unsigned
46-
// arithmetic)
47-
template <Architecture T>
48-
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
49-
5038
template <Architecture T>
5139
really_inline uint64_t find_odd_backslash_sequences(
5240
simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash);

include/simdjson/stage1_find_marks_arm64.h

Lines changed: 3 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,12 @@
11
#ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
22
#define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
33

4+
#include "simdjson/simd_input_arm64.h"
45
#include "simdjson/simdutf8check_arm64.h"
56
#include "simdjson/stage1_find_marks.h"
67

78
#ifdef IS_ARM64
89
namespace simdjson {
9-
template <> struct simd_input<Architecture::ARM64> {
10-
uint8x16_t i0;
11-
uint8x16_t i1;
12-
uint8x16_t i2;
13-
uint8x16_t i3;
14-
};
15-
16-
template <>
17-
really_inline simd_input<Architecture::ARM64>
18-
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
19-
struct simd_input<Architecture::ARM64> in;
20-
in.i0 = vld1q_u8(ptr + 0);
21-
in.i1 = vld1q_u8(ptr + 16);
22-
in.i2 = vld1q_u8(ptr + 32);
23-
in.i3 = vld1q_u8(ptr + 48);
24-
return in;
25-
}
26-
27-
really_inline uint16_t neon_movemask(uint8x16_t input) {
28-
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
29-
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
30-
uint8x16_t minput = vandq_u8(input, bit_mask);
31-
uint8x16_t tmp = vpaddq_u8(minput, minput);
32-
tmp = vpaddq_u8(tmp, tmp);
33-
tmp = vpaddq_u8(tmp, tmp);
34-
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
35-
}
36-
37-
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
38-
uint8x16_t p2, uint8x16_t p3) {
39-
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
40-
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
41-
uint8x16_t t0 = vandq_u8(p0, bit_mask);
42-
uint8x16_t t1 = vandq_u8(p1, bit_mask);
43-
uint8x16_t t2 = vandq_u8(p2, bit_mask);
44-
uint8x16_t t3 = vandq_u8(p3, bit_mask);
45-
uint8x16_t sum0 = vpaddq_u8(t0, t1);
46-
uint8x16_t sum1 = vpaddq_u8(t2, t3);
47-
sum0 = vpaddq_u8(sum0, sum1);
48-
sum0 = vpaddq_u8(sum0, sum0);
49-
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
50-
}
5110

5211
template <>
5312
really_inline uint64_t
@@ -59,7 +18,8 @@ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
5918
#endif
6019
}
6120

62-
template <> struct utf8_checking_state<Architecture::ARM64> {
21+
template <>
22+
struct utf8_checking_state<Architecture::ARM64> {
6323
int8x16_t has_error{};
6424
processed_utf_bytes previous{};
6525
};
@@ -115,28 +75,6 @@ really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
11575
: simdjson::SUCCESS;
11676
}
11777

118-
template <>
119-
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
120-
simd_input<Architecture::ARM64> in, uint8_t m) {
121-
const uint8x16_t mask = vmovq_n_u8(m);
122-
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
123-
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
124-
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
125-
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
126-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
127-
}
128-
129-
template <>
130-
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
131-
simd_input<Architecture::ARM64> in, uint8_t m) {
132-
const uint8x16_t mask = vmovq_n_u8(m);
133-
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
134-
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
135-
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
136-
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
137-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
138-
}
139-
14078
template <>
14179
really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
14280
simd_input<Architecture::ARM64> in, uint64_t &whitespace,

0 commit comments

Comments
 (0)