Skip to content

Commit aa15917

Browse files
committed
Use methods instead of functions for simd_input
1 parent 85fb37b commit aa15917

5 files changed

Lines changed: 103 additions & 130 deletions

File tree

include/simdjson/simd_input.h

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,14 @@
88

99
namespace simdjson {
1010

11-
template <Architecture> struct simd_input;
12-
13-
template <Architecture T>
14-
simd_input<T> fill_input(const uint8_t *ptr);
15-
16-
// a straightforward comparison of a mask against input.
17-
template <Architecture T>
18-
uint64_t cmp_mask_against_input(simd_input<T> in, uint8_t m);
19-
20-
// find all values less than or equal than the content of maxval (using unsigned
21-
// arithmetic)
22-
template <Architecture T>
23-
uint64_t unsigned_lteq_against_input(simd_input<T> in, uint8_t m);
11+
template <Architecture>
12+
struct simd_input {
13+
simd_input(const uint8_t *ptr);
14+
// a straightforward comparison of a mask against input.
15+
uint64_t eq(uint8_t m);
16+
// find all values less than or equal than the content of maxval (using unsigned arithmetic)
17+
uint64_t lteq(uint8_t m);
18+
}; // struct simd_input
2419

2520
} // namespace simdjson
2621

include/simdjson/simd_input_arm64.h

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,9 @@
66
#ifdef IS_ARM64
77
namespace simdjson {
88

9-
template <>
10-
struct simd_input<Architecture::ARM64> {
11-
uint8x16_t i0;
12-
uint8x16_t i1;
13-
uint8x16_t i2;
14-
uint8x16_t i3;
15-
};
16-
17-
template <>
18-
really_inline simd_input<Architecture::ARM64>
19-
fill_input<Architecture::ARM64>(const uint8_t *ptr) {
20-
struct simd_input<Architecture::ARM64> in;
21-
in.i0 = vld1q_u8(ptr + 0);
22-
in.i1 = vld1q_u8(ptr + 16);
23-
in.i2 = vld1q_u8(ptr + 32);
24-
in.i3 = vld1q_u8(ptr + 48);
25-
return in;
26-
}
27-
289
really_inline uint16_t neon_movemask(uint8x16_t input) {
2910
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
30-
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
11+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
3112
uint8x16_t minput = vandq_u8(input, bit_mask);
3213
uint8x16_t tmp = vpaddq_u8(minput, minput);
3314
tmp = vpaddq_u8(tmp, tmp);
@@ -38,7 +19,7 @@ really_inline uint16_t neon_movemask(uint8x16_t input) {
3819
really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
3920
uint8x16_t p2, uint8x16_t p3) {
4021
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
41-
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
22+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
4223
uint8x16_t t0 = vandq_u8(p0, bit_mask);
4324
uint8x16_t t1 = vandq_u8(p1, bit_mask);
4425
uint8x16_t t2 = vandq_u8(p2, bit_mask);
@@ -51,26 +32,38 @@ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
5132
}
5233

5334
template <>
54-
really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
55-
simd_input<Architecture::ARM64> in, uint8_t m) {
56-
const uint8x16_t mask = vmovq_n_u8(m);
57-
uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
58-
uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
59-
uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
60-
uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
61-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
62-
}
35+
struct simd_input<Architecture::ARM64> {
36+
uint8x16_t i0;
37+
uint8x16_t i1;
38+
uint8x16_t i2;
39+
uint8x16_t i3;
6340

64-
template <>
65-
really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
66-
simd_input<Architecture::ARM64> in, uint8_t m) {
67-
const uint8x16_t mask = vmovq_n_u8(m);
68-
uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
69-
uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
70-
uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
71-
uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
72-
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
73-
}
41+
really_inline simd_input(const uint8_t *ptr) {
42+
this->i0 = vld1q_u8(ptr + 0);
43+
this->i1 = vld1q_u8(ptr + 16);
44+
this->i2 = vld1q_u8(ptr + 32);
45+
this->i3 = vld1q_u8(ptr + 48);
46+
}
47+
48+
really_inline uint64_t eq(uint8_t m) {
49+
const uint8x16_t mask = vmovq_n_u8(m);
50+
uint8x16_t cmp_res_0 = vceqq_u8(this->i0, mask);
51+
uint8x16_t cmp_res_1 = vceqq_u8(this->i1, mask);
52+
uint8x16_t cmp_res_2 = vceqq_u8(this->i2, mask);
53+
uint8x16_t cmp_res_3 = vceqq_u8(this->i3, mask);
54+
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
55+
}
56+
57+
really_inline uint64_t lteq(uint8_t m) {
58+
const uint8x16_t mask = vmovq_n_u8(m);
59+
uint8x16_t cmp_res_0 = vcleq_u8(this->i0, mask);
60+
uint8x16_t cmp_res_1 = vcleq_u8(this->i1, mask);
61+
uint8x16_t cmp_res_2 = vcleq_u8(this->i2, mask);
62+
uint8x16_t cmp_res_3 = vcleq_u8(this->i3, mask);
63+
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
64+
}
65+
66+
}; // struct simd_input
7467

7568
} // namespace simdjson
7669

include/simdjson/simd_input_haswell.h

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,31 @@ template <>
1212
struct simd_input<Architecture::HASWELL> {
1313
__m256i lo;
1414
__m256i hi;
15-
};
1615

17-
template <>
18-
really_inline simd_input<Architecture::HASWELL>
19-
fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
20-
struct simd_input<Architecture::HASWELL> in;
21-
in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
22-
in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
23-
return in;
24-
}
25-
26-
template <>
27-
really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
28-
simd_input<Architecture::HASWELL> in, uint8_t m) {
29-
const __m256i mask = _mm256_set1_epi8(m);
30-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
31-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
32-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
33-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
34-
return res_0 | (res_1 << 32);
35-
}
36-
37-
template <>
38-
really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
39-
simd_input<Architecture::HASWELL> in, uint8_t m) {
40-
const __m256i maxval = _mm256_set1_epi8(m);
41-
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
42-
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
43-
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
44-
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
45-
return res_0 | (res_1 << 32);
46-
}
16+
really_inline simd_input(const uint8_t *ptr) {
17+
this->lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
18+
this->hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
19+
}
20+
21+
really_inline uint64_t eq(uint8_t m) {
22+
const __m256i mask = _mm256_set1_epi8(m);
23+
__m256i cmp_res_0 = _mm256_cmpeq_epi8(this->lo, mask);
24+
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
25+
__m256i cmp_res_1 = _mm256_cmpeq_epi8(this->hi, mask);
26+
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
27+
return res_0 | (res_1 << 32);
28+
}
29+
30+
really_inline uint64_t lteq(uint8_t m) {
31+
const __m256i maxval = _mm256_set1_epi8(m);
32+
__m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->lo), maxval);
33+
uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
34+
__m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, this->hi), maxval);
35+
uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
36+
return res_0 | (res_1 << 32);
37+
}
38+
39+
}; // struct simd_input
4740

4841
} // namespace simdjson
4942
UNTARGET_REGION

include/simdjson/simd_input_westmere.h

Lines changed: 34 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -14,48 +14,41 @@ struct simd_input<Architecture::WESTMERE> {
1414
__m128i v1;
1515
__m128i v2;
1616
__m128i v3;
17-
};
1817

19-
template <>
20-
really_inline simd_input<Architecture::WESTMERE>
21-
fill_input<Architecture::WESTMERE>(const uint8_t *ptr) {
22-
struct simd_input<Architecture::WESTMERE> in;
23-
in.v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
24-
in.v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
25-
in.v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
26-
in.v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
27-
return in;
28-
}
29-
30-
template <>
31-
really_inline uint64_t cmp_mask_against_input<Architecture::WESTMERE>(
32-
simd_input<Architecture::WESTMERE> in, uint8_t m) {
33-
const __m128i mask = _mm_set1_epi8(m);
34-
__m128i cmp_res_0 = _mm_cmpeq_epi8(in.v0, mask);
35-
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
36-
__m128i cmp_res_1 = _mm_cmpeq_epi8(in.v1, mask);
37-
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
38-
__m128i cmp_res_2 = _mm_cmpeq_epi8(in.v2, mask);
39-
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
40-
__m128i cmp_res_3 = _mm_cmpeq_epi8(in.v3, mask);
41-
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
42-
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
43-
}
44-
45-
template <>
46-
really_inline uint64_t unsigned_lteq_against_input<Architecture::WESTMERE>(
47-
simd_input<Architecture::WESTMERE> in, uint8_t m) {
48-
const __m128i maxval = _mm_set1_epi8(m);
49-
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v0), maxval);
50-
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
51-
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v1), maxval);
52-
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
53-
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v2), maxval);
54-
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
55-
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, in.v3), maxval);
56-
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
57-
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
58-
}
18+
really_inline simd_input(const uint8_t *ptr) {
19+
this->v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
20+
this->v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
21+
this->v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
22+
this->v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
23+
}
24+
25+
really_inline uint64_t eq(uint8_t m) {
26+
const __m128i mask = _mm_set1_epi8(m);
27+
__m128i cmp_res_0 = _mm_cmpeq_epi8(this->v0, mask);
28+
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
29+
__m128i cmp_res_1 = _mm_cmpeq_epi8(this->v1, mask);
30+
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
31+
__m128i cmp_res_2 = _mm_cmpeq_epi8(this->v2, mask);
32+
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
33+
__m128i cmp_res_3 = _mm_cmpeq_epi8(this->v3, mask);
34+
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
35+
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
36+
}
37+
38+
really_inline uint64_t lteq(uint8_t m) {
39+
const __m128i maxval = _mm_set1_epi8(m);
40+
__m128i cmp_res_0 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v0), maxval);
41+
uint64_t res_0 = _mm_movemask_epi8(cmp_res_0);
42+
__m128i cmp_res_1 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v1), maxval);
43+
uint64_t res_1 = _mm_movemask_epi8(cmp_res_1);
44+
__m128i cmp_res_2 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v2), maxval);
45+
uint64_t res_2 = _mm_movemask_epi8(cmp_res_2);
46+
__m128i cmp_res_3 = _mm_cmpeq_epi8(_mm_max_epu8(maxval, this->v3), maxval);
47+
uint64_t res_3 = _mm_movemask_epi8(cmp_res_3);
48+
return res_0 | (res_1 << 16) | (res_2 << 32) | (res_3 << 48);
49+
}
50+
51+
}; // struct simd_input
5952

6053
} // namespace simdjson
6154
UNTARGET_REGION

include/simdjson/stage1_find_marks_common.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
2424
uint64_t &prev_iter_ends_odd_backslash) {
2525
const uint64_t even_bits = 0x5555555555555555ULL;
2626
const uint64_t odd_bits = ~even_bits;
27-
uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
27+
uint64_t bs_bits = in.eq('\\');
2828
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
2929
/* flip lowest if we have an odd-length run at the end of the prior
3030
* iteration */
@@ -71,7 +71,7 @@ really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
7171
simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
7272
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
7373
uint64_t &error_mask) {
74-
quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
74+
quote_bits = in.eq('"');
7575
quote_bits = quote_bits & ~odd_ends;
7676
uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
7777
quote_mask ^= prev_iter_inside_quote;
@@ -80,8 +80,7 @@ really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
8080
* quotation mark, reverse solidus, and the control characters (U+0000
8181
* through U+001F).
8282
* https://tools.ietf.org/html/rfc8259 */
83-
uint64_t unescaped =
84-
unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
83+
uint64_t unescaped = in.lteq(0x1F);
8584
error_mask |= quote_mask & unescaped;
8685
/* right shift of a signed value expected to be well-defined and standard
8786
* compliant as of C++20,
@@ -98,7 +97,7 @@ really_inline void find_structural_bits_64(
9897
uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
9998
uint64_t &error_mask,
10099
utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
101-
simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
100+
simd_input<TARGETED_ARCHITECTURE> in(buf);
102101
check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
103102
/* detect odd sequences of backslashes */
104103
uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(

0 commit comments

Comments
 (0)