Skip to content

Commit a53d950

Browse files
authored
Intrinsic-based flatten (simdjson#234)
* Providing a flatten function with intrinsics (for Visual Studio).
1 parent f76ee5e commit a53d950

File tree

7 files changed

+97
-10
lines changed

7 files changed

+97
-10
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
6262
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
6363
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
6464

65-
HEADERS= include/simdjson/simdutf8check_haswell.h include/simdjson/simdutf8check_westmere.h include/simdjson/simdutf8check_arm64.h include/simdjson/stringparsing.h include/simdjson/stringparsing_arm64.h include/simdjson/stringparsing_haswell.h include/simdjson/stringparsing_macros.h include/simdjson/stringparsing_westmere.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage1_find_marks_arm64.h include/simdjson/stage1_find_marks_haswell.h include/simdjson/stage1_find_marks_westmere.h include/simdjson/stage1_find_marks_macros.h include/simdjson/stage2_build_tape.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h
65+
HEADERS= include/simdjson/simdutf8check_haswell.h include/simdjson/simdutf8check_westmere.h include/simdjson/simdutf8check_arm64.h include/simdjson/stringparsing.h include/simdjson/stringparsing_arm64.h include/simdjson/stringparsing_haswell.h include/simdjson/stringparsing_macros.h include/simdjson/stringparsing_westmere.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage1_find_marks_arm64.h include/simdjson/stage1_find_marks_haswell.h include/simdjson/stage1_find_marks_westmere.h include/simdjson/stage1_find_marks_macros.h include/simdjson/stage2_build_tape.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h include/simdjson/stage1_find_marks_flatten.h include/simdjson/stage1_find_marks_flatten_haswell.h
6666
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp
6767
MINIFIERHEADERS=include/simdjson/jsonminifier.h include/simdjson/simdprune_tables.h
6868
MINIFIERLIBFILES=src/jsonminifier.cpp

include/simdjson/portability.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ static inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *re
9999

100100
/* result might be undefined when input_num is zero */
101101
static inline int trailingzeroes(uint64_t input_num) {
102-
#ifdef __BMI2__
102+
#ifdef __BMI__// tzcnt is BMI1
103103
return _tzcnt_u64(input_num);
104104
#else
105105
return __builtin_ctzll(input_num);
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
2+
#define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
3+
4+
// This file provides the same function as
5+
// stage1_find_marks_flatten.h, but uses Intel intrinsics.
6+
// This should provide better performance on Visual Studio
7+
// and other compilers that do a conservative optimization.
8+
9+
#include "simdjson/common_defs.h"
10+
#include "simdjson/portability.h"
11+
12+
TARGET_HASWELL
13+
namespace simdjson {
14+
namespace haswell {
15+
16+
// flatten out values in 'bits' assuming that they are are to have values of idx
17+
// plus their position in the bitvector, and store these indexes at
18+
// base_ptr[base] incrementing base as we go
19+
// will potentially store extra values beyond end of valid bits, so base_ptr
20+
// needs to be large enough to handle this
21+
really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
22+
uint32_t idx, uint64_t bits) {
23+
// In some instances, the next branch is expensive because it is mispredicted.
24+
// Unfortunately, in other cases,
25+
// it helps tremendously.
26+
if(bits == 0) return;
27+
uint32_t cnt = _popcnt64(bits);
28+
uint32_t next_base = base + cnt;
29+
idx -= 64;
30+
base_ptr += base;
31+
{
32+
base_ptr[0] = idx + _mm_tzcnt_64(bits);
33+
bits = _blsr_u64(bits);
34+
base_ptr[1] = idx + _mm_tzcnt_64(bits);
35+
bits = _blsr_u64(bits);
36+
base_ptr[2] = idx + _mm_tzcnt_64(bits);
37+
bits = _blsr_u64(bits);
38+
base_ptr[3] = idx + _mm_tzcnt_64(bits);
39+
bits = _blsr_u64(bits);
40+
base_ptr[4] = idx + _mm_tzcnt_64(bits);
41+
bits = _blsr_u64(bits);
42+
base_ptr[5] = idx + _mm_tzcnt_64(bits);
43+
bits = _blsr_u64(bits);
44+
base_ptr[6] = idx + _mm_tzcnt_64(bits);
45+
bits = _blsr_u64(bits);
46+
base_ptr[7] = idx + _mm_tzcnt_64(bits);
47+
bits = _blsr_u64(bits);
48+
base_ptr += 8;
49+
}
50+
// We hope that the next branch is easily predicted.
51+
if (cnt > 8) {
52+
base_ptr[0] = idx + _mm_tzcnt_64(bits);
53+
bits = _blsr_u64(bits);
54+
base_ptr[1] = idx + _mm_tzcnt_64(bits);
55+
bits = _blsr_u64(bits);
56+
base_ptr[2] = idx + _mm_tzcnt_64(bits);
57+
bits = _blsr_u64(bits);
58+
base_ptr[3] = idx + _mm_tzcnt_64(bits);
59+
bits = _blsr_u64(bits);
60+
base_ptr[4] = idx + _mm_tzcnt_64(bits);
61+
bits = _blsr_u64(bits);
62+
base_ptr[5] = idx + _mm_tzcnt_64(bits);
63+
bits = _blsr_u64(bits);
64+
base_ptr[6] = idx + _mm_tzcnt_64(bits);
65+
bits = _blsr_u64(bits);
66+
base_ptr[7] = idx + _mm_tzcnt_64(bits);
67+
bits = _blsr_u64(bits);
68+
base_ptr += 8;
69+
}
70+
if (cnt > 16) { // unluckly: we rarely get here
71+
// since it means having one structural or pseudo-structral element
72+
// every 4 characters (possible with inputs like "","","",...).
73+
do {
74+
base_ptr[0] = idx + _mm_tzcnt_64(bits);
75+
bits = _blsr_u64(bits);
76+
base_ptr++;
77+
} while(bits != 0);
78+
}
79+
base = next_base;
80+
}
81+
} // haswell
82+
} // simdjson
83+
UNTARGET_REGION
84+
85+
86+
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H

include/simdjson/stage1_find_marks_haswell.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include "simdjson/stage1_find_marks.h"
55
#include "simdjson/stage1_find_marks_macros.h"
6-
#include "simdjson/stage1_find_marks_flatten.h"
6+
#include "simdjson/stage1_find_marks_flatten_haswell.h"
77
#include "simdjson/simdutf8check_haswell.h"
88

99
#ifdef IS_X86_64

include/simdjson/stage1_find_marks_macros.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
// We need to compile that code for multiple architectures. However, target attributes can be used
8888
// only once by function definition. Huge macro seemed better than huge code duplication.
8989
// FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj)
90-
#define FIND_STRUCTURAL_BITS(T, buf, len, pj) { \
90+
#define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) { \
9191
if (len > pj.bytecapacity) { \
9292
std::cerr << "Your ParsedJson object only supports documents up to " \
9393
<< pj.bytecapacity << " bytes but you are trying to process " << len \
@@ -141,7 +141,7 @@
141141
\
142142
/* take the previous iterations structural bits, not our current iteration, */ \
143143
/* and flatten */ \
144-
flatten_bits(base_ptr, base, idx, structurals); \
144+
flat(base_ptr, base, idx, structurals); \
145145
\
146146
uint64_t whitespace; \
147147
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
@@ -175,7 +175,7 @@
175175
\
176176
/* take the previous iterations structural bits, not our current iteration, */ \
177177
/* and flatten */ \
178-
flatten_bits(base_ptr, base, idx, structurals); \
178+
flat(base_ptr, base, idx, structurals); \
179179
\
180180
uint64_t whitespace; \
181181
find_whitespace_and_structurals<T>(in, whitespace, structurals); \
@@ -192,7 +192,7 @@
192192
} \
193193
\
194194
/* finally, flatten out the remaining structurals from the last iteration */ \
195-
flatten_bits(base_ptr, base, idx, structurals); \
195+
flat(base_ptr, base, idx, structurals); \
196196
\
197197
pj.n_structural_indexes = base; \
198198
/* a valid JSON file cannot have zero structural indexes - we should have */ \

jsonchecker/pass20.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1.2e000000010

src/stage1_find_marks.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ TARGET_HASWELL
99
namespace simdjson {
1010
template<>
1111
int find_structural_bits<architecture::haswell>(const uint8_t *buf, size_t len, ParsedJson &pj) {
12-
FIND_STRUCTURAL_BITS(architecture::haswell, buf, len, pj);
12+
FIND_STRUCTURAL_BITS(architecture::haswell, buf, len, pj, simdjson::haswell::flatten_bits);
1313
}
1414
} // simdjson
1515
UNTARGET_REGION
@@ -18,7 +18,7 @@ TARGET_WESTMERE
1818
namespace simdjson {
1919
template<>
2020
int find_structural_bits<architecture::westmere>(const uint8_t *buf, size_t len, ParsedJson &pj) {
21-
FIND_STRUCTURAL_BITS(architecture::westmere, buf, len, pj);
21+
FIND_STRUCTURAL_BITS(architecture::westmere, buf, len, pj, simdjson::flatten_bits);
2222
}
2323
} // simdjson
2424
UNTARGET_REGION
@@ -31,7 +31,7 @@ UNTARGET_REGION
3131
namespace simdjson {
3232
template<>
3333
int find_structural_bits<architecture::arm64>(const uint8_t *buf, size_t len, ParsedJson &pj) {
34-
FIND_STRUCTURAL_BITS(architecture::arm64, buf, len, pj);
34+
FIND_STRUCTURAL_BITS(architecture::arm64, buf, len, pj, simdjson::flatten_bits);
3535
}
3636
}
3737
#endif

0 commit comments

Comments
 (0)