Skip to content

Commit 2a24567

Browse files
authored
Replace macros by include files (simdjson#236) (simdjson#248)
* stage1 compiles without macros * cleaning * amalgation is weird but works * macros are removed from stringparsing * amalgation fixed * Huge macros are removed. * clang-format
1 parent bd9628d commit 2a24567

17 files changed

+900
-998
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
6262
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
6363
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
6464

65-
HEADERS= include/simdjson/simdutf8check_haswell.h include/simdjson/simdutf8check_westmere.h include/simdjson/simdutf8check_arm64.h include/simdjson/stringparsing.h include/simdjson/stringparsing_arm64.h include/simdjson/stringparsing_haswell.h include/simdjson/stringparsing_macros.h include/simdjson/stringparsing_westmere.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage1_find_marks_arm64.h include/simdjson/stage1_find_marks_haswell.h include/simdjson/stage1_find_marks_westmere.h include/simdjson/stage1_find_marks_macros.h include/simdjson/stage2_build_tape.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h include/simdjson/stage1_find_marks_flatten.h include/simdjson/stage1_find_marks_flatten_haswell.h
65+
HEADERS= include/simdjson/simdutf8check_haswell.h include/simdjson/simdutf8check_westmere.h include/simdjson/simdutf8check_arm64.h include/simdjson/stringparsing.h include/simdjson/stringparsing_arm64.h include/simdjson/stringparsing_haswell.h include/simdjson/stringparsing_westmere.h include/simdjson/numberparsing.h include/simdjson/jsonparser.h include/simdjson/common_defs.h include/simdjson/jsonioutil.h benchmark/benchmark.h benchmark/linux/linux-perf-events.h include/simdjson/parsedjson.h include/simdjson/stage1_find_marks.h include/simdjson/stage1_find_marks_arm64.h include/simdjson/stage1_find_marks_haswell.h include/simdjson/stage1_find_marks_westmere.h include/simdjson/stage2_build_tape.h include/simdjson/jsoncharutils.h include/simdjson/jsonformatutils.h include/simdjson/stage1_find_marks_flatten.h include/simdjson/stage1_find_marks_flatten_haswell.h
6666
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp
6767
MINIFIERHEADERS=include/simdjson/jsonminifier.h include/simdjson/simdprune_tables.h
6868
MINIFIERLIBFILES=src/jsonminifier.cpp

amalgamation.sh

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,10 @@ $SCRIPTPATH/include/simdjson/parsedjson.h
4343
$SCRIPTPATH/include/simdjson/stage1_find_marks.h
4444
$SCRIPTPATH/include/simdjson/stage1_find_marks_flatten.h
4545
$SCRIPTPATH/include/simdjson/stage1_find_marks_flatten_haswell.h
46-
$SCRIPTPATH/include/simdjson/stage1_find_marks_macros.h
4746
$SCRIPTPATH/include/simdjson/stage1_find_marks_westmere.h
4847
$SCRIPTPATH/include/simdjson/stage1_find_marks_haswell.h
4948
$SCRIPTPATH/include/simdjson/stage1_find_marks_arm64.h
5049
$SCRIPTPATH/include/simdjson/stringparsing.h
51-
$SCRIPTPATH/include/simdjson/stringparsing_macros.h
5250
$SCRIPTPATH/include/simdjson/stringparsing_westmere.h
5351
$SCRIPTPATH/include/simdjson/stringparsing_haswell.h
5452
$SCRIPTPATH/include/simdjson/stringparsing_arm64.h
@@ -63,20 +61,28 @@ for i in ${ALLCHEADERS} ${ALLCFILES}; do
6361
exit 127
6462
done
6563

66-
67-
function stripinc()
68-
{
69-
sed -e '/# *include *"/d' -e '/# *include *<simdjson\//d'
70-
}
7164
function dofile()
7265
{
66+
# Last lines are always ignored. Files should end by an empty lines.
7367
RELFILE=${1#"$SCRIPTPATH/"}
7468
echo "/* begin file $RELFILE */"
7569
# echo "#line 8 \"$1\"" ## redefining the line/file is not nearly as useful as it sounds for debugging. It breaks IDEs.
76-
stripinc < $1
70+
while IFS= read -r line
71+
do
72+
if [[ "${line}" == '#include "simdjson'* ]]; then
73+
# we paste the contents of simdjson header files with names ending by _common.h
74+
# we ignore every other simdjson headers
75+
if [[ "${line}" == '#include "simdjson/'*'_common.h"'* ]]; then
76+
file=$(echo $line| cut -d'"' -f 2)
77+
echo "$(<include/$file)" # we assume those files are always in include/
78+
fi
79+
else
80+
# Otherwise we simply copy the line
81+
echo "$line"
82+
fi;
83+
done < "$1"
7784
echo "/* end file $RELFILE */"
7885
}
79-
8086
timestamp=$(date)
8187
echo "Creating ${AMAL_H}..."
8288
echo "/* auto-generated on ${timestamp}. Do not edit! */" > "${AMAL_H}"

include/simdjson/jsonformatutils.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,5 +198,4 @@ static inline void print_with_escapes(const char *src, std::ostream &os,
198198
}
199199
} // namespace simdjson
200200

201-
#
202201
#endif

include/simdjson/stage1_find_marks_arm64.h

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include "simdjson/simdutf8check_arm64.h"
55
#include "simdjson/stage1_find_marks.h"
66
#include "simdjson/stage1_find_marks_flatten.h"
7-
#include "simdjson/stage1_find_marks_macros.h"
87

98
#ifdef IS_ARM64
109
namespace simdjson {
@@ -139,23 +138,6 @@ really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
139138
return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
140139
}
141140

142-
template <>
143-
really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
144-
simd_input<Architecture::ARM64> in,
145-
uint64_t &prev_iter_ends_odd_backslash) {
146-
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
147-
prev_iter_ends_odd_backslash);
148-
}
149-
150-
template <>
151-
really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
152-
simd_input<Architecture::ARM64> in, uint64_t odd_ends,
153-
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
154-
uint64_t &error_mask) {
155-
FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
156-
prev_iter_inside_quote, quote_bits, error_mask)
157-
}
158-
159141
template <>
160142
really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
161143
simd_input<Architecture::ARM64> in, uint64_t &whitespace,
@@ -207,4 +189,4 @@ really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
207189
} // namespace simdjson
208190

209191
#endif // IS_ARM64
210-
#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
192+
#endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
// This file contains the common code every implementation uses in stage1
2+
// It is intended to be included multiple times and compiled multiple times
3+
// We assume the file in which it is include already includes
4+
// "simdjson/stage1_find_marks.h" (this simplifies amalgation)
5+
6+
#ifdef TARGETED_ARCHITECTURE
7+
8+
// return a bitvector indicating where we have characters that end an odd-length
9+
// sequence of backslashes (and thus change the behavior of the next character
10+
// to follow). A even-length sequence of backslashes, and, for that matter, the
11+
// largest even-length prefix of our odd-length sequence of backslashes, simply
12+
// modify the behavior of the backslashes themselves.
13+
// We also update the prev_iter_ends_odd_backslash reference parameter to
14+
// indicate whether we end an iteration on an odd-length sequence of
15+
// backslashes, which modifies our subsequent search for odd-length
16+
// sequences of backslashes in an obvious way.
17+
template <>
18+
really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
19+
simd_input<TARGETED_ARCHITECTURE> in,
20+
uint64_t &prev_iter_ends_odd_backslash) {
21+
const uint64_t even_bits = 0x5555555555555555ULL;
22+
const uint64_t odd_bits = ~even_bits;
23+
uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\');
24+
uint64_t start_edges = bs_bits & ~(bs_bits << 1);
25+
/* flip lowest if we have an odd-length run at the end of the prior
26+
* iteration */
27+
uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
28+
uint64_t even_starts = start_edges & even_start_mask;
29+
uint64_t odd_starts = start_edges & ~even_start_mask;
30+
uint64_t even_carries = bs_bits + even_starts;
31+
32+
uint64_t odd_carries;
33+
/* must record the carry-out of our odd-carries out of bit 63; this
34+
* indicates whether the sense of any edge going to the next iteration
35+
* should be flipped */
36+
bool iter_ends_odd_backslash =
37+
add_overflow(bs_bits, odd_starts, &odd_carries);
38+
39+
odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a
40+
* potential end if we had an
41+
* odd-numbered run at the
42+
* end of the previous
43+
* iteration */
44+
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
45+
uint64_t even_carry_ends = even_carries & ~bs_bits;
46+
uint64_t odd_carry_ends = odd_carries & ~bs_bits;
47+
uint64_t even_start_odd_end = even_carry_ends & odd_bits;
48+
uint64_t odd_start_even_end = odd_carry_ends & even_bits;
49+
uint64_t odd_ends = even_start_odd_end | odd_start_even_end;
50+
return odd_ends;
51+
}
52+
53+
// return both the quote mask (which is a half-open mask that covers the first
54+
// quote
55+
// in an unescaped quote pair and everything in the quote pair) and the quote
56+
// bits, which are the simple
57+
// unescaped quoted bits. We also update the prev_iter_inside_quote value to
58+
// tell the next iteration
59+
// whether we finished the final iteration inside a quote pair; if so, this
60+
// inverts our behavior of
61+
// whether we're inside quotes for the next iteration.
62+
// Note that we don't do any error checking to see if we have backslash
63+
// sequences outside quotes; these
64+
// backslash sequences (of any length) will be detected elsewhere.
65+
template <>
66+
really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
67+
simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends,
68+
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
69+
uint64_t &error_mask) {
70+
quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"');
71+
quote_bits = quote_bits & ~odd_ends;
72+
uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits);
73+
quote_mask ^= prev_iter_inside_quote;
74+
/* All Unicode characters may be placed within the
75+
* quotation marks, except for the characters that MUST be escaped:
76+
* quotation mark, reverse solidus, and the control characters (U+0000
77+
* through U+001F).
78+
* https://tools.ietf.org/html/rfc8259 */
79+
uint64_t unescaped =
80+
unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F);
81+
error_mask |= quote_mask & unescaped;
82+
/* right shift of a signed value expected to be well-defined and standard
83+
* compliant as of C++20,
84+
* John Regher from Utah U. says this is fine code */
85+
prev_iter_inside_quote =
86+
static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63);
87+
return quote_mask;
88+
}
89+
90+
// Find structural bits in a 64-byte chunk.
91+
void find_structural_bits_64(
92+
const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base,
93+
uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote,
94+
uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals,
95+
uint64_t &error_mask,
96+
utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) {
97+
simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf);
98+
check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state);
99+
/* detect odd sequences of backslashes */
100+
uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>(
101+
in, prev_iter_ends_odd_backslash);
102+
103+
/* detect insides of quote pairs ("quote_mask") and also our quote_bits
104+
* themselves */
105+
uint64_t quote_bits;
106+
uint64_t quote_mask = find_quote_mask_and_bits<TARGETED_ARCHITECTURE>(
107+
in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask);
108+
109+
/* take the previous iterations structural bits, not our current
110+
* iteration,
111+
* and flatten */
112+
#ifdef IS_X86_64
113+
if (TARGETED_ARCHITECTURE == Architecture::HASWELL)
114+
simdjson::haswell::flatten_bits(base_ptr, base, idx, structurals);
115+
else
116+
#endif
117+
simdjson::flatten_bits(base_ptr, base, idx, structurals);
118+
119+
uint64_t whitespace;
120+
find_whitespace_and_structurals<TARGETED_ARCHITECTURE>(in, whitespace,
121+
structurals);
122+
123+
/* fixup structurals to reflect quotes and add pseudo-structural
124+
* characters */
125+
structurals = finalize_structurals(structurals, whitespace, quote_mask,
126+
quote_bits, prev_iter_ends_pseudo_pred);
127+
}
128+
129+
template <>
130+
int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len,
131+
ParsedJson &pj) {
132+
if (len > pj.byte_capacity) {
133+
std::cerr << "Your ParsedJson object only supports documents up to "
134+
<< pj.byte_capacity << " bytes but you are trying to process "
135+
<< len << " bytes" << std::endl;
136+
return simdjson::CAPACITY;
137+
}
138+
uint32_t *base_ptr = pj.structural_indexes;
139+
uint32_t base = 0;
140+
utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state;
141+
142+
/* we have padded the input out to 64 byte multiple with the remainder
143+
* being zeros persistent state across loop does the last iteration end
144+
* with an odd-length sequence of backslashes? */
145+
146+
/* either 0 or 1, but a 64-bit value */
147+
uint64_t prev_iter_ends_odd_backslash = 0ULL;
148+
/* does the previous iteration end inside a double-quote pair? */
149+
uint64_t prev_iter_inside_quote =
150+
0ULL; /* either all zeros or all ones
151+
* does the previous iteration end on something that is a
152+
* predecessor of a pseudo-structural character - i.e.
153+
* whitespace or a structural character effectively the very
154+
* first char is considered to follow "whitespace" for the
155+
* purposes of pseudo-structural character detection so we
156+
* initialize to 1 */
157+
uint64_t prev_iter_ends_pseudo_pred = 1ULL;
158+
159+
/* structurals are persistent state across loop as we flatten them on the
160+
* subsequent iteration into our array pointed to be base_ptr.
161+
* This is harmless on the first iteration as structurals==0
162+
* and is done for performance reasons; we can hide some of the latency of
163+
* the
164+
* expensive carryless multiply in the previous step with this work */
165+
uint64_t structurals = 0;
166+
167+
size_t lenminus64 = len < 64 ? 0 : len - 64;
168+
size_t idx = 0;
169+
uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII
170+
code points < 0x20) */
171+
172+
for (; idx < lenminus64; idx += 64) {
173+
find_structural_bits_64(&buf[idx], idx, base_ptr, base,
174+
prev_iter_ends_odd_backslash,
175+
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
176+
structurals, error_mask, utf8_state);
177+
}
178+
/* If we have a final chunk of less than 64 bytes, pad it to 64 with
179+
* spaces before processing it (otherwise, we risk invalidating the UTF-8
180+
* checks). */
181+
if (idx < len) {
182+
uint8_t tmp_buf[64];
183+
memset(tmp_buf, 0x20, 64);
184+
memcpy(tmp_buf, buf + idx, len - idx);
185+
find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base,
186+
prev_iter_ends_odd_backslash,
187+
prev_iter_inside_quote, prev_iter_ends_pseudo_pred,
188+
structurals, error_mask, utf8_state);
189+
idx += 64;
190+
}
191+
192+
/* is last string quote closed? */
193+
if (prev_iter_inside_quote) {
194+
return simdjson::UNCLOSED_STRING;
195+
}
196+
197+
/* finally, flatten out the remaining structurals from the last iteration
198+
*/
199+
#ifdef IS_X86_64
200+
if (TARGETED_ARCHITECTURE == Architecture::HASWELL)
201+
simdjson::haswell::flatten_bits(base_ptr, base, idx, structurals);
202+
else
203+
#endif
204+
simdjson::flatten_bits(base_ptr, base, idx, structurals);
205+
206+
pj.n_structural_indexes = base;
207+
/* a valid JSON file cannot have zero structural indexes - we should have
208+
* found something */
209+
if (pj.n_structural_indexes == 0u) {
210+
return simdjson::EMPTY;
211+
}
212+
if (base_ptr[pj.n_structural_indexes - 1] > len) {
213+
return simdjson::UNEXPECTED_ERROR;
214+
}
215+
if (len != base_ptr[pj.n_structural_indexes - 1]) {
216+
/* the string might not be NULL terminated, but we add a virtual NULL
217+
* ending
218+
* character. */
219+
base_ptr[pj.n_structural_indexes++] = len;
220+
}
221+
/* make it safe to dereference one beyond this array */
222+
base_ptr[pj.n_structural_indexes] = 0;
223+
if (error_mask) {
224+
return simdjson::UNESCAPED_CHARS;
225+
}
226+
return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state);
227+
}
228+
229+
#endif // TARGETED_ARCHITECTURE

include/simdjson/stage1_find_marks_flatten.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,4 @@ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
9090
#endif // SIMDJSON_NAIVE_FLATTEN
9191
} // namespace simdjson
9292

93-
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
93+
#endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H

include/simdjson/stage1_find_marks_haswell.h

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include "simdjson/simdutf8check_haswell.h"
55
#include "simdjson/stage1_find_marks.h"
66
#include "simdjson/stage1_find_marks_flatten_haswell.h"
7-
#include "simdjson/stage1_find_marks_macros.h"
87

98
#ifdef IS_X86_64
109

@@ -97,23 +96,6 @@ really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
9796
return res_0 | (res_1 << 32);
9897
}
9998

100-
template <>
101-
really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
102-
simd_input<Architecture::HASWELL> in,
103-
uint64_t &prev_iter_ends_odd_backslash) {
104-
FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
105-
prev_iter_ends_odd_backslash);
106-
}
107-
108-
template <>
109-
really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
110-
simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
111-
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
112-
uint64_t &error_mask) {
113-
FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
114-
prev_iter_inside_quote, quote_bits, error_mask)
115-
}
116-
11799
template <>
118100
really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
119101
simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
@@ -207,4 +189,4 @@ really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
207189
UNTARGET_REGION
208190

209191
#endif // IS_X86_64
210-
#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
192+
#endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H

0 commit comments

Comments
 (0)