Skip to content

Commit f1744f5

Browse files
committed
Break out string/structural scanning from tokenizer
1 parent 5750a17 commit f1744f5

File tree

10 files changed

+500
-447
lines changed

10 files changed

+500
-447
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ endif # ifeq ($(SANITIZE),1)
5353
endif # ifeq ($(MEMSANITIZE),1)
5454

5555
# Headers and sources
56-
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
56+
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/json_scanner.h src/generic/json_string_scanner.h src/generic/json_structural_indexer.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
5757
SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h
5858
SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h
5959
SRCHEADERS_FALLBACK= src/fallback/bitmanipulation.h src/fallback/implementation.h src/fallback/numberparsing.h src/fallback/stage1_find_marks.h src/fallback/stage2_build_tape.h src/fallback/stringparsing.h

src/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ set(SIMDJSON_SRC_HEADERS
4646
fallback/stage1_find_marks.h
4747
fallback/stage2_build_tape.h
4848
generic/atomparsing.h
49+
generic/json_scanner.h
50+
generic/json_string_scanner.h
51+
generic/json_structural_indexer.h
4952
generic/numberparsing.h
50-
generic/stage1_find_marks.h
5153
generic/stage2_build_tape.h
5254
generic/stage2_streaming_build_tape.h
5355
generic/stringparsing.h

src/arm64/stage1_find_marks.h

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,18 @@ namespace simdjson::arm64 {
1111

1212
using namespace simd;
1313

14-
really_inline void find_whitespace_and_operators(
15-
const simd::simd8x64<uint8_t> in,
16-
uint64_t &whitespace, uint64_t &op) {
14+
struct json_character_block {
15+
static really_inline json_character_block classify(const simd::simd8x64<uint8_t> in);
1716

17+
really_inline uint64_t whitespace() const { return _whitespace; }
18+
really_inline uint64_t op() const { return _op; }
19+
really_inline uint64_t scalar() { return ~(op() | whitespace()); }
20+
21+
uint64_t _whitespace;
22+
uint64_t _op;
23+
};
24+
25+
really_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t> in) {
1826
auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
1927
auto nib_lo = chunk & 0xf;
2028
auto nib_hi = chunk.shr<4>();
@@ -23,8 +31,9 @@ really_inline void find_whitespace_and_operators(
2331
return shuf_lo & shuf_hi;
2432
});
2533

26-
op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
27-
whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
34+
uint64_t op = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x7); }).to_bitmask();
35+
uint64_t whitespace = v.map([&](simd8<uint8_t> _v) { return _v.any_bits_set(0x18); }).to_bitmask();
36+
return { whitespace, op };
2837
}
2938

3039
really_inline bool is_ascii(simd8x64<uint8_t> input) {
@@ -45,10 +54,12 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
4554
}
4655

4756
#include "generic/utf8_lookup2_algorithm.h"
48-
#include "generic/stage1_find_marks.h"
57+
#include "generic/json_string_scanner.h"
58+
#include "generic/json_scanner.h"
59+
#include "generic/json_structural_indexer.h"
4960

5061
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, document::parser &parser, bool streaming) const noexcept {
51-
return arm64::stage1::find_structural_bits<64>(buf, len, parser, streaming);
62+
return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
5263
}
5364

5465
} // namespace simdjson::arm64

src/generic/json_scanner.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
namespace stage1 {
2+
3+
/**
4+
* A block of scanned json, with information on operators and scalars.
5+
*/
6+
struct json_block {
7+
public:
8+
// the start of structurals that are not inside strings
9+
really_inline uint64_t structural_start() { return potential_structural_start() & ~_string.string_tail(); }
10+
11+
// operators plus scalar starts like 123, true and "abc"
12+
really_inline uint64_t potential_structural_start() { return _characters.op() | potential_scalar_start(); }
13+
// the start of non-operator runs, like 123, true and "abc"
14+
really_inline uint64_t potential_scalar_start() { return _characters.scalar() & ~follows_potential_scalar(); }
15+
// whether the given character is immediately after a non-operator like 123, true or "
16+
really_inline uint64_t follows_potential_scalar() { return _follows_potential_scalar; }
17+
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
18+
really_inline uint64_t non_quote_inside_string(uint64_t mask) { return _string.non_quote_inside_string(mask); }
19+
// string and escape characters
20+
json_string_block _string;
21+
// whitespace, operators, scalars
22+
json_character_block _characters;
23+
// whether the previous character was a scalar
24+
uint64_t _follows_potential_scalar;
25+
};
26+
27+
/**
28+
* Scans JSON for important bits: operators, strings, and scalars.
29+
*
30+
* The scanner starts by calculating two distinct things:
31+
* - string characters (taking \" into account)
32+
* - operators ([]{},:) and scalars (runs of non-operators like 123, true and "abc")
33+
*
34+
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
35+
* in particular, the operator/scalar bit will find plenty of things that are actually part of
36+
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
37+
* part of a string.
38+
*/
39+
class json_scanner {
40+
public:
41+
really_inline json_block next(const simd::simd8x64<uint8_t> in);
42+
really_inline error_code finish(bool streaming);
43+
44+
private:
45+
// Whether the last character of the previous iteration is part of a scalar token
46+
// (anything except whitespace or an operator).
47+
uint64_t prev_scalar = 0ULL;
48+
json_string_scanner string_scanner;
49+
};
50+
51+
52+
//
53+
// Check if the current character immediately follows a matching character.
54+
//
55+
// For example, this checks for quotes with backslashes in front of them:
56+
//
57+
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
58+
//
59+
really_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
60+
const uint64_t result = match << 1 | overflow;
61+
overflow = match >> 63;
62+
return result;
63+
}
64+
65+
//
66+
// Check if the current character follows a matching character, with possible "filler" between.
67+
// For example, this checks for empty curly braces, e.g.
68+
//
69+
// in.eq('}') & follows(in.eq('['), in.eq(' '), prev_empty_array) // { <whitespace>* }
70+
//
71+
really_inline uint64_t follows(const uint64_t match, const uint64_t filler, uint64_t &overflow) {
72+
uint64_t follows_match = follows(match, overflow);
73+
uint64_t result;
74+
overflow |= uint64_t(add_overflow(follows_match, filler, &result));
75+
return result;
76+
}
77+
78+
really_inline json_block json_scanner::next(const simd::simd8x64<uint8_t> in) {
79+
json_string_block strings = string_scanner.next(in);
80+
json_character_block characters = json_character_block::classify(in);
81+
uint64_t follows_scalar = follows(characters.scalar(), prev_scalar);
82+
return {
83+
strings,
84+
characters,
85+
follows_scalar
86+
};
87+
}
88+
89+
really_inline error_code json_scanner::finish(bool streaming) {
90+
return string_scanner.finish(streaming);
91+
}
92+
93+
} // namespace stage1

src/generic/json_string_scanner.h

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
namespace stage1 {
2+
3+
struct json_string_block {
4+
// Escaped characters (characters following an escape() character)
5+
really_inline uint64_t escaped() const { return _escaped; }
6+
// Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
7+
really_inline uint64_t escape() const { return _backslash & ~_escaped; }
8+
// Real (non-backslashed) quotes
9+
really_inline uint64_t quote() const { return _quote; }
10+
// Start quotes of strings
11+
really_inline uint64_t string_end() const { return _quote & _in_string; }
12+
// End quotes of strings
13+
really_inline uint64_t string_start() const { return _quote & ~_in_string; }
14+
// Only characters inside the string (not including the quotes)
15+
really_inline uint64_t string_content() const { return _in_string & ~_quote; }
16+
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
17+
really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return _in_string & mask; }
18+
// Tail of string (everything except the start quote)
19+
really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
20+
21+
// backslash characters
22+
uint64_t _backslash;
23+
// escaped characters (backslashed--does not include the hex characters after \u)
24+
uint64_t _escaped;
25+
// real quotes (non-backslashed ones)
26+
uint64_t _quote;
27+
// string characters (includes start quote but not end quote)
28+
uint64_t _in_string;
29+
};
30+
31+
// Scans blocks for string characters, storing the state necessary to do so
32+
class json_string_scanner {
33+
public:
34+
really_inline json_string_block next(const simd::simd8x64<uint8_t> in);
35+
really_inline error_code finish(bool streaming);
36+
37+
private:
38+
really_inline uint64_t find_escaped(uint64_t escape);
39+
40+
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
41+
uint64_t prev_in_string = 0ULL;
42+
// Whether the first character of the next iteration is escaped.
43+
uint64_t prev_escaped = 0ULL;
44+
};
45+
46+
//
47+
// Finds escaped characters (characters following \).
48+
//
49+
// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
50+
//
51+
// Does this by:
52+
// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
53+
// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
54+
// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
55+
//
56+
// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
57+
// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
58+
// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
59+
// the start bit causes a carry), and leaves even-bit sequences alone.
60+
//
61+
// Example:
62+
//
63+
// text | \\\ | \\\"\\\" \\\" \\"\\" |
64+
// escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
65+
// odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
66+
// even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
67+
// invert_mask | | cxxx c xx c| even_seq << 1
68+
// follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
69+
// escaped | x | x x x x x x x x |
70+
// desired | x | x x x x x x x x |
71+
// text | \\\ | \\\"\\\" \\\" \\"\\" |
72+
//
73+
really_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
74+
// If there was overflow, pretend the first character isn't a backslash
75+
backslash &= ~prev_escaped;
76+
uint64_t follows_escape = backslash << 1 | prev_escaped;
77+
78+
// Get sequences starting on even bits by clearing out the odd series using +
79+
const uint64_t even_bits = 0x5555555555555555ULL;
80+
uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
81+
uint64_t sequences_starting_on_even_bits;
82+
prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
83+
uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
84+
85+
// Mask every other backslashed character as an escaped character
86+
// Flip the mask for sequences that start on even bits, to correct them
87+
return (even_bits ^ invert_mask) & follows_escape;
88+
}
89+
90+
//
91+
// Return a mask of all string characters plus end quotes.
92+
//
93+
// prev_escaped is overflow saying whether the next character is escaped.
94+
// prev_in_string is overflow saying whether we're still in a string.
95+
//
96+
// Backslash sequences outside of quotes will be detected in stage 2.
97+
//
98+
really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t> in) {
99+
const uint64_t backslash = in.eq('\\');
100+
const uint64_t escaped = find_escaped(backslash);
101+
const uint64_t quote = in.eq('"') & ~escaped;
102+
// prefix_xor flips on bits inside the string (and flips off the end quote).
103+
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
104+
// (characters inside strings are outside, and characters outside strings are inside).
105+
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
106+
// right shift of a signed value expected to be well-defined and standard
107+
// compliant as of C++20, John Regher from Utah U. says this is fine code
108+
prev_in_string = static_cast<uint64_t>(static_cast<int64_t>(in_string) >> 63);
109+
// Use ^ to turn the beginning quote off, and the end quote on.
110+
return {
111+
backslash,
112+
escaped,
113+
quote,
114+
in_string
115+
};
116+
}
117+
118+
really_inline error_code json_string_scanner::finish(bool streaming) {
119+
if (prev_in_string and (not streaming)) {
120+
return UNCLOSED_STRING;
121+
}
122+
return SUCCESS;
123+
}
124+
125+
} // namespace stage1

0 commit comments

Comments
 (0)