1+ namespace stage1 {
2+
3+ struct json_string_block {
4+ // Escaped characters (characters following an escape() character)
5+ really_inline uint64_t escaped () const { return _escaped; }
6+ // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
7+ really_inline uint64_t escape () const { return _backslash & ~_escaped; }
8+ // Real (non-backslashed) quotes
9+ really_inline uint64_t quote () const { return _quote; }
10+ // Start quotes of strings
11+ really_inline uint64_t string_end () const { return _quote & _in_string; }
12+ // End quotes of strings
13+ really_inline uint64_t string_start () const { return _quote & ~_in_string; }
14+ // Only characters inside the string (not including the quotes)
15+ really_inline uint64_t string_content () const { return _in_string & ~_quote; }
16+ // Return a mask of whether the given characters are inside a string (only works on non-quotes)
17+ really_inline uint64_t non_quote_inside_string (uint64_t mask) const { return _in_string & mask; }
18+ // Tail of string (everything except the start quote)
19+ really_inline uint64_t string_tail () const { return _in_string ^ _quote; }
20+
21+ // backslash characters
22+ uint64_t _backslash;
23+ // escaped characters (backslashed--does not include the hex characters after \u)
24+ uint64_t _escaped;
25+ // real quotes (non-backslashed ones)
26+ uint64_t _quote;
27+ // string characters (includes start quote but not end quote)
28+ uint64_t _in_string;
29+ };
30+
31+ // Scans blocks for string characters, storing the state necessary to do so
32+ class json_string_scanner {
33+ public:
34+ really_inline json_string_block next (const simd::simd8x64<uint8_t > in);
35+ really_inline error_code finish (bool streaming);
36+
37+ private:
38+ really_inline uint64_t find_escaped (uint64_t escape);
39+
40+ // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
41+ uint64_t prev_in_string = 0ULL ;
42+ // Whether the first character of the next iteration is escaped.
43+ uint64_t prev_escaped = 0ULL ;
44+ };
45+
46+ //
47+ // Finds escaped characters (characters following \).
48+ //
49+ // Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
50+ //
51+ // Does this by:
52+ // - Shift the escape mask to get potentially escaped characters (characters after backslashes).
53+ // - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
54+ // - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
55+ //
56+ // To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
57+ // escape sequences, filters out the ones that start on even bits, and adds that to the mask of
58+ // escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
59+ // the start bit causes a carry), and leaves even-bit sequences alone.
60+ //
61+ // Example:
62+ //
63+ // text | \\\ | \\\"\\\" \\\" \\"\\" |
64+ // escape | xxx | xx xxx xxx xx xx | Removed overflow backslash; will | it into follows_escape
65+ // odd_starts | x | x x x | escape & ~even_bits & ~follows_escape
66+ // even_seq | c| cxxx c xx c | c = carry bit -- will be masked out later
67+ // invert_mask | | cxxx c xx c| even_seq << 1
68+ // follows_escape | xx | x xx xxx xxx xx xx | Includes overflow bit
69+ // escaped | x | x x x x x x x x |
70+ // desired | x | x x x x x x x x |
71+ // text | \\\ | \\\"\\\" \\\" \\"\\" |
72+ //
73+ really_inline uint64_t json_string_scanner::find_escaped (uint64_t backslash) {
74+ // If there was overflow, pretend the first character isn't a backslash
75+ backslash &= ~prev_escaped;
76+ uint64_t follows_escape = backslash << 1 | prev_escaped;
77+
78+ // Get sequences starting on even bits by clearing out the odd series using +
79+ const uint64_t even_bits = 0x5555555555555555ULL ;
80+ uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
81+ uint64_t sequences_starting_on_even_bits;
82+ prev_escaped = add_overflow (odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
83+ uint64_t invert_mask = sequences_starting_on_even_bits << 1 ; // The mask we want to return is the *escaped* bits, not escapes.
84+
85+ // Mask every other backslashed character as an escaped character
86+ // Flip the mask for sequences that start on even bits, to correct them
87+ return (even_bits ^ invert_mask) & follows_escape;
88+ }
89+
90+ //
91+ // Return a mask of all string characters plus end quotes.
92+ //
93+ // prev_escaped is overflow saying whether the next character is escaped.
94+ // prev_in_string is overflow saying whether we're still in a string.
95+ //
96+ // Backslash sequences outside of quotes will be detected in stage 2.
97+ //
98+ really_inline json_string_block json_string_scanner::next (const simd::simd8x64<uint8_t > in) {
99+ const uint64_t backslash = in.eq (' \\ ' );
100+ const uint64_t escaped = find_escaped (backslash);
101+ const uint64_t quote = in.eq (' "' ) & ~escaped;
102+ // prefix_xor flips on bits inside the string (and flips off the end quote).
103+ // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
104+ // (characters inside strings are outside, and characters outside strings are inside).
105+ const uint64_t in_string = prefix_xor (quote) ^ prev_in_string;
106+ // right shift of a signed value expected to be well-defined and standard
107+ // compliant as of C++20, John Regher from Utah U. says this is fine code
108+ prev_in_string = static_cast <uint64_t >(static_cast <int64_t >(in_string) >> 63 );
109+ // Use ^ to turn the beginning quote off, and the end quote on.
110+ return {
111+ backslash,
112+ escaped,
113+ quote,
114+ in_string
115+ };
116+ }
117+
118+ really_inline error_code json_string_scanner::finish (bool streaming) {
119+ if (prev_in_string and (not streaming)) {
120+ return UNCLOSED_STRING;
121+ }
122+ return SUCCESS;
123+ }
124+
125+ } // namespace stage1
0 commit comments