@@ -81,37 +81,6 @@ really_inline ErrorValues detect_errors_on_eof(
8181 return SUCCESS;
8282}
8383
84- //
85- // Return a mask of all string characters plus end quotes.
86- //
87- // prev_escaped is overflow saying whether the next character is escaped.
88- // prev_in_string is overflow saying whether we're still in a string.
89- //
90- // Backslash sequences outside of quotes will be detected in stage 2.
91- //
92- really_inline uint64_t find_strings (const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
93- const uint64_t backslash = in.eq (' \\ ' );
94- const uint64_t escaped = follows_odd_sequence_of (backslash, prev_escaped);
95- const uint64_t quote = in.eq (' "' ) & ~escaped;
96- // compute_quote_mask returns start quote plus string contents.
97- const uint64_t in_string = compute_quote_mask (quote) ^ prev_in_string;
98- /* right shift of a signed value expected to be well-defined and standard
99- * compliant as of C++20,
100- * John Regher from Utah U. says this is fine code */
101- prev_in_string = static_cast <uint64_t >(static_cast <int64_t >(in_string) >> 63 );
102- // Use ^ to turn the beginning quote off, and the end quote on.
103- return in_string ^ quote;
104- }
105-
106- really_inline uint64_t invalid_string_bytes (const uint64_t unescaped, const uint64_t quote_mask) {
107- /* All Unicode characters may be placed within the
108- * quotation marks, except for the characters that MUST be escaped:
109- * quotation mark, reverse solidus, and the control characters (U+0000
110- * through U+001F).
111- * https://tools.ietf.org/html/rfc8259 */
112- return quote_mask & unescaped;
113- }
114-
11584//
11685// Determine which characters are *structural*:
11786// - braces: [] and {}
@@ -128,11 +97,7 @@ really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint
12897// contents of a string the same as content outside. Errors and structurals inside the string or on
12998// the trailing quote will need to be removed later when the correct string information is known.
13099//
131- really_inline uint64_t find_potential_structurals (const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
132- // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
133- uint64_t whitespace, op;
134- find_whitespace_and_operators (in, whitespace, op);
135-
100+ really_inline uint64_t find_potential_structurals (uint64_t whitespace, uint64_t op, uint64_t &prev_primitive) {
136101 // Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
137102 // Everything except whitespace, braces, colon and comma.
138103 const uint64_t primitive = ~(op | whitespace);
@@ -143,7 +108,61 @@ really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE>
143108 return op | start_primitive;
144109}
145110
146- static const size_t STEP_SIZE = 128 ;
111+ // All independent tasks that scan the SIMD input go here.
112+ really_inline void stage2_scan_input (
113+ const simd_input<ARCHITECTURE> in,
114+ uint64_t &prev_escaped, uint64_t &prev_primitive, utf8_checker<ARCHITECTURE> utf8_state,
115+ uint64_t "e, uint64_t &potential_structurals, uint64_t &invalid_string_bytes
116+ ) {
117+ // Finding real quotes, and finding potential structurals, are interleaved here.
118+ // First we read the input we need from SIMD for them ...
119+ uint64_t backslash = in.eq (' \\ ' );
120+ uint64_t whitespace, op;
121+ find_whitespace_and_operators (in, whitespace, op);
122+
123+ // Then we do the actual work. follows_odd_sequence_of takes longer than find_potential_structurals.
124+ const uint64_t escaped = follows_odd_sequence_of (backslash, prev_escaped);
125+ potential_structurals = find_potential_structurals (whitespace, op, prev_primitive);
126+ uint64_t raw_quote = in.eq (' "' );
127+ quote = raw_quote & ~escaped;
128+ invalid_string_bytes = in.lteq (0x1F );
129+
130+ // The UTF-8 scan is pretty much independent of anything else, so we stick it at the end here to
131+ // soak up some CPU.
132+ utf8_state.check_next_input (in);
133+ }
134+
135+ //
136+ // Return a mask of all string characters plus end quotes.
137+ //
138+ // prev_escaped is overflow saying whether the next character is escaped.
139+ // prev_in_string is overflow saying whether we're still in a string.
140+ //
141+ // Backslash sequences outside of quotes will be detected in stage 2.
142+ //
143+ really_inline uint64_t find_strings (uint64_t quote, uint64_t &prev_in_string) {
144+ // compute_quote_mask returns start quote plus string contents.
145+ const uint64_t in_string = compute_quote_mask (quote) ^ prev_in_string;
146+ /* right shift of a signed value expected to be well-defined and standard
147+ * compliant as of C++20,
148+ * John Regher from Utah U. says this is fine code */
149+ prev_in_string = static_cast <uint64_t >(static_cast <int64_t >(in_string) >> 63 );
150+ // Use ^ to turn the beginning quote off, and the end quote on (i.e. everything except the
151+ // starting quote is the string's value).
152+ return in_string ^ quote;
153+ }
154+
155+ really_inline void stage3_find_strings (
156+ uint64_t quote,
157+ uint64_t &prev_in_string,
158+ uint64_t &string,
159+ const uint64_t potential_structurals, uint64_t &potential_structurals_out, const uint64_t invalid_string_bytes, uint64_t &invalid_string_bytes_out
160+ ) {
161+ string = find_strings (quote, prev_in_string);
162+ potential_structurals_out = potential_structurals;
163+ invalid_string_bytes_out = invalid_string_bytes;
164+ }
165+
147166
148167//
149168// Find the important bits of JSON in a 128-byte chunk, and add them to :
@@ -166,67 +185,39 @@ static const size_t STEP_SIZE = 128;
166185// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
167186// workout.
168187//
169- really_inline void find_structural_bits_start (
170- const uint8_t *buf,
188+ really_inline bool stage1_read_input (
189+ const uint8_t *&buf,
190+ const uint8_t *buf_end,
171191 simd_input<ARCHITECTURE> &in
172192) {
173- in = simd_input<ARCHITECTURE>(buf);
174- }
193+ // If we have a full input, load it up and return it.
194+ const uint8_t *next_buf = buf+64 ;
195+ if (likely (next_buf < buf_end)) {
196+ in = simd_input<ARCHITECTURE>(buf);
197+ buf = next_buf;
198+ return true ;
199+ }
175200
176- really_inline void find_structural_bits_middle (
177- simd_input<ARCHITECTURE> in,
178- uint64_t &prev_escaped, uint64_t &prev_in_string, uint64_t &prev_primitive,
179- uint64_t &string, uint64_t &structurals
180- ) {
181- string = find_strings (in, prev_escaped, prev_in_string);
182- structurals = find_potential_structurals (in, prev_primitive);
201+ // If we have a *partial* (< 64 byte) input, load it up and return it.
202+ if (likely (buf_end > buf)) {
203+ uint8_t tmp_buf[64 ];
204+ memset (tmp_buf, 0x20 , 64 );
205+ memcpy (tmp_buf, buf, buf_end-buf);
206+ in = simd_input<ARCHITECTURE>(tmp_buf);
207+ buf = next_buf;
208+ return true ;
209+ }
210+
211+ return false ;
183212}
184213
185- really_inline void find_structural_bits_end (
186- simd_input<ARCHITECTURE> in, uint64_t idx, uint64_t string, uint64_t structurals ,
187- uint32_t *&base_ptr, uint64_t &prev_structurals, utf8_checker<ARCHITECTURE> &utf8_state ,
214+ really_inline void stage4_output_structurals (
215+ const uint64_t string, const uint64_t structurals, const uint64_t invalid_string_bytes ,
216+ uint32_t *&base_ptr, size_t &idx ,
188217 uint64_t &unescaped_chars_error
189218) {
190- uint64_t unescaped = in.lteq (0x1F );
191- utf8_state.check_next_input (in);
192- flatten_bits (base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
193- prev_structurals = structurals & ~string;
194- unescaped_chars_error |= unescaped & string;
195- idx += 64 ;
196- }
197-
198- really_inline void find_structural_bits_128 (
199- const uint8_t *buf, size_t &idx, uint32_t *&base_ptr,
200- uint64_t &prev_escaped, uint64_t &prev_in_string,
201- uint64_t &prev_primitive,
202- uint64_t &prev_structurals,
203- uint64_t &unescaped_chars_error,
204- utf8_checker<ARCHITECTURE> &utf8_state) {
205- //
206- // Load up all 128 bytes into SIMD registers
207- //
208- simd_input<ARCHITECTURE> in_1, in_2;
209- find_structural_bits_start (buf, in_1);
210- find_structural_bits_start (buf+64 , in_2);
211-
212- //
213- // Find the strings and potential structurals (operators / primitives).
214- //
215- // This will include false structurals that are *inside* strings--we'll filter strings out
216- // before we return.
217- //
218- uint64_t string_1, structurals_1, string_2, structurals_2;
219- find_structural_bits_middle (in_1, prev_escaped, prev_in_string, prev_primitive, string_1, structurals_1);
220- find_structural_bits_middle (in_2, prev_escaped, prev_in_string, prev_primitive, string_2, structurals_2);
221-
222- //
223- // Do miscellaneous work while the processor is busy calculating strings and structurals.
224- //
225- // After that, weed out structurals that are inside strings and find invalid string characters.
226- //
227- find_structural_bits_end (in_1, idx, string_1, structurals_1, base_ptr, prev_structurals, utf8_state, unescaped_chars_error);
228- find_structural_bits_end (in_2, idx+64 , string_2, structurals_2, base_ptr, prev_structurals, utf8_state, unescaped_chars_error);
229- idx += 128 ;
219+ flatten_bits (base_ptr, idx, structurals & ~string);
220+ unescaped_chars_error |= invalid_string_bytes & string;
230221}
231222
232223int find_structural_bits (const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
@@ -236,49 +227,56 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
236227 << len << " bytes" << std::endl;
237228 return simdjson::CAPACITY;
238229 }
239- if (unlikely (len == 0 )) {
240- return simdjson::EMPTY;
241- }
242- uint32_t *base_ptr = pj.structural_indexes ;
243- utf8_checker<ARCHITECTURE> utf8_state;
244230
231+ // Stage 1 state and output
232+ const uint8_t *buf_end = buf + len;
233+ simd_input<ARCHITECTURE> in;
234+
235+ // Stage 2 state and output
236+ uint64_t quote, potential_structurals, invalid_string_bytes;
245237 // Whether the first character of the next iteration is escaped.
246- uint64_t prev_escaped = 0ULL ;
247- // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
248- uint64_t prev_in_string = 0ULL ;
238+ uint64_t prev_escaped = 0 ;
249239 // Whether the last character of the previous iteration is a primitive value character
250240 // (anything except whitespace, braces, comma or colon).
251- uint64_t prev_primitive = 0ULL ;
252- // Mask of structural characters from the last iteration.
253- // Kept around for performance reasons, so we can call flatten_bits to soak up some unused
254- // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
255- uint64_t structurals = 0 ;
241+ uint64_t prev_primitive = 0 ;
242+ utf8_checker<ARCHITECTURE> utf8_state;
243+
244+ // Stage 3 state and output
245+ uint64_t prev_in_string = 0 ;
246+ uint64_t string, potential_structurals2, invalid_string_bytes2;
256247
257- size_t last_buf_size = (len % STEP_SIZE == 0 ) ? STEP_SIZE : (len % STEP_SIZE);
258- const uint8_t *last_buf = buf + len - last_buf_size ;
259- size_t idx = 0 ;
248+ // Stage 4 state and output
249+ uint32_t *base_ptr = pj. structural_indexes ;
250+ uint64_t idx = 0 ;
260251 // Errors with unescaped characters in strings (ASCII codepoints < 0x20)
261252 uint64_t unescaped_chars_error = 0 ;
262253
263- while (buf < last_buf) {
264- find_structural_bits_128 (buf, idx, base_ptr,
265- prev_escaped, prev_in_string, prev_primitive,
266- structurals, unescaped_chars_error, utf8_state);
267- buf += 128 ;
268- }
254+ // Read the first input (if any)
255+ if (stage1_read_input (buf, buf_end, in)) {
256+ // Stage 2 is ready. Push it so stage 3 is ready.
257+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
269258
270- /* If we have a final chunk of less than 64 bytes, pad it to 64 with
271- * spaces before processing it (otherwise, we risk invalidating the UTF-8
272- * checks). */
273- uint8_t tmp_buf[STEP_SIZE];
274- memset (tmp_buf, 0x20 , STEP_SIZE);
275- memcpy (tmp_buf, last_buf, last_buf_size);
276- find_structural_bits_128 (&tmp_buf[0 ], idx, base_ptr,
277- prev_escaped, prev_in_string, prev_primitive,
278- structurals, unescaped_chars_error, utf8_state);
259+ // Read the second input (if any)
260+ if (stage1_read_input (buf, buf_end, in)) {
261+ // stages 2 and 3 are ready. Push them so 3 and 4 are ready.
262+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
263+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
279264
280- /* finally, flatten out the remaining structurals from the last iteration */
281- flatten_bits (base_ptr, idx, structurals);
265+ // Read remaining inputs
266+ while (stage1_read_input (buf, buf_end, in)) {
267+ // stages 2, 3 and 4 are ready. Push them so 3 and 4 are ready.
268+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
269+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
270+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
271+ }
272+ // stages 3 and 4 are ready. Push 4 so only 3 is ready.
273+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
274+ }
275+
276+ // stage 3 is ready. Finish it.
277+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
278+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
279+ }
282280
283281 simdjson::ErrorValues error = detect_errors_on_eof (unescaped_chars_error, prev_in_string);
284282 if (unlikely (error != simdjson::SUCCESS)) {
0 commit comments