Skip to content

Commit 1d7e54f

Browse files
authored
Merge pull request simdjson#1089 from simdjson/jkeiser/stage2misc
Special case empty objects and arrays
2 parents 77c8581 + 9c33093 commit 1d7e54f

File tree

2 files changed

+105
-79
lines changed

2 files changed

+105
-79
lines changed

src/generic/stage2/structural_iterator.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,8 @@ class structural_iterator {
3434
return parser.len - *current_structural;
3535
}
3636

37-
really_inline bool past_end(uint32_t n_structural_indexes) {
38-
return current_structural >= &parser.structural_indexes[n_structural_indexes];
39-
}
40-
really_inline bool at_end(uint32_t n_structural_indexes) {
41-
return current_structural == &parser.structural_indexes[n_structural_indexes];
37+
really_inline bool at_end() {
38+
return current_structural == &parser.structural_indexes[parser.n_structural_indexes];
4239
}
4340
really_inline bool at_beginning() {
4441
return current_structural == parser.structural_indexes.get();

src/generic/stage2/structural_parser.h

Lines changed: 103 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -27,44 +27,49 @@ struct structural_parser : structural_iterator {
2727
current_string_buf_loc{parser.doc->string_buf.get()} {
2828
}
2929

30-
WARN_UNUSED really_inline error_code start_scope(bool parent_is_array) {
30+
WARN_UNUSED really_inline error_code start_scope(bool is_array) {
31+
depth++;
32+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
3133
parser.containing_scope[depth].tape_index = next_tape_index();
3234
parser.containing_scope[depth].count = 0;
3335
tape.skip(); // We don't actually *write* the start element until the end.
34-
parser.is_array[depth] = parent_is_array;
35-
depth++;
36-
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
36+
parser.is_array[depth] = is_array;
3737
return SUCCESS;
3838
}
3939

4040
WARN_UNUSED really_inline error_code start_document() {
4141
log_start_value("document");
42-
return start_scope(false);
42+
parser.containing_scope[depth].tape_index = next_tape_index();
43+
parser.containing_scope[depth].count = 0;
44+
tape.skip(); // We don't actually *write* the start element until the end.
45+
parser.is_array[depth] = false;
46+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
47+
return SUCCESS;
4348
}
4449

45-
WARN_UNUSED really_inline error_code start_object(bool parent_is_array) {
50+
WARN_UNUSED really_inline error_code start_object() {
4651
log_start_value("object");
47-
return start_scope(parent_is_array);
52+
return start_scope(false);
4853
}
4954

50-
WARN_UNUSED really_inline error_code start_array(bool parent_is_array) {
55+
WARN_UNUSED really_inline error_code start_array() {
5156
log_start_value("array");
52-
return start_scope(parent_is_array);
57+
return start_scope(true);
5358
}
5459

5560
// this function is responsible for annotating the start of the scope
5661
really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
57-
depth--;
58-
// write our doc->tape location to the header scope
59-
// The root scope gets written *at* the previous location.
60-
tape.append(parser.containing_scope[depth].tape_index, end);
62+
// SIMDJSON_ASSUME(depth > 0);
63+
// Write the ending tape element, pointing at the start location
64+
const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
65+
tape.append(start_tape_index, end);
66+
// Write the start tape element, pointing at the end location (and including count)
6167
// count can overflow if it exceeds 24 bits... so we saturate
6268
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
63-
const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
6469
const uint32_t count = parser.containing_scope[depth].count;
6570
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
66-
// This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
6771
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
72+
depth--;
6873
}
6974

7075
really_inline uint32_t next_tape_index() {
@@ -81,15 +86,38 @@ struct structural_parser : structural_iterator {
8186
}
8287
really_inline void end_document() {
8388
log_end_value("document");
84-
end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
89+
constexpr uint32_t start_tape_index = 0;
90+
tape.append(start_tape_index, internal::tape_type::ROOT);
91+
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT);
92+
}
93+
94+
really_inline void empty_container(internal::tape_type start, internal::tape_type end) {
95+
auto start_index = next_tape_index();
96+
tape.append(start_index+2, start);
97+
tape.append(start_index, end);
98+
}
99+
WARN_UNUSED really_inline bool empty_object() {
100+
if (peek_next_char() == '}') {
101+
advance_char();
102+
log_value("empty object");
103+
empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
104+
return true;
105+
}
106+
return false;
107+
}
108+
WARN_UNUSED really_inline bool empty_array() {
109+
if (peek_next_char() == ']') {
110+
advance_char();
111+
log_value("empty array");
112+
empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
113+
return true;
114+
}
115+
return false;
85116
}
86117

87118
// increment_count increments the count of keys in an object or values in an array.
88-
// Note that if you are at the level of the values or elements, the count
89-
// must be increment in the preceding depth (depth-1) where the array or
90-
// the object resides.
91119
really_inline void increment_count() {
92-
parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
120+
parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1
93121
}
94122

95123
really_inline uint8_t *on_start_string() noexcept {
@@ -199,6 +227,16 @@ struct structural_parser : structural_iterator {
199227
return SUCCESS;
200228
}
201229

230+
WARN_UNUSED really_inline error_code start() {
231+
logger::log_start();
232+
233+
// If there are no structurals left, return EMPTY
234+
if (at_end()) { return EMPTY; }
235+
236+
// Push the root scope (there is always at least one scope)
237+
return start_document();
238+
}
239+
202240
WARN_UNUSED really_inline error_code finish() {
203241
end_document();
204242
parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
@@ -211,29 +249,10 @@ struct structural_parser : structural_iterator {
211249
return SUCCESS;
212250
}
213251

214-
really_inline void init() {
215-
log_start();
216-
}
217-
218-
WARN_UNUSED really_inline error_code start() {
219-
// If there are no structurals left, return EMPTY
220-
if (at_end(parser.n_structural_indexes)) {
221-
return EMPTY;
222-
}
223-
224-
init();
225-
// Push the root scope (there is always at least one scope)
226-
return start_document();
227-
}
228-
229252
really_inline void log_value(const char *type) {
230253
logger::log_line(*this, "", type, "");
231254
}
232255

233-
static really_inline void log_start() {
234-
logger::log_start();
235-
}
236-
237256
really_inline void log_start_value(const char *type) {
238257
logger::log_line(*this, "+", type, "");
239258
if (logger::LOG_ENABLED) { logger::log_depth++; }
@@ -261,11 +280,14 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
261280
// Read first value
262281
//
263282
switch (parser.current_char()) {
264-
case '{':
265-
SIMDJSON_TRY( parser.start_object(false) );
283+
case '{': {
284+
if (parser.empty_object()) { goto document_end; }
285+
SIMDJSON_TRY( parser.start_object() );
266286
goto object_begin;
267-
case '[':
268-
SIMDJSON_TRY( parser.start_array(false) );
287+
}
288+
case '[': {
289+
if (parser.empty_array()) { goto document_end; }
290+
SIMDJSON_TRY( parser.start_array() );
269291
// Make sure the outer array is closed before continuing; otherwise, there are ways we could get
270292
// into memory corruption. See https://github.com/simdjson/simdjson/issues/906
271293
if (!STREAMING) {
@@ -274,14 +296,15 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
274296
}
275297
}
276298
goto array_begin;
277-
case '"': SIMDJSON_TRY( parser.parse_string() ); goto finish;
278-
case 't': SIMDJSON_TRY( parser.parse_root_true_atom() ); goto finish;
279-
case 'f': SIMDJSON_TRY( parser.parse_root_false_atom() ); goto finish;
280-
case 'n': SIMDJSON_TRY( parser.parse_root_null_atom() ); goto finish;
299+
}
300+
case '"': SIMDJSON_TRY( parser.parse_string() ); goto document_end;
301+
case 't': SIMDJSON_TRY( parser.parse_root_true_atom() ); goto document_end;
302+
case 'f': SIMDJSON_TRY( parser.parse_root_false_atom() ); goto document_end;
303+
case 'n': SIMDJSON_TRY( parser.parse_root_null_atom() ); goto document_end;
281304
case '-':
282305
case '0': case '1': case '2': case '3': case '4':
283306
case '5': case '6': case '7': case '8': case '9':
284-
SIMDJSON_TRY( parser.parse_root_number() ); goto finish;
307+
SIMDJSON_TRY( parser.parse_root_number() ); goto document_end;
285308
default:
286309
parser.log_error("Document starts with a non-value character");
287310
return TAPE_ERROR;
@@ -291,25 +314,27 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
291314
// Object parser states
292315
//
293316
object_begin:
294-
switch (parser.advance_char()) {
295-
case '"': {
296-
parser.increment_count();
297-
SIMDJSON_TRY( parser.parse_string(true) );
298-
goto object_key_state;
299-
}
300-
case '}':
301-
parser.end_object();
302-
goto scope_end;
303-
default:
317+
if (parser.advance_char() != '"') {
304318
parser.log_error("Object does not start with a key");
305319
return TAPE_ERROR;
306320
}
321+
parser.increment_count();
322+
SIMDJSON_TRY( parser.parse_string(true) );
323+
goto object_field;
307324

308-
object_key_state:
325+
object_field:
309326
if (unlikely( parser.advance_char() != ':' )) { parser.log_error("Missing colon after key in object"); return TAPE_ERROR; }
310327
switch (parser.advance_char()) {
311-
case '{': SIMDJSON_TRY( parser.start_object(false) ); goto object_begin;
312-
case '[': SIMDJSON_TRY( parser.start_array(false) ); goto array_begin;
328+
case '{': {
329+
if (parser.empty_object()) { break; };
330+
SIMDJSON_TRY( parser.start_object() );
331+
goto object_begin;
332+
}
333+
case '[': {
334+
if (parser.empty_array()) { break; };
335+
SIMDJSON_TRY( parser.start_array() );
336+
goto array_begin;
337+
}
313338
case '"': SIMDJSON_TRY( parser.parse_string() ); break;
314339
case 't': SIMDJSON_TRY( parser.parse_true_atom() ); break;
315340
case 'f': SIMDJSON_TRY( parser.parse_false_atom() ); break;
@@ -329,7 +354,7 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
329354
parser.increment_count();
330355
if (unlikely( parser.advance_char() != '"' )) { parser.log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
331356
SIMDJSON_TRY( parser.parse_string(true) );
332-
goto object_key_state;
357+
goto object_field;
333358
case '}':
334359
parser.end_object();
335360
goto scope_end;
@@ -339,25 +364,28 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
339364
}
340365

341366
scope_end:
342-
if (parser.depth == 1) { goto finish; }
367+
if (parser.depth == 0) { goto document_end; }
343368
if (parser.parser.is_array[parser.depth]) { goto array_continue; }
344369
goto object_continue;
345370

346371
//
347372
// Array parser states
348373
//
349374
array_begin:
350-
if (parser.peek_next_char() == ']') {
351-
parser.advance_char();
352-
parser.end_array();
353-
goto scope_end;
354-
}
355375
parser.increment_count();
356376

357-
main_array_switch:
377+
array_value:
358378
switch (parser.advance_char()) {
359-
case '{': SIMDJSON_TRY( parser.start_object(true) ); goto object_begin;
360-
case '[': SIMDJSON_TRY( parser.start_array(true) ); goto array_begin;
379+
case '{': {
380+
if (parser.empty_object()) { break; };
381+
SIMDJSON_TRY( parser.start_object() );
382+
goto object_begin;
383+
}
384+
case '[': {
385+
if (parser.empty_array()) { break; };
386+
SIMDJSON_TRY( parser.start_array() );
387+
goto array_begin;
388+
}
361389
case '"': SIMDJSON_TRY( parser.parse_string() ); break;
362390
case 't': SIMDJSON_TRY( parser.parse_true_atom() ); break;
363391
case 'f': SIMDJSON_TRY( parser.parse_false_atom() ); break;
@@ -375,7 +403,7 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
375403
switch (parser.advance_char()) {
376404
case ',':
377405
parser.increment_count();
378-
goto main_array_switch;
406+
goto array_value;
379407
case ']':
380408
parser.end_array();
381409
goto scope_end;
@@ -384,9 +412,10 @@ WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_impleme
384412
return TAPE_ERROR;
385413
}
386414

387-
finish:
415+
document_end:
388416
return parser.finish();
389-
}
417+
418+
} // parse_structurals()
390419

391420
} // namespace stage2
392421
} // namespace SIMDJSON_IMPLEMENTATION

0 commit comments

Comments
 (0)