Skip to content

Commit a0b1642

Browse files
authored
Merge pull request simdjson#1090 from simdjson/jkeiser/sax
Split stage 2 in SAX fashion
2 parents 1d7e54f + 5dd6259 commit a0b1642

8 files changed

Lines changed: 427 additions & 381 deletions

File tree

src/arm64/dom_parser_implementation.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, c
112112
#include "arm64/stringparsing.h"
113113
#include "arm64/numberparsing.h"
114114
#include "generic/stage2/structural_parser.h"
115+
#include "generic/stage2/tape_builder.h"
115116

116117
//
117118
// Implementation-specific overrides
@@ -144,19 +145,15 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons
144145
}
145146

146147
WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
147-
if (auto error = stage2::parse_structurals<false>(*this, _doc)) { return error; }
148-
149-
// If we didn't make it to the end, it's an error
150-
if ( next_structural_index != n_structural_indexes ) {
151-
logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
152-
return TAPE_ERROR;
153-
}
154-
155-
return SUCCESS;
148+
doc = &_doc;
149+
stage2::tape_builder builder(*doc);
150+
return stage2::structural_parser::parse<false>(*this, builder);
156151
}
157152

158153
WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
159-
return stage2::parse_structurals<true>(*this, _doc);
154+
doc = &_doc;
155+
stage2::tape_builder builder(_doc);
156+
return stage2::structural_parser::parse<true>(*this, builder);
160157
}
161158

162159
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {

src/fallback/dom_parser_implementation.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -316,24 +316,21 @@ WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) cons
316316
#include "fallback/stringparsing.h"
317317
#include "fallback/numberparsing.h"
318318
#include "generic/stage2/structural_parser.h"
319+
#include "generic/stage2/tape_builder.h"
319320

320321
namespace {
321322
namespace SIMDJSON_IMPLEMENTATION {
322323

323324
WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
324-
if (auto error = stage2::parse_structurals<false>(*this, _doc)) { return error; }
325-
326-
// If we didn't make it to the end, it's an error
327-
if ( next_structural_index != n_structural_indexes ) {
328-
logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
329-
return TAPE_ERROR;
330-
}
331-
332-
return SUCCESS;
325+
doc = &_doc;
326+
stage2::tape_builder builder(*doc);
327+
return stage2::structural_parser::parse<false>(*this, builder);
333328
}
334329

335330
WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
336-
return stage2::parse_structurals<true>(*this, _doc);
331+
doc = &_doc;
332+
stage2::tape_builder builder(_doc);
333+
return stage2::structural_parser::parse<true>(*this, builder);
337334
}
338335

339336
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {

src/generic/stage2/logger.h

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ namespace logger {
77
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
88

99
static constexpr const bool LOG_ENABLED = false;
10-
static constexpr const int LOG_EVENT_LEN = 30;
11-
static constexpr const int LOG_BUFFER_LEN = 20;
12-
static constexpr const int LOG_DETAIL_LEN = 50;
13-
static constexpr const int LOG_INDEX_LEN = 10;
10+
static constexpr const int LOG_EVENT_LEN = 20;
11+
static constexpr const int LOG_BUFFER_LEN = 10;
12+
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
13+
static constexpr const int LOG_INDEX_LEN = 5;
1414

1515
static int log_depth; // Not threadsafe. Log only.
1616

@@ -28,8 +28,8 @@ namespace logger {
2828
if (LOG_ENABLED) {
2929
log_depth = 0;
3030
printf("\n");
31-
printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
32-
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
31+
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
32+
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
3333
}
3434
}
3535

@@ -44,22 +44,35 @@ namespace logger {
4444
static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
4545
if (LOG_ENABLED) {
4646
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
47+
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
48+
auto next_index = structurals.next_structural;
49+
auto current = current_index ? &structurals.buf[*current_index] : (const uint8_t*)" ";
50+
auto next = &structurals.buf[*next_index];
4751
{
4852
// Print the next N characters in the buffer.
4953
printf("| ");
5054
// Otherwise, print the characters starting from the buffer position.
5155
// Print spaces for unprintable or newline characters.
5256
for (int i=0;i<LOG_BUFFER_LEN;i++) {
53-
printf("%c", printable_char(structurals.current()[i]));
57+
printf("%c", printable_char(current[i]));
5458
}
5559
printf(" ");
60+
// Print the next N characters in the buffer.
61+
printf("| ");
62+
// Otherwise, print the characters starting from the buffer position.
63+
// Print spaces for unprintable or newline characters.
64+
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
65+
printf("%c", printable_char(next[i]));
66+
}
67+
printf(" ");
68+
}
69+
if (current_index) {
70+
printf("| %*u ", LOG_INDEX_LEN, *current_index);
71+
} else {
72+
printf("| %-*s ", LOG_INDEX_LEN, "");
5673
}
57-
printf("| %c ", printable_char(structurals.current_char()));
58-
printf("| %c ", printable_char(structurals.peek_next_char()));
59-
printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
60-
printf("| %5u ", structurals.next_tape_index());
61-
printf("| %-*s ", LOG_DETAIL_LEN, detail);
62-
printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
74+
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
75+
printf("| %-s ", detail);
6376
printf("|\n");
6477
}
6578
}

src/generic/stage2/structural_iterator.h

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,40 +5,45 @@ namespace stage2 {
55
class structural_iterator {
66
public:
77
const uint8_t* const buf;
8-
uint32_t *current_structural;
9-
dom_parser_implementation &parser;
8+
uint32_t *next_structural;
9+
dom_parser_implementation &dom_parser;
1010

1111
// Start a structural
12-
really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
13-
: buf{_parser.buf},
14-
current_structural{&_parser.structural_indexes[start_structural_index]},
15-
parser{_parser} {
12+
really_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
13+
: buf{_dom_parser.buf},
14+
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
15+
dom_parser{_dom_parser} {
1616
}
1717
// Get the buffer position of the current structural character
1818
really_inline const uint8_t* current() {
19-
return &buf[*current_structural];
19+
return &buf[*(next_structural-1)];
2020
}
2121
// Get the current structural character
2222
really_inline char current_char() {
23-
return buf[*current_structural];
23+
return buf[*(next_structural-1)];
2424
}
2525
// Get the next structural character without advancing
2626
really_inline char peek_next_char() {
27-
return buf[*(current_structural+1)];
27+
return buf[*next_structural];
28+
}
29+
really_inline const uint8_t* peek() {
30+
return &buf[*next_structural];
31+
}
32+
really_inline const uint8_t* advance() {
33+
return &buf[*(next_structural++)];
2834
}
2935
really_inline char advance_char() {
30-
current_structural++;
31-
return buf[*current_structural];
36+
return buf[*(next_structural++)];
3237
}
3338
really_inline size_t remaining_len() {
34-
return parser.len - *current_structural;
39+
return dom_parser.len - *(next_structural-1);
3540
}
3641

3742
really_inline bool at_end() {
38-
return current_structural == &parser.structural_indexes[parser.n_structural_indexes];
43+
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
3944
}
4045
really_inline bool at_beginning() {
41-
return current_structural == parser.structural_indexes.get();
46+
return next_structural == dom_parser.structural_indexes.get();
4247
}
4348
};
4449

0 commit comments

Comments
 (0)