|
3 | 3 | // We assume the file in which it is include already includes |
4 | 4 | // "simdjson/stage2.h" (this simplifies amalgation) |
5 | 5 |
|
6 | | -#include "generic/stage2/tape_writer.h" |
7 | 6 | #include "generic/stage2/logger.h" |
8 | | -#include "generic/stage2/atomparsing.h" |
9 | 7 | #include "generic/stage2/structural_iterator.h" |
10 | 8 |
|
11 | 9 | namespace { // Make everything here private |
12 | 10 | namespace SIMDJSON_IMPLEMENTATION { |
13 | 11 | namespace stage2 { |
14 | 12 |
|
| 13 | +#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } |
| 14 | + |
| 15 | +template<typename T> |
15 | 16 | struct structural_parser : structural_iterator { |
16 | | - /** Lets you append to the tape */ |
17 | | - tape_writer tape; |
18 | | - /** Next write location in the string buf for stage 2 parsing */ |
19 | | - uint8_t *current_string_buf_loc; |
| 17 | + /** Receiver that actually parses the strings and builds the tape */ |
| 18 | + T builder; |
20 | 19 | /** Current depth (nested objects and arrays) */ |
21 | 20 | uint32_t depth{0}; |
22 | 21 |
|
23 | 22 | // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations |
24 | 23 | really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) |
25 | 24 | : structural_iterator(_parser, start_structural_index), |
26 | | - tape{parser.doc->tape.get()}, |
27 | | - current_string_buf_loc{parser.doc->string_buf.get()} { |
28 | | - } |
29 | | - |
30 | | - WARN_UNUSED really_inline error_code start_scope(bool is_array) { |
31 | | - depth++; |
32 | | - if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
33 | | - parser.containing_scope[depth].tape_index = next_tape_index(); |
34 | | - parser.containing_scope[depth].count = 0; |
35 | | - tape.skip(); // We don't actually *write* the start element until the end. |
36 | | - parser.is_array[depth] = is_array; |
37 | | - return SUCCESS; |
| 25 | + builder{parser.doc->tape.get(), parser.doc->string_buf.get()} { |
38 | 26 | } |
39 | 27 |
|
40 | 28 | WARN_UNUSED really_inline error_code start_document() { |
41 | | - log_start_value("document"); |
42 | | - parser.containing_scope[depth].tape_index = next_tape_index(); |
43 | | - parser.containing_scope[depth].count = 0; |
44 | | - tape.skip(); // We don't actually *write* the start element until the end. |
| 29 | + builder.start_document(*this); |
45 | 30 | parser.is_array[depth] = false; |
46 | 31 | return SUCCESS; |
47 | 32 | } |
48 | | - |
49 | 33 | WARN_UNUSED really_inline error_code start_object() { |
50 | | - log_start_value("object"); |
51 | | - return start_scope(false); |
| 34 | + depth++; |
| 35 | + if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
| 36 | + builder.start_object(*this); |
| 37 | + parser.is_array[depth] = false; |
| 38 | + return SUCCESS; |
52 | 39 | } |
53 | | - |
54 | 40 | WARN_UNUSED really_inline error_code start_array() { |
55 | | - log_start_value("array"); |
56 | | - return start_scope(true); |
57 | | - } |
58 | | - |
59 | | - // this function is responsible for annotating the start of the scope |
60 | | - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { |
61 | | - // SIMDJSON_ASSUME(depth > 0); |
62 | | - // Write the ending tape element, pointing at the start location |
63 | | - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; |
64 | | - tape.append(start_tape_index, end); |
65 | | - // Write the start tape element, pointing at the end location (and including count) |
66 | | - // count can overflow if it exceeds 24 bits... so we saturate |
67 | | - // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). |
68 | | - const uint32_t count = parser.containing_scope[depth].count; |
69 | | - const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; |
70 | | - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); |
71 | | - depth--; |
72 | | - } |
73 | | - |
74 | | - really_inline uint32_t next_tape_index() { |
75 | | - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); |
| 41 | + depth++; |
| 42 | + if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
| 43 | + builder.start_array(*this); |
| 44 | + parser.is_array[depth] = true; |
| 45 | + return SUCCESS; |
76 | 46 | } |
77 | | - |
78 | 47 | really_inline void end_object() { |
79 | | - log_end_value("object"); |
80 | | - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); |
| 48 | + builder.end_object(*this); |
| 49 | + depth--; |
81 | 50 | } |
82 | 51 | really_inline void end_array() { |
83 | | - log_end_value("array"); |
84 | | - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); |
| 52 | + builder.end_array(*this); |
| 53 | + depth--; |
85 | 54 | } |
86 | 55 | really_inline void end_document() { |
87 | | - log_end_value("document"); |
88 | | - constexpr uint32_t start_tape_index = 0; |
89 | | - tape.append(start_tape_index, internal::tape_type::ROOT); |
90 | | - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT); |
| 56 | + builder.end_document(*this); |
91 | 57 | } |
92 | 58 |
|
93 | | - really_inline void empty_container(internal::tape_type start, internal::tape_type end) { |
94 | | - auto start_index = next_tape_index(); |
95 | | - tape.append(start_index+2, start); |
96 | | - tape.append(start_index, end); |
97 | | - } |
98 | 59 | WARN_UNUSED really_inline bool empty_object() { |
99 | 60 | if (peek_next_char() == '}') { |
100 | 61 | advance_char(); |
101 | | - log_value("empty object"); |
102 | | - empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); |
| 62 | + builder.empty_object(*this); |
103 | 63 | return true; |
104 | 64 | } |
105 | 65 | return false; |
106 | 66 | } |
107 | 67 | WARN_UNUSED really_inline bool empty_array() { |
108 | 68 | if (peek_next_char() == ']') { |
109 | 69 | advance_char(); |
110 | | - log_value("empty array"); |
111 | | - empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); |
| 70 | + builder.empty_array(*this); |
112 | 71 | return true; |
113 | 72 | } |
114 | 73 | return false; |
115 | 74 | } |
116 | 75 |
|
117 | | - // increment_count increments the count of keys in an object or values in an array. |
118 | 76 | really_inline void increment_count() { |
119 | | - parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1 |
120 | | - } |
121 | | - |
122 | | - really_inline uint8_t *on_start_string() noexcept { |
123 | | - // we advance the point, accounting for the fact that we have a NULL termination |
124 | | - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); |
125 | | - return current_string_buf_loc + sizeof(uint32_t); |
126 | | - } |
127 | | - |
128 | | - really_inline void on_end_string(uint8_t *dst) noexcept { |
129 | | - uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); |
130 | | - // TODO check for overflow in case someone has a crazy string (>=4GB?) |
131 | | - // But only add the overflow check when the document itself exceeds 4GB |
132 | | - // Currently unneeded because we refuse to parse docs larger or equal to 4GB. |
133 | | - memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t)); |
134 | | - // NULL termination is still handy if you expect all your strings to |
135 | | - // be NULL terminated? It comes at a small cost |
136 | | - *dst = 0; |
137 | | - current_string_buf_loc = dst + 1; |
| 77 | + builder.increment_count(*this); |
138 | 78 | } |
139 | 79 |
|
140 | 80 | WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) { |
141 | | - return parse_string(key, true); |
142 | | - } |
143 | | - WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) { |
144 | | - log_value(key ? "key" : "string"); |
145 | | - uint8_t *dst = on_start_string(); |
146 | | - dst = stringparsing::parse_string(value, dst); |
147 | | - if (dst == nullptr) { |
148 | | - log_error("Invalid escape in string"); |
149 | | - return STRING_ERROR; |
150 | | - } |
151 | | - on_end_string(dst); |
152 | | - return SUCCESS; |
| 81 | + return builder.parse_key(*this, key); |
| 82 | + } |
| 83 | + WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) { |
| 84 | + return builder.parse_string(*this, value); |
153 | 85 | } |
154 | | - |
155 | 86 | WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) { |
156 | | - log_value("number"); |
157 | | - if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; } |
158 | | - return SUCCESS; |
| 87 | + return builder.parse_number(*this, value); |
159 | 88 | } |
160 | | - |
161 | | - really_inline error_code parse_root_number(const uint8_t *value) { |
162 | | - // |
163 | | - // We need to make a copy to make sure that the string is space terminated. |
164 | | - // This is not about padding the input, which should already padded up |
165 | | - // to len + SIMDJSON_PADDING. However, we have no control at this stage |
166 | | - // on how the padding was done. What if the input string was padded with nulls? |
167 | | - // It is quite common for an input string to have an extra null character (C string). |
168 | | - // We do not want to allow 9\0 (where \0 is the null character) inside a JSON |
169 | | - // document, but the string "9\0" by itself is fine. So we make a copy and |
170 | | - // pad the input with spaces when we know that there is just one input element. |
171 | | - // This copy is relatively expensive, but it will almost never be called in |
172 | | - // practice unless you are in the strange scenario where you have many JSON |
173 | | - // documents made of single atoms. |
174 | | - // |
175 | | - uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING)); |
176 | | - if (copy == nullptr) { |
177 | | - return MEMALLOC; |
178 | | - } |
179 | | - memcpy(copy, value, remaining_len()); |
180 | | - memset(copy + remaining_len(), ' ', SIMDJSON_PADDING); |
181 | | - error_code error = parse_number(copy); |
182 | | - free(copy); |
183 | | - return error; |
| 89 | + WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) { |
| 90 | + return builder.parse_root_number(*this, value); |
184 | 91 | } |
185 | | - |
186 | 92 | WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) { |
187 | | - log_value("true"); |
188 | | - if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; } |
189 | | - tape.append(0, internal::tape_type::TRUE_VALUE); |
190 | | - return SUCCESS; |
| 93 | + return builder.parse_true_atom(*this, value); |
191 | 94 | } |
192 | | - |
193 | 95 | WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) { |
194 | | - log_value("true"); |
195 | | - if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; } |
196 | | - tape.append(0, internal::tape_type::TRUE_VALUE); |
197 | | - return SUCCESS; |
| 96 | + return builder.parse_root_true_atom(*this, value); |
198 | 97 | } |
199 | | - |
200 | 98 | WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) { |
201 | | - log_value("false"); |
202 | | - if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; } |
203 | | - tape.append(0, internal::tape_type::FALSE_VALUE); |
204 | | - return SUCCESS; |
| 99 | + return builder.parse_false_atom(*this, value); |
205 | 100 | } |
206 | | - |
207 | 101 | WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) { |
208 | | - log_value("false"); |
209 | | - if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; } |
210 | | - tape.append(0, internal::tape_type::FALSE_VALUE); |
211 | | - return SUCCESS; |
| 102 | + return builder.parse_root_false_atom(*this, value); |
212 | 103 | } |
213 | | - |
214 | 104 | WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) { |
215 | | - log_value("null"); |
216 | | - if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; } |
217 | | - tape.append(0, internal::tape_type::NULL_VALUE); |
218 | | - return SUCCESS; |
| 105 | + return builder.parse_null_atom(*this, value); |
219 | 106 | } |
220 | | - |
221 | 107 | WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) { |
222 | | - log_value("null"); |
223 | | - if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; } |
224 | | - tape.append(0, internal::tape_type::NULL_VALUE); |
225 | | - return SUCCESS; |
| 108 | + return builder.parse_root_null_atom(*this, value); |
226 | 109 | } |
227 | 110 |
|
228 | 111 | WARN_UNUSED really_inline error_code start() { |
@@ -266,12 +149,20 @@ struct structural_parser : structural_iterator { |
266 | 149 | } |
267 | 150 | }; // struct structural_parser |
268 | 151 |
|
269 | | -#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } |
| 152 | +} // namespace stage2 |
| 153 | +} // namespace SIMDJSON_IMPLEMENTATION |
| 154 | +} // unnamed namespace |
| 155 | + |
| 156 | +#include "generic/stage2/tape_builder.h" |
| 157 | + |
| 158 | +namespace { // Make everything here private |
| 159 | +namespace SIMDJSON_IMPLEMENTATION { |
| 160 | +namespace stage2 { |
270 | 161 |
|
271 | 162 | template<bool STREAMING> |
272 | 163 | WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { |
273 | 164 | dom_parser.doc = &doc; |
274 | | - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); |
| 165 | + stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); |
275 | 166 | SIMDJSON_TRY( parser.start() ); |
276 | 167 |
|
277 | 168 | // |
|
0 commit comments