Skip to content

Commit 7de7ce5

Browse files
committed
Move document stream state to implementation
1 parent 383e8c7 commit 7de7ce5

File tree

1 file changed

+89
-0
lines changed

1 file changed

+89
-0
lines changed

src/generic/stage1/json_structural_indexer.h

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ class json_structural_indexer {
7373
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
7474
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
7575
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76+
static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser);
77+
static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len);
7678

7779
json_scanner scanner{};
7880
utf8_checker checker{};
@@ -195,4 +197,91 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
195197
return checker.errors();
196198
}
197199

200+
/**
201+
* This algorithm is used to quickly identify the last structural position that
202+
* makes up a complete document.
203+
*
204+
* It does this by going backwards and finding the last *document boundary* (a
205+
* place where one value follows another without a comma between them). If the
206+
* last document (the characters after the boundary) has an equal number of
207+
* start and end brackets, it is considered complete.
208+
*
209+
* Simply put, we iterate over the structural characters, starting from
210+
* the end. We consider that we found the end of a JSON document when the
211+
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
212+
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
213+
*
214+
* This simple comparison works most of the time, but it does not cover cases
215+
* where the batch's structural indexes contain a perfect amount of documents.
216+
* In such a case, we do not have access to the structural index which follows
217+
* the last document, therefore, we do not have access to the second element in
218+
* the pair, and means that we cannot identify the last document. To fix this
219+
* issue, we keep a count of the open and closed curly/square braces we found
220+
* while searching for the pair. When we find a pair AND the count of open and
221+
* closed curly/square braces is the same, we know that we just passed a
222+
* complete
223+
* document, therefore the last json buffer location is the end of the batch
224+
*/
225+
really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) {
226+
// TODO don't count separately, just figure out depth
227+
auto arr_cnt = 0;
228+
auto obj_cnt = 0;
229+
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
230+
auto idxb = parser.structural_indexes[i];
231+
switch (parser.buf[idxb]) {
232+
case ':':
233+
case ',':
234+
continue;
235+
case '}':
236+
obj_cnt--;
237+
continue;
238+
case ']':
239+
arr_cnt--;
240+
continue;
241+
case '{':
242+
obj_cnt++;
243+
break;
244+
case '[':
245+
arr_cnt++;
246+
break;
247+
}
248+
auto idxa = parser.structural_indexes[i - 1];
249+
switch (parser.buf[idxa]) {
250+
case '{':
251+
case '[':
252+
case ':':
253+
case ',':
254+
continue;
255+
}
256+
// Last document is complete, so the next document will appear after!
257+
if (!arr_cnt && !obj_cnt) {
258+
return parser.n_structural_indexes;
259+
}
260+
// Last document is incomplete; mark the document at i + 1 as the next one
261+
return i;
262+
}
263+
return 0;
264+
}
265+
266+
// Skip the last character if it is partial
267+
really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) {
268+
if (unlikely(len < 3)) {
269+
switch (len) {
270+
case 2:
271+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
272+
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
273+
return len;
274+
case 1:
275+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
276+
return len;
277+
case 0:
278+
return len;
279+
}
280+
}
281+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
282+
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
283+
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
284+
return len;
285+
}
286+
198287
} // namespace stage1

0 commit comments

Comments
 (0)