@@ -73,6 +73,8 @@ class json_structural_indexer {
7373 really_inline void step (const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept ;
7474 really_inline void next (simd::simd8x64<uint8_t > in, json_block block, size_t idx);
7575 really_inline error_code finish (dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76+ static really_inline uint32_t find_next_document_index (dom_parser_implementation &parser);
77+ static really_inline size_t trim_partial_utf8 (const uint8_t *buf, size_t len);
7678
7779 json_scanner scanner{};
7880 utf8_checker checker{};
@@ -195,4 +197,91 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
195197 return checker.errors ();
196198}
197199
200+ /* *
201+ * This algorithm is used to quickly identify the last structural position that
202+ * makes up a complete document.
203+ *
204+ * It does this by going backwards and finding the last *document boundary* (a
205+ * place where one value follows another without a comma between them). If the
206+ * last document (the characters after the boundary) has an equal number of
207+ * start and end brackets, it is considered complete.
208+ *
209+ * Simply put, we iterate over the structural characters, starting from
210+ * the end. We consider that we found the end of a JSON document when the
211+ * first element of the pair is NOT one of these characters: '{' '[' ';' ','
212+ * and when the second element is NOT one of these characters: '}' '}' ';' ','.
213+ *
214+ * This simple comparison works most of the time, but it does not cover cases
215+ * where the batch's structural indexes contain a perfect amount of documents.
216+ * In such a case, we do not have access to the structural index which follows
217+ * the last document, therefore, we do not have access to the second element in
218+ * the pair, and means that we cannot identify the last document. To fix this
219+ * issue, we keep a count of the open and closed curly/square braces we found
220+ * while searching for the pair. When we find a pair AND the count of open and
221+ * closed curly/square braces is the same, we know that we just passed a
222+ * complete
223+ * document, therefore the last json buffer location is the end of the batch
224+ */
225+ really_inline uint32_t json_structural_indexer::find_next_document_index (dom_parser_implementation &parser) {
226+ // TODO don't count separately, just figure out depth
227+ auto arr_cnt = 0 ;
228+ auto obj_cnt = 0 ;
229+ for (auto i = parser.n_structural_indexes - 1 ; i > 0 ; i--) {
230+ auto idxb = parser.structural_indexes [i];
231+ switch (parser.buf [idxb]) {
232+ case ' :' :
233+ case ' ,' :
234+ continue ;
235+ case ' }' :
236+ obj_cnt--;
237+ continue ;
238+ case ' ]' :
239+ arr_cnt--;
240+ continue ;
241+ case ' {' :
242+ obj_cnt++;
243+ break ;
244+ case ' [' :
245+ arr_cnt++;
246+ break ;
247+ }
248+ auto idxa = parser.structural_indexes [i - 1 ];
249+ switch (parser.buf [idxa]) {
250+ case ' {' :
251+ case ' [' :
252+ case ' :' :
253+ case ' ,' :
254+ continue ;
255+ }
256+ // Last document is complete, so the next document will appear after!
257+ if (!arr_cnt && !obj_cnt) {
258+ return parser.n_structural_indexes ;
259+ }
260+ // Last document is incomplete; mark the document at i + 1 as the next one
261+ return i;
262+ }
263+ return 0 ;
264+ }
265+
266+ // Skip the last character if it is partial
267+ really_inline size_t json_structural_indexer::trim_partial_utf8 (const uint8_t *buf, size_t len) {
268+ if (unlikely (len < 3 )) {
269+ switch (len) {
270+ case 2 :
271+ if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
272+ if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 2 bytes left
273+ return len;
274+ case 1 :
275+ if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
276+ return len;
277+ case 0 :
278+ return len;
279+ }
280+ }
281+ if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
282+ if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 1 byte left
283+ if (buf[len-3 ] >= 0b11110000 ) { return len-3 ; } // 4-byte characters with only 3 bytes left
284+ return len;
285+ }
286+
198287} // namespace stage1
0 commit comments