@@ -73,8 +73,6 @@ class json_structural_indexer {
7373 really_inline void step (const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept ;
7474 really_inline void next (simd::simd8x64<uint8_t > in, json_block block, size_t idx);
7575 really_inline error_code finish (dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76- static really_inline uint32_t find_next_document_index (dom_parser_implementation &parser);
77- static really_inline size_t trim_partial_utf8 (const uint8_t *buf, size_t len);
7876
7977 json_scanner scanner{};
8078 utf8_checker checker{};
@@ -197,91 +195,4 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
197195 return checker.errors ();
198196}
199197
200- /* *
201- * This algorithm is used to quickly identify the last structural position that
202- * makes up a complete document.
203- *
204- * It does this by going backwards and finding the last *document boundary* (a
205- * place where one value follows another without a comma between them). If the
206- * last document (the characters after the boundary) has an equal number of
207- * start and end brackets, it is considered complete.
208- *
209- * Simply put, we iterate over the structural characters, starting from
210- * the end. We consider that we found the end of a JSON document when the
211- * first element of the pair is NOT one of these characters: '{' '[' ';' ','
212- * and when the second element is NOT one of these characters: '}' '}' ';' ','.
213- *
214- * This simple comparison works most of the time, but it does not cover cases
215- * where the batch's structural indexes contain a perfect amount of documents.
216- * In such a case, we do not have access to the structural index which follows
217- * the last document, therefore, we do not have access to the second element in
218- * the pair, and means that we cannot identify the last document. To fix this
219- * issue, we keep a count of the open and closed curly/square braces we found
220- * while searching for the pair. When we find a pair AND the count of open and
221- * closed curly/square braces is the same, we know that we just passed a
222- * complete
223- * document, therefore the last json buffer location is the end of the batch
224- */
225- really_inline uint32_t json_structural_indexer::find_next_document_index (dom_parser_implementation &parser) {
226- // TODO don't count separately, just figure out depth
227- auto arr_cnt = 0 ;
228- auto obj_cnt = 0 ;
229- for (auto i = parser.n_structural_indexes - 1 ; i > 0 ; i--) {
230- auto idxb = parser.structural_indexes [i];
231- switch (parser.buf [idxb]) {
232- case ' :' :
233- case ' ,' :
234- continue ;
235- case ' }' :
236- obj_cnt--;
237- continue ;
238- case ' ]' :
239- arr_cnt--;
240- continue ;
241- case ' {' :
242- obj_cnt++;
243- break ;
244- case ' [' :
245- arr_cnt++;
246- break ;
247- }
248- auto idxa = parser.structural_indexes [i - 1 ];
249- switch (parser.buf [idxa]) {
250- case ' {' :
251- case ' [' :
252- case ' :' :
253- case ' ,' :
254- continue ;
255- }
256- // Last document is complete, so the next document will appear after!
257- if (!arr_cnt && !obj_cnt) {
258- return parser.n_structural_indexes ;
259- }
260- // Last document is incomplete; mark the document at i + 1 as the next one
261- return i;
262- }
263- return 0 ;
264- }
265-
266- // Skip the last character if it is partial
267- really_inline size_t json_structural_indexer::trim_partial_utf8 (const uint8_t *buf, size_t len) {
268- if (unlikely (len < 3 )) {
269- switch (len) {
270- case 2 :
271- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
272- if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 2 bytes left
273- return len;
274- case 1 :
275- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
276- return len;
277- case 0 :
278- return len;
279- }
280- }
281- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
282- if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 1 byte left
283- if (buf[len-3 ] >= 0b11110000 ) { return len-3 ; } // 4-byte characters with only 3 bytes left
284- return len;
285- }
286-
287198} // namespace stage1
0 commit comments