Skip to content

Commit ea08e7d

Browse files
committed
Remove unused extra copy of find_next_document_index
1 parent 6f90f5d commit ea08e7d

1 file changed

Lines changed: 0 additions & 89 deletions

File tree

src/generic/stage1/json_structural_indexer.h

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ class json_structural_indexer {
7373
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
7474
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
7575
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76-
static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser);
77-
static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len);
7876

7977
json_scanner scanner{};
8078
utf8_checker checker{};
@@ -197,91 +195,4 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
197195
return checker.errors();
198196
}
199197

200-
/**
201-
* This algorithm is used to quickly identify the last structural position that
202-
* makes up a complete document.
203-
*
204-
* It does this by going backwards and finding the last *document boundary* (a
205-
* place where one value follows another without a comma between them). If the
206-
* last document (the characters after the boundary) has an equal number of
207-
* start and end brackets, it is considered complete.
208-
*
209-
* Simply put, we iterate over the structural characters, starting from
210-
* the end. We consider that we found the end of a JSON document when the
211-
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
212-
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
213-
*
214-
* This simple comparison works most of the time, but it does not cover cases
215-
* where the batch's structural indexes contain a perfect amount of documents.
216-
* In such a case, we do not have access to the structural index which follows
217-
* the last document, therefore, we do not have access to the second element in
218-
* the pair, and means that we cannot identify the last document. To fix this
219-
* issue, we keep a count of the open and closed curly/square braces we found
220-
* while searching for the pair. When we find a pair AND the count of open and
221-
* closed curly/square braces is the same, we know that we just passed a
222-
* complete
223-
* document, therefore the last json buffer location is the end of the batch
224-
*/
225-
really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) {
226-
// TODO don't count separately, just figure out depth
227-
auto arr_cnt = 0;
228-
auto obj_cnt = 0;
229-
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
230-
auto idxb = parser.structural_indexes[i];
231-
switch (parser.buf[idxb]) {
232-
case ':':
233-
case ',':
234-
continue;
235-
case '}':
236-
obj_cnt--;
237-
continue;
238-
case ']':
239-
arr_cnt--;
240-
continue;
241-
case '{':
242-
obj_cnt++;
243-
break;
244-
case '[':
245-
arr_cnt++;
246-
break;
247-
}
248-
auto idxa = parser.structural_indexes[i - 1];
249-
switch (parser.buf[idxa]) {
250-
case '{':
251-
case '[':
252-
case ':':
253-
case ',':
254-
continue;
255-
}
256-
// Last document is complete, so the next document will appear after!
257-
if (!arr_cnt && !obj_cnt) {
258-
return parser.n_structural_indexes;
259-
}
260-
// Last document is incomplete; mark the document at i + 1 as the next one
261-
return i;
262-
}
263-
return 0;
264-
}
265-
266-
// Skip the last character if it is partial
267-
really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) {
268-
if (unlikely(len < 3)) {
269-
switch (len) {
270-
case 2:
271-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
272-
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
273-
return len;
274-
case 1:
275-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
276-
return len;
277-
case 0:
278-
return len;
279-
}
280-
}
281-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
282-
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
283-
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
284-
return len;
285-
}
286-
287198
} // namespace stage1

0 commit comments

Comments
 (0)