|
1 | 1 | #ifndef SIMDJSON_STAGE1_FIND_MARKS_H |
2 | 2 | #define SIMDJSON_STAGE1_FIND_MARKS_H |
3 | 3 |
|
4 | | -#include "simdjson/common_defs.h" |
5 | 4 | #include "simdjson/parsedjson.h" |
6 | | -#include "simdjson/portability.h" |
7 | 5 | #include "simdjson/simdjson.h" |
8 | | -#include "simdjson/simd_input.h" |
9 | | -#include <cassert> |
10 | 6 |
|
11 | 7 | namespace simdjson { |
12 | 8 |
|
13 | | -template <Architecture> uint64_t compute_quote_mask(uint64_t quote_bits); |
14 | | - |
15 | | -namespace { |
16 | | -// for when clmul is unavailable |
17 | | -[[maybe_unused]] uint64_t portable_compute_quote_mask(uint64_t quote_bits) { |
18 | | - uint64_t quote_mask = quote_bits ^ (quote_bits << 1); |
19 | | - quote_mask = quote_mask ^ (quote_mask << 2); |
20 | | - quote_mask = quote_mask ^ (quote_mask << 4); |
21 | | - quote_mask = quote_mask ^ (quote_mask << 8); |
22 | | - quote_mask = quote_mask ^ (quote_mask << 16); |
23 | | - quote_mask = quote_mask ^ (quote_mask << 32); |
24 | | - return quote_mask; |
25 | | -} |
26 | | -} // namespace |
27 | | - |
28 | | -template <Architecture T> |
29 | | -really_inline uint64_t find_odd_backslash_sequences( |
30 | | - simd_input<T> in, uint64_t &prev_iter_ends_odd_backslash); |
31 | | - |
32 | | -template <Architecture T> |
33 | | -really_inline uint64_t find_quote_mask_and_bits( |
34 | | - simd_input<T> in, uint64_t odd_ends, uint64_t &prev_iter_inside_quote, |
35 | | - uint64_t "e_bits, uint64_t &error_mask); |
36 | | - |
37 | | -// do a 'shufti' to detect structural JSON characters |
38 | | -// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c |
39 | | -// these go into the first 3 buckets of the comparison (1/2/4) |
40 | | - |
41 | | -// we are also interested in the four whitespace characters |
42 | | -// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d |
43 | | -// these go into the next 2 buckets of the comparison (8/16) |
44 | | -template <Architecture T> |
45 | | -void find_whitespace_and_structurals(simd_input<T> in, uint64_t &whitespace, |
46 | | - uint64_t &structurals); |
47 | | - |
48 | | -// return a updated structural bit vector with quoted contents cleared out and |
49 | | -// pseudo-structural characters added to the mask |
50 | | -// updates prev_iter_ends_pseudo_pred which tells us whether the previous |
51 | | -// iteration ended on a whitespace or a structural character (which means that |
52 | | -// the next iteration |
53 | | -// will have a pseudo-structural character at its start) |
54 | | -really_inline uint64_t finalize_structurals( |
55 | | - uint64_t structurals, uint64_t whitespace, uint64_t quote_mask, |
56 | | - uint64_t quote_bits, uint64_t &prev_iter_ends_pseudo_pred) { |
57 | | - // mask off anything inside quotes |
58 | | - structurals &= ~quote_mask; |
59 | | - // add the real quote bits back into our bit_mask as well, so we can |
60 | | - // quickly traverse the strings we've spent all this trouble gathering |
61 | | - structurals |= quote_bits; |
62 | | - // Now, establish "pseudo-structural characters". These are non-whitespace |
63 | | - // characters that are (a) outside quotes and (b) have a predecessor that's |
64 | | - // either whitespace or a structural character. This means that subsequent |
65 | | - // passes will get a chance to encounter the first character of every string |
66 | | - // of non-whitespace and, if we're parsing an atom like true/false/null or a |
67 | | - // number we can stop at the first whitespace or structural character |
68 | | - // following it. |
69 | | - |
70 | | - // a qualified predecessor is something that can happen 1 position before an |
71 | | - // pseudo-structural character |
72 | | - uint64_t pseudo_pred = structurals | whitespace; |
73 | | - |
74 | | - uint64_t shifted_pseudo_pred = |
75 | | - (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; |
76 | | - prev_iter_ends_pseudo_pred = pseudo_pred >> 63; |
77 | | - uint64_t pseudo_structurals = |
78 | | - shifted_pseudo_pred & (~whitespace) & (~quote_mask); |
79 | | - structurals |= pseudo_structurals; |
80 | | - |
81 | | - // now, we've used our close quotes all we need to. So let's switch them off |
82 | | - // they will be off in the quote mask and on in quote bits. |
83 | | - structurals &= ~(quote_bits & ~quote_mask); |
84 | | - return structurals; |
85 | | -} |
86 | | - |
87 | 9 | template <Architecture T = Architecture::NATIVE> |
88 | | -int find_structural_bits(const uint8_t *buf, size_t len, |
89 | | - simdjson::ParsedJson &pj); |
| 10 | +int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj); |
90 | 11 |
|
91 | 12 | template <Architecture T = Architecture::NATIVE> |
92 | | -int find_structural_bits(const char *buf, size_t len, |
93 | | - simdjson::ParsedJson &pj) { |
| 13 | +int find_structural_bits(const char *buf, size_t len, simdjson::ParsedJson &pj) { |
94 | 14 | return find_structural_bits((const uint8_t *)buf, len, pj); |
95 | 15 | } |
96 | 16 |
|
97 | | -// flatten out values in 'bits' assuming that they are are to have values of idx |
98 | | -// plus their position in the bitvector, and store these indexes at |
99 | | -// base_ptr[base] incrementing base as we go |
100 | | -// will potentially store extra values beyond end of valid bits, so base_ptr |
101 | | -// needs to be large enough to handle this |
102 | | -template <Architecture T = Architecture::NATIVE> |
103 | | -really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base, |
104 | | - uint32_t idx, uint64_t bits); |
105 | | - |
106 | | -} // namespace simdjson |
| 17 | +}; // namespace simdjson |
107 | 18 |
|
108 | 19 | #endif |
0 commit comments