|
82 | 82 | return quote_mask; \ |
83 | 83 | } \ |
84 | 84 |
|
| 85 | +// Find structural bits in a 64-byte chunk. |
| 86 | +// We need to compile that code for multiple architectures. However, target attributes can be used |
| 87 | +// only once by function definition. Huge macro seemed better than huge code duplication. |
| 88 | +// void FIND_STRUCTURAL_BITS_64(architecture T, |
| 89 | +// const uint8_t *buf, |
| 90 | +// size_t idx, |
| 91 | +// uint32_t *base_ptr, |
| 92 | +// uint32_t &base, |
| 93 | +// uint64_t &prev_iter_ends_odd_backslash, |
| 94 | +// uint64_t &prev_iter_inside_quote, |
| 95 | +// uint64_t &prev_iter_ends_pseudo_pred, |
| 96 | +// uint64_t &structurals, |
| 97 | +// uint64_t &error_mask, |
| 98 | +// utf8_checking_state<T> &utf8_state, flatten function) |
| 99 | +#define FIND_STRUCTURAL_BITS_64(T, \ |
| 100 | + buf, \ |
| 101 | + idx, \ |
| 102 | + base_ptr, \ |
| 103 | + base, \ |
| 104 | + prev_iter_ends_odd_backslash, \ |
| 105 | + prev_iter_inside_quote, \ |
| 106 | + prev_iter_ends_pseudo_pred, \ |
| 107 | + structurals, \ |
| 108 | + error_mask, \ |
| 109 | + utf8_state, \ |
| 110 | + flat \ |
| 111 | +) { \ |
| 112 | + simd_input<T> in = fill_input<T>(buf); \ |
| 113 | + check_utf8<T>(in, utf8_state); \ |
| 114 | + /* detect odd sequences of backslashes */ \ |
| 115 | + uint64_t odd_ends = find_odd_backslash_sequences<T>(in, prev_iter_ends_odd_backslash); \ |
| 116 | + \ |
| 117 | + /* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \ |
| 118 | + /* themselves */ \ |
| 119 | + uint64_t quote_bits; \ |
| 120 | + uint64_t quote_mask = find_quote_mask_and_bits<T>( \ |
| 121 | + in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \ |
| 122 | + \ |
| 123 | + /* take the previous iterations structural bits, not our current iteration, */ \ |
| 124 | + /* and flatten */ \ |
| 125 | + flat(base_ptr, base, idx, structurals); \ |
| 126 | + \ |
| 127 | + uint64_t whitespace; \ |
| 128 | + find_whitespace_and_structurals<T>(in, whitespace, structurals); \ |
| 129 | + \ |
| 130 | + /* fixup structurals to reflect quotes and add pseudo-structural characters */ \ |
| 131 | + structurals = finalize_structurals(structurals, whitespace, quote_mask, \ |
| 132 | + quote_bits, prev_iter_ends_pseudo_pred); \ |
| 133 | +} \ |
85 | 134 |
|
86 | 135 |
|
87 | 136 | // We need to compile that code for multiple architectures. However, target attributes can be used |
88 | 137 | // only once by function definition. Huge macro seemed better than huge code duplication. |
89 | | -// FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj) |
| 138 | +// errorValues FIND_STRUCTURAL_BITS(architecture T, const uint8_t *buf, size_t len, ParsedJson &pj, flatten functio ) |
90 | 139 | #define FIND_STRUCTURAL_BITS(T, buf, len, pj, flat) { \ |
91 | 140 | if (len > pj.bytecapacity) { \ |
92 | 141 | std::cerr << "Your ParsedJson object only supports documents up to " \ |
|
96 | 145 | } \ |
97 | 146 | uint32_t *base_ptr = pj.structural_indexes; \ |
98 | 147 | uint32_t base = 0; \ |
99 | | - utf8_checking_state<T> state; \ |
| 148 | + utf8_checking_state<T> utf8_state; \ |
100 | 149 | \ |
101 | 150 | /* we have padded the input out to 64 byte multiple with the remainder being */ \ |
102 | 151 | /* zeros */ \ |
|
126 | 175 | uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII code points < 0x20) */ \ |
127 | 176 | \ |
128 | 177 | for (; idx < lenminus64; idx += 64) { \ |
129 | | - \ |
130 | | - simd_input<T> in = fill_input<T>(buf+idx); \ |
131 | | - check_utf8<T>(in, state); \ |
132 | | - /* detect odd sequences of backslashes */ \ |
133 | | - uint64_t odd_ends = find_odd_backslash_sequences<T>( \ |
134 | | - in, prev_iter_ends_odd_backslash); \ |
135 | | - \ |
136 | | - /* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \ |
137 | | - /* themselves */ \ |
138 | | - uint64_t quote_bits; \ |
139 | | - uint64_t quote_mask = find_quote_mask_and_bits<T>( \ |
140 | | - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \ |
141 | | - \ |
142 | | - /* take the previous iterations structural bits, not our current iteration, */ \ |
143 | | - /* and flatten */ \ |
144 | | - flat(base_ptr, base, idx, structurals); \ |
145 | | - \ |
146 | | - uint64_t whitespace; \ |
147 | | - find_whitespace_and_structurals<T>(in, whitespace, structurals); \ |
148 | | - \ |
149 | | - /* fixup structurals to reflect quotes and add pseudo-structural characters */ \ |
150 | | - structurals = finalize_structurals(structurals, whitespace, quote_mask, \ |
151 | | - quote_bits, prev_iter_ends_pseudo_pred); \ |
| 178 | + FIND_STRUCTURAL_BITS_64(T, &buf[idx], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ |
| 179 | + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ |
| 180 | + error_mask, utf8_state, flat); \ |
152 | 181 | } \ |
153 | | - \ |
154 | | - /*////////////// */ \ |
155 | | - /*/ we use a giant copy-paste which is ugly. */ \ |
156 | | - /*/ but otherwise the string needs to be properly padded or else we */ \ |
157 | | - /*/ risk invalidating the UTF-8 checks. */ \ |
158 | | - /*////////// */ \ |
| 182 | + /* If we have a final chunk of less than 64 bytes, pad it to 64 with spaces */ \ |
| 183 | + /* before processing it (otherwise, we risk invalidating the UTF-8 checks). */ \ |
159 | 184 | if (idx < len) { \ |
160 | 185 | uint8_t tmpbuf[64]; \ |
161 | 186 | memset(tmpbuf, 0x20, 64); \ |
162 | 187 | memcpy(tmpbuf, buf + idx, len - idx); \ |
163 | | - simd_input<T> in = fill_input<T>(tmpbuf); \ |
164 | | - check_utf8<T>(in, state); \ |
165 | | - \ |
166 | | - /* detect odd sequences of backslashes */ \ |
167 | | - uint64_t odd_ends = find_odd_backslash_sequences<T>( \ |
168 | | - in, prev_iter_ends_odd_backslash); \ |
169 | | - \ |
170 | | - /* detect insides of quote pairs ("quote_mask") and also our quote_bits */ \ |
171 | | - /* themselves */ \ |
172 | | - uint64_t quote_bits; \ |
173 | | - uint64_t quote_mask = find_quote_mask_and_bits<T>( \ |
174 | | - in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); \ |
175 | | - \ |
176 | | - /* take the previous iterations structural bits, not our current iteration, */ \ |
177 | | - /* and flatten */ \ |
178 | | - flat(base_ptr, base, idx, structurals); \ |
179 | | - \ |
180 | | - uint64_t whitespace; \ |
181 | | - find_whitespace_and_structurals<T>(in, whitespace, structurals); \ |
182 | | - \ |
183 | | - /* fixup structurals to reflect quotes and add pseudo-structural characters */ \ |
184 | | - structurals = finalize_structurals(structurals, whitespace, quote_mask, \ |
185 | | - quote_bits, prev_iter_ends_pseudo_pred); \ |
| 188 | + FIND_STRUCTURAL_BITS_64(T, &tmpbuf[0], idx, base_ptr, base, prev_iter_ends_odd_backslash, \ |
| 189 | + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, structurals, \ |
| 190 | + error_mask, utf8_state, flat); \ |
186 | 191 | idx += 64; \ |
187 | 192 | } \ |
188 | 193 | \ |
|
192 | 197 | } \ |
193 | 198 | \ |
194 | 199 | /* finally, flatten out the remaining structurals from the last iteration */ \ |
195 | | - flat(base_ptr, base, idx, structurals); \ |
| 200 | + flat(base_ptr, base, idx, structurals); \ |
196 | 201 | \ |
197 | 202 | pj.n_structural_indexes = base; \ |
198 | 203 | /* a valid JSON file cannot have zero structural indexes - we should have */ \ |
|
213 | 218 | if (error_mask) { \ |
214 | 219 | return simdjson::UNESCAPED_CHARS; \ |
215 | 220 | } \ |
216 | | - return check_utf8_errors<T>(state); \ |
| 221 | + return check_utf8_errors<T>(utf8_state); \ |
217 | 222 | } |
218 | 223 |
|
219 | 224 |
|
|
0 commit comments