|
| 1 | +// This file contains the common code every implementation uses in stage1 |
| 2 | +// It is intended to be included multiple times and compiled multiple times |
| 3 | +// We assume the file in which it is include already includes |
| 4 | +// "simdjson/stage1_find_marks.h" (this simplifies amalgation) |
| 5 | + |
| 6 | +#ifdef TARGETED_ARCHITECTURE |
| 7 | + |
| 8 | +// return a bitvector indicating where we have characters that end an odd-length |
| 9 | +// sequence of backslashes (and thus change the behavior of the next character |
| 10 | +// to follow). A even-length sequence of backslashes, and, for that matter, the |
| 11 | +// largest even-length prefix of our odd-length sequence of backslashes, simply |
| 12 | +// modify the behavior of the backslashes themselves. |
| 13 | +// We also update the prev_iter_ends_odd_backslash reference parameter to |
| 14 | +// indicate whether we end an iteration on an odd-length sequence of |
| 15 | +// backslashes, which modifies our subsequent search for odd-length |
| 16 | +// sequences of backslashes in an obvious way. |
| 17 | +template <> |
| 18 | +really_inline uint64_t find_odd_backslash_sequences<TARGETED_ARCHITECTURE>( |
| 19 | + simd_input<TARGETED_ARCHITECTURE> in, |
| 20 | + uint64_t &prev_iter_ends_odd_backslash) { |
| 21 | + const uint64_t even_bits = 0x5555555555555555ULL; |
| 22 | + const uint64_t odd_bits = ~even_bits; |
| 23 | + uint64_t bs_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '\\'); |
| 24 | + uint64_t start_edges = bs_bits & ~(bs_bits << 1); |
| 25 | + /* flip lowest if we have an odd-length run at the end of the prior |
| 26 | + * iteration */ |
| 27 | + uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; |
| 28 | + uint64_t even_starts = start_edges & even_start_mask; |
| 29 | + uint64_t odd_starts = start_edges & ~even_start_mask; |
| 30 | + uint64_t even_carries = bs_bits + even_starts; |
| 31 | + |
| 32 | + uint64_t odd_carries; |
| 33 | + /* must record the carry-out of our odd-carries out of bit 63; this |
| 34 | + * indicates whether the sense of any edge going to the next iteration |
| 35 | + * should be flipped */ |
| 36 | + bool iter_ends_odd_backslash = |
| 37 | + add_overflow(bs_bits, odd_starts, &odd_carries); |
| 38 | + |
| 39 | + odd_carries |= prev_iter_ends_odd_backslash; /* push in bit zero as a |
| 40 | + * potential end if we had an |
| 41 | + * odd-numbered run at the |
| 42 | + * end of the previous |
| 43 | + * iteration */ |
| 44 | + prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; |
| 45 | + uint64_t even_carry_ends = even_carries & ~bs_bits; |
| 46 | + uint64_t odd_carry_ends = odd_carries & ~bs_bits; |
| 47 | + uint64_t even_start_odd_end = even_carry_ends & odd_bits; |
| 48 | + uint64_t odd_start_even_end = odd_carry_ends & even_bits; |
| 49 | + uint64_t odd_ends = even_start_odd_end | odd_start_even_end; |
| 50 | + return odd_ends; |
| 51 | +} |
| 52 | + |
| 53 | +// return both the quote mask (which is a half-open mask that covers the first |
| 54 | +// quote |
| 55 | +// in an unescaped quote pair and everything in the quote pair) and the quote |
| 56 | +// bits, which are the simple |
| 57 | +// unescaped quoted bits. We also update the prev_iter_inside_quote value to |
| 58 | +// tell the next iteration |
| 59 | +// whether we finished the final iteration inside a quote pair; if so, this |
| 60 | +// inverts our behavior of |
| 61 | +// whether we're inside quotes for the next iteration. |
| 62 | +// Note that we don't do any error checking to see if we have backslash |
| 63 | +// sequences outside quotes; these |
| 64 | +// backslash sequences (of any length) will be detected elsewhere. |
| 65 | +template <> |
| 66 | +really_inline uint64_t find_quote_mask_and_bits<TARGETED_ARCHITECTURE>( |
| 67 | + simd_input<TARGETED_ARCHITECTURE> in, uint64_t odd_ends, |
| 68 | + uint64_t &prev_iter_inside_quote, uint64_t "e_bits, |
| 69 | + uint64_t &error_mask) { |
| 70 | + quote_bits = cmp_mask_against_input<TARGETED_ARCHITECTURE>(in, '"'); |
| 71 | + quote_bits = quote_bits & ~odd_ends; |
| 72 | + uint64_t quote_mask = compute_quote_mask<TARGETED_ARCHITECTURE>(quote_bits); |
| 73 | + quote_mask ^= prev_iter_inside_quote; |
| 74 | + /* All Unicode characters may be placed within the |
| 75 | + * quotation marks, except for the characters that MUST be escaped: |
| 76 | + * quotation mark, reverse solidus, and the control characters (U+0000 |
| 77 | + * through U+001F). |
| 78 | + * https://tools.ietf.org/html/rfc8259 */ |
| 79 | + uint64_t unescaped = |
| 80 | + unsigned_lteq_against_input<TARGETED_ARCHITECTURE>(in, 0x1F); |
| 81 | + error_mask |= quote_mask & unescaped; |
| 82 | + /* right shift of a signed value expected to be well-defined and standard |
| 83 | + * compliant as of C++20, |
| 84 | + * John Regher from Utah U. says this is fine code */ |
| 85 | + prev_iter_inside_quote = |
| 86 | + static_cast<uint64_t>(static_cast<int64_t>(quote_mask) >> 63); |
| 87 | + return quote_mask; |
| 88 | +} |
| 89 | + |
| 90 | +// Find structural bits in a 64-byte chunk. |
| 91 | +void find_structural_bits_64( |
| 92 | + const uint8_t *buf, size_t idx, uint32_t *base_ptr, uint32_t &base, |
| 93 | + uint64_t &prev_iter_ends_odd_backslash, uint64_t &prev_iter_inside_quote, |
| 94 | + uint64_t &prev_iter_ends_pseudo_pred, uint64_t &structurals, |
| 95 | + uint64_t &error_mask, |
| 96 | + utf8_checking_state<TARGETED_ARCHITECTURE> &utf8_state) { |
| 97 | + simd_input<TARGETED_ARCHITECTURE> in = fill_input<TARGETED_ARCHITECTURE>(buf); |
| 98 | + check_utf8<TARGETED_ARCHITECTURE>(in, utf8_state); |
| 99 | + /* detect odd sequences of backslashes */ |
| 100 | + uint64_t odd_ends = find_odd_backslash_sequences<TARGETED_ARCHITECTURE>( |
| 101 | + in, prev_iter_ends_odd_backslash); |
| 102 | + |
| 103 | + /* detect insides of quote pairs ("quote_mask") and also our quote_bits |
| 104 | + * themselves */ |
| 105 | + uint64_t quote_bits; |
| 106 | + uint64_t quote_mask = find_quote_mask_and_bits<TARGETED_ARCHITECTURE>( |
| 107 | + in, odd_ends, prev_iter_inside_quote, quote_bits, error_mask); |
| 108 | + |
| 109 | + /* take the previous iterations structural bits, not our current |
| 110 | + * iteration, |
| 111 | + * and flatten */ |
| 112 | +#ifdef IS_X86_64 |
| 113 | + if (TARGETED_ARCHITECTURE == Architecture::HASWELL) |
| 114 | + simdjson::haswell::flatten_bits(base_ptr, base, idx, structurals); |
| 115 | + else |
| 116 | +#endif |
| 117 | + simdjson::flatten_bits(base_ptr, base, idx, structurals); |
| 118 | + |
| 119 | + uint64_t whitespace; |
| 120 | + find_whitespace_and_structurals<TARGETED_ARCHITECTURE>(in, whitespace, |
| 121 | + structurals); |
| 122 | + |
| 123 | + /* fixup structurals to reflect quotes and add pseudo-structural |
| 124 | + * characters */ |
| 125 | + structurals = finalize_structurals(structurals, whitespace, quote_mask, |
| 126 | + quote_bits, prev_iter_ends_pseudo_pred); |
| 127 | +} |
| 128 | + |
| 129 | +template <> |
| 130 | +int find_structural_bits<TARGETED_ARCHITECTURE>(const uint8_t *buf, size_t len, |
| 131 | + ParsedJson &pj) { |
| 132 | + if (len > pj.byte_capacity) { |
| 133 | + std::cerr << "Your ParsedJson object only supports documents up to " |
| 134 | + << pj.byte_capacity << " bytes but you are trying to process " |
| 135 | + << len << " bytes" << std::endl; |
| 136 | + return simdjson::CAPACITY; |
| 137 | + } |
| 138 | + uint32_t *base_ptr = pj.structural_indexes; |
| 139 | + uint32_t base = 0; |
| 140 | + utf8_checking_state<TARGETED_ARCHITECTURE> utf8_state; |
| 141 | + |
| 142 | + /* we have padded the input out to 64 byte multiple with the remainder |
| 143 | + * being zeros persistent state across loop does the last iteration end |
| 144 | + * with an odd-length sequence of backslashes? */ |
| 145 | + |
| 146 | + /* either 0 or 1, but a 64-bit value */ |
| 147 | + uint64_t prev_iter_ends_odd_backslash = 0ULL; |
| 148 | + /* does the previous iteration end inside a double-quote pair? */ |
| 149 | + uint64_t prev_iter_inside_quote = |
| 150 | + 0ULL; /* either all zeros or all ones |
| 151 | + * does the previous iteration end on something that is a |
| 152 | + * predecessor of a pseudo-structural character - i.e. |
| 153 | + * whitespace or a structural character effectively the very |
| 154 | + * first char is considered to follow "whitespace" for the |
| 155 | + * purposes of pseudo-structural character detection so we |
| 156 | + * initialize to 1 */ |
| 157 | + uint64_t prev_iter_ends_pseudo_pred = 1ULL; |
| 158 | + |
| 159 | + /* structurals are persistent state across loop as we flatten them on the |
| 160 | + * subsequent iteration into our array pointed to be base_ptr. |
| 161 | + * This is harmless on the first iteration as structurals==0 |
| 162 | + * and is done for performance reasons; we can hide some of the latency of |
| 163 | + * the |
| 164 | + * expensive carryless multiply in the previous step with this work */ |
| 165 | + uint64_t structurals = 0; |
| 166 | + |
| 167 | + size_t lenminus64 = len < 64 ? 0 : len - 64; |
| 168 | + size_t idx = 0; |
| 169 | + uint64_t error_mask = 0; /* for unescaped characters within strings (ASCII |
| 170 | + code points < 0x20) */ |
| 171 | + |
| 172 | + for (; idx < lenminus64; idx += 64) { |
| 173 | + find_structural_bits_64(&buf[idx], idx, base_ptr, base, |
| 174 | + prev_iter_ends_odd_backslash, |
| 175 | + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, |
| 176 | + structurals, error_mask, utf8_state); |
| 177 | + } |
| 178 | + /* If we have a final chunk of less than 64 bytes, pad it to 64 with |
| 179 | + * spaces before processing it (otherwise, we risk invalidating the UTF-8 |
| 180 | + * checks). */ |
| 181 | + if (idx < len) { |
| 182 | + uint8_t tmp_buf[64]; |
| 183 | + memset(tmp_buf, 0x20, 64); |
| 184 | + memcpy(tmp_buf, buf + idx, len - idx); |
| 185 | + find_structural_bits_64(&tmp_buf[0], idx, base_ptr, base, |
| 186 | + prev_iter_ends_odd_backslash, |
| 187 | + prev_iter_inside_quote, prev_iter_ends_pseudo_pred, |
| 188 | + structurals, error_mask, utf8_state); |
| 189 | + idx += 64; |
| 190 | + } |
| 191 | + |
| 192 | + /* is last string quote closed? */ |
| 193 | + if (prev_iter_inside_quote) { |
| 194 | + return simdjson::UNCLOSED_STRING; |
| 195 | + } |
| 196 | + |
| 197 | + /* finally, flatten out the remaining structurals from the last iteration |
| 198 | + */ |
| 199 | +#ifdef IS_X86_64 |
| 200 | + if (TARGETED_ARCHITECTURE == Architecture::HASWELL) |
| 201 | + simdjson::haswell::flatten_bits(base_ptr, base, idx, structurals); |
| 202 | + else |
| 203 | +#endif |
| 204 | + simdjson::flatten_bits(base_ptr, base, idx, structurals); |
| 205 | + |
| 206 | + pj.n_structural_indexes = base; |
| 207 | + /* a valid JSON file cannot have zero structural indexes - we should have |
| 208 | + * found something */ |
| 209 | + if (pj.n_structural_indexes == 0u) { |
| 210 | + return simdjson::EMPTY; |
| 211 | + } |
| 212 | + if (base_ptr[pj.n_structural_indexes - 1] > len) { |
| 213 | + return simdjson::UNEXPECTED_ERROR; |
| 214 | + } |
| 215 | + if (len != base_ptr[pj.n_structural_indexes - 1]) { |
| 216 | + /* the string might not be NULL terminated, but we add a virtual NULL |
| 217 | + * ending |
| 218 | + * character. */ |
| 219 | + base_ptr[pj.n_structural_indexes++] = len; |
| 220 | + } |
| 221 | + /* make it safe to dereference one beyond this array */ |
| 222 | + base_ptr[pj.n_structural_indexes] = 0; |
| 223 | + if (error_mask) { |
| 224 | + return simdjson::UNESCAPED_CHARS; |
| 225 | + } |
| 226 | + return check_utf8_errors<TARGETED_ARCHITECTURE>(utf8_state); |
| 227 | +} |
| 228 | + |
| 229 | +#endif // TARGETED_ARCHITECTURE |
0 commit comments