@@ -61,8 +61,9 @@ WARN_UNUSED
6161 // effectively the very first char is considered to follow "whitespace" for the
6262 // purposes of psuedo-structural character detection
6363 u64 prev_iter_ends_pseudo_pred = 1ULL ;
64-
65- for (size_t idx = 0 ; idx < len; idx += 64 ) {
64+ size_t lenminus64 = len + 1 < 64 ? 0 : len + 1 - 64 ; // len + 1 because of the NULL termination
65+ size_t idx = 0 ;
66+ for (; idx < lenminus64; idx += 64 ) {
6667 __builtin_prefetch (buf + idx + 128 );
6768#ifdef DEBUG
6869 cout << " Idx is " << idx << " \n " ;
@@ -249,21 +250,163 @@ WARN_UNUSED
249250 " final structurals and pseudo structurals after close quote removal" );
250251 *(u64 *)(pj.structurals + idx / 8 ) = structurals;
251252 }
253+
254+ // //////////////
255+ // / we use a giant copy-paste which is ugly.
256+ // / but otherwise the string needs to be properly padded or else we
257+ // / risk invalidating the UTF-8 checks.
258+ // //////////
259+ if (idx < len + 1 ) { // +1 due to NULL termination
260+ u8 tmpbuf[64 ];
261+ memset (tmpbuf,0x20 ,64 );
262+ memcpy (tmpbuf,buf+idx,len - idx + 1 );// +1 due to NULL termination
263+ m256 input_lo = _mm256_loadu_si256 ((const m256 *)(tmpbuf + 0 ));
264+ m256 input_hi = _mm256_loadu_si256 ((const m256 *)(tmpbuf + 32 ));
265+ #ifdef UTF8VALIDATE
266+ m256 highbit = _mm256_set1_epi8 (0x80 );
267+ if ((_mm256_testz_si256 (_mm256_or_si256 (input_lo, input_hi),highbit)) == 1 ) {
268+ // it is ascii, we just check continuation
269+ has_error = _mm256_or_si256 (
270+ _mm256_cmpgt_epi8 (previous.carried_continuations ,
271+ _mm256_setr_epi8 (9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
272+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
273+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 1 )),has_error);
274+
275+ } else {
276+ // it is not ascii so we have to do heavy work
277+ previous = avxcheckUTF8Bytes (input_lo, &previous, &has_error);
278+ previous = avxcheckUTF8Bytes (input_hi, &previous, &has_error);
279+ }
280+ #endif
281+ // //////////////////////////////////////////////////////////////////////////////////////////
282+ // Step 1: detect odd sequences of backslashes
283+ // //////////////////////////////////////////////////////////////////////////////////////////
284+
285+ u64 bs_bits =
286+ cmp_mask_against_input (input_lo, input_hi, _mm256_set1_epi8 (' \\ ' ));
287+ u64 start_edges = bs_bits & ~(bs_bits << 1 );
288+ // flip lowest if we have an odd-length run at the end of the prior
289+ // iteration
290+ u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
291+ u64 even_starts = start_edges & even_start_mask;
292+ u64 odd_starts = start_edges & ~even_start_mask;
293+ u64 even_carries = bs_bits + even_starts;
294+
295+ u64 odd_carries;
296+ // must record the carry-out of our odd-carries out of bit 63; this
297+ // indicates whether the sense of any edge going to the next iteration
298+ // should be flipped
299+ bool iter_ends_odd_backslash =
300+ __builtin_uaddll_overflow (bs_bits, odd_starts, &odd_carries);
301+
302+ odd_carries |=
303+ prev_iter_ends_odd_backslash; // push in bit zero as a potential end
304+ // if we had an odd-numbered run at the
305+ // end of the previous iteration
306+ prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL ;
307+ u64 even_carry_ends = even_carries & ~bs_bits;
308+ u64 odd_carry_ends = odd_carries & ~bs_bits;
309+ u64 even_start_odd_end = even_carry_ends & odd_bits;
310+ u64 odd_start_even_end = odd_carry_ends & even_bits;
311+ u64 odd_ends = even_start_odd_end | odd_start_even_end;
312+
313+ // //////////////////////////////////////////////////////////////////////////////////////////
314+ // Step 2: detect insides of quote pairs
315+ // //////////////////////////////////////////////////////////////////////////////////////////
316+
317+ u64 quote_bits =
318+ cmp_mask_against_input (input_lo, input_hi, _mm256_set1_epi8 (' "' ));
319+ quote_bits = quote_bits & ~odd_ends;
320+ u64 quote_mask = _mm_cvtsi128_si64 (_mm_clmulepi64_si128 (
321+ _mm_set_epi64x (0ULL , quote_bits), _mm_set1_epi8 (0xFF ), 0 ));
322+ quote_mask ^= prev_iter_inside_quote;
323+ prev_iter_inside_quote = (u64 )((s64)quote_mask >> 63 ); // right shift of a signed value expected to be well-defined and standard compliant as of C++20
324+
325+ // How do we build up a user traversable data structure
326+ // first, do a 'shufti' to detect structural JSON characters
327+ // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
328+ // these go into the first 3 buckets of the comparison (1/2/4)
329+
330+ // we are also interested in the four whitespace characters
331+ // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
332+ // these go into the next 2 buckets of the comparison (8/16)
333+ const m256 low_nibble_mask = _mm256_setr_epi8 (
334+ // 0 9 a b c d
335+ 16 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 8 , 12 , 1 , 2 , 9 , 0 , 0 , 16 , 0 , 0 , 0 , 0 , 0 , 0 ,
336+ 0 , 0 , 8 , 12 , 1 , 2 , 9 , 0 , 0 );
337+ const m256 high_nibble_mask = _mm256_setr_epi8 (
338+ // 0 2 3 5 7
339+ 8 , 0 , 18 , 4 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 3 , 2 , 1 , 0 , 0 , 8 , 0 , 18 , 4 , 0 , 1 , 0 ,
340+ 1 , 0 , 0 , 0 , 3 , 2 , 1 , 0 , 0 );
341+
342+ m256 structural_shufti_mask = _mm256_set1_epi8 (0x7 );
343+ m256 whitespace_shufti_mask = _mm256_set1_epi8 (0x18 );
344+
345+ m256 v_lo = _mm256_and_si256 (
346+ _mm256_shuffle_epi8 (low_nibble_mask, input_lo),
347+ _mm256_shuffle_epi8 (high_nibble_mask,
348+ _mm256_and_si256 (_mm256_srli_epi32 (input_lo, 4 ),
349+ _mm256_set1_epi8 (0x7f ))));
350+
351+ m256 v_hi = _mm256_and_si256 (
352+ _mm256_shuffle_epi8 (low_nibble_mask, input_hi),
353+ _mm256_shuffle_epi8 (high_nibble_mask,
354+ _mm256_and_si256 (_mm256_srli_epi32 (input_hi, 4 ),
355+ _mm256_set1_epi8 (0x7f ))));
356+ m256 tmp_lo = _mm256_cmpeq_epi8 (
357+ _mm256_and_si256 (v_lo, structural_shufti_mask), _mm256_set1_epi8 (0 ));
358+ m256 tmp_hi = _mm256_cmpeq_epi8 (
359+ _mm256_and_si256 (v_hi, structural_shufti_mask), _mm256_set1_epi8 (0 ));
360+
361+ u64 structural_res_0 = (u32 )_mm256_movemask_epi8 (tmp_lo);
362+ u64 structural_res_1 = _mm256_movemask_epi8 (tmp_hi);
363+ u64 structurals = ~(structural_res_0 | (structural_res_1 << 32 ));
364+
365+ // this additional mask and transfer is non-trivially expensive,
366+ // unfortunately
367+ m256 tmp_ws_lo = _mm256_cmpeq_epi8 (
368+ _mm256_and_si256 (v_lo, whitespace_shufti_mask), _mm256_set1_epi8 (0 ));
369+ m256 tmp_ws_hi = _mm256_cmpeq_epi8 (
370+ _mm256_and_si256 (v_hi, whitespace_shufti_mask), _mm256_set1_epi8 (0 ));
371+
372+ u64 ws_res_0 = (u32 )_mm256_movemask_epi8 (tmp_ws_lo);
373+ u64 ws_res_1 = _mm256_movemask_epi8 (tmp_ws_hi);
374+ u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32 ));
375+
376+
377+ // mask off anything inside quotes
378+ structurals &= ~quote_mask;
379+
380+ // add the real quote bits back into our bitmask as well, so we can
381+ // quickly traverse the strings we've spent all this trouble gathering
382+ structurals |= quote_bits;
383+
384+ // Now, establish "pseudo-structural characters". These are non-whitespace
385+ // characters that are (a) outside quotes and (b) have a predecessor that's
386+ // either whitespace or a structural character. This means that subsequent
387+ // passes will get a chance to encounter the first character of every string
388+ // of non-whitespace and, if we're parsing an atom like true/false/null or a
389+ // number we can stop at the first whitespace or structural character
390+ // following it.
391+
392+ // a qualified predecessor is something that can happen 1 position before an
393+ // psuedo-structural character
394+ u64 pseudo_pred = structurals | whitespace;
395+ u64 shifted_pseudo_pred = (pseudo_pred << 1 ) | prev_iter_ends_pseudo_pred;
396+ prev_iter_ends_pseudo_pred = pseudo_pred >> 63 ;
397+ u64 pseudo_structurals =
398+ shifted_pseudo_pred & (~whitespace) & (~quote_mask);
399+ structurals |= pseudo_structurals;
400+
401+ // now, we've used our close quotes all we need to. So let's switch them off
402+ // they will be off in the quote mask and on in quote bits.
403+ structurals &= ~(quote_bits & ~quote_mask);
404+ *(u64 *)(pj.structurals + idx / 8 ) = structurals;
405+ }
252406 if (buf[len] != ' \0 ' ) {
253407 std::cerr << " Your string should be NULL terminated." << std::endl;
254408 return false ;
255409 }
256- // we are going to zero out everything after len:
257- size_t count_last_64bits = len % 64 ;
258- if (count_last_64bits != 0 ) { // we have a "final" word where only count_last_64bits matter
259- u64 lastword = *(u64 *)(pj.structurals + len / 8 );
260- printf (" last word %zu \n " , lastword);
261- printf (" count_last_64bits%zu \n " , count_last_64bits);
262- lastword &= ( UINT64_C (1 ) << count_last_64bits) - 1 ;
263- *(u64 *)(pj.structurals + len / 8 ) = lastword;
264- }
265-
266- // pj.structural_indexes[pj.n_structural_indexes++] = len; // the final NULL is used as a pseudo-structural character
267410#ifdef UTF8VALIDATE
268411 return _mm256_testz_si256 (has_error, has_error);
269412#else
0 commit comments