1- #ifndef SIMDJSON_JSONCHARUTILS_H
2- #define SIMDJSON_JSONCHARUTILS_H
1+ #ifndef SIMDJSON_JSONCHARUTILS_TABLES_H
2+ #define SIMDJSON_JSONCHARUTILS_TABLES_H
33
44#include " simdjson.h"
55
@@ -34,12 +34,6 @@ const uint32_t structural_or_whitespace_or_null_negated[256] = {
3434 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
3535 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
3636
37- // return non-zero if not a structural or whitespace char
38- // zero otherwise
39- really_inline uint32_t is_not_structural_or_whitespace_or_null (uint8_t c) {
40- return structural_or_whitespace_or_null_negated[c];
41- }
42-
4337const uint32_t structural_or_whitespace_negated[256 ] = {
4438 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
4539 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
@@ -57,12 +51,6 @@ const uint32_t structural_or_whitespace_negated[256] = {
5751 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
5852 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
5953
60- // return non-zero if not a structural or whitespace char
61- // zero otherwise
62- really_inline uint32_t is_not_structural_or_whitespace (uint8_t c) {
63- return structural_or_whitespace_negated[c];
64- }
65-
6654const uint32_t structural_or_whitespace_or_null[256 ] = {
6755 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
6856 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 ,
@@ -76,10 +64,6 @@ const uint32_t structural_or_whitespace_or_null[256] = {
7664 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
7765 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
7866
79- really_inline uint32_t is_structural_or_whitespace_or_null (uint8_t c) {
80- return structural_or_whitespace_or_null[c];
81- }
82-
8367const uint32_t structural_or_whitespace[256 ] = {
8468 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
8569 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 ,
@@ -93,10 +77,6 @@ const uint32_t structural_or_whitespace[256] = {
9377 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
9478 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
9579
96- really_inline uint32_t is_structural_or_whitespace (uint8_t c) {
97- return structural_or_whitespace[c];
98- }
99-
10080const uint32_t digit_to_val32[886 ] = {
10181 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ,
10282 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ,
@@ -246,62 +226,6 @@ const uint32_t digit_to_val32[886] = {
246226 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ,
247227 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ,
248228 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF };
249- // returns a value with the high 16 bits set if not valid
250- // otherwise returns the conversion of the 4 hex digits at src into the bottom
251- // 16 bits of the 32-bit return register
252- //
253- // see
254- // https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
255- static inline uint32_t hex_to_u32_nocheck (
256- const uint8_t *src) { // strictly speaking, static inline is a C-ism
257- uint32_t v1 = digit_to_val32[630 + src[0 ]];
258- uint32_t v2 = digit_to_val32[420 + src[1 ]];
259- uint32_t v3 = digit_to_val32[210 + src[2 ]];
260- uint32_t v4 = digit_to_val32[0 + src[3 ]];
261- return v1 | v2 | v3 | v4;
262- }
263-
264- // given a code point cp, writes to c
265- // the utf-8 code, outputting the length in
266- // bytes, if the length is zero, the code point
267- // is invalid
268- //
269- // This can possibly be made faster using pdep
270- // and clz and table lookups, but JSON documents
271- // have few escaped code points, and the following
272- // function looks cheap.
273- //
274- // Note: we assume that surrogates are treated separately
275- //
276- inline size_t codepoint_to_utf8 (uint32_t cp, uint8_t *c) {
277- if (cp <= 0x7F ) {
278- c[0 ] = uint8_t (cp);
279- return 1 ; // ascii
280- }
281- if (cp <= 0x7FF ) {
282- c[0 ] = uint8_t ((cp >> 6 ) + 192 );
283- c[1 ] = uint8_t ((cp & 63 ) + 128 );
284- return 2 ; // universal plane
285- // Surrogates are treated elsewhere...
286- // } //else if (0xd800 <= cp && cp <= 0xdfff) {
287- // return 0; // surrogates // could put assert here
288- } else if (cp <= 0xFFFF ) {
289- c[0 ] = uint8_t ((cp >> 12 ) + 224 );
290- c[1 ] = uint8_t (((cp >> 6 ) & 63 ) + 128 );
291- c[2 ] = uint8_t ((cp & 63 ) + 128 );
292- return 3 ;
293- } else if (cp <= 0x10FFFF ) { // if you know you have a valid code point, this
294- // is not needed
295- c[0 ] = uint8_t ((cp >> 18 ) + 240 );
296- c[1 ] = uint8_t (((cp >> 12 ) & 63 ) + 128 );
297- c[2 ] = uint8_t (((cp >> 6 ) & 63 ) + 128 );
298- c[3 ] = uint8_t ((cp & 63 ) + 128 );
299- return 4 ;
300- }
301- // will return 0 when the code point was too large.
302- return 0 ; // bad r
303- }
304-
305229// //
306230// The following code is used in number parsing. It is not
307231// properly "char utils" stuff, but we move it here so that
@@ -317,42 +241,6 @@ struct value128 {
317241 uint64_t high;
318242};
319243
320- #ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
321- // this is a slow emulation routine for 32-bit
322- //
323- static inline uint64_t __emulu (uint32_t x, uint32_t y) {
324- return x * (uint64_t )y;
325- }
326- static inline uint64_t _umul128 (uint64_t ab, uint64_t cd, uint64_t *hi) {
327- uint64_t ad = __emulu ((uint32_t )(ab >> 32 ), (uint32_t )cd);
328- uint64_t bd = __emulu ((uint32_t )ab, (uint32_t )cd);
329- uint64_t adbc = ad + __emulu ((uint32_t )ab, (uint32_t )(cd >> 32 ));
330- uint64_t adbc_carry = !!(adbc < ad);
331- uint64_t lo = bd + (adbc << 32 );
332- *hi = __emulu ((uint32_t )(ab >> 32 ), (uint32_t )(cd >> 32 )) + (adbc >> 32 ) +
333- (adbc_carry << 32 ) + !!(lo < bd);
334- return lo;
335- }
336- #endif
337-
338- really_inline value128 full_multiplication (uint64_t value1, uint64_t value2) {
339- value128 answer;
340- #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
341- #ifdef _M_ARM64
342- // ARM64 has native support for 64-bit multiplications, no need to emultate
343- answer.high = __umulh (value1, value2);
344- answer.low = value1 * value2;
345- #else
346- answer.low = _umul128 (value1, value2, &answer.high ); // _umul128 not available on ARM64
347- #endif // _M_ARM64
348- #else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
349- __uint128_t r = ((__uint128_t )value1) * value2;
350- answer.low = uint64_t (r);
351- answer.high = uint64_t (r >> 64 );
352- #endif
353- return answer;
354- }
355-
356244// Precomputed powers of ten from 10^0 to 10^22. These
357245// can be represented exactly using the double type.
358246static const double power_of_ten[] = {
@@ -1333,4 +1221,4 @@ const uint64_t mantissa_128[] = {
13331221
13341222} // namespace simdjson
13351223
1336- #endif // SIMDJSON_JSONCHARUTILS_H
1224+ #endif // SIMDJSON_JSONCHARUTILS_TABLES_H
0 commit comments