forked from simdjson/simdjson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstringparsing.h
More file actions
145 lines (133 loc) · 6.15 KB
/
stringparsing.h
File metadata and controls
145 lines (133 loc) · 6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// This file contains the common code every implementation uses
// It is intended to be included multiple times and compiled multiple times
// We assume the file in which it is include already includes
// "stringparsing.h" (this simplifies amalgation)
// begin copypasta
// These chars yield themselves: " \ /
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
// u not handled in this table as it's complex
static const uint8_t escape_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
// handle a unicode codepoint
// write appropriate values into dest
// src will advance 6 bytes or 12 bytes
// dest will advance a variable amount (return via pointer)
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time
WARN_UNUSED
really_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
uint8_t **dst_ptr) {
// hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
// conversion isn't valid; we defer the check for this to inside the
// multilingual plane check
uint32_t code_point = hex_to_u32_nocheck(*src_ptr + 2);
*src_ptr += 6;
// check for low surrogate for characters outside the Basic
// Multilingual Plane.
if (code_point >= 0xd800 && code_point < 0xdc00) {
if (((*src_ptr)[0] != '\\') || (*src_ptr)[1] != 'u') {
return false;
}
uint32_t code_point_2 = hex_to_u32_nocheck(*src_ptr + 2);
// if the first code point is invalid we will get here, as we will go past
// the check for being outside the Basic Multilingual plane. If we don't
// find a \u immediately afterwards we fail out anyhow, but if we do,
// this check catches both the case of the first code point being invalid
// or the second code point being invalid.
if ((code_point | code_point_2) >> 16) {
return false;
}
code_point =
(((code_point - 0xd800) << 10) | (code_point_2 - 0xdc00)) + 0x10000;
*src_ptr += 6;
}
size_t offset = codepoint_to_utf8(code_point, *dst_ptr);
*dst_ptr += offset;
return offset > 0;
}
WARN_UNUSED really_inline bool parse_string(UNUSED const uint8_t *buf,
UNUSED size_t len, ParsedJson &pj,
UNUSED const uint32_t depth,
UNUSED uint32_t offset) {
pj.write_tape(pj.current_string_buf_loc - pj.string_buf, '"');
const uint8_t *src = &buf[offset + 1]; /* we know that buf at offset is a " */
uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t);
const uint8_t *const start_of_string = dst;
while (1) {
parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
if (((helper.bs_bits - 1) & helper.quote_bits) != 0) {
/* we encountered quotes first. Move dst to point to quotes and exit
*/
/* find out where the quote is... */
auto quote_dist = trailing_zeroes(helper.quote_bits);
/* NULL termination is still handy if you expect all your strings to
* be NULL terminated? */
/* It comes at a small cost */
dst[quote_dist] = 0;
uint32_t str_length = (dst - start_of_string) + quote_dist;
memcpy(pj.current_string_buf_loc, &str_length, sizeof(str_length));
/*****************************
* Above, check for overflow in case someone has a crazy string
* (>=4GB?) _
* But only add the overflow check when the document itself exceeds
* 4GB
* Currently unneeded because we refuse to parse docs larger or equal
* to 4GB.
****************************/
/* we advance the point, accounting for the fact that we have a NULL
* termination */
pj.current_string_buf_loc = dst + quote_dist + 1;
return true;
}
if (((helper.quote_bits - 1) & helper.bs_bits) != 0) {
/* find out where the backspace is */
auto bs_dist = trailing_zeroes(helper.bs_bits);
uint8_t escape_char = src[bs_dist + 1];
/* we encountered backslash first. Handle backslash */
if (escape_char == 'u') {
/* move src/dst up to the start; they will be further adjusted
within the unicode codepoint handling code. */
src += bs_dist;
dst += bs_dist;
if (!handle_unicode_codepoint(&src, &dst)) {
return false;
}
} else {
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
* write bs_dist+1 characters to output
* note this may reach beyond the part of the buffer we've actually
* seen. I think this is ok */
uint8_t escape_result = escape_map[escape_char];
if (escape_result == 0u) {
return false; /* bogus escape value is an error */
}
dst[bs_dist] = escape_result;
src += bs_dist + 2;
dst += bs_dist + 1;
}
} else {
/* they are the same. Since they can't co-occur, it means we
* encountered neither. */
src += parse_string_helper::BYTES_PROCESSED;
dst += parse_string_helper::BYTES_PROCESSED;
}
}
/* can't be reached */
return true;
}