Skip to content

Commit 76c7066

Browse files
authored
Move stage 2 tape writing to ParsedJson (simdjson#477)
This is a first step to allowing alternate tape formats.
1 parent 0c8f2b9 commit 76c7066

File tree

7 files changed

+225
-158
lines changed

7 files changed

+225
-158
lines changed

include/simdjson/jsonparser.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ int json_parse_implementation(const uint8_t *buf, size_t len, ParsedJson &pj,
3535
if (reallocated) { // must free before we exit
3636
aligned_free((void *)buf);
3737
}
38-
pj.error_code = stage1_is_ok;
3938
return pj.error_code;
4039
}
4140
int res = unified_machine<T>(buf, len, pj);

include/simdjson/parsedjson.h

Lines changed: 108 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -60,49 +60,105 @@ class ParsedJson {
6060
WARN_UNUSED
6161
bool dump_raw_tape(std::ostream &os) const;
6262

63-
// all nodes are stored on the tape using a 64-bit word.
64-
//
65-
// strings, double and ints are stored as
66-
// a 64-bit word with a pointer to the actual value
67-
//
68-
//
69-
//
70-
// for objects or arrays, store [ or { at the beginning and } and ] at the
71-
// end. For the openings ([ or {), we annotate them with a reference to the
72-
// location on the tape of the end, and for then closings (} and ]), we
73-
// annotate them with a reference to the location of the opening
74-
//
75-
//
63+
really_inline ErrorValues on_error(ErrorValues new_error_code) {
64+
error_code = new_error_code;
65+
return new_error_code;
66+
}
67+
really_inline ErrorValues on_success(ErrorValues success_code) {
68+
error_code = success_code;
69+
valid = true;
70+
return success_code;
71+
}
72+
really_inline bool on_start_document(uint32_t depth) {
73+
containing_scope_offset[depth] = get_current_loc();
74+
write_tape(0, 'r');
75+
return true;
76+
}
77+
really_inline bool on_start_object(uint32_t depth) {
78+
containing_scope_offset[depth] = get_current_loc();
79+
write_tape(0, '{');
80+
return true;
81+
}
82+
really_inline bool on_start_array(uint32_t depth) {
83+
containing_scope_offset[depth] = get_current_loc();
84+
write_tape(0, '[');
85+
return true;
86+
}
87+
// TODO we're not checking this bool
88+
really_inline bool on_end_document(uint32_t depth) {
89+
// write our tape location to the header scope
90+
// The root scope gets written *at* the previous location.
91+
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
92+
write_tape(containing_scope_offset[depth], 'r');
93+
return true;
94+
}
95+
really_inline bool on_end_object(uint32_t depth) {
96+
// write our tape location to the header scope
97+
write_tape(containing_scope_offset[depth], '}');
98+
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
99+
return true;
100+
}
101+
really_inline bool on_end_array(uint32_t depth) {
102+
// write our tape location to the header scope
103+
write_tape(containing_scope_offset[depth], ']');
104+
annotate_previous_loc(containing_scope_offset[depth], get_current_loc());
105+
return true;
106+
}
76107

77-
// this should be considered a private function
78-
really_inline void write_tape(uint64_t val, uint8_t c) {
79-
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
108+
really_inline bool on_true_atom() {
109+
write_tape(0, 't');
110+
return true;
111+
}
112+
really_inline bool on_false_atom() {
113+
write_tape(0, 'f');
114+
return true;
115+
}
116+
really_inline bool on_null_atom() {
117+
write_tape(0, 'n');
118+
return true;
119+
}
120+
121+
really_inline uint8_t *on_start_string() {
122+
/* we advance the point, accounting for the fact that we have a NULL
123+
* termination */
124+
write_tape(current_string_buf_loc - string_buf.get(), '"');
125+
return current_string_buf_loc + sizeof(uint32_t);
126+
}
127+
128+
really_inline bool on_end_string(uint8_t *dst) {
129+
uint32_t str_length = dst - (current_string_buf_loc + sizeof(uint32_t));
130+
// TODO check for overflow in case someone has a crazy string (>=4GB?)
131+
// But only add the overflow check when the document itself exceeds 4GB
132+
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
133+
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
134+
// NULL termination is still handy if you expect all your strings to
135+
// be NULL terminated? It comes at a small cost
136+
*dst = 0;
137+
current_string_buf_loc = dst + 1;
138+
return true;
80139
}
81140

82-
really_inline void write_tape_s64(int64_t i) {
141+
really_inline bool on_number_s64(int64_t value) {
83142
write_tape(0, 'l');
84-
std::memcpy(&tape[current_loc], &i, sizeof(i));
143+
std::memcpy(&tape[current_loc], &value, sizeof(value));
85144
++current_loc;
145+
return true;
86146
}
87-
88-
really_inline void write_tape_u64(uint64_t i) {
147+
really_inline bool on_number_u64(uint64_t value) {
89148
write_tape(0, 'u');
90-
tape[current_loc++] = i;
149+
tape[current_loc++] = value;
150+
return true;
91151
}
92-
93-
really_inline void write_tape_double(double d) {
152+
really_inline bool on_number_double(double value) {
94153
write_tape(0, 'd');
95-
static_assert(sizeof(d) == sizeof(tape[current_loc]), "mismatch size");
96-
memcpy(&tape[current_loc++], &d, sizeof(double));
154+
static_assert(sizeof(value) == sizeof(tape[current_loc]), "mismatch size");
155+
memcpy(&tape[current_loc++], &value, sizeof(double));
97156
// tape[current_loc++] = *((uint64_t *)&d);
157+
return true;
98158
}
99159

100160
really_inline uint32_t get_current_loc() const { return current_loc; }
101161

102-
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
103-
tape[saved_loc] |= val;
104-
}
105-
106162
struct InvalidJSON : public std::exception {
107163
const char *what() const noexcept { return "JSON document is invalid"; }
108164
};
@@ -134,6 +190,29 @@ class ParsedJson {
134190
bool valid{false};
135191
int error_code{simdjson::UNINITIALIZED};
136192

193+
private:
194+
// all nodes are stored on the tape using a 64-bit word.
195+
//
196+
// strings, double and ints are stored as
197+
// a 64-bit word with a pointer to the actual value
198+
//
199+
//
200+
//
201+
// for objects or arrays, store [ or { at the beginning and } and ] at the
202+
// end. For the openings ([ or {), we annotate them with a reference to the
203+
// location on the tape of the end, and for then closings (} and ]), we
204+
// annotate them with a reference to the location of the opening
205+
//
206+
//
207+
208+
// this should be considered a private function
209+
really_inline void write_tape(uint64_t val, uint8_t c) {
210+
tape[current_loc++] = val | ((static_cast<uint64_t>(c)) << 56);
211+
}
212+
213+
really_inline void annotate_previous_loc(uint32_t saved_loc, uint64_t val) {
214+
tape[saved_loc] |= val;
215+
}
137216
};
138217

139218

src/generic/numberparsing.h

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
namespace numberparsing {
12

23
// Allowable floating-point values range
34
// std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
@@ -75,7 +76,7 @@ static const double power_of_ten[] = {
7576
1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303,
7677
1e304, 1e305, 1e306, 1e307, 1e308};
7778

78-
static inline bool is_integer(char c) {
79+
really_inline bool is_integer(char c) {
7980
return (c >= '0' && c <= '9');
8081
// this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
8182
}
@@ -104,7 +105,7 @@ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
104105
// check quickly whether the next 8 chars are made of digits
105106
// at a glance, it looks better than Mula's
106107
// http://0x80.pl/articles/swar-digits-validate.html
107-
static inline bool is_made_of_eight_digits_fast(const char *chars) {
108+
really_inline bool is_made_of_eight_digits_fast(const char *chars) {
108109
uint64_t val;
109110
// this can read up to 7 bytes beyond the buffer size, but we require
110111
// SIMDJSON_PADDING of padding
@@ -123,7 +124,7 @@ static inline bool is_made_of_eight_digits_fast(const char *chars) {
123124
//
124125
// This function computes base * 10 ^ (- negative_exponent ).
125126
// It is only even going to be used when negative_exponent is tiny.
126-
static double subnormal_power10(double base, int64_t negative_exponent) {
127+
really_inline double subnormal_power10(double base, int64_t negative_exponent) {
127128
// avoid integer overflows in the pow expression, those values would
128129
// become zero anyway.
129130
if(negative_exponent < -1000) {
@@ -144,8 +145,8 @@ static double subnormal_power10(double base, int64_t negative_exponent) {
144145
//
145146
// Note: a redesign could avoid this function entirely.
146147
//
147-
static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
148-
const uint32_t offset, bool found_minus) {
148+
never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
149+
const uint32_t offset, bool found_minus) {
149150
const char *p = reinterpret_cast<const char *>(buf + offset);
150151
bool negative = false;
151152
if (found_minus) {
@@ -268,7 +269,7 @@ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
268269
return false;
269270
}
270271
double d = negative ? -i : i;
271-
pj.write_tape_double(d);
272+
pj.on_number_double(d);
272273
#ifdef JSON_TEST_NUMBERS // for unit testing
273274
found_float(d, buf + offset);
274275
#endif
@@ -283,7 +284,7 @@ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
283284
//
284285
// This function will almost never be called!!!
285286
//
286-
static never_inline bool parse_large_integer(const uint8_t *const buf,
287+
never_inline bool parse_large_integer(const uint8_t *const buf,
287288
ParsedJson &pj,
288289
const uint32_t offset,
289290
bool found_minus) {
@@ -333,14 +334,14 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
333334
// as a positive signed integer, but the negative version is
334335
// possible.
335336
constexpr int64_t signed_answer = INT64_MIN;
336-
pj.write_tape_s64(signed_answer);
337+
pj.on_number_s64(signed_answer);
337338
#ifdef JSON_TEST_NUMBERS // for unit testing
338339
found_integer(signed_answer, buf + offset);
339340
#endif
340341
} else {
341342
// we can negate safely
342343
int64_t signed_answer = -static_cast<int64_t>(i);
343-
pj.write_tape_s64(signed_answer);
344+
pj.on_number_s64(signed_answer);
344345
#ifdef JSON_TEST_NUMBERS // for unit testing
345346
found_integer(signed_answer, buf + offset);
346347
#endif
@@ -353,12 +354,12 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
353354
#ifdef JSON_TEST_NUMBERS // for unit testing
354355
found_integer(i, buf + offset);
355356
#endif
356-
pj.write_tape_s64(i);
357+
pj.on_number_s64(i);
357358
} else {
358359
#ifdef JSON_TEST_NUMBERS // for unit testing
359360
found_unsigned_integer(i, buf + offset);
360361
#endif
361-
pj.write_tape_u64(i);
362+
pj.on_number_u64(i);
362363
}
363364
}
364365
return is_structural_or_whitespace(*p);
@@ -373,12 +374,13 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
373374
// content and append a space before calling this function.
374375
//
375376
// Our objective is accurate parsing (ULP of 0 or 1) at high speed.
376-
static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
377-
const uint32_t offset,
378-
bool found_minus) {
377+
really_inline bool parse_number(const uint8_t *const buf,
378+
const uint32_t offset,
379+
bool found_minus,
380+
ParsedJson &pj) {
379381
#ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
380382
// useful to skip parsing
381-
pj.write_tape_s64(0); // always write zero
383+
pj.on_number_s64(0); // always write zero
382384
return true; // always succeeds
383385
#else
384386
const char *p = reinterpret_cast<const char *>(buf + offset);
@@ -535,7 +537,7 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
535537
double factor = power_of_ten[power_index];
536538
factor = negative ? -factor : factor;
537539
double d = i * factor;
538-
pj.write_tape_double(d);
540+
pj.on_number_double(d);
539541
#ifdef JSON_TEST_NUMBERS // for unit testing
540542
found_float(d, buf + offset);
541543
#endif
@@ -546,7 +548,7 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
546548
return parse_large_integer(buf, pj, offset, found_minus);
547549
}
548550
i = negative ? 0 - i : i;
549-
pj.write_tape_s64(i);
551+
pj.on_number_s64(i);
550552
#ifdef JSON_TEST_NUMBERS // for unit testing
551553
found_integer(i, buf + offset);
552554
#endif
@@ -555,3 +557,4 @@ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
555557
#endif // SIMDJSON_SKIPNUMBERPARSING
556558
}
557559

560+
} // namespace numberparsing

0 commit comments

Comments
 (0)