Skip to content

Commit 1aaad22

Browse files
committed
Simplify atom parsing
1 parent 81c86d7 commit 1aaad22

File tree

9 files changed

+89
-70
lines changed

9 files changed

+89
-70
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ endif # ifeq ($(SANITIZE),1)
5858
endif # ifeq ($(MEMSANITIZE),1)
5959

6060
# Headers and sources
61-
SRCHEADERS_GENERIC=src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
61+
SRCHEADERS_GENERIC=src/generic/atomparsing.h src/generic/numberparsing.h src/generic/stage1_find_marks.h src/generic/stage2_build_tape.h src/generic/stringparsing.h src/generic/stage2_streaming_build_tape.h src/generic/utf8_fastvalidate_algorithm.h src/generic/utf8_lookup_algorithm.h src/generic/utf8_lookup2_algorithm.h src/generic/utf8_range_algorithm.h src/generic/utf8_zwegner_algorithm.h
6262
SRCHEADERS_ARM64= src/arm64/bitmanipulation.h src/arm64/bitmask.h src/arm64/intrinsics.h src/arm64/numberparsing.h src/arm64/simd.h src/arm64/stage1_find_marks.h src/arm64/stage2_build_tape.h src/arm64/stringparsing.h
6363
SRCHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/haswell/intrinsics.h src/haswell/numberparsing.h src/haswell/simd.h src/haswell/stage1_find_marks.h src/haswell/stage2_build_tape.h src/haswell/stringparsing.h
6464
SRCHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h

src/CMakeLists.txt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,43 +33,45 @@ set(SIMDJSON_SRC_HEADERS
3333
simdprune_tables.h
3434
stage1_find_marks.cpp
3535
stage2_build_tape.cpp
36-
arm64/bitmask.h
37-
arm64/simd.h
3836
arm64/bitmanipulation.h
37+
arm64/bitmask.h
3938
arm64/implementation.h
4039
arm64/intrinsics.h
40+
arm64/numberparsing.h
41+
arm64/simd.h
4142
arm64/stage1_find_marks.h
4243
arm64/stage2_build_tape.h
4344
arm64/stringparsing.h
44-
arm64/numberparsing.h
45+
generic/atomparsing.h
46+
generic/numberparsing.h
4547
generic/stage1_find_marks.h
4648
generic/stage2_build_tape.h
4749
generic/stage2_streaming_build_tape.h
4850
generic/stringparsing.h
49-
generic/numberparsing.h
5051
generic/utf8_fastvalidate_algorithm.h
5152
generic/utf8_lookup_algorithm.h
5253
generic/utf8_lookup2_algorithm.h
5354
generic/utf8_range_algorithm.h
5455
generic/utf8_zwegner_algorithm.h
55-
haswell/bitmask.h
5656
haswell/bitmanipulation.h
57+
haswell/bitmask.h
5758
haswell/implementation.h
5859
haswell/intrinsics.h
60+
haswell/numberparsing.h
5961
haswell/simd.h
6062
haswell/stage1_find_marks.h
6163
haswell/stage2_build_tape.h
6264
haswell/stringparsing.h
6365
document_parser_callbacks.h
6466
westmere/bitmanipulation.h
67+
westmere/bitmask.h
6568
westmere/implementation.h
6669
westmere/intrinsics.h
67-
westmere/bitmask.h
70+
westmere/numberparsing.h
6871
westmere/simd.h
6972
westmere/stage1_find_marks.h
7073
westmere/stage2_build_tape.h
7174
westmere/stringparsing.h
72-
westmere/numberparsing.h
7375
)
7476
set_source_files_properties(${SIMDJSON_SRC_HEADERS} PROPERTIES HEADER_FILE_ONLY TRUE)
7577

src/arm64/stage2_build_tape.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
namespace simdjson::arm64 {
1313

14+
#include "generic/atomparsing.h"
1415
#include "generic/stage2_build_tape.h"
1516
#include "generic/stage2_streaming_build_tape.h"
1617

src/generic/atomparsing.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
namespace atomparsing {
2+
3+
really_inline uint32_t string_to_uint32(const char* str) { return *reinterpret_cast<const uint32_t *>(str); }
4+
5+
WARN_UNUSED
6+
really_inline bool str4ncmp(const uint8_t *src, const char* atom) {
7+
uint32_t srcval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
8+
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
9+
std::memcpy(&srcval, src, sizeof(uint32_t));
10+
return srcval ^ string_to_uint32(atom);
11+
}
12+
13+
WARN_UNUSED
14+
really_inline bool is_valid_true_atom(const uint8_t *src) {
15+
return (str4ncmp(src, "true") | is_not_structural_or_whitespace(src[4])) == 0;
16+
}
17+
18+
WARN_UNUSED
19+
really_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
20+
if (len > 4) { return is_valid_true_atom(src); }
21+
else if (len == 4) { return !str4ncmp(src, "true"); }
22+
else { return false; }
23+
}
24+
25+
WARN_UNUSED
26+
really_inline bool is_valid_false_atom(const uint8_t *src) {
27+
return (str4ncmp(src+1, "alse") | is_not_structural_or_whitespace(src[5])) == 0;
28+
}
29+
30+
WARN_UNUSED
31+
really_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
32+
if (len > 5) { return is_valid_false_atom(src); }
33+
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
34+
else { return false; }
35+
}
36+
37+
WARN_UNUSED
38+
really_inline bool is_valid_null_atom(const uint8_t *src) {
39+
return (str4ncmp(src, "null") | is_not_structural_or_whitespace(src[4])) == 0;
40+
}
41+
42+
WARN_UNUSED
43+
really_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
44+
if (len > 4) { return is_valid_null_atom(src); }
45+
else if (len == 4) { return !str4ncmp(src, "null"); }
46+
else { return false; }
47+
}
48+
49+
} // namespace atomparsing

src/generic/stage2_build_tape.h

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ class structural_iterator {
6363
really_inline const uint8_t* current() {
6464
return &buf[idx];
6565
}
66+
really_inline size_t remaining_len() {
67+
return len - idx;
68+
}
6669
template<typename F>
6770
really_inline bool with_space_terminated_copy(const F& f) {
6871
/**
@@ -172,18 +175,18 @@ struct structural_parser {
172175
return parse_number(structurals.current(), found_minus);
173176
}
174177

175-
WARN_UNUSED really_inline bool parse_atom(const uint8_t *src) {
178+
WARN_UNUSED really_inline bool parse_atom() {
176179
switch (structurals.current_char()) {
177180
case 't':
178-
if (!is_valid_true_atom(src)) { return true; }
181+
if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
179182
doc_parser.on_true_atom();
180183
break;
181184
case 'f':
182-
if (!is_valid_false_atom(src)) { return true; }
185+
if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
183186
doc_parser.on_false_atom();
184187
break;
185188
case 'n':
186-
if (!is_valid_null_atom(src)) { return true; }
189+
if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
187190
doc_parser.on_null_atom();
188191
break;
189192
default:
@@ -192,8 +195,24 @@ struct structural_parser {
192195
return false;
193196
}
194197

195-
WARN_UNUSED really_inline bool parse_atom() {
196-
return parse_atom(structurals.current());
198+
WARN_UNUSED really_inline bool parse_single_atom() {
199+
switch (structurals.current_char()) {
200+
case 't':
201+
if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
202+
doc_parser.on_true_atom();
203+
break;
204+
case 'f':
205+
if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
206+
doc_parser.on_false_atom();
207+
break;
208+
case 'n':
209+
if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
210+
doc_parser.on_null_atom();
211+
break;
212+
default:
213+
return true;
214+
}
215+
return false;
197216
}
198217

199218
WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
@@ -327,11 +346,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
327346
FAIL_IF( parser.parse_string() );
328347
goto finish;
329348
case 't': case 'f': case 'n':
330-
FAIL_IF(
331-
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
332-
return parser.parse_atom(&copy[idx]);
333-
})
334-
);
349+
FAIL_IF( parser.parse_single_atom() );
335350
goto finish;
336351
case '0': case '1': case '2': case '3': case '4':
337352
case '5': case '6': case '7': case '8': case '9':

src/generic/stage2_streaming_build_tape.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,7 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, do
5858
FAIL_IF( parser.parse_string() );
5959
goto finish;
6060
case 't': case 'f': case 'n':
61-
FAIL_IF(
62-
parser.structurals.with_space_terminated_copy([&](auto copy, auto idx) {
63-
return parser.parse_atom(&copy[idx]);
64-
})
65-
);
61+
FAIL_IF( parser.parse_single_atom() );
6662
goto finish;
6763
case '0': case '1': case '2': case '3': case '4':
6864
case '5': case '6': case '7': case '8': case '9':

src/haswell/stage2_build_tape.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
TARGET_HASWELL
1313
namespace simdjson::haswell {
1414

15+
#include "generic/atomparsing.h"
1516
#include "generic/stage2_build_tape.h"
1617
#include "generic/stage2_streaming_build_tape.h"
1718

src/stage2_build_tape.cpp

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,52 +6,6 @@
66

77
using namespace simdjson;
88

9-
WARN_UNUSED
10-
really_inline bool is_valid_true_atom(const uint8_t *loc) {
11-
uint32_t tv = *reinterpret_cast<const uint32_t *>("true");
12-
uint32_t error = 0;
13-
uint32_t
14-
locval; // we want to avoid unaligned 64-bit loads (undefined in C/C++)
15-
// this can read up to 3 bytes beyond the buffer size, but we require
16-
// SIMDJSON_PADDING of padding
17-
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
18-
std::memcpy(&locval, loc, sizeof(uint32_t));
19-
error = locval ^ tv;
20-
error |= is_not_structural_or_whitespace(loc[4]);
21-
return error == 0;
22-
}
23-
24-
WARN_UNUSED
25-
really_inline bool is_valid_false_atom(const uint8_t *loc) {
26-
// assume that loc starts with "f"
27-
uint32_t fv = *reinterpret_cast<const uint32_t *>("alse");
28-
uint32_t error = 0;
29-
uint32_t
30-
locval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
31-
// this can read up to 4 bytes beyond the buffer size, but we require
32-
// SIMDJSON_PADDING of padding
33-
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING);
34-
std::memcpy(&locval, loc + 1, sizeof(uint32_t));
35-
error = locval ^ fv;
36-
error |= is_not_structural_or_whitespace(loc[5]);
37-
return error == 0;
38-
}
39-
40-
WARN_UNUSED
41-
really_inline bool is_valid_null_atom(const uint8_t *loc) {
42-
uint32_t nv = *reinterpret_cast<const uint32_t *>("null");
43-
uint32_t error = 0;
44-
uint32_t
45-
locval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
46-
// this can read up to 2 bytes beyond the buffer size, but we require
47-
// SIMDJSON_PADDING of padding
48-
static_assert(sizeof(uint32_t) - 1 <= SIMDJSON_PADDING);
49-
std::memcpy(&locval, loc, sizeof(uint32_t));
50-
error = locval ^ nv;
51-
error |= is_not_structural_or_whitespace(loc[4]);
52-
return error == 0;
53-
}
54-
559
#ifdef JSON_TEST_STRINGS
5610
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
5711
const uint8_t *parsed_end);

src/westmere/stage2_build_tape.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
TARGET_WESTMERE
1313
namespace simdjson::westmere {
1414

15+
#include "generic/atomparsing.h"
1516
#include "generic/stage2_build_tape.h"
1617
#include "generic/stage2_streaming_build_tape.h"
1718

0 commit comments

Comments
 (0)