Skip to content

Commit 8e7d1a5

Browse files
authored
Separate document state from ParsedJson
This creates a "document" class with only user-facing document state (no parser internals). - document: user-facing document state - document::iterator: iterator (equivalent of ParsedJsonIterator) - document::parser: parser state plus a "docked" document we parse into (equivalent of ParsedJson) Usage: ```c++ auto doc = simdjson::document::parse(buf, len); // less efficient but simplest ``` ```c++ simdjson::document::parser parser; // reusable parser parser.allocate_capacity(len); simdjson::document* doc = parser.parse(buf, len); // pointer to doc inside parser doc = parser.parse(buf2, len); // reuses all buffers and overwrites doc; more efficient ```
1 parent c879b56 commit 8e7d1a5

36 files changed

+1773
-1422
lines changed

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,21 @@ objs
5454

5555
# Build outputs (TODO build to a subdir so we can exclude that instead)
5656
/allparserscheckfile
57+
/allparsingcompetition
5758
/basictests
5859
/benchfeatures
5960
/benchmark/parse
6061
/benchmark/parse_stream
6162
/benchmark/perfdiff
6263
/benchmark/statisticalmodel
64+
/build/
65+
/build-ossfuzz-*/
66+
/build-plain-*/
67+
/corpus.zip
68+
/distinctuseridcompetition
6369
/fuzz/fuzz_dump
6470
/fuzz/fuzz_parser
71+
/get_corpus_benchmark
6572
/json2json
6673
/jsoncheck
6774
/jsoncheck_noavx
@@ -70,9 +77,18 @@ objs
7077
/jsonstats
7178
/integer_tests
7279
/libsimdjson.so*
80+
/minifiercompetition
7381
/minify
7482
/numberparsingcheck
83+
/ossfuzz-out
84+
/out
7585
/parse
86+
/parse_nonumberparsing
87+
/parse_nostringparsing
88+
/parse_noutf8validation
89+
/parse_stream
90+
/parseandstatcompetition
91+
/parsingcompetition
7692
/perfdiff
7793
/pointercheck
7894
/statisticalmodel

Makefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ LIBHEADERS_HASWELL= src/haswell/bitmanipulation.h src/haswell/bitmask.h src/h
7070
LIBHEADERS_WESTMERE=src/westmere/bitmanipulation.h src/westmere/bitmask.h src/westmere/intrinsics.h src/westmere/numberparsing.h src/westmere/simd.h src/westmere/stage1_find_marks.h src/westmere/stage2_build_tape.h src/westmere/stringparsing.h
7171
LIBHEADERS=src/jsoncharutils.h src/simdprune_tables.h $(LIBHEADERS_GENERIC) $(LIBHEADERS_ARM64) $(LIBHEADERS_HASWELL) $(LIBHEADERS_WESTMERE)
7272

73-
PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/parsedjson.h include/simdjson/parsedjsoniterator.h include/simdjson/portability.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h
73+
PUBHEADERS=include/simdjson/common_defs.h include/simdjson/isadetection.h include/simdjson/jsonformatutils.h include/simdjson/jsonioutil.h include/simdjson/jsonminifier.h include/simdjson/jsonparser.h include/simdjson/padded_string.h include/simdjson/document.h include/simdjson/document/iterator.h include/simdjson/document/parser.h include/simdjson/parsedjson.h include/simdjson/jsonstream.h include/simdjson/portability.h include/simdjson/simdjson.h include/simdjson/simdjson_version.h include/simdjson/stage1_find_marks.h include/simdjson/stage2_build_tape.h
7474
HEADERS=$(PUBHEADERS) $(LIBHEADERS)
7575

76-
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp
76+
LIBFILES=src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/document.cpp src/document/parser.cpp
7777
MINIFIERHEADERS=include/simdjson/jsonminifier.h
7878
MINIFIERLIBFILES=src/jsonminifier.cpp
7979

@@ -205,18 +205,18 @@ basictests:tests/basictests.cpp $(HEADERS) $(LIBFILES)
205205

206206

207207
numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
208-
$(CXX) $(CXXFLAGS) -o numberparsingcheck tests/numberparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
208+
$(CXX) $(CXXFLAGS) -o numberparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/document.cpp src/document/parser.cpp tests/numberparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
209209

210210
integer_tests:tests/integer_tests.cpp $(HEADERS) $(LIBFILES)
211-
$(CXX) $(CXXFLAGS) -o integer_tests tests/integer_tests.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp -I. $(LIBFLAGS)
211+
$(CXX) $(CXXFLAGS) -o integer_tests $(LIBFILES) tests/integer_tests.cpp -I. $(LIBFLAGS)
212212

213213

214214

215215
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
216-
$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
216+
$(CXX) $(CXXFLAGS) -o stringparsingcheck src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/document.cpp src/document/parser.cpp tests/stringparsingcheck.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
217217

218218
pointercheck:tests/pointercheck.cpp $(HEADERS) $(LIBFILES)
219-
$(CXX) $(CXXFLAGS) -o pointercheck tests/pointercheck.cpp src/stage2_build_tape.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp -I. $(LIBFLAGS)
219+
$(CXX) $(CXXFLAGS) -o pointercheck $(LIBFILES) tests/pointercheck.cpp -I. $(LIBFLAGS)
220220

221221
minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) submodules $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
222222
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)

amalgamation.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ jsonminifier.cpp
2222
jsonparser.cpp
2323
stage1_find_marks.cpp
2424
stage2_build_tape.cpp
25-
parsedjson.cpp
26-
parsedjsoniterator.cpp
25+
document.cpp
26+
document/parser.cpp
2727
"
2828

2929
# order matters
@@ -37,8 +37,10 @@ simdjson/common_defs.h
3737
simdjson/padded_string.h
3838
simdjson/jsonioutil.h
3939
simdjson/jsonminifier.h
40+
simdjson/document.h
41+
simdjson/document/iterator.h
42+
simdjson/document/parser.h
4043
simdjson/parsedjson.h
41-
simdjson/parsedjsoniterator.h
4244
simdjson/stage1_find_marks.h
4345
simdjson/stage2_build_tape.h
4446
simdjson/jsonparser.h
@@ -149,11 +151,11 @@ int main(int argc, char *argv[]) {
149151
}
150152
const char * filename = argv[1];
151153
simdjson::padded_string p = simdjson::get_corpus(filename);
152-
simdjson::ParsedJson pj = simdjson::build_parsed_json(p); // do the parsing
153-
if( ! pj.is_valid() ) {
154-
std::cout << "build_parsed_json not valid" << std::endl;
154+
simdjson::document doc;
155+
if (!simdjson::document::try_parse(p, doc)) { // do the parsing
156+
std::cout << "document::try_parse not valid" << std::endl;
155157
} else {
156-
std::cout << "build_parsed_json valid" << std::endl;
158+
std::cout << "document::try_parse valid" << std::endl;
157159
}
158160
if(argc == 2) {
159161
return EXIT_SUCCESS;
@@ -162,15 +164,15 @@ int main(int argc, char *argv[]) {
162164
//JsonStream
163165
const char * filename2 = argv[2];
164166
simdjson::padded_string p2 = simdjson::get_corpus(filename2);
165-
simdjson::ParsedJson pj2;
167+
simdjson::document::parser parser;
166168
simdjson::JsonStream js{p2};
167169
int parse_res = simdjson::SUCCESS_AND_HAS_MORE;
168170
169171
while (parse_res == simdjson::SUCCESS_AND_HAS_MORE) {
170-
parse_res = js.json_parse(pj2);
172+
parse_res = js.json_parse(parser);
171173
}
172174
173-
if( ! pj2.is_valid()) {
175+
if( ! parser.is_valid()) {
174176
std::cout << "JsonStream not valid" << std::endl;
175177
} else {
176178
std::cout << "JsonStream valid" << std::endl;

benchmark/benchmarker.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
#include "simdjson/isadetection.h"
3838
#include "simdjson/jsonioutil.h"
3939
#include "simdjson/jsonparser.h"
40-
#include "simdjson/parsedjson.h"
40+
#include "simdjson/document.h"
4141
#include "simdjson/stage1_find_marks.h"
4242
#include "simdjson/stage2_build_tape.h"
4343

@@ -85,11 +85,11 @@ struct json_stats {
8585
size_t blocks_with_16_structurals = 0;
8686
size_t blocks_with_16_structurals_flipped = 0;
8787

88-
json_stats(const padded_string& json, const ParsedJson& pj) {
88+
json_stats(const padded_string& json, const document::parser& parser) {
8989
bytes = json.size();
9090
blocks = bytes / BYTES_PER_BLOCK;
9191
if (bytes % BYTES_PER_BLOCK > 0) { blocks++; } // Account for remainder block
92-
structurals = pj.n_structural_indexes-1;
92+
structurals = parser.n_structural_indexes-1;
9393

9494
// Calculate stats on blocks that will trigger utf-8 if statements / mispredictions
9595
bool last_block_has_utf8 = false;
@@ -146,7 +146,7 @@ struct json_stats {
146146
for (size_t block=0; block<blocks; block++) {
147147
// Count structurals in the block
148148
int block_structurals=0;
149-
while (structural < pj.n_structural_indexes && pj.structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
149+
while (structural < parser.n_structural_indexes && parser.structural_indexes[structural] < (block+1)*BYTES_PER_BLOCK) {
150150
block_structurals++;
151151
structural++;
152152
}
@@ -305,9 +305,9 @@ struct benchmarker {
305305
}
306306

307307
really_inline void run_iteration(bool stage1_only, bool hotbuffers=false) {
308-
// Allocate ParsedJson
308+
// Allocate document::parser
309309
collector.start();
310-
ParsedJson pj;
310+
document::parser pj;
311311
bool allocok = pj.allocate_capacity(json.size());
312312
event_count allocate_count = collector.end();
313313
allocate_stage << allocate_count;

benchmark/distinctuseridcompetition.cpp

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include "simdjson/jsonparser.h"
1+
#include "simdjson/jsonioutil.h"
2+
#include "simdjson/document.h"
23
#include <algorithm>
34
#include <unistd.h>
45
#include <vector>
@@ -30,8 +31,7 @@ void print_vec(const std::vector<int64_t> &v) {
3031
std::cout << std::endl;
3132
}
3233

33-
void simdjson_scan(std::vector<int64_t> &answer,
34-
simdjson::ParsedJson::Iterator &i) {
34+
void simdjson_scan(std::vector<int64_t> &answer, simdjson::document::iterator i) {
3535
while (i.move_forward()) {
3636
if (i.get_scope_type() == '{') {
3737
bool found_user = (i.get_string_length() == 4) &&
@@ -50,32 +50,26 @@ void simdjson_scan(std::vector<int64_t> &answer,
5050
}
5151

5252
__attribute__((noinline)) std::vector<int64_t>
53-
simdjson_just_dom(simdjson::ParsedJson &pj) {
53+
simdjson_just_dom(simdjson::document &doc) {
5454
std::vector<int64_t> answer;
55-
simdjson::ParsedJson::Iterator i(pj);
56-
simdjson_scan(answer, i);
55+
simdjson_scan(answer, doc);
5756
remove_duplicates(answer);
5857
return answer;
5958
}
6059

6160
__attribute__((noinline)) std::vector<int64_t>
6261
simdjson_compute_stats(const simdjson::padded_string &p) {
6362
std::vector<int64_t> answer;
64-
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
65-
if (!pj.is_valid()) {
66-
return answer;
67-
}
68-
simdjson::ParsedJson::Iterator i(pj);
69-
simdjson_scan(answer, i);
63+
simdjson::document doc = simdjson::document::parse(p);
64+
simdjson_scan(answer, doc);
7065
remove_duplicates(answer);
7166
return answer;
7267
}
7368

7469
__attribute__((noinline)) bool
7570
simdjson_just_parse(const simdjson::padded_string &p) {
76-
simdjson::ParsedJson pj = simdjson::build_parsed_json(p);
77-
bool answer = !pj.is_valid();
78-
return answer;
71+
simdjson::document doc;
72+
return simdjson::document::try_parse(p, doc) == simdjson::SUCCESS;
7973
}
8074

8175
void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
@@ -323,13 +317,13 @@ int main(int argc, char *argv[]) {
323317
!just_data);
324318
BEST_TIME("sasjon ", sasjon_compute_stats(p).size(), size, , repeat, volume,
325319
!just_data);
326-
BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), false, , repeat,
320+
BEST_TIME("simdjson (just parse) ", simdjson_just_parse(p), true, , repeat,
327321
volume, !just_data);
328322
BEST_TIME("rapid (just parse) ", rapid_just_parse(p), false, , repeat,
329323
volume, !just_data);
330324
BEST_TIME("sasjon (just parse) ", sasjon_just_parse(p), false, , repeat,
331325
volume, !just_data);
332-
simdjson::ParsedJson dsimdjson = simdjson::build_parsed_json(p);
326+
simdjson::document dsimdjson = simdjson::document::parse(p);
333327
BEST_TIME("simdjson (just dom) ", simdjson_just_dom(dsimdjson).size(), size,
334328
, repeat, volume, !just_data);
335329
char *buffer = (char *)malloc(p.size());

benchmark/parseandstatcompetition.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ simdjson_compute_stats(const simdjson::padded_string &p) {
5858
answer.true_count = 0;
5959
answer.false_count = 0;
6060
size_t tape_idx = 0;
61-
uint64_t tape_val = pj.tape[tape_idx++];
61+
uint64_t tape_val = pj.doc.tape[tape_idx++];
6262
uint8_t type = (tape_val >> 56);
6363
size_t how_many = 0;
6464
assert(type == 'r');
6565
how_many = tape_val & JSON_VALUE_MASK;
6666
for (; tape_idx < how_many; tape_idx++) {
67-
tape_val = pj.tape[tape_idx];
67+
tape_val = pj.doc.tape[tape_idx];
6868
// uint64_t payload = tape_val & JSON_VALUE_MASK;
6969
type = (tape_val >> 56);
7070
switch (type) {

benchmark/statisticalmodel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,13 @@ stat_t simdjson_compute_stats(const simdjson::padded_string &p) {
6464
answer.string_count = 0;
6565
answer.structural_indexes_count = pj.n_structural_indexes;
6666
size_t tape_idx = 0;
67-
uint64_t tape_val = pj.tape[tape_idx++];
67+
uint64_t tape_val = pj.doc.tape[tape_idx++];
6868
uint8_t type = (tape_val >> 56);
6969
size_t how_many = 0;
7070
assert(type == 'r');
7171
how_many = tape_val & JSON_VALUE_MASK;
7272
for (; tape_idx < how_many; tape_idx++) {
73-
tape_val = pj.tape[tape_idx];
73+
tape_val = pj.doc.tape[tape_idx];
7474
// uint64_t payload = tape_val & JSON_VALUE_MASK;
7575
type = (tape_val >> 56);
7676
switch (type) {

fuzz/fuzz_dump_raw_tape.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
1111
try {
1212
auto pj = simdjson::build_parsed_json(Data, Size);
1313
NulOStream os;
14-
bool ignored=pj.dump_raw_tape(os);
14+
UNUSED bool ignored=pj.dump_raw_tape(os);
1515
} catch (...) {
1616
}
1717
return 0;

fuzz/ossfuzz.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# make sure to exit on problems
1010
set -e
1111
set -u
12+
set -x
1213

1314
for prog in zip cmake ninja; do
1415
if ! which $prog >/dev/null; then
@@ -21,7 +22,7 @@ done
2122
# build the corpus (all inputs are json, the same corpus can be used for everyone)
2223
fuzz/build_corpus.sh
2324

24-
mkdir build
25+
mkdir -p build
2526
cd build
2627

2728
cmake .. \

include/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
set(SIMDJSON_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
22
set(SIMDJSON_INCLUDE
33
${SIMDJSON_INCLUDE_DIR}/simdjson/common_defs.h
4+
${SIMDJSON_INCLUDE_DIR}/simdjson/document.h
5+
${SIMDJSON_INCLUDE_DIR}/simdjson/document/iterator.h
6+
${SIMDJSON_INCLUDE_DIR}/simdjson/document/parser.h
47
${SIMDJSON_INCLUDE_DIR}/simdjson/isadetection.h
58
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonformatutils.h
69
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonioutil.h
710
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonminifier.h
811
${SIMDJSON_INCLUDE_DIR}/simdjson/jsonparser.h
912
${SIMDJSON_INCLUDE_DIR}/simdjson/padded_string.h
1013
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjson.h
11-
${SIMDJSON_INCLUDE_DIR}/simdjson/parsedjsoniterator.h
1214
${SIMDJSON_INCLUDE_DIR}/simdjson/portability.h
1315
${SIMDJSON_INCLUDE_DIR}/simdjson/simdjson.h
1416
${SIMDJSON_INCLUDE_DIR}/simdjson/simdjson_version.h

0 commit comments

Comments
 (0)