Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ if (TARGET benchmark::benchmark)
link_libraries(benchmark::benchmark)
add_executable(bench_parse_call bench_parse_call.cpp)
add_executable(bench_dom_api bench_dom_api.cpp)
add_executable(bench_stream_formats bench_stream_formats.cpp)
if(SIMDJSON_EXCEPTIONS)
add_executable(bench_ondemand bench_ondemand.cpp)
if(TARGET yyjson)
Expand Down
216 changes: 216 additions & 0 deletions benchmark/bench_stream_formats.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#include <benchmark/benchmark.h>
#include <string>
#include "simdjson.h"

using namespace simdjson;

namespace {

enum class stream_case {
ndjson_small,
ndjson_large,
rfc7464_small,
rfc7464_large,
comma_delimited_small,
comma_delimited_large
};

constexpr size_t TARGET_BYTES = 128 * 1000 * 1000;
constexpr size_t SMALL_PAYLOAD = 16;
constexpr size_t LARGE_PAYLOAD = 4096;
constexpr size_t BATCH_SIZE = 1 << 20;

struct stream_dataset {
padded_string json;
size_t count{};
};

std::string make_document(size_t id, size_t payload_size) {
return std::string{"{\"id\":"} + std::to_string(id) +
",\"name\":\"aaaaaaaa\",\"payload\":\"" +
std::string(payload_size, 'x') + "\",\"flag\":true}";
}

stream_dataset build_dataset(stream_case which) {
const bool small = which == stream_case::ndjson_small ||
which == stream_case::rfc7464_small ||
which == stream_case::comma_delimited_small;
const bool rfc = which == stream_case::rfc7464_small ||
which == stream_case::rfc7464_large;
const bool comma = which == stream_case::comma_delimited_small ||
which == stream_case::comma_delimited_large;
const size_t payload_size = small ? SMALL_PAYLOAD : LARGE_PAYLOAD;
const size_t count = TARGET_BYTES / (payload_size + 48);
std::string out;
out.reserve(count * (payload_size + 64));
for (size_t i = 0; i < count; i++) {
if (rfc) {
out += char(0x1E);
}
if (comma && i > 0) {
out += ',';
}
out += make_document(i, payload_size);
if (!comma) {
out += '\n';
}
}
return {padded_string(out), count};
}

const stream_dataset &get_dataset(stream_case which) {
static const stream_dataset ndjson_small =
build_dataset(stream_case::ndjson_small);
static const stream_dataset ndjson_large =
build_dataset(stream_case::ndjson_large);
static const stream_dataset rfc_small =
build_dataset(stream_case::rfc7464_small);
static const stream_dataset rfc_large =
build_dataset(stream_case::rfc7464_large);
static const stream_dataset comma_small =
build_dataset(stream_case::comma_delimited_small);
static const stream_dataset comma_large =
build_dataset(stream_case::comma_delimited_large);
switch (which) {
case stream_case::ndjson_small:
return ndjson_small;
case stream_case::ndjson_large:
return ndjson_large;
case stream_case::rfc7464_small:
return rfc_small;
case stream_case::rfc7464_large:
return rfc_large;
case stream_case::comma_delimited_small:
return comma_small;
case stream_case::comma_delimited_large:
return comma_large;
}
return ndjson_small;
}

void set_counters(benchmark::State &state, const stream_dataset &dataset) {
state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(dataset.json.size()));
state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(dataset.count));
}

template <stream_case which, bool threaded = true>
static void bench_ondemand(benchmark::State &state) {
const auto &dataset = get_dataset(which);
ondemand::parser parser;
parser.threaded = threaded;
stream_format format = stream_format::whitespace_delimited;
if constexpr (which == stream_case::rfc7464_small ||
which == stream_case::rfc7464_large) {
format = stream_format::json_sequence;
} else if constexpr (which == stream_case::comma_delimited_small ||
which == stream_case::comma_delimited_large) {
format = stream_format::comma_delimited;
}
for (const auto _ : state) {
ondemand::document_stream docs;
auto error = parser.iterate_many(dataset.json, BATCH_SIZE, format).get(docs);
if (error) {
state.SkipWithError(error_message(error));
return;
}
uint64_t sum = 0;
for (auto doc : docs) {
ondemand::object obj;
if ((error = doc.get_object().get(obj))) {
state.SkipWithError(error_message(error));
return;
}
uint64_t id;
if ((error = obj["id"].get_uint64().get(id))) {
state.SkipWithError(error_message(error));
return;
}
sum += id;
}
benchmark::DoNotOptimize(sum);
}
set_counters(state, dataset);
}

template <stream_case which>
static void bench_dom(benchmark::State &state) {
const auto &dataset = get_dataset(which);
dom::parser parser;
parser.threaded = true;
stream_format format = stream_format::whitespace_delimited;
if constexpr (which == stream_case::rfc7464_small ||
which == stream_case::rfc7464_large) {
format = stream_format::json_sequence;
} else if constexpr (which == stream_case::comma_delimited_small ||
which == stream_case::comma_delimited_large) {
format = stream_format::comma_delimited;
}
for (const auto _ : state) {
dom::document_stream docs;
auto error = parser.parse_many(dataset.json, BATCH_SIZE, format).get(docs);
if (error) {
state.SkipWithError(error_message(error));
return;
}
uint64_t sum = 0;
for (auto doc : docs) {
uint64_t id;
if ((error = doc["id"].get(id))) {
state.SkipWithError(error_message(error));
return;
}
sum += id;
}
benchmark::DoNotOptimize(sum);
}
set_counters(state, dataset);
}

} // namespace

BENCHMARK(bench_ondemand<stream_case::ndjson_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::ndjson_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::rfc7464_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::rfc7464_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::comma_delimited_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::comma_delimited_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);
// Non-threaded comma_delimited for comparison
BENCHMARK(bench_ondemand<stream_case::comma_delimited_small, false>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_ondemand<stream_case::comma_delimited_large, false>)
->UseRealTime()
->DisplayAggregatesOnly(true);

BENCHMARK(bench_dom<stream_case::ndjson_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_dom<stream_case::ndjson_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_dom<stream_case::rfc7464_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_dom<stream_case::rfc7464_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_dom<stream_case::comma_delimited_small>)
->UseRealTime()
->DisplayAggregatesOnly(true);
BENCHMARK(bench_dom<stream_case::comma_delimited_large>)
->UseRealTime()
->DisplayAggregatesOnly(true);

BENCHMARK_MAIN();
110 changes: 84 additions & 26 deletions doc/iterate_many.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ E.g., `[1,2]{"32":1}` is recognized as two documents.
Some official formats **(non-exhaustive list)**:
- [Newline-Delimited JSON (NDJSON)](https://github.com/ndjson/ndjson-spec/)
- [JSON lines (JSONL)](http://jsonlines.org/)
- [Record separator-delimited JSON (RFC 7464)](https://tools.ietf.org/html/rfc7464) <- Not supported by simdjson!
- [Record separator-delimited JSON (RFC 7464)](https://tools.ietf.org/html/rfc7464)
- [More on Wikipedia...](https://en.wikipedia.org/wiki/JSON_streaming)

API
Expand Down Expand Up @@ -278,39 +278,97 @@ Importantly, you should only call `truncated_bytes()` after iterating through al
Comma-separated documents
-----------

We also support comma-separated documents, but with some performance limitations. The `iterate_many` function takes in an option to allow parsing of comma separated documents (which defaults on false). In this mode, the entire buffer is processed in one batch. Therefore, the total size of the document should not exceed the maximal capacity of the parser (4 GB). This mode also effectively disallow multithreading. It is therefore mostly suitable for not "very large" inputs. In this mode, the batch_size parameter
is effectively ignored, as it is set to at least the document size.
To parse comma-separated documents like `{"a":1},{"b":2},{"c":3}`, use the `stream_format::comma_delimited` parameter:

Example:
```cpp
auto json = R"({"a":1},{"b":2},{"c":3})"_padded;
ondemand::parser parser;
ondemand::document_stream stream;
auto error = parser.iterate_many(json, ondemand::DEFAULT_BATCH_SIZE,
simdjson::stream_format::comma_delimited).get(stream);
if (error) { std::cerr << error << std::endl; return; }
for (auto doc : stream) {
std::cout << doc << std::endl;
}
// Prints: {"a":1}
// {"b":2}
// {"c":3}
```

Whitespace around the commas is allowed:
```cpp
auto json = R"( 1, 2, 3, 4, "a", "b", "c", {"hello": "world"} , [1, 2, 3])"_padded;
ondemand::parser parser;
ondemand::document_stream doc_stream;
// We pass '32' as the batch size, but it is a bogus parameter because, since
// we pass 'true' to the allow_comma parameter, the batch size will be set to at least
// the document size.
auto error = parser.iterate_many(json, 32, true).get(doc_stream);
if (error) { std::cerr << error << std::endl; return; }
for (auto doc : doc_stream) {
std::cout << doc.type() << std::endl;
}
```
auto json = R"({"a":1} , {"b":2} , {"c":3})"_padded; // Also works
```

Nested commas inside objects and arrays are preserved:
```cpp
auto json = R"({"arr":[1,2,3]},{"obj":{"x":1,"y":2}})"_padded;
// Correctly parses as 2 documents, not 6
```

Mixed document types are supported:
```cpp
auto json = R"(1, 2, 3, 4, "a", "b", "c", {"hello": "world"}, [1, 2, 3])"_padded;
ondemand::parser parser;
ondemand::document_stream doc_stream;
auto error = parser.iterate_many(json, ondemand::DEFAULT_BATCH_SIZE,
simdjson::stream_format::comma_delimited).get(doc_stream);
if (error) { std::cerr << error << std::endl; return; }
for (auto doc : doc_stream) {
std::cout << doc.type() << std::endl;
}
// Prints: number number number number string string string object array
```

Extra top-level separators are tolerated for compatibility with the legacy
`allow_comma_separated` behavior. For example, leading commas, trailing commas,
and repeated commas are treated as empty separators rather than documents.

### Legacy `allow_comma_separated` parameter (deprecated)

The `allow_comma_separated` boolean parameter is deprecated. When set to `true`, it now internally maps to `stream_format::comma_delimited`.

This will print:
The old single-batch limitation no longer applies - comma-delimited parsing now supports multi-batch processing and threading for optimal performance on large files.

JSON Text Sequences (RFC 7464)
------------------------------

[RFC 7464](https://tools.ietf.org/html/rfc7464) defines a format for streaming JSON values using ASCII Record Separator (RS, 0x1E) as a delimiter. Each JSON text is preceded by RS and optionally followed by ASCII Line Feed (LF, 0x0A).

Example input:
```
number
number
number
number
string
string
string
object
array
<RS>{"name":"doc1"}<LF>
<RS>{"name":"doc2"}<LF>
<RS>{"name":"doc3"}<LF>
```

To parse JSON text sequences, use the `stream_format::json_sequence` parameter:

```cpp
// Build input with RS (0x1E) and LF (0x0A) delimiters
std::string input_str;
input_str += '\x1e'; input_str += "{\"a\":1}"; input_str += '\x0a';
input_str += '\x1e'; input_str += "{\"b\":2}"; input_str += '\x0a';
input_str += '\x1e'; input_str += "{\"c\":3}"; input_str += '\x0a';
simdjson::padded_string input(input_str);

ondemand::parser parser;
ondemand::document_stream stream;
auto error = parser.iterate_many(input, ondemand::DEFAULT_BATCH_SIZE,
simdjson::stream_format::json_sequence).get(stream);
if (error) { std::cerr << error << std::endl; return; }
for (auto doc : stream) {
std::cout << doc << std::endl;
}
```

The `stream_format` enum has the following values:
- `stream_format::whitespace_delimited` (default): Standard NDJSON/JSON Lines format
- `stream_format::json_sequence`: RFC 7464 format with RS delimiters
- `stream_format::comma_delimited`: Comma-separated JSON documents

The trailing LF after each JSON text is optional but recommended by the RFC for robustness.


C++20 features
--------------------
Expand Down
Loading
Loading