simdjson · lemire · Apr 11, 2026 · Apr 5, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ if (TARGET benchmark::benchmark)
   link_libraries(benchmark::benchmark)
   add_executable(bench_parse_call bench_parse_call.cpp)
   add_executable(bench_dom_api bench_dom_api.cpp)
+  add_executable(bench_stream_formats bench_stream_formats.cpp)
   if(SIMDJSON_EXCEPTIONS)
     add_executable(bench_ondemand bench_ondemand.cpp)
     if(TARGET yyjson)

diff --git a/benchmark/bench_stream_formats.cpp b/benchmark/bench_stream_formats.cpp
@@ -0,0 +1,216 @@
+#include <benchmark/benchmark.h>
+#include <string>
+#include "simdjson.h"
+
+using namespace simdjson;
+
+namespace {
+
+enum class stream_case {
+  ndjson_small,
+  ndjson_large,
+  rfc7464_small,
+  rfc7464_large,
+  comma_delimited_small,
+  comma_delimited_large
+};
+
+constexpr size_t TARGET_BYTES = 128 * 1000 * 1000;
+constexpr size_t SMALL_PAYLOAD = 16;
+constexpr size_t LARGE_PAYLOAD = 4096;
+constexpr size_t BATCH_SIZE = 1 << 20;
+
+struct stream_dataset {
+  padded_string json;
+  size_t count{};
+};
+
+std::string make_document(size_t id, size_t payload_size) {
+  return std::string{"{\"id\":"} + std::to_string(id) +
+    ",\"name\":\"aaaaaaaa\",\"payload\":\"" +
+    std::string(payload_size, 'x') + "\",\"flag\":true}";
+}
+
+stream_dataset build_dataset(stream_case which) {
+  const bool small = which == stream_case::ndjson_small ||
+                     which == stream_case::rfc7464_small ||
+                     which == stream_case::comma_delimited_small;
+  const bool rfc = which == stream_case::rfc7464_small ||
+                   which == stream_case::rfc7464_large;
+  const bool comma = which == stream_case::comma_delimited_small ||
+                     which == stream_case::comma_delimited_large;
+  const size_t payload_size = small ? SMALL_PAYLOAD : LARGE_PAYLOAD;
+  const size_t count = TARGET_BYTES / (payload_size + 48);
+  std::string out;
+  out.reserve(count * (payload_size + 64));
+  for (size_t i = 0; i < count; i++) {
+    if (rfc) {
+      out += char(0x1E);
+    }
+    if (comma && i > 0) {
+      out += ',';
+    }
+    out += make_document(i, payload_size);
+    if (!comma) {
+      out += '\n';
+    }
+  }
+  return {padded_string(out), count};
+}
+
+const stream_dataset &get_dataset(stream_case which) {
+  static const stream_dataset ndjson_small =
+    build_dataset(stream_case::ndjson_small);
+  static const stream_dataset ndjson_large =
+    build_dataset(stream_case::ndjson_large);
+  static const stream_dataset rfc_small =
+    build_dataset(stream_case::rfc7464_small);
+  static const stream_dataset rfc_large =
+    build_dataset(stream_case::rfc7464_large);
+  static const stream_dataset comma_small =
+    build_dataset(stream_case::comma_delimited_small);
+  static const stream_dataset comma_large =
+    build_dataset(stream_case::comma_delimited_large);
+  switch (which) {
+    case stream_case::ndjson_small:
+      return ndjson_small;
+    case stream_case::ndjson_large:
+      return ndjson_large;
+    case stream_case::rfc7464_small:
+      return rfc_small;
+    case stream_case::rfc7464_large:
+      return rfc_large;
+    case stream_case::comma_delimited_small:
+      return comma_small;
+    case stream_case::comma_delimited_large:
+      return comma_large;
+  }
+  return ndjson_small;
+}
+
+void set_counters(benchmark::State &state, const stream_dataset &dataset) {
+  state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(dataset.json.size()));
+  state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(dataset.count));
+}
+
+template <stream_case which, bool threaded = true>
+static void bench_ondemand(benchmark::State &state) {
+  const auto &dataset = get_dataset(which);
+  ondemand::parser parser;
+  parser.threaded = threaded;
+  stream_format format = stream_format::whitespace_delimited;
+  if constexpr (which == stream_case::rfc7464_small ||
+                which == stream_case::rfc7464_large) {
+    format = stream_format::json_sequence;
+  } else if constexpr (which == stream_case::comma_delimited_small ||
+                       which == stream_case::comma_delimited_large) {
+    format = stream_format::comma_delimited;
+  }
+  for (const auto _ : state) {
+    ondemand::document_stream docs;
+    auto error = parser.iterate_many(dataset.json, BATCH_SIZE, format).get(docs);
+    if (error) {
+      state.SkipWithError(error_message(error));
+      return;
+    }
+    uint64_t sum = 0;
+    for (auto doc : docs) {
+      ondemand::object obj;
+      if ((error = doc.get_object().get(obj))) {
+        state.SkipWithError(error_message(error));
+        return;
+      }
+      uint64_t id;
+      if ((error = obj["id"].get_uint64().get(id))) {
+        state.SkipWithError(error_message(error));
+        return;
+      }
+      sum += id;
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+  set_counters(state, dataset);
+}
+
+template <stream_case which>
+static void bench_dom(benchmark::State &state) {
+  const auto &dataset = get_dataset(which);
+  dom::parser parser;
+  parser.threaded = true;
+  stream_format format = stream_format::whitespace_delimited;
+  if constexpr (which == stream_case::rfc7464_small ||
+                which == stream_case::rfc7464_large) {
+    format = stream_format::json_sequence;
+  } else if constexpr (which == stream_case::comma_delimited_small ||
+                       which == stream_case::comma_delimited_large) {
+    format = stream_format::comma_delimited;
+  }
+  for (const auto _ : state) {
+    dom::document_stream docs;
+    auto error = parser.parse_many(dataset.json, BATCH_SIZE, format).get(docs);
+    if (error) {
+      state.SkipWithError(error_message(error));
+      return;
+    }
+    uint64_t sum = 0;
+    for (auto doc : docs) {
+      uint64_t id;
+      if ((error = doc["id"].get(id))) {
+        state.SkipWithError(error_message(error));
+        return;
+      }
+      sum += id;
+    }
+    benchmark::DoNotOptimize(sum);
+  }
+  set_counters(state, dataset);
+}
+
+} // namespace
+
+BENCHMARK(bench_ondemand<stream_case::ndjson_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::ndjson_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::rfc7464_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::rfc7464_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::comma_delimited_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::comma_delimited_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+// Non-threaded comma_delimited for comparison
+BENCHMARK(bench_ondemand<stream_case::comma_delimited_small, false>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_ondemand<stream_case::comma_delimited_large, false>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+
+BENCHMARK(bench_dom<stream_case::ndjson_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_dom<stream_case::ndjson_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_dom<stream_case::rfc7464_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_dom<stream_case::rfc7464_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_dom<stream_case::comma_delimited_small>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+BENCHMARK(bench_dom<stream_case::comma_delimited_large>)
+    ->UseRealTime()
+    ->DisplayAggregatesOnly(true);
+
+BENCHMARK_MAIN();
diff --git a/doc/iterate_many.md b/doc/iterate_many.md
@@ -132,7 +132,7 @@ E.g., `[1,2]{"32":1}` is recognized as two documents.
 Some official formats **(non-exhaustive list)**:
 - [Newline-Delimited JSON (NDJSON)](https://github.com/ndjson/ndjson-spec/)
 - [JSON lines (JSONL)](http://jsonlines.org/)
-- [Record separator-delimited JSON (RFC 7464)](https://tools.ietf.org/html/rfc7464) <- Not supported by simdjson!
+- [Record separator-delimited JSON (RFC 7464)](https://tools.ietf.org/html/rfc7464)
 - [More on Wikipedia...](https://en.wikipedia.org/wiki/JSON_streaming)
 
 API
@@ -278,39 +278,97 @@ Importantly, you should only call `truncated_bytes()` after iterating through al
 Comma-separated documents
 -----------
 
-We also support comma-separated documents, but with some performance limitations. The `iterate_many` function  takes in an option to allow parsing of comma separated documents (which defaults on false). In this mode, the entire buffer is processed in one batch. Therefore, the total size of the document should not exceed the maximal capacity of the parser (4 GB). This mode also effectively disallow multithreading. It is therefore mostly suitable for not "very large" inputs. In this mode, the batch_size parameter
-is effectively ignored, as it is set to at least the document size.
+To parse comma-separated documents like `{"a":1},{"b":2},{"c":3}`, use the `stream_format::comma_delimited` parameter:
 
-Example:
+```cpp
+auto json = R"({"a":1},{"b":2},{"c":3})"_padded;
+ondemand::parser parser;
+ondemand::document_stream stream;
+auto error = parser.iterate_many(json, ondemand::DEFAULT_BATCH_SIZE,
+                                 simdjson::stream_format::comma_delimited).get(stream);
+if (error) { std::cerr << error << std::endl; return; }
+for (auto doc : stream) {
+    std::cout << doc << std::endl;
+}
+// Prints: {"a":1}
+//         {"b":2}
+//         {"c":3}
+```
 
+Whitespace around the commas is allowed:
 ```cpp
-    auto json = R"( 1, 2, 3, 4, "a", "b", "c", {"hello": "world"} , [1, 2, 3])"_padded;
-    ondemand::parser parser;
-    ondemand::document_stream doc_stream;
-    // We pass '32' as the batch size, but it is a bogus parameter because, since
-    // we pass 'true' to the allow_comma parameter, the batch size will be set to at least
-    // the document size.
-    auto error = parser.iterate_many(json, 32, true).get(doc_stream);
-    if (error) { std::cerr << error << std::endl; return; }
-    for (auto doc : doc_stream) {
-        std::cout << doc.type() << std::endl;
-    }
- ```
+auto json = R"({"a":1} , {"b":2} , {"c":3})"_padded;  // Also works
+```
+
+Nested commas inside objects and arrays are preserved:
+```cpp
+auto json = R"({"arr":[1,2,3]},{"obj":{"x":1,"y":2}})"_padded;
+// Correctly parses as 2 documents, not 6
+```
+
+Mixed document types are supported:
+```cpp
+auto json = R"(1, 2, 3, 4, "a", "b", "c", {"hello": "world"}, [1, 2, 3])"_padded;
+ondemand::parser parser;
+ondemand::document_stream doc_stream;
+auto error = parser.iterate_many(json, ondemand::DEFAULT_BATCH_SIZE,
+                                 simdjson::stream_format::comma_delimited).get(doc_stream);
+if (error) { std::cerr << error << std::endl; return; }
+for (auto doc : doc_stream) {
+    std::cout << doc.type() << std::endl;
+}
+// Prints: number number number number string string string object array
+```
+
+Extra top-level separators are tolerated for compatibility with the legacy
+`allow_comma_separated` behavior. For example, leading commas, trailing commas,
+and repeated commas are treated as empty separators rather than documents.
+
+### Legacy `allow_comma_separated` parameter (deprecated)
+
+The `allow_comma_separated` boolean parameter is deprecated. When set to `true`, it now internally maps to `stream_format::comma_delimited`.
 
- This will print:
+The old single-batch limitation no longer applies - comma-delimited parsing now supports multi-batch processing and threading for optimal performance on large files.
 
+JSON Text Sequences (RFC 7464)
+------------------------------
+
+[RFC 7464](https://tools.ietf.org/html/rfc7464) defines a format for streaming JSON values using ASCII Record Separator (RS, 0x1E) as a delimiter. Each JSON text is preceded by RS and optionally followed by ASCII Line Feed (LF, 0x0A).
+
+Example input:
 ```
-number
-number
-number
-number
-string
-string
-string
-object
-array
+<RS>{"name":"doc1"}<LF>
+<RS>{"name":"doc2"}<LF>
+<RS>{"name":"doc3"}<LF>
 ```
 
+To parse JSON text sequences, use the `stream_format::json_sequence` parameter:
+
+```cpp
+// Build input with RS (0x1E) and LF (0x0A) delimiters
+std::string input_str;
+input_str += '\x1e'; input_str += "{\"a\":1}"; input_str += '\x0a';
+input_str += '\x1e'; input_str += "{\"b\":2}"; input_str += '\x0a';
+input_str += '\x1e'; input_str += "{\"c\":3}"; input_str += '\x0a';
+simdjson::padded_string input(input_str);
+
+ondemand::parser parser;
+ondemand::document_stream stream;
+auto error = parser.iterate_many(input, ondemand::DEFAULT_BATCH_SIZE,
+                                 simdjson::stream_format::json_sequence).get(stream);
+if (error) { std::cerr << error << std::endl; return; }
+for (auto doc : stream) {
+    std::cout << doc << std::endl;
+}
+```
+
+The `stream_format` enum has the following values:
+- `stream_format::whitespace_delimited` (default): Standard NDJSON/JSON Lines format
+- `stream_format::json_sequence`: RFC 7464 format with RS delimiters
+- `stream_format::comma_delimited`: Comma-separated JSON documents
+
+The trailing LF after each JSON text is optional but recommended by the RFC for robustness.
+
 
 C++20 features
 --------------------