Skip to content

Commit 7a69da1

Browse files
lemireDaniel Lemire
andauthored
Fixing issue 906 (simdjson#912)
* Fixing issue 906 * Safe patching. * Now with explanations. * Bumping up memory allocation. * Putting the patch back. * fallback fixes. Co-authored-by: Daniel Lemire <lemire@gmai.com>
1 parent 3517174 commit 7a69da1

File tree

9 files changed

+66
-5
lines changed

9 files changed

+66
-5
lines changed

include/simdjson/inline/document.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ inline error_code document::allocate(size_t capacity) noexcept {
3333
// worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
3434
//where len + 1 tape elements are
3535
// generated, see issue https://github.com/lemire/simdjson/issues/345
36-
size_t tape_capacity = ROUNDUP_N(capacity + 2, 64);
36+
size_t tape_capacity = ROUNDUP_N(capacity + 3, 64);
3737
// a document with only zero-length strings... could have len/3 string
3838
// and we would need len/3 * 5 bytes on the string buffer
3939
size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64);

src/arm64/dom_parser_implementation.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ namespace arm64 {
112112
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
113113
error_code err = stage1(_buf, _len, false);
114114
if (err) { return err; }
115+
err = check_for_unclosed_array();
116+
if (err) { return err; }
115117
return stage2(_doc);
116118
}
117119

src/fallback/dom_parser_implementation.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ really_inline error_code scan() {
133133
}
134134
*next_structural_index = len;
135135
next_structural_index++;
136+
// We pad beyond.
137+
// https://github.com/simdjson/simdjson/issues/906
138+
next_structural_index[0] = len;
139+
next_structural_index[1] = 0;
136140
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
137141
return error;
138142
}
@@ -234,6 +238,8 @@ namespace fallback {
234238
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
235239
error_code err = stage1(_buf, _len, false);
236240
if (err) { return err; }
241+
err = check_for_unclosed_array();
242+
if (err) { return err; }
237243
return stage2(_doc);
238244
}
239245

src/generic/dom_parser_implementation.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class dom_parser_implementation final : public internal::dom_parser_implementati
3131

3232
WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
3333
WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept final;
34+
WARN_UNUSED error_code check_for_unclosed_array() noexcept;
3435
WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
3536
WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::document &doc, size_t &next_json) noexcept final;
3637
WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
@@ -56,3 +57,22 @@ WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth
5657
_max_depth = max_depth;
5758
return SUCCESS;
5859
}
60+
61+
62+
WARN_UNUSED error_code dom_parser_implementation::check_for_unclosed_array() noexcept {
63+
// Before we engage stage 2, we want to make sure there is no risk that we could end with [ and
64+
// loop back at the start with [. That is, we want to make sure that if the first character is [, then
65+
// the last one is ].
66+
// See https://github.com/simdjson/simdjson/issues/906 for details.
67+
if(n_structural_indexes < 2) {
68+
return UNEXPECTED_ERROR;
69+
}
70+
const size_t first_index = structural_indexes[0];
71+
const size_t last_index = structural_indexes[n_structural_indexes - 2];
72+
const char first_character = char(buf[first_index]);
73+
const char last_character = char(buf[last_index]);
74+
if((first_character == '[') and (last_character != ']')) {
75+
return TAPE_ERROR;
76+
}
77+
return SUCCESS;
78+
}

src/generic/stage1/json_structural_indexer.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,22 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
109109
parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
110110
}
111111
/* make it safe to dereference one beyond this array */
112-
parser.structural_indexes[parser.n_structural_indexes] = 0;
112+
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
113+
parser.structural_indexes[parser.n_structural_indexes + 1] = 0;
114+
/***
115+
* This is related to https://github.com/simdjson/simdjson/issues/906
116+
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
117+
* structural character, it quickly stops.
118+
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
119+
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
120+
* continues, then it must be [,] or }.
121+
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
122+
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
123+
* It can't be a comma, a colon or any simple value. So the only way we could continue is
124+
* if the repeated character is [. But if so, the document must start with [. But if the document
125+
* starts with [, it should end with ]. If we enforce that rule, then we would get
126+
* ][[ which is invalid.
127+
**/
113128
return checker.errors();
114129
}
115130

src/generic/stage2/logger.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ namespace logger {
77
static constexpr const int LOG_EVENT_LEN = 30;
88
static constexpr const int LOG_BUFFER_LEN = 20;
99
static constexpr const int LOG_DETAIL_LEN = 50;
10+
static constexpr const int LOG_INDEX_LEN = 10;
1011

1112
static int log_depth; // Not threadsafe. Log only.
1213

@@ -24,8 +25,8 @@ namespace logger {
2425
if (LOG_ENABLED) {
2526
log_depth = 0;
2627
printf("\n");
27-
printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", LOG_DETAIL_LEN, "Detail");
28-
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES);
28+
printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
29+
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
2930
}
3031
}
3132

@@ -57,6 +58,7 @@ namespace logger {
5758
printf("| %c ", printable_char(structurals.peek_char()));
5859
printf("| %5zd ", structurals.next_structural);
5960
printf("| %-*s ", LOG_DETAIL_LEN, detail);
61+
printf("| %*zu ", LOG_INDEX_LEN, structurals.idx);
6062
printf("|\n");
6163
}
6264
}

src/haswell/dom_parser_implementation.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ namespace haswell {
101101
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
102102
error_code err = stage1(_buf, _len, false);
103103
if (err) { return err; }
104+
err = check_for_unclosed_array();
105+
if (err) { return err; }
104106
return stage2(_doc);
105107
}
106108

src/westmere/dom_parser_implementation.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ namespace westmere {
102102
WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
103103
error_code err = stage1(_buf, _len, false);
104104
if (err) { return err; }
105+
err = check_for_unclosed_array();
106+
if (err) { return err; }
105107
return stage2(_doc);
106108
}
107109

tests/basictests.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,17 @@ namespace document_tests {
233233
}
234234
return true;
235235
}
236+
bool padded_with_open_bracket() {
237+
std::cout << __func__ << std::endl;
238+
simdjson::dom::parser parser;
239+
// This is an invalid document padded with open braces.
240+
auto error1 = parser.parse("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", 2, false).error();
241+
if (!error1) { std::cerr << "We expected an error but got: " << error1 << std::endl; return false; }
242+
// This is a valid document padded with open braces.
243+
auto error2 = parser.parse("[][[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", 2, false).error();
244+
if (error2) { std::cerr << "Error: " << error2 << std::endl; return false; }
245+
return true;
246+
}
236247
// returns true if successful
237248
bool stable_test() {
238249
std::cout << __func__ << std::endl;
@@ -340,7 +351,8 @@ namespace document_tests {
340351
return true;
341352
}
342353
bool run() {
343-
return bad_example() &&
354+
return padded_with_open_bracket() &&
355+
bad_example() &&
344356
count_array_example() &&
345357
count_object_example() &&
346358
stable_test() &&

0 commit comments

Comments
 (0)