Skip to content

Commit d7f7f1b

Browse files
authored
Fixing issue. (simdjson#193)
1 parent 8914b12 commit d7f7f1b

17 files changed

+144
-55
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ padded_string p = get_corpus(filename);
9292
ParsedJson pj = build_parsed_json(p); // do the parsing
9393
if( ! pj.isValid() ) {
9494
// something went wrong
95+
std::cout << pj.getErrorMsg() << std::endl;
9596
}
9697
```
9798

@@ -127,6 +128,7 @@ std::string mystring = ... //
127128
ParsedJson pj = build_parsed_json(mystring); // do the parsing
128129
if( ! pj.isValid() ) {
129130
// something went wrong
131+
std::cout << pj.getErrorMsg() << std::endl;
130132
}
131133
```
132134

@@ -148,6 +150,7 @@ int main(int argc, char *argv[]) {
148150
ParsedJson pj = build_parsed_json(p); // do the parsing
149151
if( ! pj.isValid() ) {
150152
std::cout << "not valid" << std::endl;
153+
std::cout << pj.getErrorMsg() << std::endl;
151154
} else {
152155
std::cout << "valid" << std::endl;
153156
}

benchmark/parse.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
144144
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
145145
}
146146
unified.start();
147-
isok = find_structural_bits(p.data(), p.size(), pj);
147+
isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
148148
unified.end(results);
149149
cy1 += results[0];
150150
cl1 += results[1];
@@ -185,18 +185,20 @@ int main(int argc, char *argv[]) {
185185
}
186186

187187
auto start = std::chrono::steady_clock::now();
188-
isok = find_structural_bits(p.data(), p.size(), pj);
188+
isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
189189
isok = isok && (simdjson::SUCCESS == unified_machine(p.data(), p.size(), pj));
190190
auto end = std::chrono::steady_clock::now();
191191
std::chrono::duration<double> secs = end - start;
192192
res[i] = secs.count();
193193
if(! isok) {
194+
std::cerr << pj.getErrorMsg() << std::endl;
194195
std::cerr << "Could not parse. " << std::endl;
195196
return EXIT_FAILURE;
196197
}
197198
}
198199
ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
199200
if (!pj.isValid()) {
201+
std::cerr << pj.getErrorMsg() << std::endl;
200202
std::cerr << "Could not parse. " << std::endl;
201203
return EXIT_FAILURE;
202204
}

benchmark/statisticalmodel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
180180
results.resize(evts.size());
181181
for (uint32_t i = 0; i < iterations; i++) {
182182
unified.start();
183-
bool isok = find_structural_bits(p.data(), p.size(), pj);
183+
bool isok = (find_structural_bits(p.data(), p.size(), pj) == simdjson::SUCCESS);
184184
unified.end(results);
185185

186186
cy1 += results[0];

include/simdjson/jsoncharutils.h

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
// these are the chars that can follow a true/false/null or number atom
1313
// and nothing else
14-
const uint32_t structural_or_whitespace_negated[256] = {
14+
const uint32_t structural_or_whitespace_or_null_negated[256] = {
1515
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1616
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
@@ -28,13 +28,37 @@ const uint32_t structural_or_whitespace_negated[256] = {
2828
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2929
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
3030

31+
// return non-zero if not a structural or whitespace char
32+
// zero otherwise
33+
really_inline uint32_t is_not_structural_or_whitespace_or_null(uint8_t c) {
34+
return structural_or_whitespace_or_null_negated[c];
35+
}
36+
37+
38+
const uint32_t structural_or_whitespace_negated[256] = {
39+
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
40+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41+
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
42+
43+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44+
1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
46+
47+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50+
51+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
54+
3155
// return non-zero if not a structural or whitespace char
3256
// zero otherwise
3357
really_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
3458
return structural_or_whitespace_negated[c];
3559
}
3660

37-
const uint32_t structural_or_whitespace[256] = {
61+
const uint32_t structural_or_whitespace_or_null[256] = {
3862
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3963
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
4064
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -47,6 +71,24 @@ const uint32_t structural_or_whitespace[256] = {
4771
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4872
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4973

74+
really_inline uint32_t is_structural_or_whitespace_or_null(uint8_t c) {
75+
return structural_or_whitespace_or_null[c];
76+
}
77+
78+
79+
const uint32_t structural_or_whitespace[256] = {
80+
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81+
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
82+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
84+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85+
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
91+
5092
really_inline uint32_t is_structural_or_whitespace(uint8_t c) {
5193
return structural_or_whitespace[c];
5294
}

include/simdjson/numberparsing.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static inline bool is_integer(char c) {
9090
// probably frequent and it is hard than it looks. We are building all of this
9191
// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
9292
const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
93-
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93+
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9494
1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
9595
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
9696
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
@@ -103,7 +103,7 @@ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
103103
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
104104

105105
really_inline bool
106-
is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(unsigned char c) {
106+
is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
107107
return structural_or_whitespace_or_exponent_or_decimal_negated[c];
108108
}
109109

@@ -380,6 +380,12 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
380380

381381
// parse the number at buf + offset
382382
// define JSON_TEST_NUMBERS for unit testing
383+
//
384+
// It is assumed that the number is followed by a structural ({,},],[) character
385+
// or a white space character. If that is not the case (e.g., when the JSON document
386+
// is made of a single number), then it is necessary to copy the content and append
387+
// a space before calling this function.
388+
//
383389
static really_inline bool parse_number(const uint8_t *const buf,
384390
ParsedJson &pj,
385391
const uint32_t offset,
@@ -405,7 +411,7 @@ static really_inline bool parse_number(const uint8_t *const buf,
405411
uint64_t i; // an unsigned int avoids signed overflows (which are bad)
406412
if (*p == '0') { // 0 cannot be followed by an integer
407413
++p;
408-
if (is_not_structural_or_whitespace_or_exponent_or_decimal_or_null(*p)) {
414+
if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
409415
#ifdef JSON_TEST_NUMBERS // for unit testing
410416
foundInvalidNumber(buf + offset);
411417
#endif
@@ -430,7 +436,6 @@ static really_inline bool parse_number(const uint8_t *const buf,
430436
++p;
431437
}
432438
}
433-
434439
int64_t exponent = 0;
435440
bool is_float = false;
436441
if ('.' == *p) {

include/simdjson/parsedjson.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include <cstring>
77
#include <iomanip>
88
#include <iostream>
9-
9+
#include "simdjson/simdjson.h"
1010
#include "simdjson/common_defs.h"
1111
#include "simdjson/jsonformatutils.h"
1212
#include "simdjson/portability.h"
@@ -34,8 +34,16 @@ struct ParsedJson {
3434
WARN_UNUSED
3535
bool allocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH);
3636

37+
// returns true if the document parsed was valid
3738
bool isValid() const;
3839

40+
// return an error code corresponding to the last parsing attempt, see simdjson.h
41+
// will return simdjson::UNITIALIZED if no parsing was attempted
42+
int getErrorCode() const;
43+
44+
// return the string equivalent of "getErrorCode"
45+
std::string getErrorMsg() const;
46+
3947
// deallocate memory and set capacity to zero, called automatically by the
4048
// destructor
4149
void deallocate();
@@ -297,6 +305,7 @@ struct ParsedJson {
297305
uint8_t *string_buf; // should be at least bytecapacity
298306
uint8_t *current_string_buf_loc;
299307
bool isvalid{false};
308+
int errorcode{simdjson::UNITIALIZED};
300309

301310
private :
302311

include/simdjson/simdjson.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@ struct simdjson {
1515
F_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'f'
1616
N_ATOM_ERROR, // Problem while parsing an atom starting with the letter 'n'
1717
NUMBER_ERROR, // Problem while parsing a number
18-
UTF8_ERROR // the input is not valid UTF-8
18+
UTF8_ERROR, // the input is not valid UTF-8
19+
UNITIALIZED, // unknown error, or uninitialized document
20+
EMPTY, // no structural document found
21+
UNESCAPED_CHARS, // found unescaped characters in a string.
22+
UNCLOSED_STRING, // missing quote at the end
23+
UNEXPECTED_ERROR // indicative of a bug in simdjson
1924
};
2025
static const std::string& errorMsg(const int);
2126
};

include/simdjson/stage1_find_marks.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
struct ParsedJson;
77

88
WARN_UNUSED
9-
bool find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj);
9+
int find_structural_bits(const uint8_t *buf, size_t len, ParsedJson &pj);
1010

1111
WARN_UNUSED
12-
bool find_structural_bits(const char *buf, size_t len, ParsedJson &pj);
12+
int find_structural_bits(const char *buf, size_t len, ParsedJson &pj);
1313

1414
#endif

jsonchecker/fail66.json

3 Bytes
Binary file not shown.

jsonchecker/fail67.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
44

0 commit comments

Comments
 (0)