Skip to content

Commit df8f792

Browse files
authored
Store the string lengths on the string tape (simdjson#101)
* Store string length in the string-tape item. * Files are now limited to 4GB. * Moving detection of unescaped chars to stage 1 to reduce the burden due to string parsing. Fixes simdjson#114 Fixes simdjson#87
1 parent 609e96b commit df8f792

File tree

12 files changed

+293
-162
lines changed

12 files changed

+293
-162
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,13 @@ jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
138138
ujdecode.o: $(UJSON4C_INCLUDE)
139139
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c
140140

141-
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES)
141+
parseandstatcompetition: benchmark/parseandstatcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
142142
$(CXX) $(CXXFLAGS) -o parseandstatcompetition $(LIBFILES) benchmark/parseandstatcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
143143

144-
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES)
144+
distinctuseridcompetition: benchmark/distinctuseridcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
145145
$(CXX) $(CXXFLAGS) -o distinctuseridcompetition $(LIBFILES) benchmark/distinctuseridcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
146146

147-
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES)
147+
parsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(LIBS)
148148
$(CXX) $(CXXFLAGS) -o parsingcompetition $(LIBFILES) benchmark/parsingcompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
149149

150150
allparsingcompetition: benchmark/parsingcompetition.cpp $(HEADERS) $(LIBFILES) $(EXTRAOBJECTS) $(LIBS)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ The parser builds a useful immutable (read-only) DOM (document-object model) whi
263263
To simplify the engineering, we make some assumptions.
264264
265265
- We support UTF-8 (and thus ASCII), nothing else (no Latin, no UTF-16). We do not believe that this is a genuine limitation in the sense that we do not think that there is any serious application that needs to process JSON data without an ASCII or UTF-8 encoding.
266-
- We store strings as NULL terminated C strings. Thus we implicitly assume that you do not include a NULL character within your string, which is allowed technically speaking if you escape it (\u0000).
266+
- All strings in the JSON document may have up to 4294967295 bytes in UTF-8 (4GB). To enforce this constraint, we refuse to parse a document that contains more than 4294967295 bytes (4GB). This should accomodate most JSON documents.
267267
- We assume AVX2 support which is available in all recent mainstream x86 processors produced by AMD and Intel. No support for non-x86 processors is included though it can be done. We plan to support ARM processors (help is invited).
268268
- In cases of failure, we just report a failure without any indication as to the nature of the problem. (This can be easily improved without affecting performance.)
269269
- As allowed by the specification, we allow repeated keys within an object (other parsers like sajson do the same).

benchmark/parsingcompetition.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
120120
if(!justdata) BEST_TIME("simdjson (dynamic mem) ", build_parsed_json(p).isValid(), true, ,
121121
repeat, volume, !justdata);
122122
// (static alloc)
123-
BEST_TIME("simdjson ", json_parse(p, pj), true, , repeat,
123+
BEST_TIME("simdjson ", json_parse(p, pj), simdjson::SUCCESS, , repeat,
124124
volume, !justdata);
125125

126126

include/simdjson/common_defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55

66
#include <cassert>
77

8+
// we support documents up to 4GB
9+
#define SIMDJSON_MAXSIZE_BYTES 0xFFFFFFFF
10+
811
// the input buf should be readable up to buf + SIMDJSON_PADDING
912
#define SIMDJSON_PADDING sizeof(__m256i)
1013

include/simdjson/jsonformatutils.h

Lines changed: 173 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -5,87 +5,183 @@
55
#include <iomanip>
66
#include <iostream>
77

8+
// ends with zero char
89
static inline void print_with_escapes(const unsigned char *src) {
9-
while (*src != 0u) {
10+
while (*src) {
1011
switch (*src) {
11-
case '\b':
12-
putchar('\\');
13-
putchar('b');
14-
break;
15-
case '\f':
16-
putchar('\\');
17-
putchar('f');
18-
break;
19-
case '\n':
20-
putchar('\\');
21-
putchar('n');
22-
break;
23-
case '\r':
24-
putchar('\\');
25-
putchar('r');
26-
break;
27-
case '\"':
28-
putchar('\\');
29-
putchar('"');
30-
break;
31-
case '\t':
32-
putchar('\\');
33-
putchar('t');
34-
break;
35-
case '\\':
36-
putchar('\\');
37-
putchar('\\');
38-
break;
39-
default:
40-
if (*src <= 0x1F) {
41-
printf("\\u%04x", *src);
42-
} else {
43-
putchar(*src);
12+
case '\b':
13+
putchar('\\');
14+
putchar('b');
15+
break;
16+
case '\f':
17+
putchar('\\');
18+
putchar('f');
19+
break;
20+
case '\n':
21+
putchar('\\');
22+
putchar('n');
23+
break;
24+
case '\r':
25+
putchar('\\');
26+
putchar('r');
27+
break;
28+
case '\"':
29+
putchar('\\');
30+
putchar('"');
31+
break;
32+
case '\t':
33+
putchar('\\');
34+
putchar('t');
35+
break;
36+
case '\\':
37+
putchar('\\');
38+
putchar('\\');
39+
break;
40+
default:
41+
if (*src <= 0x1F) {
42+
printf("\\u%04x", *src);
43+
} else {
44+
putchar(*src);
45+
}
46+
}
47+
src++;
48+
}
4449
}
50+
51+
// ends with zero char
52+
static inline void print_with_escapes(const unsigned char *src,
53+
std::ostream &os) {
54+
while (*src) {
55+
switch (*src) {
56+
case '\b':
57+
os << '\\';
58+
os << 'b';
59+
break;
60+
case '\f':
61+
os << '\\';
62+
os << 'f';
63+
break;
64+
case '\n':
65+
os << '\\';
66+
os << 'n';
67+
break;
68+
case '\r':
69+
os << '\\';
70+
os << 'r';
71+
break;
72+
case '\"':
73+
os << '\\';
74+
os << '"';
75+
break;
76+
case '\t':
77+
os << '\\';
78+
os << 't';
79+
break;
80+
case '\\':
81+
os << '\\';
82+
os << '\\';
83+
break;
84+
default:
85+
if (*src <= 0x1F) {
86+
std::ios::fmtflags f(os.flags());
87+
os << std::hex << std::setw(4) << std::setfill('0')
88+
<< static_cast<int>(*src);
89+
os.flags(f);
90+
} else {
91+
os << *src;
92+
}
4593
}
4694
src++;
4795
}
4896
}
4997

50-
static inline void print_with_escapes(const unsigned char *src, std::ostream &os) {
51-
while (*src != 0u) {
98+
// print len chars
99+
static inline void print_with_escapes(const unsigned char *src, size_t len) {
100+
const unsigned char *finalsrc = src + len;
101+
while (src < finalsrc) {
52102
switch (*src) {
53-
case '\b':
54-
os << '\\';
55-
os << 'b';
56-
break;
57-
case '\f':
58-
os << '\\';
59-
os << 'f';
60-
break;
61-
case '\n':
62-
os << '\\';
63-
os << 'n';
64-
break;
65-
case '\r':
66-
os << '\\';
67-
os << 'r';
68-
break;
69-
case '\"':
70-
os << '\\';
71-
os << '"';
72-
break;
73-
case '\t':
74-
os << '\\';
75-
os << 't';
76-
break;
77-
case '\\':
78-
os << '\\';
79-
os << '\\';
80-
break;
81-
default:
82-
if (*src <= 0x1F) {
83-
std::ios::fmtflags f(os.flags());
84-
os << std::hex << std::setw(4) << std::setfill('0') << static_cast<int>(*src);
85-
os.flags(f);
86-
} else {
87-
os << *src;
103+
case '\b':
104+
putchar('\\');
105+
putchar('b');
106+
break;
107+
case '\f':
108+
putchar('\\');
109+
putchar('f');
110+
break;
111+
case '\n':
112+
putchar('\\');
113+
putchar('n');
114+
break;
115+
case '\r':
116+
putchar('\\');
117+
putchar('r');
118+
break;
119+
case '\"':
120+
putchar('\\');
121+
putchar('"');
122+
break;
123+
case '\t':
124+
putchar('\\');
125+
putchar('t');
126+
break;
127+
case '\\':
128+
putchar('\\');
129+
putchar('\\');
130+
break;
131+
default:
132+
if (*src <= 0x1F) {
133+
printf("\\u%04x", *src);
134+
} else {
135+
putchar(*src);
136+
}
137+
}
138+
src++;
139+
}
88140
}
141+
142+
// print len chars
143+
static inline void print_with_escapes(const unsigned char *src,
144+
std::ostream &os, size_t len) {
145+
const unsigned char *finalsrc = src + len;
146+
while (src < finalsrc) {
147+
switch (*src) {
148+
case '\b':
149+
os << '\\';
150+
os << 'b';
151+
break;
152+
case '\f':
153+
os << '\\';
154+
os << 'f';
155+
break;
156+
case '\n':
157+
os << '\\';
158+
os << 'n';
159+
break;
160+
case '\r':
161+
os << '\\';
162+
os << 'r';
163+
break;
164+
case '\"':
165+
os << '\\';
166+
os << '"';
167+
break;
168+
case '\t':
169+
os << '\\';
170+
os << 't';
171+
break;
172+
case '\\':
173+
os << '\\';
174+
os << '\\';
175+
break;
176+
default:
177+
if (*src <= 0x1F) {
178+
std::ios::fmtflags f(os.flags());
179+
os << std::hex << std::setw(4) << std::setfill('0')
180+
<< static_cast<int>(*src);
181+
os.flags(f);
182+
} else {
183+
os << *src;
184+
}
89185
}
90186
src++;
91187
}
@@ -95,4 +191,10 @@ static inline void print_with_escapes(const char *src, std::ostream &os) {
95191
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os);
96192
}
97193

194+
static inline void print_with_escapes(const char *src, std::ostream &os,
195+
size_t len) {
196+
print_with_escapes(reinterpret_cast<const unsigned char *>(src), os, len);
197+
}
198+
199+
#
98200
#endif

include/simdjson/jsonparser.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ WARN_UNUSED
2020
int json_parse(const uint8_t *buf, size_t len, ParsedJson &pj, bool reallocifneeded = true);
2121

2222
// Parse a document found in buf, need to preallocate ParsedJson.
23-
// Return false in case of a failure. You can also check validity
23+
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
2424
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
2525
//
2626
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing
@@ -33,7 +33,7 @@ inline int json_parse(const char * buf, size_t len, ParsedJson &pj, bool realloc
3333
}
3434

3535
// Parse a document found in buf, need to preallocate ParsedJson.
36-
// Return false in case of a failure. You can also check validity
36+
// Return SUCCESS (an integer = 1) in case of a success. You can also check validity
3737
// by calling pj.isValid(). The same ParsedJson can be reused for other documents.
3838
//
3939
// If reallocifneeded is true (default) then a temporary buffer is created when needed during processing

include/simdjson/parsedjson.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,12 @@ struct ParsedJson {
125125
// get the string value at this node (NULL ended); valid only if we're at "
126126
// note that tabs, and line endings are escaped in the returned value (see print_with_escapes)
127127
// return value is valid UTF-8
128+
// It may contain NULL chars within the string: get_string_length determines the true
129+
// string length.
128130
const char * get_string() const;
129131

132+
uint32_t get_string_length() const;
133+
130134
// get the double value at this node; valid only if
131135
// we're at "d"
132136
double get_double() const;
@@ -149,6 +153,9 @@ struct ParsedJson {
149153
// if successful, we are left pointing at the value,
150154
// if not, we are still pointing at the object ({)
151155
// (in case of repeated keys, this only finds the first one)
156+
// We seek the key using C's strcmp so if your JSON strings contain
157+
// NULL chars, this would trigger a false positive: if you expect that
158+
// to be the case, take extra precautions.
152159
bool move_to_key(const char * key);
153160

154161
// throughout return true if we can do the navigation, false

0 commit comments

Comments
 (0)