Skip to content

Commit a8b9998

Browse files
committed
Intermediate step.
1 parent e570733 commit a8b9998

20 files changed

+284
-249
lines changed

README.md

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,13 @@ Goal: Speed up the parsing of JSON per se.
1212
/...
1313

1414
const char * filename = ... //
15-
pair<u8 *, size_t> p = get_corpus(filename);
16-
ParsedJson *pj_ptr = allocate_ParsedJson(p.second); // allocate memory for parsing up to p.second bytes
17-
bool is_ok = json_parse(p.first, p.second, *pj_ptr); // do the parsing, return false on error
15+
simdjsonstring p = get_corpus(filename);
16+
ParsedJson pj;
17+
size_t maxdepth = 1024; // support documents have nesting "depth" up to 1024
18+
pj.allocateCapacity(p.size(), maxdepth); // allocate memory for parsing up to p.size() bytes
19+
bool is_ok = json_parse(p.first, p.second, pj); // do the parsing, return false on error
1820
// parsing is done!
19-
20-
free(p.first); // free JSON bytes, can be done right after parsing
21-
22-
23-
deallocate_ParsedJson(pj_ptr); // once you are done with pj_ptr, free JSON document; hint: you can reuse pj_ptr
21+
// js can be reused with other json_parse calls.
2422
```
2523

2624

benchmark/minifiercompetition.cpp

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
6161
exit(1);
6262
}
6363
const char * filename = argv[optind];
64-
pair<u8 *, size_t> p;
64+
simdjsonstring p;
6565
try {
6666
p = get_corpus(filename);
6767
} catch (const std::exception& e) { // caught by reference to base
@@ -70,79 +70,76 @@ int main(int argc, char *argv[]) {
7070
}
7171
if (verbose) {
7272
std::cout << "Input has ";
73-
if (p.second > 1024 * 1024)
74-
std::cout << p.second / (1024 * 1024) << " MB ";
75-
else if (p.second > 1024)
76-
std::cout << p.second / 1024 << " KB ";
73+
if (p.size() > 1024 * 1024)
74+
std::cout << p.size() / (1024 * 1024) << " MB ";
75+
else if (p.size() > 1024)
76+
std::cout << p.size() / 1024 << " KB ";
7777
else
78-
std::cout << p.second << " B ";
78+
std::cout << p.size() << " B ";
7979
std::cout << std::endl;
8080
}
81-
char *buffer = allocate_aligned_buffer(p.second + 1);
82-
memcpy(buffer, p.first, p.second);
83-
buffer[p.second] = '\0';
81+
char *buffer = allocate_aligned_buffer(p.size() + 1);
82+
memcpy(buffer, p.c_str(), p.size());
83+
buffer[p.size()] = '\0';
8484

8585
int repeat = 10;
86-
int volume = p.second;
86+
int volume = p.size();
8787

88-
size_t strlength = rapidstringme((char *)p.first).size();
88+
size_t strlength = rapidstringme((char *)p.c_str()).size();
8989
if (verbose)
90-
std::cout << "input length is " << p.second << " stringified length is "
90+
std::cout << "input length is " << p.size() << " stringified length is "
9191
<< strlength << std::endl;
92-
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.first), , repeat, volume, true);
92+
BEST_TIME_NOCHECK("despacing with RapidJSON", rapidstringme((char *)p.c_str()), , repeat, volume, true);
9393
BEST_TIME_NOCHECK("despacing with RapidJSON Insitu", rapidstringmeInsitu((char *)buffer),
94-
memcpy(buffer, p.first, p.second), repeat, volume, true);
95-
memcpy(buffer, p.first, p.second);
94+
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
95+
memcpy(buffer, p.c_str(), p.size());
9696

9797
size_t outlength =
98-
jsonminify((const uint8_t *)buffer, p.second, (uint8_t *)buffer);
98+
jsonminify((const uint8_t *)buffer, p.size(), (uint8_t *)buffer);
9999
if (verbose)
100100
std::cout << "jsonminify length is " << outlength << std::endl;
101101

102102
uint8_t *cbuffer = (uint8_t *)buffer;
103-
BEST_TIME("jsonminify", jsonminify(cbuffer, p.second, cbuffer), outlength,
104-
memcpy(buffer, p.first, p.second), repeat, volume, true);
105-
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.second, outlength * 100.0 / p.second);
103+
BEST_TIME("jsonminify", jsonminify(cbuffer, p.size(), cbuffer), outlength,
104+
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
105+
printf("minisize = %zu, original size = %zu (minified down to %.2f percent of original) \n", outlength, p.size(), outlength * 100.0 / p.size());
106106

107107
/***
108108
* Is it worth it to minify before parsing?
109109
***/
110110
rapidjson::Document d;
111111
BEST_TIME("RapidJSON Insitu orig", d.ParseInsitu(buffer).HasParseError(), false,
112-
memcpy(buffer, p.first, p.second), repeat, volume, true);
112+
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
113113

114-
char *minibuffer = allocate_aligned_buffer(p.second + 1);
115-
size_t minisize = jsonminify((const uint8_t *)p.first, p.second, (uint8_t*) minibuffer);
114+
char *minibuffer = allocate_aligned_buffer(p.size() + 1);
115+
size_t minisize = jsonminify((const uint8_t *)p.c_str(), p.size(), (uint8_t*) minibuffer);
116116
minibuffer[minisize] = '\0';
117117

118118
BEST_TIME("RapidJSON Insitu despaced", d.ParseInsitu(buffer).HasParseError(), false,
119-
memcpy(buffer, minibuffer, p.second),
119+
memcpy(buffer, minibuffer, p.size()),
120120
repeat, volume, true);
121121

122-
size_t astbuffersize = p.second * 2;
122+
size_t astbuffersize = p.size() * 2;
123123
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
124124

125-
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
125+
BEST_TIME("sajson orig", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
126126

127127

128-
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.second), repeat, volume, true);
128+
BEST_TIME("sajson despaced", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(minisize, buffer)).is_valid(), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
129129

130-
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
131-
ParsedJson &pj(*pj_ptr);
132-
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.second, pj), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
130+
ParsedJson pj;
131+
pj.allocateCapacity(p.size(), 1024);
132+
BEST_TIME("json_parse orig", json_parse((const u8*)buffer, p.size(), pj), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
133133

134-
ParsedJson *pj_ptr2 = allocate_ParsedJson(p.second, 1024);
135-
ParsedJson &pj2(*pj_ptr2);
134+
ParsedJson pj2;
135+
pj2.allocateCapacity(p.size(), 1024);
136136

137137

138-
BEST_TIME("json_parse despaced", json_parse((const u8*)buffer, minisize, pj2), true, memcpy(buffer, minibuffer, p.second), repeat, volume, true);
138+
BEST_TIME("json_parse despaced", json_parse((const u8*)buffer, minisize, pj2), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, true);
139139

140140
free(buffer);
141-
free(p.first);
142141
free(ast_buffer);
143142
free(minibuffer);
144-
deallocate_ParsedJson(pj_ptr);
145-
deallocate_ParsedJson(pj_ptr2);
146143

147144

148145
}

benchmark/parse.cpp

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,27 @@ int main(int argc, char *argv[]) {
6565
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
6666
}
6767
if(verbose) cout << "[verbose] loading " << filename << endl;
68-
pair<u8 *, size_t> p;
68+
simdjsonstring p;
6969
try {
7070
p = get_corpus(filename);
7171
} catch (const std::exception& e) { // caught by reference to base
7272
std::cout << "Could not load the file " << filename << std::endl;
7373
return EXIT_FAILURE;
7474
}
75-
if(verbose) cout << "[verbose] loaded " << filename << " ("<< p.second << " bytes)" << endl;
76-
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
77-
ParsedJson &pj(*pj_ptr);
75+
if(verbose) cout << "[verbose] loaded " << filename << " ("<< p.size() << " bytes)" << endl;
76+
ParsedJson pj;
77+
bool allocok = pj.allocateCapacity(p.size(), 1024);
78+
if(!allocok) {
79+
std::cerr << "failed to allocate memory" << std::endl;
80+
return EXIT_FAILURE;
81+
}
82+
7883
if(verbose) cout << "[verbose] allocated memory for parsed JSON " << endl;
7984

8085
#if defined(DEBUG)
8186
const u32 iterations = 1;
8287
#else
83-
const u32 iterations = forceoneiteration ? 1 : ( p.second < 1 * 1000 * 1000? 1000 : 10);
88+
const u32 iterations = forceoneiteration ? 1 : ( p.size() < 1 * 1000 * 1000? 1000 : 10);
8489
#endif
8590
vector<double> res;
8691
res.resize(iterations);
@@ -113,7 +118,7 @@ int main(int argc, char *argv[]) {
113118
#ifndef SQUASH_COUNTERS
114119
unified.start();
115120
#endif
116-
isok = find_structural_bits(p.first, p.second, pj);
121+
isok = find_structural_bits(p.c_str(), p.size(), pj);
117122
#ifndef SQUASH_COUNTERS
118123
unified.end(results);
119124
cy1 += results[0];
@@ -127,7 +132,7 @@ int main(int argc, char *argv[]) {
127132
}
128133
unified.start();
129134
#endif
130-
isok = isok && flatten_indexes(p.second, pj);
135+
isok = isok && flatten_indexes(p.size(), pj);
131136
#ifndef SQUASH_COUNTERS
132137
unified.end(results);
133138
cy2 += results[0];
@@ -142,7 +147,7 @@ int main(int argc, char *argv[]) {
142147
unified.start();
143148
#endif
144149

145-
isok = isok && unified_machine(p.first, p.second, pj);
150+
isok = isok && unified_machine(p.c_str(), p.size(), pj);
146151
#ifndef SQUASH_COUNTERS
147152
unified.end(results);
148153
cy3 += results[0];
@@ -163,43 +168,40 @@ int main(int argc, char *argv[]) {
163168

164169
#ifndef SQUASH_COUNTERS
165170
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
166-
p.second, pj.n_structural_indexes,
167-
(double)pj.n_structural_indexes / p.second);
171+
p.size(), pj.n_structural_indexes,
172+
(double)pj.n_structural_indexes / p.size());
168173
unsigned long total = cy1 + cy2 + cy3;
169174

170175
printf(
171176
"stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: %.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: %10lu (failure %10lu)\n",
172177
cl1 / iterations, cy1 / iterations, 100. * cy1 / total, (double)cl1 / cy1, mis1/iterations, (double)cy1/mis1, cref1 / iterations, cmis1 / iterations);
173178
printf(" stage 1 runs at %.2f cycles per input byte.\n",
174-
(double)cy1 / (iterations * p.second));
179+
(double)cy1 / (iterations * p.size()));
175180

176181
printf(
177182
"stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: %.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: %10lu (failure %10lu)\n",
178183
cl2 / iterations, cy2 / iterations, 100. * cy2 / total, (double)cl2 / cy2, mis2/iterations, (double)cy2/mis2, cref2 /iterations, cmis2 / iterations);
179184
printf(" stage 2 runs at %.2f cycles per input byte and ",
180-
(double)cy2 / (iterations * p.second));
185+
(double)cy2 / (iterations * p.size()));
181186
printf("%.2f cycles per structural character.\n",
182187
(double)cy2 / (iterations * pj.n_structural_indexes));
183188

184189
printf(
185190
"stage 3 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: %.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: %10lu (failure %10lu)\n",
186191
cl3 / iterations, cy3 /iterations, 100. * cy3 / total, (double)cl3 / cy3, mis3/iterations, (double)cy3/mis3, cref3 / iterations, cmis3 / iterations);
187192
printf(" stage 3 runs at %.2f cycles per input byte and ",
188-
(double)cy3 / (iterations * p.second));
193+
(double)cy3 / (iterations * p.size()));
189194
printf("%.2f cycles per structural character.\n",
190195
(double)cy3 / (iterations * pj.n_structural_indexes));
191196

192197
printf(" all stages: %.2f cycles per input byte.\n",
193-
(double)total / (iterations * p.second));
198+
(double)total / (iterations * p.size()));
194199
#endif
195-
// colorfuldisplay(pj, p.first);
196200
double min_result = *min_element(res.begin(), res.end());
197-
cout << "Min: " << min_result << " bytes read: " << p.second
198-
<< " Gigabytes/second: " << (p.second) / (min_result * 1000000000.0)
201+
cout << "Min: " << min_result << " bytes read: " << p.size()
202+
<< " Gigabytes/second: " << (p.size()) / (min_result * 1000000000.0)
199203
<< "\n";
200-
if(dump) pj_ptr->printjson();
201-
free(p.first);
202-
deallocate_ParsedJson(pj_ptr);
204+
if(dump) pj.printjson();
203205
if (!isok) {
204206
printf(" Parsing failed. \n ");
205207
return EXIT_FAILURE;

benchmark/parsingcompetition.cpp

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
6161
if(optind + 1 < argc) {
6262
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
6363
}
64-
pair<u8 *, size_t> p;
64+
simdjsonstring p;
6565
try {
6666
p = get_corpus(filename);
6767
} catch (const std::exception& e) { // caught by reference to base
@@ -71,56 +71,54 @@ int main(int argc, char *argv[]) {
7171

7272
if (verbose) {
7373
std::cout << "Input has ";
74-
if (p.second > 1024 * 1024)
75-
std::cout << p.second / (1024 * 1024) << " MB ";
76-
else if (p.second > 1024)
77-
std::cout << p.second / 1024 << " KB ";
74+
if (p.size() > 1024 * 1024)
75+
std::cout << p.size() / (1024 * 1024) << " MB ";
76+
else if (p.size() > 1024)
77+
std::cout << p.size() / 1024 << " KB ";
7878
else
79-
std::cout << p.second << " B ";
79+
std::cout << p.size() << " B ";
8080
std::cout << std::endl;
8181
}
82-
ParsedJson *pj_ptr = allocate_ParsedJson(p.second, 1024);
83-
if (pj_ptr == NULL) {
82+
ParsedJson pj;
83+
bool allocok = pj.allocateCapacity(p.size(), 1024);
84+
85+
if (!allocok) {
8486
std::cerr << "can't allocate memory" << std::endl;
8587
return EXIT_FAILURE;
8688
}
87-
ParsedJson &pj(*pj_ptr);
88-
8989
int repeat = 10;
90-
int volume = p.second;
91-
BEST_TIME("simdjson", json_parse(p.first, p.second, pj), true, , repeat, volume, true);
90+
int volume = p.size();
91+
BEST_TIME("simdjson", json_parse(p, pj), true, , repeat, volume, true);
9292

9393
rapidjson::Document d;
9494

95-
char *buffer = (char *)malloc(p.second + 1);
96-
memcpy(buffer, p.first, p.second);
97-
buffer[p.second] = '\0';
95+
char *buffer = (char *)malloc(p.size() + 1);
96+
memcpy(buffer, p.c_str(), p.size());
97+
buffer[p.size()] = '\0';
9898

9999
BEST_TIME("RapidJSON",
100100
d.Parse<kParseValidateEncodingFlag>((const char *)buffer).HasParseError(),
101-
false, memcpy(buffer, p.first, p.second), repeat, volume, true);
101+
false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
102102
BEST_TIME("RapidJSON Insitu", d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(), false,
103-
memcpy(buffer, p.first, p.second), repeat, volume, true);
103+
memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
104104

105-
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
105+
BEST_TIME("sajson (dynamic mem)", sajson::parse(sajson::dynamic_allocation(), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
106106

107-
size_t astbuffersize = p.second;
107+
size_t astbuffersize = p.size();
108108
size_t * ast_buffer = (size_t *) malloc(astbuffersize * sizeof(size_t));
109109

110-
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.second, buffer)).is_valid(), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
110+
BEST_TIME("sajson (static alloc)", sajson::parse(sajson::bounded_allocation(ast_buffer, astbuffersize), sajson::mutable_string_view(p.size(), buffer)).is_valid(), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
111111
std::string json11err;
112-
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.first, p.second), repeat, volume, true);
112+
if(all) BEST_TIME("dropbox (json11) ", (( json11::Json::parse(buffer,json11err).is_null() ) || ( ! json11err.empty() )), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
113113

114-
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.first, p.second), repeat, volume, true);
114+
if(all) BEST_TIME("fastjson ", fastjson_parse(buffer), true, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
115115
JsonValue value;
116116
JsonAllocator allocator;
117117
char *endptr;
118-
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.first, p.second), repeat, volume, true);
118+
if(all) BEST_TIME("gason ", jsonParse(buffer, &endptr, &value, allocator), JSON_OK, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
119119
void *state;
120-
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.second, NULL, &state) == NULL), false, memcpy(buffer, p.first, p.second), repeat, volume, true);
121-
BEST_TIME("memcpy ", (memcpy(buffer, p.first, p.second) == buffer), true, , repeat, volume, true);
122-
free(p.first);
120+
if(all) BEST_TIME("ultrajson ", (UJDecode(buffer, p.size(), NULL, &state) == NULL), false, memcpy(buffer, p.c_str(), p.size()), repeat, volume, true);
121+
BEST_TIME("memcpy ", (memcpy(buffer, p.c_str(), p.size()) == buffer), true, , repeat, volume, true);
123122
free(ast_buffer);
124123
free(buffer);
125-
deallocate_ParsedJson(pj_ptr);
126124
}

include/simdjson/jsonioutil.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@
99

1010
#include "simdjson/common_defs.h"
1111

12+
13+
// low-level function
1214
// if you must provide a pointer to some data, create it with this function:
1315
// length is the max. size in bytes of the string
1416
// caller is responsible to free the memory (free(...))
1517
char * allocate_aligned_buffer(size_t length);
1618

19+
20+
21+
22+
1723
// load a file in memory...
1824
// get a corpus; pad out to cache line so we can always use SIMD
1925
// throws exceptions in case of failure
@@ -27,6 +33,7 @@ char * allocate_aligned_buffer(size_t length);
2733
// } catch (const std::exception& e) {
2834
// std::cout << "Could not load the file " << filename << std::endl;
2935
// }
30-
std::pair<u8 *, size_t> get_corpus(std::string filename);
36+
std::string_view get_corpus(std::string filename);
37+
3138

3239
#endif

include/simdjson/jsonminifier.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,16 @@
44
#include <cstdint>
55

66
// Take input from buf and remove useless whitespace, write it to out; buf and
7-
// out can be the same pointer.
7+
// out can be the same pointer. Result is null terminated,
8+
// return the string length (minus the null termination).
89
size_t jsonminify(const uint8_t *buf, size_t len, uint8_t *out);
10+
11+
12+
static inline size_t jsonminify(const char *buf, size_t len, char *out) {
13+
return jsonminify((const uint8_t *)buf, len, (uint8_t *)out);
14+
}
15+
16+
17+
static inline size_t jsonminify(const simdjsonstring & p, char *out) {
18+
return jsonminify(p.c_str(), p.size(), out);
19+
}

0 commit comments

Comments
 (0)