Skip to content

Commit e370a65

Browse files
authored
Fix for issues 32, 50, 131, 137
* Improving portability. * Revisiting faulty logic regarding same-page overruns. * Disabling same-page overruns under VS. * Clarifying the documentation * Fix for issue 131 + being more explicit regarding memory realloc. * Fix for issue 137. * removing "using namespace std" throughout. Fix for 50 * Introducing typed malloc/free. * Introducing a custom class (padded_string) that solves several minor usability issues. * Updating amalgamation for testing.
1 parent c5a3f9c commit e370a65

31 files changed

Lines changed: 1106 additions & 363 deletions

README.md

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi
6666
const char * filename = ... //
6767

6868
// use whatever means you want to get a string (UTF-8) of your JSON document
69-
std::string_view p = get_corpus(filename); // you are responsible for freeing p.data()
69+
padded_string p = get_corpus(filename);
7070
ParsedJson pj;
7171
pj.allocateCapacity(p.size()); // allocate memory for parsing up to p.size() bytes
7272
const int res = json_parse(p, pj); // do the parsing, return 0 on success
@@ -75,8 +75,6 @@ if (res != 0) {
7575
// You can use the "simdjson/simdjson.h" header to access the error message
7676
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
7777
}
78-
// You can safely delete the string content
79-
aligned_free((void*)p.data());
8078
// the ParsedJson document can be used here
8179
// pj can be reused with other json_parse calls.
8280
```
@@ -90,21 +88,49 @@ of memory allocation with each new JSON document:
9088
/...
9189

9290
const char * filename = ... //
93-
std::string_view p = get_corpus(filename);
91+
padding_string p = get_corpus(filename);
9492
ParsedJson pj = build_parsed_json(p); // do the parsing
95-
// you no longer need p at this point, can do aligned_free((void*)p.data())
9693
if( ! pj.isValid() ) {
9794
// something went wrong
9895
}
99-
aligned_free((void*)p.data());
10096
```
10197

102-
You can call `json_parse` and `build_parsed_json`, passing a standard `std::string` object.
98+
Though the `padded_string` class is recommended for best performance, you can call `json_parse` and `build_parsed_json`, passing a standard `std::string` object.
10399

104100

105-
## Memory overallocation `
101+
```C
102+
#include "simdjson/jsonparser.h"
103+
104+
/...
105+
std::string mystring = ... //
106+
ParsedJson pj;
107+
pj.allocateCapacity(mystring.size()); // allocate memory for parsing up to p.size() bytes
108+
// std::string may not overallocate so a copy will be needed
109+
const int res = json_parse(mystring, pj); // do the parsing, return 0 on success
110+
// parsing is done!
111+
if (res != 0) {
112+
// You can use the "simdjson/simdjson.h" header to access the error message
113+
std::cout << "Error parsing:" << simdjson::errorMsg(res) << std::endl;
114+
}
115+
// pj can be reused with other json_parse calls.
116+
```
117+
118+
or
119+
120+
```C
121+
#include "simdjson/jsonparser.h"
122+
123+
/...
124+
125+
std::string mystring = ... //
126+
// std::string may not overallocate so a copy will be needed
127+
ParsedJson pj = build_parsed_json(mystring); // do the parsing
128+
if( ! pj.isValid() ) {
129+
// something went wrong
130+
}
131+
```
106132

107-
As needed, the `json_parse` and `build_parsed_json` functions copy the input data to a temporary buffer readable up to SIMDJSON_PADDING bytes beyond the end of the data. To avoid this potentially expensive copy, overallocate your own input data and then call the `json_parse` and `build_parsed_json` functions with an extra parameter value set to `false` (e.g., `build_parsed_json(p,false)` and `parsed_json(p,pj,false)`). In such instance, no temporary copy is made. The `get_corpus` function does this automatically as well as the provide `char * allocate_padded_buffer(size_t length)` function to achieve the desired effect.
133+
As needed, the `json_parse` and `build_parsed_json` functions copy the input data to a temporary buffer readable up to SIMDJSON_PADDING bytes beyond the end of the data.
108134

109135
## Usage: easy single-header version
110136

@@ -118,14 +144,13 @@ copy the files in your project in your include path. You can then include them q
118144
#include "simdjson.cpp"
119145
int main(int argc, char *argv[]) {
120146
const char * filename = argv[1];
121-
std::string_view p = get_corpus(filename);
147+
padded_string p = get_corpus(filename);
122148
ParsedJson pj = build_parsed_json(p); // do the parsing
123149
if( ! pj.isValid() ) {
124150
std::cout << "not valid" << std::endl;
125151
} else {
126152
std::cout << "valid" << std::endl;
127153
}
128-
aligned_free((void*)p.data());
129154
return EXIT_SUCCESS;
130155
}
131156
```

amalgamation.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ $SCRIPTPATH/include/simdjson/simdjson_version.h
2828
$SCRIPTPATH/include/simdjson/simdjson.h
2929
$SCRIPTPATH/include/simdjson/portability.h
3030
$SCRIPTPATH/include/simdjson/common_defs.h
31+
$SCRIPTPATH/include/simdjson/padded_string.h
3132
$SCRIPTPATH/include/simdjson/jsoncharutils.h
3233
$SCRIPTPATH/include/simdjson/jsonformatutils.h
3334
$SCRIPTPATH/include/simdjson/jsonioutil.h
@@ -100,7 +101,7 @@ cat <<< '
100101
#include "simdjson.cpp"
101102
int main(int argc, char *argv[]) {
102103
const char * filename = argv[1];
103-
std::string_view p = get_corpus(filename);
104+
padded_string p = get_corpus(filename);
104105
ParsedJson pj = build_parsed_json(p); // do the parsing
105106
if( ! pj.isValid() ) {
106107
std::cout << "not valid" << std::endl;

benchmark/distinctuseridcompetition.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,16 @@
1414
#include "sajson.h"
1515

1616
using namespace rapidjson;
17-
using namespace std;
1817

1918
bool equals(const char *s1, const char *s2) { return strcmp(s1, s2) == 0; }
2019

21-
void remove_duplicates(vector<int64_t> &v) {
20+
void remove_duplicates(std::vector<int64_t> &v) {
2221
std::sort(v.begin(), v.end());
2322
auto last = std::unique(v.begin(), v.end());
2423
v.erase(last, v.end());
2524
}
2625

27-
void print_vec(vector<int64_t> &v) {
26+
void print_vec(const std::vector<int64_t> &v) {
2827
for (auto i : v) {
2928
std::cout << i << " ";
3029
}
@@ -73,7 +72,7 @@ void simdjson_traverse(std::vector<int64_t> &answer, ParsedJson::iterator &i) {
7372
}
7473
}
7574

76-
std::vector<int64_t> simdjson_computestats(const std::string_view &p) {
75+
std::vector<int64_t> simdjson_computestats(const padded_string &p) {
7776
std::vector<int64_t> answer;
7877
ParsedJson pj = build_parsed_json(p);
7978
if (!pj.isValid()) {
@@ -134,7 +133,7 @@ void sajson_traverse(std::vector<int64_t> &answer, const sajson::value &node) {
134133
}
135134
}
136135

137-
std::vector<int64_t> sasjon_computestats(const std::string_view &p) {
136+
std::vector<int64_t> sasjon_computestats(const padded_string &p) {
138137
std::vector<int64_t> answer;
139138
char *buffer = (char *)malloc(p.size());
140139
memcpy(buffer, p.data(), p.size());
@@ -187,7 +186,7 @@ void rapid_traverse(std::vector<int64_t> &answer, const rapidjson::Value &v) {
187186
}
188187
}
189188

190-
std::vector<int64_t> rapid_computestats(const std::string_view &p) {
189+
std::vector<int64_t> rapid_computestats(const padded_string &p) {
191190
std::vector<int64_t> answer;
192191
char *buffer = (char *)malloc(p.size() + 1);
193192
memcpy(buffer, p.data(), p.size());
@@ -220,19 +219,19 @@ int main(int argc, char *argv[]) {
220219
abort();
221220
}
222221
if (optind >= argc) {
223-
cerr << "Using different parsers, we compute the content statistics of "
224-
"JSON documents.\n";
225-
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
226-
cerr << "Or " << argv[0] << " -v <jsonfile>\n";
222+
std::cerr << "Using different parsers, we compute the content statistics of "
223+
"JSON documents." << std::endl;
224+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
225+
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
227226
exit(1);
228227
}
229228
const char *filename = argv[optind];
230229
if (optind + 1 < argc) {
231-
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
230+
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
232231
}
233-
std::string_view p;
232+
padded_string p;
234233
try {
235-
p = get_corpus(filename);
234+
get_corpus(filename).swap(p);
236235
} catch (const std::exception &e) { // caught by reference to base
237236
std::cout << "Could not load the file " << filename << std::endl;
238237
return EXIT_FAILURE;
@@ -279,5 +278,4 @@ int main(int argc, char *argv[]) {
279278
!justdata);
280279
BEST_TIME("sasjon ", sasjon_computestats(p).size(), size, , repeat, volume,
281280
!justdata);
282-
aligned_free((void*)p.data());
283281
}

benchmark/minifiercompetition.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717

1818
using namespace rapidjson;
19-
using namespace std;
2019

2120
std::string rapidstringmeInsitu(char *json) {
2221
Document d;
@@ -62,13 +61,13 @@ int main(int argc, char *argv[]) {
6261
abort ();
6362
}
6463
if (optind >= argc) {
65-
cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
64+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
6665
exit(1);
6766
}
6867
const char * filename = argv[optind];
69-
std::string_view p;
68+
padded_string p;
7069
try {
71-
p = get_corpus(filename);
70+
get_corpus(filename).swap(p);
7271
} catch (const std::exception& e) { // caught by reference to base
7372
std::cout << "Could not load the file " << filename << std::endl;
7473
return EXIT_FAILURE;
@@ -140,17 +139,17 @@ int main(int argc, char *argv[]) {
140139
fprintf(stderr, "failed to allocate memory\n");
141140
return EXIT_FAILURE;
142141
}
143-
BEST_TIME("simdjson orig", json_parse((const uint8_t*)buffer, p.size(), pj), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
142+
bool automated_reallocation = false;
143+
BEST_TIME("simdjson orig", json_parse((const uint8_t*)buffer, p.size(), pj, automated_reallocation), true, memcpy(buffer, p.data(), p.size()), repeat, volume, !justdata);
144144

145145
ParsedJson pj2;
146146
bool isallocok2 = pj2.allocateCapacity(p.size(), 1024);
147147
if(!isallocok2) {
148148
fprintf(stderr, "failed to allocate memory\n");
149149
return EXIT_FAILURE;
150150
}
151-
152-
BEST_TIME("simdjson despaced", json_parse((const uint8_t*)buffer, minisize, pj2), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
153-
aligned_free((void*)p.data());
151+
automated_reallocation = false;
152+
BEST_TIME("simdjson despaced", json_parse((const uint8_t*)buffer, minisize, pj2, automated_reallocation), true, memcpy(buffer, minibuffer, p.size()), repeat, volume, !justdata);
154153
free(buffer);
155154
free(ast_buffer);
156155
free(minibuffer);

benchmark/parse.cpp

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "simdjson/parsedjson.h"
3434
#include "simdjson/stage1_find_marks.h"
3535
#include "simdjson/stage2_build_tape.h"
36-
using namespace std;
3736

3837
int main(int argc, char *argv[]) {
3938
bool verbose = false;
@@ -69,34 +68,34 @@ int main(int argc, char *argv[]) {
6968
int optind = 1;
7069
#endif
7170
if (optind >= argc) {
72-
cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
71+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
7372
exit(1);
7473
}
7574
const char *filename = argv[optind];
7675
if (optind + 1 < argc) {
77-
cerr << "warning: ignoring everything after " << argv[optind + 1] << endl;
76+
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
7877
}
7978
if (verbose) {
80-
cout << "[verbose] loading " << filename << endl;
81-
}
82-
std::string_view p;
79+
std::cout << "[verbose] loading " << filename << std::endl;
80+
}
81+
padded_string p;
8382
try {
84-
p = get_corpus(filename);
83+
get_corpus(filename).swap(p);
8584
} catch (const std::exception &e) { // caught by reference to base
8685
std::cout << "Could not load the file " << filename << std::endl;
8786
return EXIT_FAILURE;
8887
}
8988
if (verbose) {
90-
cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
91-
<< endl;
89+
std::cout << "[verbose] loaded " << filename << " (" << p.size() << " bytes)"
90+
<< std::endl;
9291
}
9392
#if defined(DEBUG)
9493
const uint32_t iterations = 1;
9594
#else
9695
const uint32_t iterations =
9796
forceoneiteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
9897
#endif
99-
vector<double> res;
98+
std::vector<double> res;
10099
res.resize(iterations);
101100

102101
#if !defined(__linux__)
@@ -107,14 +106,14 @@ int main(int argc, char *argv[]) {
107106
#endif
108107

109108
#ifndef SQUASH_COUNTERS
110-
vector<int> evts;
109+
std::vector<int> evts;
111110
evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
112111
evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
113112
evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
114113
evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
115114
evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
116115
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
117-
vector<unsigned long long> results;
116+
std::vector<unsigned long long> results;
118117
results.resize(evts.size());
119118
unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
120119
unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
@@ -126,16 +125,15 @@ int main(int argc, char *argv[]) {
126125

127126
for (uint32_t i = 0; i < iterations; i++) {
128127
if (verbose) {
129-
cout << "[verbose] iteration # " << i << endl;
130-
}
128+
std::cout << "[verbose] iteration # " << i << std::endl;
129+
}
131130
#ifndef SQUASH_COUNTERS
132131
unified.start();
133132
#endif
134133
ParsedJson pj;
135134
bool allocok = pj.allocateCapacity(p.size());
136135
if (!allocok) {
137136
std::cerr << "failed to allocate memory" << std::endl;
138-
aligned_free((void *)p.data());
139137
return EXIT_FAILURE;
140138
}
141139
#ifndef SQUASH_COUNTERS
@@ -147,7 +145,7 @@ int main(int argc, char *argv[]) {
147145
cmis0 += results[4];
148146
#endif
149147
if (verbose) {
150-
cout << "[verbose] allocated memory for parsed JSON " << endl;
148+
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
151149
}
152150

153151
auto start = std::chrono::steady_clock::now();
@@ -163,7 +161,7 @@ int main(int argc, char *argv[]) {
163161
cref1 += results[3];
164162
cmis1 += results[4];
165163
if (!isok) {
166-
cout << "Failed during stage 1\n";
164+
std::cout << "Failed during stage 1" << std::endl;
167165
break;
168166
}
169167
unified.start();
@@ -178,7 +176,7 @@ int main(int argc, char *argv[]) {
178176
cref2 += results[3];
179177
cmis2 += results[4];
180178
if (!isok) {
181-
cout << "Failed during stage 2\n";
179+
std::cout << "Failed during stage 2" << std::endl;
182180
break;
183181
}
184182
#endif
@@ -190,7 +188,6 @@ int main(int argc, char *argv[]) {
190188
ParsedJson pj = build_parsed_json(p); // do the parsing again to get the stats
191189
if (!pj.isValid()) {
192190
std::cerr << "Could not parse. " << std::endl;
193-
aligned_free((void *)p.data());
194191
return EXIT_FAILURE;
195192
}
196193
#ifndef SQUASH_COUNTERS
@@ -202,7 +199,6 @@ int main(int argc, char *argv[]) {
202199
float cpbtotal = (double)total / (iterations * p.size());
203200
char *newfile = (char *)malloc(strlen(filename) + 1);
204201
if (newfile == NULL) {
205-
aligned_free((void *)p.data());
206202
return EXIT_FAILURE;
207203
}
208204
::strcpy(newfile, filename);
@@ -255,17 +251,16 @@ int main(int argc, char *argv[]) {
255251
#endif
256252
double min_result = *min_element(res.begin(), res.end());
257253
if (!justdata) {
258-
cout << "Min: " << min_result << " bytes read: " << p.size()
254+
std::cout << "Min: " << min_result << " bytes read: " << p.size()
259255
<< " Gigabytes/second: " << (p.size()) / (min_result * 1000000000.0)
260-
<< "\n";
256+
<< std::endl;
261257
}
262258
if (jsonoutput) {
263259
isok = isok && pj.printjson(std::cout);
264260
}
265261
if (dump) {
266262
isok = isok && pj.dump_raw_tape(std::cout);
267263
}
268-
aligned_free((void *)p.data());
269264
if (!isok) {
270265
fprintf(stderr, " Parsing failed. \n ");
271266
return EXIT_FAILURE;

0 commit comments

Comments
 (0)