Skip to content

Commit 42a8b40

Browse files
committed
Reamalgamate
1 parent b4b968f commit 42a8b40

File tree

3 files changed

+264
-58
lines changed

3 files changed

+264
-58
lines changed

singleheader/amalgamate_demo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */
1+
/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */
22

33
#include <iostream>
44
#include "simdjson.h"

singleheader/simdjson.cpp

Lines changed: 176 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */
1+
/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */
22
/* begin file src/simdjson.cpp */
33
#include "simdjson.h"
44

@@ -371,6 +371,7 @@ class implementation final : public simdjson::implementation {
371371
std::unique_ptr<internal::dom_parser_implementation>& dst
372372
) const noexcept final;
373373
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
374+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
374375
};
375376

376377
} // namespace haswell
@@ -402,6 +403,7 @@ class implementation final : public simdjson::implementation {
402403
std::unique_ptr<internal::dom_parser_implementation>& dst
403404
) const noexcept final;
404405
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
406+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
405407
};
406408

407409
} // namespace westmere
@@ -433,6 +435,7 @@ class implementation final : public simdjson::implementation {
433435
std::unique_ptr<internal::dom_parser_implementation>& dst
434436
) const noexcept final;
435437
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
438+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
436439
};
437440

438441
} // namespace arm64
@@ -468,6 +471,7 @@ class implementation final : public simdjson::implementation {
468471
std::unique_ptr<internal::dom_parser_implementation>& dst
469472
) const noexcept final;
470473
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
474+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
471475
};
472476

473477
} // namespace fallback
@@ -500,7 +504,9 @@ class detect_best_supported_implementation_on_first_use final : public implement
500504
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
501505
return set_best()->minify(buf, len, dst, dst_len);
502506
}
503-
507+
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
508+
return set_best()->validate_utf8(buf, len);
509+
}
504510
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
505511
private:
506512
const implementation *set_best() const noexcept;
@@ -535,10 +541,19 @@ class unsupported_implementation final : public implementation {
535541
) const noexcept final {
536542
return UNSUPPORTED_ARCHITECTURE;
537543
}
538-
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
544+
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
539545
return UNSUPPORTED_ARCHITECTURE;
540546
}
541-
547+
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
548+
return false; // Just refuse to validate. Given that we have a fallback implementation
549+
// it seems unlikely that unsupported_implementation will ever be used. If it is used,
550+
// then it will flag all strings as invalid. The alternative is to return an error_code
551+
// from which the user has to figure out whether the string is valid UTF-8... which seems
552+
// like a lot of work just to handle the very unlikely case that we have an unsupported
553+
// implementation. And, when it does happen (that we have an unsupported implementation),
554+
// what are the chances that the programmer has a fallback? Given that *we* provide the
555+
// fallback, it implies that the programmer would need a fallback for our fallback.
556+
}
542557
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
543558
};
544559

@@ -589,6 +604,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_imple
589604
WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
590605
return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len);
591606
}
607+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept {
608+
return active_implementation->validate_utf8(buf, len);
609+
}
592610

593611

594612
} // namespace simdjson
@@ -3757,7 +3775,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
37573775
this->len = _len;
37583776
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
37593777
}
3778+
/* begin file src/generic/stage1/utf8_validator.h */
3779+
namespace stage1 {
3780+
/**
3781+
* Validates that the string is actual UTF-8.
3782+
*/
3783+
template<class checker>
3784+
bool generic_validate_utf8(const uint8_t * input, size_t length) {
3785+
checker c{};
3786+
buf_block_reader<64> reader(input, length);
3787+
while (reader.has_full_block()) {
3788+
simd::simd8x64<uint8_t> in(reader.full_block());
3789+
c.check_next_input(in);
3790+
reader.advance();
3791+
}
3792+
uint8_t block[64]{};
3793+
reader.get_remainder(block);
3794+
simd::simd8x64<uint8_t> in(block);
3795+
c.check_next_input(in);
3796+
reader.advance();
3797+
return c.errors() == error_code::SUCCESS;
3798+
}
3799+
3800+
bool generic_validate_utf8(const char * input, size_t length) {
3801+
return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
3802+
}
37603803

3804+
} // namespace stage1
3805+
/* end file src/generic/stage1/utf8_validator.h */
3806+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
3807+
return simdjson::arm64::stage1::generic_validate_utf8(buf,len);
3808+
}
37613809
} // namespace arm64
37623810
} // namespace simdjson
37633811

@@ -5798,6 +5846,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
57985846
return SUCCESS;
57995847
}
58005848

5849+
// credit: based on code from Google Fuchsia (Apache Licensed)
5850+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
5851+
const uint8_t *data = (const uint8_t *)buf;
5852+
uint64_t pos = 0;
5853+
uint64_t next_pos = 0;
5854+
uint32_t code_point = 0;
5855+
while (pos < len) {
5856+
// check of the next 8 bytes are ascii.
5857+
next_pos = pos + 16;
5858+
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
5859+
uint64_t v1;
5860+
memcpy(&v1, data + pos, sizeof(uint64_t));
5861+
uint64_t v2;
5862+
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
5863+
uint64_t v{v1 | v2};
5864+
if ((v & 0x8080808080808080) == 0) {
5865+
pos = next_pos;
5866+
continue;
5867+
}
5868+
}
5869+
unsigned char byte = data[pos];
5870+
if (byte < 0b10000000) {
5871+
pos++;
5872+
continue;
5873+
} else if ((byte & 0b11100000) == 0b11000000) {
5874+
next_pos = pos + 2;
5875+
if (next_pos > len) { return false; }
5876+
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5877+
// range check
5878+
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
5879+
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
5880+
} else if ((byte & 0b11110000) == 0b11100000) {
5881+
next_pos = pos + 3;
5882+
if (next_pos > len) { return false; }
5883+
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5884+
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5885+
// range check
5886+
code_point = (byte & 0b00001111) << 12 |
5887+
(data[pos + 1] & 0b00111111) << 6 |
5888+
(data[pos + 2] & 0b00111111);
5889+
if (code_point < 0x800 || 0xffff < code_point ||
5890+
(0xd7ff < code_point && code_point < 0xe000)) {
5891+
return false;
5892+
}
5893+
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
5894+
next_pos = pos + 4;
5895+
if (next_pos > len) { return false; }
5896+
if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
5897+
if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
5898+
if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
5899+
// range check
5900+
code_point =
5901+
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
5902+
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
5903+
if (code_point < 0xffff || 0x10ffff < code_point) { return false; }
5904+
} else {
5905+
// we may have a continuation
5906+
return false;
5907+
}
5908+
pos = next_pos;
5909+
}
5910+
return true;
5911+
}
5912+
58015913
} // namespace fallback
58025914
} // namespace simdjson
58035915

@@ -9121,7 +9233,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
91219233
this->len = _len;
91229234
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
91239235
}
9236+
/* begin file src/generic/stage1/utf8_validator.h */
9237+
namespace stage1 {
9238+
/**
9239+
* Validates that the string is actual UTF-8.
9240+
*/
9241+
template<class checker>
9242+
bool generic_validate_utf8(const uint8_t * input, size_t length) {
9243+
checker c{};
9244+
buf_block_reader<64> reader(input, length);
9245+
while (reader.has_full_block()) {
9246+
simd::simd8x64<uint8_t> in(reader.full_block());
9247+
c.check_next_input(in);
9248+
reader.advance();
9249+
}
9250+
uint8_t block[64]{};
9251+
reader.get_remainder(block);
9252+
simd::simd8x64<uint8_t> in(block);
9253+
c.check_next_input(in);
9254+
reader.advance();
9255+
return c.errors() == error_code::SUCCESS;
9256+
}
9257+
9258+
bool generic_validate_utf8(const char * input, size_t length) {
9259+
return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
9260+
}
91249261

9262+
} // namespace stage1
9263+
/* end file src/generic/stage1/utf8_validator.h */
9264+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
9265+
return simdjson::haswell::stage1::generic_validate_utf8(buf,len);
9266+
}
91259267
} // namespace haswell
91269268
} // namespace simdjson
91279269
UNTARGET_REGION
@@ -12368,7 +12510,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
1236812510
this->len = _len;
1236912511
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
1237012512
}
12513+
/* begin file src/generic/stage1/utf8_validator.h */
12514+
namespace stage1 {
12515+
/**
12516+
* Validates that the string is actual UTF-8.
12517+
*/
12518+
template<class checker>
12519+
bool generic_validate_utf8(const uint8_t * input, size_t length) {
12520+
checker c{};
12521+
buf_block_reader<64> reader(input, length);
12522+
while (reader.has_full_block()) {
12523+
simd::simd8x64<uint8_t> in(reader.full_block());
12524+
c.check_next_input(in);
12525+
reader.advance();
12526+
}
12527+
uint8_t block[64]{};
12528+
reader.get_remainder(block);
12529+
simd::simd8x64<uint8_t> in(block);
12530+
c.check_next_input(in);
12531+
reader.advance();
12532+
return c.errors() == error_code::SUCCESS;
12533+
}
12534+
12535+
bool generic_validate_utf8(const char * input, size_t length) {
12536+
return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length);
12537+
}
1237112538

12539+
} // namespace stage1
12540+
/* end file src/generic/stage1/utf8_validator.h */
12541+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
12542+
return simdjson::westmere::stage1::generic_validate_utf8(buf,len);
12543+
}
1237212544
} // namespace westmere
1237312545
} // namespace simdjson
1237412546
UNTARGET_REGION

0 commit comments

Comments
 (0)