|
1 | | -/* auto-generated on Sun Jun 21 11:49:12 PDT 2020. Do not edit! */ |
| 1 | +/* auto-generated on Tue Jun 23 09:15:19 PDT 2020. Do not edit! */ |
2 | 2 | /* begin file src/simdjson.cpp */ |
3 | 3 | #include "simdjson.h" |
4 | 4 |
|
@@ -371,6 +371,7 @@ class implementation final : public simdjson::implementation { |
371 | 371 | std::unique_ptr<internal::dom_parser_implementation>& dst |
372 | 372 | ) const noexcept final; |
373 | 373 | WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; |
| 374 | + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; |
374 | 375 | }; |
375 | 376 |
|
376 | 377 | } // namespace haswell |
@@ -402,6 +403,7 @@ class implementation final : public simdjson::implementation { |
402 | 403 | std::unique_ptr<internal::dom_parser_implementation>& dst |
403 | 404 | ) const noexcept final; |
404 | 405 | WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; |
| 406 | + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; |
405 | 407 | }; |
406 | 408 |
|
407 | 409 | } // namespace westmere |
@@ -433,6 +435,7 @@ class implementation final : public simdjson::implementation { |
433 | 435 | std::unique_ptr<internal::dom_parser_implementation>& dst |
434 | 436 | ) const noexcept final; |
435 | 437 | WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; |
| 438 | + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; |
436 | 439 | }; |
437 | 440 |
|
438 | 441 | } // namespace arm64 |
@@ -468,6 +471,7 @@ class implementation final : public simdjson::implementation { |
468 | 471 | std::unique_ptr<internal::dom_parser_implementation>& dst |
469 | 472 | ) const noexcept final; |
470 | 473 | WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; |
| 474 | + WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final; |
471 | 475 | }; |
472 | 476 |
|
473 | 477 | } // namespace fallback |
@@ -500,7 +504,9 @@ class detect_best_supported_implementation_on_first_use final : public implement |
500 | 504 | WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { |
501 | 505 | return set_best()->minify(buf, len, dst, dst_len); |
502 | 506 | } |
503 | | - |
| 507 | + WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override { |
| 508 | + return set_best()->validate_utf8(buf, len); |
| 509 | + } |
504 | 510 | really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} |
505 | 511 | private: |
506 | 512 | const implementation *set_best() const noexcept; |
@@ -535,10 +541,19 @@ class unsupported_implementation final : public implementation { |
535 | 541 | ) const noexcept final { |
536 | 542 | return UNSUPPORTED_ARCHITECTURE; |
537 | 543 | } |
538 | | - WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final { |
| 544 | + WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override { |
539 | 545 | return UNSUPPORTED_ARCHITECTURE; |
540 | 546 | } |
541 | | - |
| 547 | + WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override { |
| 548 | + return false; // Just refuse to validate. Given that we have a fallback implementation |
| 549 | + // it seems unlikely that unsupported_implementation will ever be used. If it is used, |
| 550 | + // then it will flag all strings as invalid. The alternative is to return an error_code |
| 551 | + // from which the user has to figure out whether the string is valid UTF-8... which seems |
| 552 | + // like a lot of work just to handle the very unlikely case that we have an unsupported |
| 553 | + // implementation. And, when it does happen (that we have an unsupported implementation), |
| 554 | + // what are the chances that the programmer has a fallback? Given that *we* provide the |
| 555 | + // fallback, it implies that the programmer would need a fallback for our fallback. |
| 556 | + } |
542 | 557 | unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} |
543 | 558 | }; |
544 | 559 |
|
@@ -589,6 +604,9 @@ SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_imple |
589 | 604 | WARN_UNUSED error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept { |
590 | 605 | return active_implementation->minify((const uint8_t *)buf, len, (uint8_t *)dst, dst_len); |
591 | 606 | } |
| 607 | +WARN_UNUSED bool validate_utf8(const char *buf, size_t len) noexcept { |
| 608 | + return active_implementation->validate_utf8(buf, len); |
| 609 | +} |
592 | 610 |
|
593 | 611 |
|
594 | 612 | } // namespace simdjson |
@@ -3757,7 +3775,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si |
3757 | 3775 | this->len = _len; |
3758 | 3776 | return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); |
3759 | 3777 | } |
| 3778 | +/* begin file src/generic/stage1/utf8_validator.h */ |
| 3779 | +namespace stage1 { |
| 3780 | +/** |
| 3781 | + * Validates that the string is actual UTF-8. |
| 3782 | + */ |
| 3783 | +template<class checker> |
| 3784 | +bool generic_validate_utf8(const uint8_t * input, size_t length) { |
| 3785 | + checker c{}; |
| 3786 | + buf_block_reader<64> reader(input, length); |
| 3787 | + while (reader.has_full_block()) { |
| 3788 | + simd::simd8x64<uint8_t> in(reader.full_block()); |
| 3789 | + c.check_next_input(in); |
| 3790 | + reader.advance(); |
| 3791 | + } |
| 3792 | + uint8_t block[64]{}; |
| 3793 | + reader.get_remainder(block); |
| 3794 | + simd::simd8x64<uint8_t> in(block); |
| 3795 | + c.check_next_input(in); |
| 3796 | + reader.advance(); |
| 3797 | + return c.errors() == error_code::SUCCESS; |
| 3798 | +} |
| 3799 | + |
| 3800 | +bool generic_validate_utf8(const char * input, size_t length) { |
| 3801 | + return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length); |
| 3802 | +} |
3760 | 3803 |
|
| 3804 | +} // namespace stage1 |
| 3805 | +/* end file src/generic/stage1/utf8_validator.h */ |
| 3806 | +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { |
| 3807 | + return simdjson::arm64::stage1::generic_validate_utf8(buf,len); |
| 3808 | +} |
3761 | 3809 | } // namespace arm64 |
3762 | 3810 | } // namespace simdjson |
3763 | 3811 |
|
@@ -5798,6 +5846,70 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui |
5798 | 5846 | return SUCCESS; |
5799 | 5847 | } |
5800 | 5848 |
|
| 5849 | +// credit: based on code from Google Fuchsia (Apache Licensed) |
| 5850 | +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { |
| 5851 | + const uint8_t *data = (const uint8_t *)buf; |
| 5852 | + uint64_t pos = 0; |
| 5853 | + uint64_t next_pos = 0; |
| 5854 | + uint32_t code_point = 0; |
| 5855 | + while (pos < len) { |
| 5856 | + // check of the next 8 bytes are ascii. |
| 5857 | + next_pos = pos + 16; |
| 5858 | + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii |
| 5859 | + uint64_t v1; |
| 5860 | + memcpy(&v1, data + pos, sizeof(uint64_t)); |
| 5861 | + uint64_t v2; |
| 5862 | + memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); |
| 5863 | + uint64_t v{v1 | v2}; |
| 5864 | + if ((v & 0x8080808080808080) == 0) { |
| 5865 | + pos = next_pos; |
| 5866 | + continue; |
| 5867 | + } |
| 5868 | + } |
| 5869 | + unsigned char byte = data[pos]; |
| 5870 | + if (byte < 0b10000000) { |
| 5871 | + pos++; |
| 5872 | + continue; |
| 5873 | + } else if ((byte & 0b11100000) == 0b11000000) { |
| 5874 | + next_pos = pos + 2; |
| 5875 | + if (next_pos > len) { return false; } |
| 5876 | + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } |
| 5877 | + // range check |
| 5878 | + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); |
| 5879 | + if (code_point < 0x80 || 0x7ff < code_point) { return false; } |
| 5880 | + } else if ((byte & 0b11110000) == 0b11100000) { |
| 5881 | + next_pos = pos + 3; |
| 5882 | + if (next_pos > len) { return false; } |
| 5883 | + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } |
| 5884 | + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } |
| 5885 | + // range check |
| 5886 | + code_point = (byte & 0b00001111) << 12 | |
| 5887 | + (data[pos + 1] & 0b00111111) << 6 | |
| 5888 | + (data[pos + 2] & 0b00111111); |
| 5889 | + if (code_point < 0x800 || 0xffff < code_point || |
| 5890 | + (0xd7ff < code_point && code_point < 0xe000)) { |
| 5891 | + return false; |
| 5892 | + } |
| 5893 | + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 |
| 5894 | + next_pos = pos + 4; |
| 5895 | + if (next_pos > len) { return false; } |
| 5896 | + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } |
| 5897 | + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } |
| 5898 | + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; } |
| 5899 | + // range check |
| 5900 | + code_point = |
| 5901 | + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | |
| 5902 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); |
| 5903 | + if (code_point < 0xffff || 0x10ffff < code_point) { return false; } |
| 5904 | + } else { |
| 5905 | + // we may have a continuation |
| 5906 | + return false; |
| 5907 | + } |
| 5908 | + pos = next_pos; |
| 5909 | + } |
| 5910 | + return true; |
| 5911 | +} |
| 5912 | + |
5801 | 5913 | } // namespace fallback |
5802 | 5914 | } // namespace simdjson |
5803 | 5915 |
|
@@ -9121,7 +9233,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si |
9121 | 9233 | this->len = _len; |
9122 | 9234 | return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); |
9123 | 9235 | } |
| 9236 | +/* begin file src/generic/stage1/utf8_validator.h */ |
| 9237 | +namespace stage1 { |
| 9238 | +/** |
| 9239 | + * Validates that the string is actual UTF-8. |
| 9240 | + */ |
| 9241 | +template<class checker> |
| 9242 | +bool generic_validate_utf8(const uint8_t * input, size_t length) { |
| 9243 | + checker c{}; |
| 9244 | + buf_block_reader<64> reader(input, length); |
| 9245 | + while (reader.has_full_block()) { |
| 9246 | + simd::simd8x64<uint8_t> in(reader.full_block()); |
| 9247 | + c.check_next_input(in); |
| 9248 | + reader.advance(); |
| 9249 | + } |
| 9250 | + uint8_t block[64]{}; |
| 9251 | + reader.get_remainder(block); |
| 9252 | + simd::simd8x64<uint8_t> in(block); |
| 9253 | + c.check_next_input(in); |
| 9254 | + reader.advance(); |
| 9255 | + return c.errors() == error_code::SUCCESS; |
| 9256 | +} |
| 9257 | + |
| 9258 | +bool generic_validate_utf8(const char * input, size_t length) { |
| 9259 | + return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length); |
| 9260 | +} |
9124 | 9261 |
|
| 9262 | +} // namespace stage1 |
| 9263 | +/* end file src/generic/stage1/utf8_validator.h */ |
| 9264 | +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { |
| 9265 | + return simdjson::haswell::stage1::generic_validate_utf8(buf,len); |
| 9266 | +} |
9125 | 9267 | } // namespace haswell |
9126 | 9268 | } // namespace simdjson |
9127 | 9269 | UNTARGET_REGION |
@@ -12368,7 +12510,37 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si |
12368 | 12510 | this->len = _len; |
12369 | 12511 | return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); |
12370 | 12512 | } |
| 12513 | +/* begin file src/generic/stage1/utf8_validator.h */ |
| 12514 | +namespace stage1 { |
| 12515 | +/** |
| 12516 | + * Validates that the string is actual UTF-8. |
| 12517 | + */ |
| 12518 | +template<class checker> |
| 12519 | +bool generic_validate_utf8(const uint8_t * input, size_t length) { |
| 12520 | + checker c{}; |
| 12521 | + buf_block_reader<64> reader(input, length); |
| 12522 | + while (reader.has_full_block()) { |
| 12523 | + simd::simd8x64<uint8_t> in(reader.full_block()); |
| 12524 | + c.check_next_input(in); |
| 12525 | + reader.advance(); |
| 12526 | + } |
| 12527 | + uint8_t block[64]{}; |
| 12528 | + reader.get_remainder(block); |
| 12529 | + simd::simd8x64<uint8_t> in(block); |
| 12530 | + c.check_next_input(in); |
| 12531 | + reader.advance(); |
| 12532 | + return c.errors() == error_code::SUCCESS; |
| 12533 | +} |
| 12534 | + |
| 12535 | +bool generic_validate_utf8(const char * input, size_t length) { |
| 12536 | + return generic_validate_utf8<utf8_checker>((const uint8_t *)input,length); |
| 12537 | +} |
12371 | 12538 |
|
| 12539 | +} // namespace stage1 |
| 12540 | +/* end file src/generic/stage1/utf8_validator.h */ |
| 12541 | +WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { |
| 12542 | + return simdjson::westmere::stage1::generic_validate_utf8(buf,len); |
| 12543 | +} |
12372 | 12544 | } // namespace westmere |
12373 | 12545 | } // namespace simdjson |
12374 | 12546 | UNTARGET_REGION |
|
0 commit comments