diff --git a/benchmark/util/text-decoder.js b/benchmark/util/text-decoder.js index 1aa60f2dd0bcd6..ecfba045c52fab 100644 --- a/benchmark/util/text-decoder.js +++ b/benchmark/util/text-decoder.js @@ -6,26 +6,42 @@ const bench = common.createBenchmark(main, { encoding: ['utf-8', 'windows-1252', 'iso-8859-3'], ignoreBOM: [0, 1], fatal: [0, 1], + type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'], + content: ['ascii', 'one-byte-string', 'two-byte-string'], len: [256, 1024 * 16, 1024 * 128], n: [1e3], - type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'], }); -function main({ encoding, len, n, ignoreBOM, type, fatal }) { +function buildContent(content, len) { + let base; + switch (content) { + case 'ascii': base = 'a'; break; + case 'one-byte-string': base = '\xff'; break; + case 'two-byte-string': base = 'ğ'; break; + } + const unitBytes = Buffer.byteLength(base, 'utf8'); + const copies = Math.max(1, Math.floor(len / unitBytes)); + return Buffer.from(base.repeat(copies)); +} + +function main({ encoding, len, n, ignoreBOM, type, fatal, content }) { const decoder = new TextDecoder(encoding, { ignoreBOM, fatal }); + const seed = buildContent(content, len); let buf; switch (type) { case 'SharedArrayBuffer': { - buf = new SharedArrayBuffer(len); + buf = new SharedArrayBuffer(seed.length); + new Uint8Array(buf).set(seed); break; } case 'ArrayBuffer': { - buf = new ArrayBuffer(len); + buf = new ArrayBuffer(seed.length); + new Uint8Array(buf).set(seed); break; } case 'Buffer': { - buf = Buffer.allocUnsafe(len); + buf = seed; break; } } diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index c569375383e8d9..9c84d24c84576d 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -459,14 +459,15 @@ void BindingData::DecodeUTF8(const FunctionCallbackInfo& args) { return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( env->isolate(), "The encoded data was not valid for encoding utf-8"); } - - // TODO(chalker): save on utf8 validity recheck in StringBytes::Encode() } if (length == 0) return args.GetReturnValue().SetEmptyString(); Local ret; - if (StringBytes::Encode(env->isolate(), data, length, UTF8).ToLocal(&ret)) { + v8::MaybeLocal encoded = + has_fatal ? StringBytes::EncodeValidUtf8(env->isolate(), data, length) + : StringBytes::Encode(env->isolate(), data, length, UTF8); + if (encoded.ToLocal(&ret)) { args.GetReturnValue().Set(ret); } } diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 865302bfd1b4de..1d4ee3a81803b2 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -671,6 +671,40 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, } } +MaybeLocal StringBytes::EncodeValidUtf8(Isolate* isolate, + const char* buf, + size_t buflen) { + CHECK_BUFLEN_IN_RANGE(buflen); + if (!buflen) return String::Empty(isolate); + buflen = keep_buflen_in_range(buflen); + + // ASCII fast path + if (!simdutf::validate_ascii_with_errors(buf, buflen).error) { + return ExternOneByteString::NewFromCopy(isolate, buf, buflen); + } + + if (buflen >= 32) { + size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen); + if (u16size > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return MaybeLocal(); + } + return EncodeTwoByteString( + isolate, u16size, [buf, buflen, u16size](uint16_t* dst) { + size_t written = simdutf::convert_valid_utf8_to_utf16( + buf, buflen, reinterpret_cast(dst)); + CHECK_EQ(written, u16size); + }); + } + + Local str; + if (!String::NewFromUtf8(isolate, buf, v8::NewStringType::kNormal, buflen) + .ToLocal(&str)) { + isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate)); + } + return str; +} + MaybeLocal StringBytes::Encode(Isolate* isolate, const uint16_t* buf, size_t buflen) { diff --git a/src/string_bytes.h b/src/string_bytes.h index 9949f508f83ffe..71aa9ff1f90a7c 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -83,6 +83,11 @@ class StringBytes { size_t buflen, enum encoding encoding); + // Like Encode(..., UTF8) but does not re-validate. Input must be valid UTF-8. + static v8::MaybeLocal EncodeValidUtf8(v8::Isolate* isolate, + const char* buf, + size_t buflen); + // Warning: This reverses endianness on BE platforms, even though the // signature using uint16_t implies that it should not. // However, the brokenness is already public API and can't therefore