diff --git a/benchmark/fs/readfile-utf8-fastpath.js b/benchmark/fs/readfile-utf8-fastpath.js new file mode 100644 index 00000000000000..9bf00717c5f0b2 --- /dev/null +++ b/benchmark/fs/readfile-utf8-fastpath.js @@ -0,0 +1,62 @@ +'use strict'; + +const common = require('../common.js'); +const fs = require('fs'); +const path = require('path'); +const tmpdir = require('../../test/common/tmpdir'); + +const bench = common.createBenchmark(main, { + size: [64, 1024, 16384, 262144, 4194304], + content: ['ascii', 'latin1', 'utf8_mixed'], + source: ['path', 'fd'], + n: [3e3], +}); + +function buildContent(kind, size) { + if (kind === 'ascii') { + return Buffer.alloc(size, 0x61); // 'a' + } + if (kind === 'latin1') { + // 'é' in UTF-8 is 0xC3 0xA9 (2 bytes per char) + const pair = Buffer.from([0xC3, 0xA9]); + const buf = Buffer.alloc(size); + for (let i = 0; i + 2 <= size; i += 2) pair.copy(buf, i); + return buf; + } + if (kind === 'utf8_mixed') { + // mixed ASCII + 3-byte CJK (U+4E2D 中 = E4 B8 AD) + const cjk = Buffer.from([0xE4, 0xB8, 0xAD]); + const buf = Buffer.alloc(size); + let i = 0; + while (i + 4 <= size) { + buf[i++] = 0x61; + cjk.copy(buf, i); + i += 3; + } + return buf; + } + throw new Error('unknown content: ' + kind); +} + +function main({ n, size, content, source }) { + tmpdir.refresh(); + const file = path.join(tmpdir.path, `bench-${content}-${size}.bin`); + fs.writeFileSync(file, buildContent(content, size)); + + let arg; + let shouldClose = false; + if (source === 'fd') { + arg = fs.openSync(file, 'r'); + shouldClose = true; + } else { + arg = file; + } + + bench.start(); + for (let i = 0; i < n; i++) { + fs.readFileSync(arg, 'utf8'); + } + bench.end(n); + + if (shouldClose) fs.closeSync(arg); +} diff --git a/src/node_file.cc b/src/node_file.cc index d93f213202ec43..2974a2031fe1ed 100644 --- a/src/node_file.cc +++ b/src/node_file.cc @@ -38,6 +38,7 @@ #include "tracing/trace_event.h" #include "req_wrap-inl.h" +#include "simdutf.h" #include "stream_base-inl.h" #include "string_bytes.h" #include "uv.h" @@ -2938,12 +2939,86 @@ static void ReadFileUtf8(const FunctionCallbackInfo& args) { } FS_SYNC_TRACE_END(read); - Local val; - if (!ToV8Value(env->context(), result, isolate).ToLocal(&val)) { + const char* data = result.data(); + const size_t total = result.size(); + + if (total == 0) { + args.GetReturnValue().Set(String::Empty(isolate)); + return; + } + if (total > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return; + } + + // ASCII: skip V8 UTF-8 validation. + if (!simdutf::validate_ascii_with_errors(data, total).error) [[likely]] { + Local str; + if (!String::NewFromOneByte(isolate, + reinterpret_cast(data), + v8::NewStringType::kNormal, + static_cast(total)) + .ToLocal(&str)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return; + } + args.GetReturnValue().Set(str); + return; + } + + // Latin1-fits: one-byte V8 string, half the heap of UTF-16. + MaybeStackBuffer latin1; + latin1.AllocateSufficientStorage(total); + simdutf::result l1 = + simdutf::convert_utf8_to_latin1_with_errors(data, total, latin1.out()); + if (!l1.error) { + Local str; + if (!String::NewFromOneByte(isolate, + reinterpret_cast(latin1.out()), + v8::NewStringType::kNormal, + static_cast(l1.count)) + .ToLocal(&str)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return; + } + args.GetReturnValue().Set(str); return; } - args.GetReturnValue().Set(val); + // Multibyte UTF-8: simdutf to UTF-16 in one pass. + if (l1.error == simdutf::error_code::TOO_LARGE) { + MaybeStackBuffer u16; + u16.AllocateSufficientStorage(total); + simdutf::result r = simdutf::convert_utf8_to_utf16_with_errors( + data, total, reinterpret_cast(u16.out())); + if (!r.error) { + if (r.count > static_cast(v8::String::kMaxLength)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return; + } + Local str; + if (!String::NewFromTwoByte(isolate, + u16.out(), + v8::NewStringType::kNormal, + static_cast(r.count)) + .ToLocal(&str)) { + isolate->ThrowException(ERR_STRING_TOO_LONG(isolate)); + return; + } + args.GetReturnValue().Set(str); + return; + } + } + + // Invalid UTF-8: fall back to V8 for replacement-char (U+FFFD) semantics. + { + Local val; + if (!ToV8Value(env->context(), result, isolate).ToLocal(&val)) + [[unlikely]] { + return; + } + args.GetReturnValue().Set(val); + } } // Wrapper for readv(2). diff --git a/test/parallel/test-fs-readfile-utf8-fast-path.js b/test/parallel/test-fs-readfile-utf8-fast-path.js new file mode 100644 index 00000000000000..18d0d884dfa455 --- /dev/null +++ b/test/parallel/test-fs-readfile-utf8-fast-path.js @@ -0,0 +1,103 @@ +'use strict'; + +require('../common'); +const fs = require('node:fs'); +const path = require('node:path'); +const assert = require('node:assert'); +const { describe, it } = require('node:test'); +const tmpdir = require('../common/tmpdir'); + +tmpdir.refresh(); + +function writeFile(name, buf) { + const p = path.join(tmpdir.path, name); + fs.writeFileSync(p, buf); + return p; +} + +function expectMatches(filePath, rawBuf) { + assert.strictEqual( + fs.readFileSync(filePath, 'utf8'), + rawBuf.toString('utf8'), + ); +} + +describe('fs.readFileSync utf8 simdutf dispatch', () => { + it('empty file', () => { + const p = writeFile('empty.txt', Buffer.alloc(0)); + assert.strictEqual(fs.readFileSync(p, 'utf8'), ''); + }); + + it('ascii small', () => { + const buf = Buffer.from('hello'); + expectMatches(writeFile('tiny-ascii.txt', buf), buf); + }); + + it('ascii 20KB', () => { + const buf = Buffer.alloc(20 * 1024, 0x41); + expectMatches(writeFile('medium-ascii.txt', buf), buf); + }); + + it('ascii 1MB', () => { + const buf = Buffer.alloc(1024 * 1024, 0x61); + expectMatches(writeFile('large-ascii.txt', buf), buf); + }); + + it('fd input', () => { + const buf = Buffer.alloc(50 * 1024, 0x62); + const p = writeFile('fd-ascii.txt', buf); + const fd = fs.openSync(p, 'r'); + try { + assert.strictEqual(fs.readFileSync(fd, 'utf8'), buf.toString('utf8')); + } finally { + fs.closeSync(fd); + } + }); + + it('multibyte UTF-8', () => { + const buf = Buffer.from('中文测试 — café — 🚀'.repeat(500), 'utf8'); + expectMatches(writeFile('multibyte.txt', buf), buf); + }); + + it('latin1-fits utf8', () => { + const buf = Buffer.from('naïve café résumé — niño Köln '.repeat(500), 'utf8'); + expectMatches(writeFile('latin1-fits.txt', buf), buf); + }); + + it('invalid: lone continuation byte', () => { + const buf = Buffer.from([0x68, 0x69, 0x80, 0x21]); + expectMatches(writeFile('invalid-cont.txt', buf), buf); + }); + + it('invalid: overlong', () => { + const buf = Buffer.from([0x41, 0xC0, 0xAF, 0x42]); + expectMatches(writeFile('invalid-overlong.txt', buf), buf); + }); + + it('invalid: surrogate', () => { + const buf = Buffer.from([0x41, 0xED, 0xA0, 0x80, 0x42]); + expectMatches(writeFile('invalid-surrogate.txt', buf), buf); + }); + + it('latin1 boundary U+00FF', () => { + const buf = Buffer.from('ÿ'.repeat(2048), 'utf8'); + expectMatches(writeFile('latin1-boundary.txt', buf), buf); + }); + + it('above latin1 U+0100', () => { + const buf = Buffer.from('ĀāĂ'.repeat(1024), 'utf8'); + expectMatches(writeFile('above-latin1.txt', buf), buf); + }); + + it('single codepoint each UTF-8 length', () => { + for (const cp of [0x41, 0x00E9, 0x4E2D, 0x1F600]) { + const buf = Buffer.from(String.fromCodePoint(cp), 'utf8'); + expectMatches(writeFile(`single-cp-${cp.toString(16)}.txt`, buf), buf); + } + }); + + it('truncated multibyte at EOF', () => { + const buf = Buffer.from([0x41, 0xE4, 0xB8]); + expectMatches(writeFile('truncated-multibyte.txt', buf), buf); + }); +});