fixup! buffer: add buffer.isUtf8 for utf8 validation

nodejs · nodejs-github-bot · Dec 25, 2022 · Dec 22, 2022 · Dec 23, 2022 · Dec 23, 2022
commit e940f594f7550aa1bf6f50916e84f427497704d5
diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js
@@ -11,18 +11,52 @@ assert.strictEqual(isUtf8(encoder.encode('hello')), true);
 assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
 assert.strictEqual(isUtf8(Buffer.from([])), true);
 
-// Invalid UTF-8
-assert.strictEqual(isUtf8(Buffer.from([0xf8])), false);
+// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
+[
+  [0xFF], // 'invalid code'
+  [0xC0], // 'ends early'
+  [0xE0], // 'ends early 2'
+  [0xC0, 0x00], // 'invalid trail'
+  [0xC0, 0xC0], // 'invalid trail 2'
+  [0xE0, 0x00], // 'invalid trail 3'
+  [0xE0, 0xC0], // 'invalid trail 4'
+  [0xE0, 0x80, 0x00], // 'invalid trail 5'
+  [0xE0, 0x80, 0xC0], // 'invalid trail 6'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
+  [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
+
+  // Overlong encodings
+  [0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
+  [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
+  [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
+
+  [0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
+  [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
+  [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
 
-// CESU-8
-assert.strictEqual(isUtf8(encoder.encode('\u0045\u0205\u10400')), true);
-assert.strictEqual(isUtf8(encoder.encode('aé日')), true);
+  [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
+  [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
 
-// Two byte overlong encoding
-assert.strictEqual(isUtf8(encoder.encode('\u0000')), true);
+  [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
+  [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
 
-// WTF-8
-assert.strictEqual(isUtf8(encoder.encode('\uD800\uDFFF')), true);
+  [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
+  [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
+
+  // UTF-16 surrogates encoded as code points in UTF-8
+  [0xED, 0xA0, 0x80], // 'lead surrogate'
+  [0xED, 0xB0, 0x80], // 'trail surrogate'
+  [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
+].forEach((input) => {
+  assert.strictEqual(isUtf8(Buffer.from(input)), false);
+});
 
 [
   null,