Skip to content

Commit 0da4c67

Browse files
felixgetjfontaine
authored andcommitted
string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output when encountering unmatched surrogate code units [1]. The new REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points with the unicode replacement character. [1]: JS Strings are defined as arrays of 16 bit unsigned integers. There is no unicode enforcement, so one can easily end up with invalid unicode code unit sequences inside a string.
1 parent 881ac26 commit 0da4c67

4 files changed

Lines changed: 22 additions & 1 deletion

File tree

src/node.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ static uv_async_t dispatch_debug_messages_async;
176176
// Declared in node_internals.h
177177
Isolate* node_isolate = NULL;
178178

179+
int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
180+
v8::String::NO_NULL_TERMINATION;
179181

180182
static void Spin(uv_idle_t* handle, int status) {
181183
assert((uv_idle_t*) handle == &tick_spinner);
@@ -3042,6 +3044,11 @@ static char **copy_argv(int argc, char **argv) {
30423044
}
30433045

30443046
int Start(int argc, char *argv[]) {
3047+
const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
3048+
3049+
if (replaceInvalid == NULL)
3050+
WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
3051+
30453052
// Hack aroung with the argv pointer. Used for process.title = "blah".
30463053
argv = uv_setup_args(argc, argv);
30473054

src/string_bytes.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ size_t StringBytes::Write(char* buf,
199199
break;
200200

201201
case UTF8:
202-
len = str->WriteUtf8(buf, buflen, chars_written, flags);
202+
len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS);
203203
break;
204204

205205
case UCS2:

src/string_bytes.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
namespace node {
3131

32+
extern int WRITE_UTF8_FLAGS;
33+
3234
using v8::Handle;
3335
using v8::Local;
3436
using v8::String;

test/simple/test-buffer.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,18 @@ assert.equal(buf[3], 0xFF);
791791
assert.equal(buf[3], 0xFF);
792792
});
793793

794+
// test unmatched surrogates not producing invalid utf8 output
795+
// ef bf bd = utf-8 representation of unicode replacement character
796+
// see https://codereview.chromium.org/121173009/
797+
buf = new Buffer('ab\ud800cd', 'utf8');
798+
assert.equal(buf[0], 0x61);
799+
assert.equal(buf[1], 0x62);
800+
assert.equal(buf[2], 0xef);
801+
assert.equal(buf[3], 0xbf);
802+
assert.equal(buf[4], 0xbd);
803+
assert.equal(buf[5], 0x63);
804+
assert.equal(buf[6], 0x64);
805+
794806
// test for buffer overrun
795807
buf = new Buffer([0, 0, 0, 0, 0]); // length: 5
796808
var sub = buf.slice(0, 4); // length: 4

0 commit comments

Comments
 (0)