-
-
Notifications
You must be signed in to change notification settings - Fork 35.4k
string_decoder: fix bad utf8 character handling #7310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: #7308 PR-URL: #7310 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,8 +25,10 @@ function StringDecoder(encoding) { | |
| case 'utf16le': | ||
| this.text = utf16Text; | ||
| this.end = utf16End; | ||
| // fall through | ||
| nb = 4; | ||
| break; | ||
| case 'utf8': | ||
| this.fillLast = utf8FillLast; | ||
| nb = 4; | ||
| break; | ||
| case 'base64': | ||
|
|
@@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End; | |
| // Returns only complete characters in a Buffer | ||
| StringDecoder.prototype.text = utf8Text; | ||
|
|
||
| // Attempts to complete a partial character using bytes from a Buffer | ||
| // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer | ||
| StringDecoder.prototype.fillLast = function(buf) { | ||
| if (this.lastNeed <= buf.length) { | ||
| buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); | ||
|
|
@@ -92,38 +94,83 @@ function utf8CheckByte(byte) { | |
| return -1; | ||
| } | ||
|
|
||
| // Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8 | ||
| // character, returning the total number of bytes needed to complete the partial | ||
| // character (if applicable). | ||
| // Checks at most 3 bytes at the end of a Buffer in order to detect an | ||
| // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) | ||
| // needed to complete the UTF-8 character (if applicable) are returned. | ||
| function utf8CheckIncomplete(self, buf, i) { | ||
| var j = buf.length - 1; | ||
| if (j < i) | ||
| return 0; | ||
| var nb = utf8CheckByte(buf[j--]); | ||
| var nb = utf8CheckByte(buf[j]); | ||
| if (nb >= 0) { | ||
| if (nb > 0) | ||
| self.lastNeed = nb + 1 - (buf.length - j); | ||
| self.lastNeed = nb - 1; | ||
| return nb; | ||
| } | ||
| if (j < i) | ||
| if (--j < i) | ||
| return 0; | ||
| nb = utf8CheckByte(buf[j--]); | ||
| nb = utf8CheckByte(buf[j]); | ||
| if (nb >= 0) { | ||
| if (nb > 0) | ||
| self.lastNeed = nb + 1 - (buf.length - j); | ||
| self.lastNeed = nb - 2; | ||
| return nb; | ||
| } | ||
| if (j < i) | ||
| if (--j < i) | ||
| return 0; | ||
| nb = utf8CheckByte(buf[j--]); | ||
| nb = utf8CheckByte(buf[j]); | ||
| if (nb >= 0) { | ||
| if (nb > 0) | ||
| self.lastNeed = nb + 1 - (buf.length - j); | ||
| if (nb > 0) { | ||
| if (nb === 2) | ||
| nb = 0; | ||
| else | ||
| self.lastNeed = nb - 3; | ||
| } | ||
| return nb; | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| // Validates as many continuation bytes for a multi-byte UTF-8 character as | ||
| // needed or are available. If we see a non-continuation byte where we expect | ||
| // one, we "replace" the validated continuation bytes we've seen so far with | ||
| // UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding | ||
| // behavior. The continuation byte check is included three times in the case | ||
| // where all of the continuation bytes for a character exist in the same buffer. | ||
| // It is also done this way as a slight performance increase instead of using a | ||
| // loop. | ||
| function utf8CheckExtraBytes(self, buf, p) { | ||
| if ((buf[0] & 0xC0) !== 0x80) { | ||
| self.lastNeed = 0; | ||
| return '\ufffd'.repeat(p); | ||
| } | ||
| if (self.lastNeed > 1 && buf.length > 1) { | ||
| if ((buf[1] & 0xC0) !== 0x80) { | ||
| self.lastNeed = 1; | ||
| return '\ufffd'.repeat(p + 1); | ||
| } | ||
| if (self.lastNeed > 2 && buf.length > 2) { | ||
| if ((buf[2] & 0xC0) !== 0x80) { | ||
| self.lastNeed = 2; | ||
| return '\ufffd'.repeat(p + 2); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason to repeat it two more times?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of the two preceding (valid) continuation bytes.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, but it is still not obvious to me. |
||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. | ||
| function utf8FillLast(buf) { | ||
| const p = this.lastTotal - this.lastNeed; | ||
| var r = utf8CheckExtraBytes(this, buf, p); | ||
| if (r !== undefined) | ||
| return r; | ||
| if (this.lastNeed <= buf.length) { | ||
| buf.copy(this.lastChar, p, 0, this.lastNeed); | ||
| return this.lastChar.toString(this.encoding, 0, this.lastTotal); | ||
| } | ||
| buf.copy(this.lastChar, p, 0, buf.length); | ||
| this.lastNeed -= buf.length; | ||
| } | ||
|
|
||
| // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a | ||
| // partial character, the character's bytes are buffered until the required | ||
| // number of bytes are available. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')), | |
| assert.strictEqual(decoder.end(), ''); | ||
|
|
||
| decoder = new StringDecoder('utf8'); | ||
| assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd'); | ||
| assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd'); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this change? And if needed, maybe leave the old test too?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just making capitalization consistent with all of the other hex buffers.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be inclined to leave it just to make sure that lower-case is tested. (I'm sure that under the hood, While I have an opinion, I don't feel strongly enough about this to protest or anything. If you do feel strongly, feel free to leave it as you have it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand. The hex string is being passed to the Buffer creation function (
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mscdex Argh! You're right, of course. Ignore me. |
||
| assert.strictEqual(decoder.end(), '\ufffd'); | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't it still go negative here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, because if execution has reached here,
nbwould have to be a 2, 3, or 4-byte character indicator. Soself.lastNeedwould range from 0 to 2.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right,
nbcan't be 1.