Skip to content

Commit 19f0867

Browse files
committed
[MERGE chakra-core#2396 @boingoing] Some string APIs operating on UTF-8 buffers do not property compute the length of the UTF-8 buffer in bytes
Merge pull request chakra-core#2396 from boingoing:UnifiedRegexDecodeUnicodeSequences Many of our string APIs which take LPCUTF8 buffers do not know how many bytes are in those buffers. Typically, these APIs need to walk over the characters in the decoded string so they take in parameters for the number of decoded characters. But the number of decoded characters is not necessarily the same as the number of bytes. This is the case when there are Unicode sequences encoded into the UTF-8 string. As a consequence of this missing info, APIs such as utf8::CharsAreEqual made the assumption that the UTF-8 buffer always contains at least 4 bytes - the maximum number of bytes in a Unicode sequence - without knowing if that was actually true or not. It turned out that this was not always true. This change computes the correct byte length for the APIs which previously made assumptions about the number of bytes in the buffer. Fixes: https://microsoft.visualstudio.com/OS/_workitems?id=10440534&_a=edit
2 parents 8161f77 + d2b1cc2 commit 19f0867

25 files changed

Lines changed: 258 additions & 140 deletions

bin/NativeTests/FileLoadHelpers.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ HRESULT FileLoadHelpers::LoadScriptFromFile(LPCSTR filename, LPCWSTR& contents,
99
{
1010
HRESULT hr = S_OK;
1111
LPCWSTR contentsRaw = nullptr;
12-
byte * pRawBytes = nullptr;
12+
LPCUTF8 pRawBytes = nullptr;
1313
UINT lengthBytes = 0;
1414
bool isUtf8 = false;
1515
contents = nullptr;
@@ -119,7 +119,7 @@ HRESULT FileLoadHelpers::LoadScriptFromFile(LPCSTR filename, LPCWSTR& contents,
119119
IfFailGo(E_OUTOFMEMORY);
120120
}
121121

122-
utf8::DecodeIntoAndNullTerminate((char16*) contents, pRawBytes, cUtf16Chars, decodeOptions);
122+
utf8::DecodeUnitsIntoAndNullTerminate((char16*)contents, pRawBytes, pRawBytes + lengthBytes, decodeOptions);
123123
}
124124

125125
Error:

lib/Common/Codex/Utf8Codex.cpp

Lines changed: 15 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -376,42 +376,9 @@ namespace utf8
376376
else
377377
return ptr;
378378
}
379-
380-
void DecodeInto(__out_ecount_full(cch) char16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options)
381-
{
382-
DecodeOptions localOptions = options;
383-
384-
if (!ShouldFastPath(ptr, buffer)) goto LSlowPath;
385-
386-
LFastPath:
387-
while (cch >= 4)
388-
{
389-
uint32 bytes = *(uint32 *)ptr;
390-
if ((bytes & 0x80808080) != 0) goto LSlowPath;
391-
((uint32 *)buffer)[0] = (bytes & 0x7F) | ((bytes << 8) & 0x7F0000);
392-
((uint32 *)buffer)[1] = ((bytes >> 16) & 0x7F) | ((bytes >> 8) & 0x7F0000);
393-
ptr += 4;
394-
buffer += 4;
395-
cch -= 4;
396-
}
397-
LSlowPath:
398-
while (cch-- > 0)
399-
{
400-
LPCUTF8 end = ptr + cch + 1; // WARNING: Assume cch correct, suppress end-of-buffer checking
401-
402-
*buffer++ = Decode(ptr, end, localOptions);
403-
if (ShouldFastPath(ptr, buffer)) goto LFastPath;
404-
}
405-
}
406-
407-
void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated char16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options)
408-
{
409-
DecodeInto(buffer, ptr, cch, options);
410-
buffer[cch] = 0;
411-
}
412-
413-
_Ret_range_(0, pbEnd - _Old_(pbUtf8))
414-
size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
379+
380+
_Use_decl_annotations_
381+
size_t DecodeUnitsInto(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
415382
{
416383
DecodeOptions localOptions = options;
417384

@@ -456,22 +423,29 @@ namespace utf8
456423
return dest - buffer;
457424
}
458425

459-
size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
426+
_Use_decl_annotations_
427+
size_t DecodeUnitsIntoAndNullTerminate(char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
460428
{
461429
size_t result = DecodeUnitsInto(buffer, pbUtf8, pbEnd, options);
462430
buffer[(int)result] = 0;
463431
return result;
464432
}
465433

466-
bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options)
434+
_Use_decl_annotations_
435+
size_t DecodeUnitsIntoAndNullTerminateNoAdvance(char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options)
436+
{
437+
return DecodeUnitsIntoAndNullTerminate(buffer, pbUtf8, pbEnd, options);
438+
}
439+
440+
bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options)
467441
{
468442
DecodeOptions localOptions = options;
469-
while (cch-- > 0)
443+
while (bch < end)
470444
{
471-
LPCUTF8 end = bch + cch + 1; // WARNING: Assume cch correct, suppress end-of-buffer checking
472-
473445
if (*pch++ != utf8::Decode(bch, end, localOptions))
446+
{
474447
return false;
448+
}
475449
}
476450
return true;
477451
}

lib/Common/Codex/Utf8Codex.h

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ typedef char16_t char16;
2626

2727
typedef char16 wchar;
2828

29+
#ifndef Unused
30+
#define Unused(var) var
31+
#endif
2932

3033
#ifndef _WIN32
3134
// Templates are defined here in order to avoid a dependency on C++
@@ -270,27 +273,15 @@ namespace utf8
270273
return PrevCharFull(ptr, start);
271274
}
272275

273-
// Decode a UTF-8 sequence of cch UTF-16 characters into buffer. ptr could advance up to 3 times
274-
// longer than cch so DecodeInto should only be used when it is already known that
275-
// ptr refers to at least cch number of UTF-8 sequences.
276-
void DecodeInto(__out_ecount_full(cch) char16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options = doDefault);
277-
278-
// Provided for dual-mode templates
279-
inline void DecodeInto(__out_ecount_full(cch) char16 *buffer, const char16 *ptr, size_t cch, DecodeOptions /* options */ = doDefault)
280-
{
281-
memcpy_s(buffer, cch * sizeof(char16), ptr, cch * sizeof(char16));
282-
}
283-
284-
// Like DecodeInto but ensures buffer ends with a NULL at buffer[cch].
285-
void DecodeIntoAndNullTerminate(__out_ecount(cch+1) __nullterminated char16 *buffer, LPCUTF8 ptr, size_t cch, DecodeOptions options = doDefault);
286-
287276
// Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer
288277
_Ret_range_(0, pbEnd - _Old_(pbUtf8))
289278
size_t DecodeUnitsInto(_Out_writes_(pbEnd - pbUtf8) char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
290279

291280
// Decode cb bytes from ptr to into buffer returning the number of characters converted and written to buffer (excluding the null terminator)
292281
size_t DecodeUnitsIntoAndNullTerminate(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8& pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
293282

283+
size_t DecodeUnitsIntoAndNullTerminateNoAdvance(__out_ecount(pbEnd - pbUtf8 + 1) __nullterminated char16 *buffer, LPCUTF8 pbUtf8, LPCUTF8 pbEnd, DecodeOptions options = doDefault);
284+
294285
// Encode a UTF-8 sequence into a UTF-8 sequence (which is just a memcpy). This is included for convenience in templates
295286
// when the character encoding is a template parameter.
296287
__range(cch, cch)
@@ -316,7 +307,7 @@ namespace utf8
316307
size_t EncodeTrueUtf8IntoAndNullTerminate(__out_ecount(cch * 3 + 1) utf8char_t *buffer, __in_ecount(cch) const char16 *source, charcount_t cch);
317308

318309
// Returns true if the pch refers to a UTF-16LE encoding of the given UTF-8 encoding bch.
319-
bool CharsAreEqual(__in_ecount(cch) LPCOLESTR pch, LPCUTF8 bch, size_t cch, DecodeOptions options = doDefault);
310+
bool CharsAreEqual(LPCOLESTR pch, LPCUTF8 bch, LPCUTF8 end, DecodeOptions options = doDefault);
320311

321312
// Convert the character index into a byte index.
322313
size_t CharacterIndexToByteIndex(__in_ecount(cbLength) LPCUTF8 pch, size_t cbLength, const charcount_t cchIndex, size_t cbStartIndex, charcount_t cchStartIndex, DecodeOptions options = doDefault);

lib/Common/Codex/Utf8Helper.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ namespace utf8
7373
// Some node tests depend on the utf8 decoder not swallowing invalid unicode characters
7474
// instead of replacing them with the "replacement" chracter. Pass a flag to our
7575
// decoder to require such behavior
76-
utf8::DecodeIntoAndNullTerminate(destString, (LPCUTF8) sourceString, cchDestString,
77-
DecodeOptions::doAllowInvalidWCHARs);
76+
utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(destString, (LPCUTF8) sourceString, (LPCUTF8) sourceString + cbSourceString, DecodeOptions::doAllowInvalidWCHARs);
7877
Assert(destString[cchDestString] == 0);
7978
static_assert(sizeof(utf8char_t) == sizeof(char), "Needs to be valid for cast");
8079
*destStringPtr = destString;

lib/Jsrt/JsrtDebugUtils.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,21 @@ void JsrtDebugUtils::AddSourceLengthAndTextToObject(Js::DynamicObject* object, J
5959
Assert(statementMap != nullptr);
6060

6161
LPCUTF8 source = functionBody->GetStartOfDocument(_u("Source for debugging"));
62-
size_t startByte = utf8::CharacterIndexToByteIndex(source, functionBody->GetUtf8SourceInfo()->GetCbLength(), (const charcount_t)statementMap->sourceSpan.begin);
62+
size_t cbLength = functionBody->GetUtf8SourceInfo()->GetCbLength();
63+
size_t startByte = utf8::CharacterIndexToByteIndex(source, cbLength, (const charcount_t)statementMap->sourceSpan.begin);
64+
size_t endByte = utf8::CharacterIndexToByteIndex(source, cbLength, (const charcount_t)statementMap->sourceSpan.end);
65+
int cch = statementMap->sourceSpan.end - statementMap->sourceSpan.begin;
6366

64-
int byteLength = statementMap->sourceSpan.end - statementMap->sourceSpan.begin;
67+
JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceLength, (double)cch, functionBody->GetScriptContext());
6568

66-
JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceLength, (double)byteLength, functionBody->GetScriptContext());
67-
68-
AutoArrayPtr<char16> sourceContent(HeapNewNoThrowArray(char16, byteLength + 1), byteLength + 1);
69+
AutoArrayPtr<char16> sourceContent(HeapNewNoThrowArray(char16, cch + 1), cch + 1);
6970
if (sourceContent != nullptr)
7071
{
72+
LPCUTF8 pbStart = source + startByte;
73+
LPCUTF8 pbEnd = pbStart + (endByte - startByte);
7174
utf8::DecodeOptions options = functionBody->GetUtf8SourceInfo()->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
72-
utf8::DecodeIntoAndNullTerminate(sourceContent, source + startByte, byteLength, options);
73-
JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceText, sourceContent, byteLength, functionBody->GetScriptContext());
75+
utf8::DecodeUnitsIntoAndNullTerminate(sourceContent, pbStart, pbEnd, options);
76+
JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::sourceText, sourceContent, cch, functionBody->GetScriptContext());
7477
}
7578
else
7679
{
@@ -92,8 +95,10 @@ void JsrtDebugUtils::AddSouceToObject(Js::DynamicObject * object, Js::Utf8Source
9295
AutoArrayPtr<char16> sourceContent(HeapNewNoThrowArray(char16, cchLength + 1), cchLength + 1);
9396
if (sourceContent != nullptr)
9497
{
98+
LPCUTF8 source = utf8SourceInfo->GetSource();
99+
size_t cbLength = utf8SourceInfo->GetCbLength();
95100
utf8::DecodeOptions options = utf8SourceInfo->IsCesu8() ? utf8::doAllowThreeByteSurrogates : utf8::doDefault;
96-
utf8::DecodeIntoAndNullTerminate(sourceContent, utf8SourceInfo->GetSource(), cchLength, options);
101+
utf8::DecodeUnitsIntoAndNullTerminate(sourceContent, source, source + cbLength, options);
97102
JsrtDebugUtils::AddPropertyToObject(object, JsrtDebugPropertyId::source, sourceContent, cchLength, utf8SourceInfo->GetScriptContext());
98103
}
99104
else

lib/Parser/Hash.cpp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -209,31 +209,42 @@ IdentPtr HashTbl::PidFromTk(tokens token)
209209
{
210210
StaticSym const * sym = s_reservedWordInfo[token].sym;
211211
Assert(sym != nullptr);
212-
rpid = this->PidHashNameLenWithHash(sym->sz, sym->cch, sym->luHash);
212+
rpid = this->PidHashNameLenWithHash(sym->sz, sym->sz + sym->cch, sym->cch, sym->luHash);
213213
rpid->SetTk(token, s_reservedWordInfo[token].grfid);
214214
m_rpid[token] = rpid;
215215
}
216216
return rpid;
217217
}
218218

219219
template <typename CharType>
220-
IdentPtr HashTbl::PidHashNameLen(CharType const * prgch, uint32 cch)
220+
IdentPtr HashTbl::PidHashNameLen(CharType const * prgch, CharType const * end, uint32 cch)
221221
{
222222
// NOTE: We use case sensitive hash during compilation, but the runtime
223223
// uses case insensitive hashing so it can do case insensitive lookups.
224-
uint32 luHash = CaseSensitiveComputeHashCch(prgch, cch);
225-
return PidHashNameLenWithHash(prgch, cch, luHash);
224+
225+
uint32 luHash = CaseSensitiveComputeHash(prgch, end);
226+
return PidHashNameLenWithHash(prgch, end, cch, luHash);
227+
}
228+
template IdentPtr HashTbl::PidHashNameLen<utf8char_t>(utf8char_t const * prgch, utf8char_t const * end, uint32 cch);
229+
template IdentPtr HashTbl::PidHashNameLen<char>(char const * prgch, char const * end, uint32 cch);
230+
template IdentPtr HashTbl::PidHashNameLen<char16>(char16 const * prgch, char16 const * end, uint32 cch);
231+
232+
template <typename CharType>
233+
IdentPtr HashTbl::PidHashNameLen(CharType const * prgch, uint32 cch)
234+
{
235+
Assert(sizeof(CharType) == 2);
236+
return PidHashNameLen(prgch, prgch + cch, cch);
226237
};
227238
template IdentPtr HashTbl::PidHashNameLen<utf8char_t>(utf8char_t const * prgch, uint32 cch);
228239
template IdentPtr HashTbl::PidHashNameLen<char>(char const * prgch, uint32 cch);
229240
template IdentPtr HashTbl::PidHashNameLen<char16>(char16 const * prgch, uint32 cch);
230241

231242
template <typename CharType>
232-
IdentPtr HashTbl::PidHashNameLenWithHash(_In_reads_(cch) CharType const * prgch, int32 cch, uint32 luHash)
243+
IdentPtr HashTbl::PidHashNameLenWithHash(_In_reads_(cch) CharType const * prgch, CharType const * end, int32 cch, uint32 luHash)
233244
{
234245
Assert(cch >= 0);
235246
AssertArrMemR(prgch, cch);
236-
Assert(luHash == CaseSensitiveComputeHashCch(prgch, cch));
247+
Assert(luHash == CaseSensitiveComputeHash(prgch, end));
237248

238249
IdentPtr * ppid;
239250
IdentPtr pid;
@@ -245,7 +256,7 @@ IdentPtr HashTbl::PidHashNameLenWithHash(_In_reads_(cch) CharType const * prgch,
245256
int depth = 0;
246257
#endif
247258

248-
pid = this->FindExistingPid(prgch, cch, luHash, &ppid, &bucketCount
259+
pid = this->FindExistingPid(prgch, end, cch, luHash, &ppid, &bucketCount
249260
#if PROFILE_DICTIONARY
250261
, depth
251262
#endif
@@ -313,14 +324,15 @@ IdentPtr HashTbl::PidHashNameLenWithHash(_In_reads_(cch) CharType const * prgch,
313324
pid->m_propertyId = Js::Constants::NoProperty;
314325
pid->assignmentState = NotAssigned;
315326

316-
HashTbl::CopyString(pid->m_sz, prgch, cch);
327+
HashTbl::CopyString(pid->m_sz, prgch, end);
317328

318329
return pid;
319330
}
320331

321332
template <typename CharType>
322333
IdentPtr HashTbl::FindExistingPid(
323334
CharType const * prgch,
335+
CharType const * end,
324336
int32 cch,
325337
uint32 luHash,
326338
IdentPtr **pppInsert,
@@ -340,7 +352,7 @@ IdentPtr HashTbl::FindExistingPid(
340352
for (bucketCount = 0; nullptr != (pid = *ppid); ppid = &pid->m_pidNext, bucketCount++)
341353
{
342354
if (pid->m_luHash == luHash && (int)pid->m_cch == cch &&
343-
HashTbl::CharsAreEqual(pid->m_sz, prgch, cch))
355+
HashTbl::CharsAreEqual(pid->m_sz, prgch, end))
344356
{
345357
return pid;
346358
}
@@ -362,32 +374,32 @@ IdentPtr HashTbl::FindExistingPid(
362374
}
363375

364376
template IdentPtr HashTbl::FindExistingPid<utf8char_t>(
365-
utf8char_t const * prgch, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
377+
utf8char_t const * prgch, utf8char_t const * end, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
366378
#if PROFILE_DICTIONARY
367379
, int& depth
368380
#endif
369381
);
370382
template IdentPtr HashTbl::FindExistingPid<char>(
371-
char const * prgch, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
383+
char const * prgch, char const * end, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
372384
#if PROFILE_DICTIONARY
373385
, int& depth
374386
#endif
375387
);
376388
template IdentPtr HashTbl::FindExistingPid<char16>(
377-
char16 const * prgch, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
389+
char16 const * prgch, char16 const * end, int32 cch, uint32 luHash, IdentPtr **pppInsert, int32 *pBucketCount
378390
#if PROFILE_DICTIONARY
379391
, int& depth
380392
#endif
381393
);
382394

383395
bool HashTbl::Contains(_In_reads_(cch) LPCOLESTR prgch, int32 cch)
384396
{
385-
uint32 luHash = CaseSensitiveComputeHashCch(prgch, cch);
397+
uint32 luHash = CaseSensitiveComputeHash(prgch, prgch + cch);
386398

387399
for (auto pid = m_prgpidName[luHash & m_luMask]; pid; pid = pid->m_pidNext)
388400
{
389401
if (pid->m_luHash == luHash && (int)pid->m_cch == cch &&
390-
HashTbl::CharsAreEqual(pid->m_sz, prgch, cch))
402+
HashTbl::CharsAreEqual(pid->m_sz, prgch + cch, prgch))
391403
{
392404
return true;
393405
}
@@ -407,7 +419,7 @@ bool HashTbl::Contains(_In_reads_(cch) LPCOLESTR prgch, int32 cch)
407419
// This method is used during colorizing when scanner isn't interested in storing the actual id and does not care about conversion of escape sequences
408420
tokens HashTbl::TkFromNameLenColor(_In_reads_(cch) LPCOLESTR prgch, uint32 cch)
409421
{
410-
uint32 luHash = CaseSensitiveComputeHashCch(prgch, cch);
422+
uint32 luHash = CaseSensitiveComputeHash(prgch, prgch + cch);
411423

412424
// look for a keyword
413425
#include "kwds_sw.h"
@@ -434,7 +446,7 @@ tokens HashTbl::TkFromNameLenColor(_In_reads_(cch) LPCOLESTR prgch, uint32 cch)
434446
// This method is used during colorizing when scanner isn't interested in storing the actual id and does not care about conversion of escape sequences
435447
tokens HashTbl::TkFromNameLen(_In_reads_(cch) LPCOLESTR prgch, uint32 cch, bool isStrictMode)
436448
{
437-
uint32 luHash = CaseSensitiveComputeHashCch(prgch, cch);
449+
uint32 luHash = CaseSensitiveComputeHash(prgch, prgch + cch);
438450

439451
// look for a keyword
440452
#include "kwds_sw.h"

0 commit comments

Comments
 (0)