Skip to content

Commit c6ae5d6

Browse files
committed
HTML Comment Spec Compliance
Removed custom logic from HTML comment handling and brought up to date with B.1.3 HTML-like Comments. - Removed HTML comment flag and replaced with a flag for checking if we are parsing a module (the only place we don't allow HTML comments in the standard.) - Removed custom logic involving looking for EOF and } when parsing HTML end comment and jump to single line comment handling instead. - Removed some scanner lookahead functions that are no longer used. - Added negative parsing cases to modules syntax testing. The unit test added for this commit is a special file and must be treated like binary to preserve the mixed and non-standard line endings. I added a line to .gitattributes to ensure this. Fixes chakra-core#20.
1 parent db15dfe commit c6ae5d6

7 files changed

Lines changed: 119 additions & 61 deletions

File tree

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
*.baseline -crlf
22
*.cmd -crlf
33
test/*.js -crlf
4+
test/es6/HTMLComments.js binary diff=cpp

lib/Parser/Scan.cpp

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ void Scanner<EncodingPolicy>::SetText(EncodedCharPtr pszSrc, size_t offset, size
202202
m_startLine = lineNumber;
203203
m_pchStartLine = m_currentCharacter;
204204
m_ptoken->tk = tkNone;
205-
m_fHtmlComments = (grfscr & fscrHtmlComments) != 0;
205+
m_fIsModuleCode = (grfscr & fscrIsModuleCode) != 0;
206206
m_fHadEol = FALSE;
207207
m_fSyntaxColor = (grfscr & fscrSyntaxColor) != 0;
208208
m_DeferredParseFlags = ScanFlagNone;
@@ -1614,6 +1614,8 @@ tokens Scanner<EncodingPolicy>::SkipComment(EncodedCharPtr *pp, /* out */ bool*
16141614
return tkNone;
16151615
}
16161616
break;
1617+
1618+
// ES 2015 11.3 Line Terminators
16171619
case kchLS: // 0x2028, classifies as new line
16181620
case kchPS: // 0x2029, classifies as new line
16191621
LEcmaLineBreak:
@@ -1756,6 +1758,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
17561758
m_fHadEol = FALSE;
17571759
CharTypes chType;
17581760
charcount_t commentStartLine;
1761+
bool seenDelimitedCommentEnd = false;
17591762

17601763
if (m_scanState && *p != 0)
17611764
{
@@ -1934,16 +1937,19 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
19341937
}
19351938
case '(': Assert(chType == _C_LPR); token = tkLParen; break;
19361939
case ')': Assert(chType == _C_RPR); token = tkRParen; break;
1937-
case ',': Assert(chType == _C_CMA); token = tkComma; break;
1940+
case ',': Assert(chType == _C_CMA); token = tkComma; break;
19381941
case ';': Assert(chType == _C_SMC); token = tkSColon; break;
19391942
case '[': Assert(chType == _C_LBR); token = tkLBrack; break;
19401943
case ']': Assert(chType == _C_RBR); token = tkRBrack; break;
1941-
case '~': Assert(chType == _C_TIL); token = tkTilde; break;
1942-
case '?': Assert(chType == _C_QUE); token = tkQMark; break;
1943-
case '{': Assert(chType == _C_LC); token = tkLCurly; break;
1944+
case '~': Assert(chType == _C_TIL); token = tkTilde; break;
1945+
case '?': Assert(chType == _C_QUE); token = tkQMark; break;
1946+
case '{': Assert(chType == _C_LC); token = tkLCurly; break;
19441947

1948+
// ES 2015 11.3 Line Terminators
19451949
case '\r':
19461950
case '\n':
1951+
// kchLS:
1952+
// kchPS:
19471953
LNewLine:
19481954
m_currentCharacter = p;
19491955
ScanNewLine(ch);
@@ -2087,36 +2093,11 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
20872093
case '-':
20882094
p++;
20892095
token = tkDec;
2090-
if (m_fHtmlComments)
2096+
if (!m_fIsModuleCode)
20912097
{
2092-
int i = 0;
2093-
while ('-' == PeekFirst(p + i, last)) //Have already seen --, skip any further - characters
2094-
i++;
2095-
if ('>' == PeekFirst(p + i++, last)) //This means we've got a --------------------------->.
2098+
if ('>' == PeekFirst(p, last) && (m_fHadEol || seenDelimitedCommentEnd)) // --> HTMLCloseComment
20962099
{
2097-
//If that precedes an EOF or }NWL (disregarding whitespace), then it is a comment.
2098-
OLECHAR nextChar;
2099-
nextChar = NextNonWhiteChar(&p[i], last);
2100-
if (nextChar == 0)
2101-
{
2102-
//Treat the -----------------------------> EOF as if it were EOF
2103-
token = tkEOF;
2104-
++p;
2105-
}
2106-
else if (nextChar == '}')
2107-
{
2108-
CharTypes nextNextCharType = this->charClassifier->GetCharType(NextNonWhiteCharPlusOne(&p[i], last));
2109-
if (nextNextCharType == _C_NWL
2110-
// Corner case: If we have reached the end of the source, either we are at the end of the file or the end of
2111-
// a deferred function. We treat this case as NWL.
2112-
// TODO(tcare): Update to ES6 spec. Tracked in Bug 1164686
2113-
|| (last == m_pchLast && nextNextCharType == _C_NUL))
2114-
{
2115-
//Treat the -----------------------------> }NWL as if it were }NWL
2116-
p += i;
2117-
continue;
2118-
}
2119-
}
2100+
goto LSkipLineComment;
21202101
}
21212102
}
21222103
break;
@@ -2155,7 +2136,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
21552136
case '/':
21562137
if (p >= last)
21572138
{
2158-
AssertMsg(m_fHtmlComments, "Do we have other line comment cases scanning pass last?");
2139+
AssertMsg(!m_fIsModuleCode, "Do we have other line comment cases scanning pass last?");
21592140

21602141
// Effective source length may have excluded HTMLCommentSuffix "//... -->". If we are scanning
21612142
// those, we have passed "last" already. Move back and return EOF.
@@ -2251,6 +2232,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
22512232
// of deciding whether to defer AST and byte code generation.
22522233
m_parser->ReduceDeferredScriptLength((ULONG)(pchT - m_pchMinTok));
22532234
p = pchT;
2235+
seenDelimitedCommentEnd = true;
22542236
goto LLoop;
22552237
}
22562238
p = pchT;
@@ -2286,7 +2268,8 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
22862268
}
22872269
break;
22882270
case '!':
2289-
if (m_fHtmlComments && PeekFirst(p + 1, last) == '-' && PeekFirst(p + 2, last) == '-')
2271+
// ES 2015 B.1.3 - HTML comments are only allowed when parsing non-module code.
2272+
if (!m_fIsModuleCode && PeekFirst(p + 1, last) == '-' && PeekFirst(p + 2, last) == '-')
22902273
{
22912274
// This is a "<!--" comment - treat as //
22922275
if (p >= last)

lib/Parser/Scan.h

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,7 @@ class Scanner : public IScanner, public EncodingPolicy
663663
ErrHandler *m_perr; // error handler to use
664664
uint16 m_fStringTemplateDepth; // we should treat } as string template middle starting character (depth instead of flag)
665665
BOOL m_fHadEol;
666-
BOOL m_fHtmlComments : 1;
666+
BOOL m_fIsModuleCode : 1;
667667
BOOL m_doubleQuoteOnLastTkStrCon :1;
668668
bool m_OctOrLeadingZeroOnLastTKNumber :1;
669669
BOOL m_fSyntaxColor : 1; // whether we're just syntax coloring
@@ -762,26 +762,6 @@ class Scanner : public IScanner, public EncodingPolicy
762762
{
763763
return ReadFull<true>(m_currentCharacter, m_pchLast);
764764
}
765-
OLECHAR NextNonWhiteChar(EncodedCharPtr p, EncodedCharPtr last)
766-
{
767-
OLECHAR ch;
768-
do
769-
{
770-
ch = ReadFull<false>(p, last);
771-
}
772-
while (this->charClassifier->IsWhiteSpace(ch));
773-
return ch;
774-
}
775-
OLECHAR NextNonWhiteCharPlusOne(EncodedCharPtr p, EncodedCharPtr last)
776-
{
777-
OLECHAR ch;
778-
do
779-
{
780-
ch = ReadFull<false>(p, last);
781-
}
782-
while (this->charClassifier->IsWhiteSpace(ch));
783-
return ReadFull<false>(p, last);
784-
}
785765

786766
EncodedCharPtr AdjustedLast() const
787767
{

test/es6/HTMLComments.baseline

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Code before CRLF--> is reachable
2+
Code before CR--> is reachable
3+
Code before LF--> is reachable
4+
Code before LS--> is reachable
5+
Code before PS--> is reachable
6+
Code before CRLS--> is reachable
7+
Code before CRPS--> is reachable
8+
Code before <!-- is reachable
9+
Code before <!-- --> is reachable
10+
Code before <!-- LineTerminator --> is reachable
11+
Code before /* */ --> is reachable
12+
Code before /* */--> is reachable
13+
Code after post-decrement with a greater-than comparison (-->) is reachable

test/es6/HTMLComments.js

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//-------------------------------------------------------------------------------------------------------
2+
// Copyright (C) Microsoft. All rights reserved.
3+
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+
//-------------------------------------------------------------------------------------------------------
5+
6+
/* NOTE: This file needs to be treated as binary. It contains mixed line endings, including non-standard
7+
* line endings. Most text editors will not handle the file correctly. If you need to edit this
8+
* file, make sure you do a binary compare to ensure the non-standard line endings have not been lost.
9+
*
10+
* 'LS' refers to Unicode Character 'LINE SEPARATOR' (U+2028)
11+
* 'PS' refers to Unicode Character 'PARAGRAPH SEPARATOR' (U+2029)
12+
*/
13+
14+
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
15+
16+
/*
17+
* Line terminator sequences - standard (11.3 LineTerminator)
18+
*/
19+
20+
// CRLF
21+
WScript.Echo("Code before CRLF--> is reachable");
22+
--> WScript.Echo("Code after CRLF--> is unreachable");
23+
24+
// CR
25+
WScript.Echo("Code before CR--> is reachable");--> WScript.Echo("Code after CR--> is unreachable");
26+
27+
// LF
28+
WScript.Echo("Code before LF--> is reachable");
29+
--> WScript.Echo("Code after LF--> is unreachable");
30+
31+
// LS
32+
WScript.Echo("Code before LS--> is reachable");
--> WScript.Echo("Code after LS--> is unreachable");
33+
34+
// PS
35+
WScript.Echo("Code before PS--> is reachable");
--> WScript.Echo("Code after PS--> is unreachable");
36+
37+
/*
38+
* Line terminator sequences - non-standard (11.3 LineTerminatorSequence <CR>[lookahead != <LF>])
39+
*/
40+
41+
// CRLS
42+
WScript.Echo("Code before CRLS--> is reachable");
--> WScript.Echo("Code after CRLS--> is unreachable");
43+
44+
// CRPS
45+
WScript.Echo("Code before CRPS--> is reachable");
--> WScript.Echo("Code after CRPS--> is unreachable");
46+
47+
// HTML open comment comments out the rest of the line
48+
WScript.Echo("Code before <!-- is reachable"); <!-- WScript.Echo("Code after <!-- is unreachable");
49+
WScript.Echo("Code before <!-- --> is reachable"); <!-- --> WScript.Echo("Code after <!-- --> is unreachable");
50+
51+
// Split multiline HTML comment comments out both lines
52+
WScript.Echo("Code before <!-- LineTerminator --> is reachable"); <!-- WScript.Echo("Code after multiline <!-- is unreachable");
53+
--> WScript.Echo("Code after <!-- LineTerminator --> is unreachable");
54+
55+
// Delimited comments syntax
56+
/* Multi
57+
Line
58+
Comment */ --> WScript.Echo("Code after */ --> is unreachable");
59+
WScript.Echo("Code before /* */ --> is reachable"); /* Comment */ --> WScript.Echo("Code after /* */ --> is unreachable");
60+
WScript.Echo("Code before /* */--> is reachable"); /* Comment */--> WScript.Echo("Code after /* */--> is unreachable"); // No WhiteSpaceSequence
61+
62+
// Post-decrement with a greater-than comparison does not get interpreted as a comment
63+
var a = 1; a-->a; WScript.Echo("Code after post-decrement with a greater-than comparison (-->) is reachable");
64+
assert.areEqual(0, a, "Post decrement executes");
65+
66+
assert.throws(function () { eval('/* */ --->'); }, SyntaxError, "HTMLCloseComment causes syntax error with an extra -", "Syntax error");

test/es6/module-syntax.js

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,21 @@ WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
99

1010
function testModuleScript(source, message, shouldFail) {
1111
let testfunc = () => WScript.LoadModule(source, 'samethread');
12-
12+
1313
if (shouldFail) {
1414
let caught = false;
15-
15+
1616
// We can't use assert.throws here because the SyntaxError used to construct the thrown error
1717
// is from a different context so it won't be strictly equal to our SyntaxError.
1818
try {
1919
testfunc();
2020
} catch(e) {
2121
caught = true;
22-
22+
2323
// Compare toString output of SyntaxError and other context SyntaxError constructor.
2424
assert.areEqual(e.constructor.toString(), SyntaxError.toString(), message);
2525
}
26-
26+
2727
assert.isTrue(caught, `Expected error not thrown: ${message}`);
2828
} else {
2929
assert.doesNotThrow(testfunc, message);
@@ -124,6 +124,16 @@ var tests = [
124124
assert.doesNotThrow(function () { WScript.LoadModuleFile('.\\module\\ValidReExportStatements.js', 'samethread'); }, "Valid re-export statements");
125125
}
126126
},
127+
{
128+
name: "HTML comments do not parse in module code",
129+
body: function () {
130+
testModuleScript("<!--\n", "HTML open comment does not parse in module code", true);
131+
testModuleScript("\n-->", "HTML close comment does not parse in module code", true);
132+
testModuleScript("<!-- -->", "HTML comment does not parse in module code", true);
133+
testModuleScript("/* */ -->", "HTML comment after delimited comment does not parse in module code", true);
134+
testModuleScript("/* */\n-->", "HTML comment after delimited comment does not parse in module code", true);
135+
}
136+
}
127137
];
128138

129139
testRunner.runTests(tests, { verbose: WScript.Arguments[0] != "summary" });

test/es6/rlexe.xml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1141,7 +1141,12 @@
11411141
<compile-flags>-args summary -endargs</compile-flags>
11421142
</default>
11431143
</test>
1144-
1144+
<test>
1145+
<default>
1146+
<files>HTMLComments.js</files>
1147+
<baseline>HTMLComments.baseline</baseline>
1148+
</default>
1149+
</test>
11451150
<test>
11461151
<default>
11471152
<files>module-syntax.js</files>

0 commit comments

Comments
 (0)