HTML Comment Spec Compliance

tcare · tcare · commit c6ae5d6a8276 · 2016-03-01T12:09:14.000-08:00
Removed custom logic from HTML comment handling and brought up to date with B.1.3 HTML-like Comments. - Removed HTML comment flag and replaced with a flag for checking if we are parsing a module (the only place we don't allow HTML comments in the standard.) - Removed custom logic involving looking for EOF and } when parsing HTML end comment and jump to single line comment handling instead. - Removed some scanner lookahead functions that are no longer used. - Added negative parsing cases to modules syntax testing. The unit test added for this commit is a special file and must be treated like binary to preserve the mixed and non-standard line endings. I added a line to .gitattributes to ensure this. Fixes chakra-core#20.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 *.baseline -crlf
 *.cmd -crlf
 test/*.js -crlf
+test/es6/HTMLComments.js binary diff=cpp
diff --git a/lib/Parser/Scan.cpp b/lib/Parser/Scan.cpp
@@ -202,7 +202,7 @@ void Scanner<EncodingPolicy>::SetText(EncodedCharPtr pszSrc, size_t offset, size
     m_startLine = lineNumber;
     m_pchStartLine = m_currentCharacter;
     m_ptoken->tk = tkNone;
-    m_fHtmlComments = (grfscr & fscrHtmlComments) != 0;
+    m_fIsModuleCode = (grfscr & fscrIsModuleCode) != 0;
     m_fHadEol = FALSE;
     m_fSyntaxColor = (grfscr & fscrSyntaxColor) != 0;
     m_DeferredParseFlags = ScanFlagNone;
@@ -1614,6 +1614,8 @@ tokens Scanner<EncodingPolicy>::SkipComment(EncodedCharPtr *pp, /* out */ bool*
                 return tkNone;
             }
             break;
+
+        // ES 2015 11.3 Line Terminators
         case kchLS:         // 0x2028, classifies as new line
         case kchPS:         // 0x2029, classifies as new line
 LEcmaLineBreak:
@@ -1756,6 +1758,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
     m_fHadEol = FALSE;
     CharTypes chType;
     charcount_t commentStartLine;
+    bool seenDelimitedCommentEnd = false;
 
     if (m_scanState && *p != 0)
     {
@@ -1934,16 +1937,19 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
             }
         case '(': Assert(chType == _C_LPR); token = tkLParen; break;
         case ')': Assert(chType == _C_RPR); token = tkRParen; break;
-        case ',': Assert(chType == _C_CMA); token = tkComma; break;
+        case ',': Assert(chType == _C_CMA); token = tkComma;  break;
         case ';': Assert(chType == _C_SMC); token = tkSColon; break;
         case '[': Assert(chType == _C_LBR); token = tkLBrack; break;
         case ']': Assert(chType == _C_RBR); token = tkRBrack; break;
-        case '~': Assert(chType == _C_TIL); token = tkTilde; break;
-        case '?': Assert(chType == _C_QUE); token = tkQMark; break;
-        case '{': Assert(chType == _C_LC); token = tkLCurly; break;
+        case '~': Assert(chType == _C_TIL); token = tkTilde;  break;
+        case '?': Assert(chType == _C_QUE); token = tkQMark;  break;
+        case '{': Assert(chType == _C_LC);  token = tkLCurly; break;
 
+        // ES 2015 11.3 Line Terminators
         case '\r':
         case '\n':
+        // kchLS:
+        // kchPS:
 LNewLine:
             m_currentCharacter = p;
             ScanNewLine(ch);
@@ -2087,36 +2093,11 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
             case '-':
                 p++;
                 token = tkDec;
-                if (m_fHtmlComments)
+                if (!m_fIsModuleCode)
                 {
-                    int i = 0;
-                    while ('-' == PeekFirst(p + i, last)) //Have already seen --, skip any further - characters
-                        i++;
-                    if ('>' == PeekFirst(p + i++, last)) //This means we've got a --------------------------->.
+                    if ('>' == PeekFirst(p, last) && (m_fHadEol || seenDelimitedCommentEnd)) // --> HTMLCloseComment
                     {
-                        //If that precedes an EOF or }NWL (disregarding whitespace), then it is a comment.
-                        OLECHAR nextChar;
-                        nextChar = NextNonWhiteChar(&p[i], last);
-                        if (nextChar == 0)
-                        {
-                            //Treat the -----------------------------> EOF as if it were EOF
-                            token = tkEOF;
-                            ++p;
-                        }
-                        else if (nextChar == '}')
-                        {
-                            CharTypes nextNextCharType = this->charClassifier->GetCharType(NextNonWhiteCharPlusOne(&p[i], last));
-                            if (nextNextCharType == _C_NWL
-                                // Corner case: If we have reached the end of the source, either we are at the end of the file or the end of
-                                // a deferred function. We treat this case as NWL.
-                                // TODO(tcare): Update to ES6 spec. Tracked in Bug 1164686
-                                || (last == m_pchLast && nextNextCharType == _C_NUL))
-                            {
-                                //Treat the -----------------------------> }NWL as if it were }NWL
-                                p += i;
-                                continue;
-                            }
-                        }
+                        goto LSkipLineComment;
                     }
                 }
                 break;
@@ -2155,7 +2136,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
             case '/':
                 if (p >= last)
                 {
-                    AssertMsg(m_fHtmlComments, "Do we have other line comment cases scanning pass last?");
+                    AssertMsg(!m_fIsModuleCode, "Do we have other line comment cases scanning pass last?");
 
                     // Effective source length may have excluded HTMLCommentSuffix "//... -->". If we are scanning
                     // those, we have passed "last" already. Move back and return EOF.
@@ -2251,6 +2232,7 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
                     // of deciding whether to defer AST and byte code generation.
                     m_parser->ReduceDeferredScriptLength((ULONG)(pchT - m_pchMinTok));
                     p = pchT;
+                    seenDelimitedCommentEnd = true;
                     goto LLoop;
                 }
                 p = pchT;
@@ -2286,7 +2268,8 @@ tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
                 }
                 break;
             case '!':
-                if (m_fHtmlComments && PeekFirst(p + 1, last) == '-' && PeekFirst(p + 2, last) == '-')
+                // ES 2015 B.1.3 -  HTML comments are only allowed when parsing non-module code.
+                if (!m_fIsModuleCode && PeekFirst(p + 1, last) == '-' && PeekFirst(p + 2, last) == '-')
                 {
                     // This is a "<!--" comment - treat as //
                     if (p >= last)
diff --git a/lib/Parser/Scan.h b/lib/Parser/Scan.h
@@ -663,7 +663,7 @@ class Scanner : public IScanner, public EncodingPolicy
     ErrHandler *m_perr;                // error handler to use
     uint16 m_fStringTemplateDepth;     // we should treat } as string template middle starting character (depth instead of flag)
     BOOL m_fHadEol;
-    BOOL m_fHtmlComments : 1;
+    BOOL m_fIsModuleCode : 1;
     BOOL m_doubleQuoteOnLastTkStrCon :1;
     bool m_OctOrLeadingZeroOnLastTKNumber :1;
     BOOL m_fSyntaxColor : 1;            // whether we're just syntax coloring
@@ -762,26 +762,6 @@ class Scanner : public IScanner, public EncodingPolicy
     {
         return ReadFull<true>(m_currentCharacter, m_pchLast);
     }
-    OLECHAR NextNonWhiteChar(EncodedCharPtr p, EncodedCharPtr last)
-    {
-        OLECHAR ch;
-        do
-        {
-            ch = ReadFull<false>(p, last);
-        }
-        while (this->charClassifier->IsWhiteSpace(ch));
-        return ch;
-    }
-    OLECHAR NextNonWhiteCharPlusOne(EncodedCharPtr p, EncodedCharPtr last)
-    {
-        OLECHAR ch;
-        do
-        {
-            ch = ReadFull<false>(p, last);
-        }
-        while (this->charClassifier->IsWhiteSpace(ch));
-        return ReadFull<false>(p, last);
-    }
 
     EncodedCharPtr AdjustedLast() const
     {
diff --git a/test/es6/HTMLComments.baseline b/test/es6/HTMLComments.baseline
@@ -0,0 +1,13 @@
+Code before CRLF--> is reachable
+Code before CR--> is reachable
+Code before LF--> is reachable
+Code before LS--> is reachable
+Code before PS--> is reachable
+Code before CRLS--> is reachable
+Code before CRPS--> is reachable
+Code before <!-- is reachable
+Code before <!-- --> is reachable
+Code before <!-- LineTerminator --> is reachable
+Code before /* */ --> is reachable
+Code before /* */--> is reachable
+Code after post-decrement with a greater-than comparison (-->) is reachable
diff --git a/test/es6/HTMLComments.js b/test/es6/HTMLComments.js
@@ -0,0 +1,66 @@
+﻿//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+/* NOTE: This file needs to be treated as binary. It contains mixed line endings, including non-standard
+ *       line endings. Most text editors will not handle the file correctly. If you need to edit this
+ *       file, make sure you do a binary compare to ensure the non-standard line endings have not been lost.
+ *
+ *       'LS' refers to Unicode Character 'LINE SEPARATOR' (U+2028)
+ *       'PS' refers to Unicode Character 'PARAGRAPH SEPARATOR' (U+2029)
+ */
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+/*
+ * Line terminator sequences - standard (11.3 LineTerminator)
+ */
+ 
+// CRLF
+WScript.Echo("Code before CRLF--> is reachable");
+--> WScript.Echo("Code after CRLF--> is unreachable");
+
+// CR
+WScript.Echo("Code before CR--> is reachable");--> WScript.Echo("Code after CR--> is unreachable");
+
+// LF
+WScript.Echo("Code before LF--> is reachable");
+--> WScript.Echo("Code after LF--> is unreachable");
+
+// LS
+WScript.Echo("Code before LS--> is reachable"); --> WScript.Echo("Code after LS--> is unreachable");
+
+// PS
+WScript.Echo("Code before PS--> is reachable"); --> WScript.Echo("Code after PS--> is unreachable");
+
+/*
+ * Line terminator sequences - non-standard (11.3 LineTerminatorSequence <CR>[lookahead != <LF>])
+ */
+
+// CRLS
+WScript.Echo("Code before CRLS--> is reachable"); --> WScript.Echo("Code after CRLS--> is unreachable");
+
+// CRPS
+WScript.Echo("Code before CRPS--> is reachable"); --> WScript.Echo("Code after CRPS--> is unreachable");
+
+// HTML open comment comments out the rest of the line
+WScript.Echo("Code before <!-- is reachable"); <!-- WScript.Echo("Code after <!-- is unreachable");
+WScript.Echo("Code before <!-- --> is reachable"); <!-- --> WScript.Echo("Code after <!-- --> is unreachable");
+
+// Split multiline HTML comment comments out both lines
+WScript.Echo("Code before <!-- LineTerminator --> is reachable"); <!-- WScript.Echo("Code after multiline <!-- is unreachable");
+--> WScript.Echo("Code after <!-- LineTerminator --> is unreachable");
+
+// Delimited comments syntax
+/* Multi
+   Line
+   Comment */ --> WScript.Echo("Code after */ --> is unreachable");
+WScript.Echo("Code before /* */ --> is reachable"); /* Comment */ --> WScript.Echo("Code after /* */ --> is unreachable");
+WScript.Echo("Code before /* */--> is reachable"); /* Comment */--> WScript.Echo("Code after /* */--> is unreachable"); // No WhiteSpaceSequence
+
+// Post-decrement with a greater-than comparison does not get interpreted as a comment
+var a = 1; a-->a; WScript.Echo("Code after post-decrement with a greater-than comparison (-->) is reachable");
+assert.areEqual(0, a, "Post decrement executes");
+
+assert.throws(function () { eval('/* */ --->'); }, SyntaxError, "HTMLCloseComment causes syntax error with an extra -", "Syntax error");
diff --git a/test/es6/module-syntax.js b/test/es6/module-syntax.js
@@ -9,21 +9,21 @@ WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
 
 function testModuleScript(source, message, shouldFail) {
     let testfunc = () => WScript.LoadModule(source, 'samethread');
-    
+
     if (shouldFail) {
         let caught = false;
-        
+
         // We can't use assert.throws here because the SyntaxError used to construct the thrown error
         // is from a different context so it won't be strictly equal to our SyntaxError.
         try {
             testfunc();
         } catch(e) {
             caught = true;
-            
+
             // Compare toString output of SyntaxError and other context SyntaxError constructor.
             assert.areEqual(e.constructor.toString(), SyntaxError.toString(), message);
         }
-        
+
         assert.isTrue(caught, `Expected error not thrown: ${message}`);
     } else {
         assert.doesNotThrow(testfunc, message);
@@ -124,6 +124,16 @@ var tests = [
             assert.doesNotThrow(function () { WScript.LoadModuleFile('.\\module\\ValidReExportStatements.js', 'samethread'); }, "Valid re-export statements");
         }
     },
+    {
+        name: "HTML comments do not parse in module code",
+        body: function () {
+            testModuleScript("<!--\n",     "HTML open comment does not parse in module code",  true);
+            testModuleScript("\n-->",      "HTML close comment does not parse in module code", true);
+            testModuleScript("<!-- -->",   "HTML comment does not parse in module code",       true);
+            testModuleScript("/* */ -->",  "HTML comment after delimited comment does not parse in module code", true);
+            testModuleScript("/* */\n-->", "HTML comment after delimited comment does not parse in module code", true);
+        }
+    }
 ];
 
 testRunner.runTests(tests, { verbose: WScript.Arguments[0] != "summary" });
diff --git a/test/es6/rlexe.xml b/test/es6/rlexe.xml
@@ -1141,7 +1141,12 @@
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
-
+<test>
+    <default>
+        <files>HTMLComments.js</files>
+        <baseline>HTMLComments.baseline</baseline>
+    </default>
+</test>
 <test>
     <default>
         <files>module-syntax.js</files>