make comments inside CDATA and RCDATA work; someone please review the HTMLInputStream regression

annevk · annevk · commit f98117962188 · 2007-06-19T13:32:23.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40760
diff --git a/src/inputstream.py b/src/inputstream.py
@@ -202,11 +202,17 @@ def charsUntil(self, characters, opposite = False):
         # Put the character stopped on back to the front of the queue
         # from where it came.
         c = charStack.pop()
-        if c != EOF and self.tell <= len(self.dataStream) and \
-          self.dataStream[self.tell - 1] == c[0]:
-            self.tell -= 1
-        else:
-            self.queue.insert(0, c)
+        self.queue.insert(0, c)
+        
+        # XXX the following is need for correct line number reporting apparently
+        # but it causes to break other tests with the fixes in tokenizer. I have
+        # no idea why...
+        #
+        #if c != EOF and self.tell <= len(self.dataStream) and \
+        #  self.dataStream[self.tell - 1] == c[0]:
+        #    self.tell -= 1
+        #else:
+        #    self.queue.insert(0, c)
         return "".join(charStack)
 
 class EncodingBytes(str):
diff --git a/src/tokenizer.py b/src/tokenizer.py
@@ -70,6 +70,8 @@ def __init__(self, stream, encoding=None, parseMeta=True):
 
         # Setup the initial tokenizer state
         self.contentModelFlag = contentModelFlags["PCDATA"]
+        self.escapeFlag = False
+        self.lastFourChars = []
         self.state = self.states["data"]
 
         # The current token being created
@@ -273,12 +275,30 @@ def emitCurrentToken(self):
 
     def dataState(self):
         data = self.stream.char()
-        if data == u"&" and self.contentModelFlag in\
+        if self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
+            if len(self.lastFourChars) == 4:
+                self.lastFourChars.pop(0)
+            self.lastFourChars.append(data)
+        if data == "&" and self.contentModelFlag in\
           (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
             self.state = self.states["entityData"]
-        elif data == u"<" and self.contentModelFlag !=\
-          contentModelFlags["PLAINTEXT"]:
+        elif data == "-" and self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == False and\
+          "".join(self.lastFourChars) == "<!--":
+            self.escapeFlag = True
+            self.tokenQueue.append({"type": "Characters", "data":data})
+        elif data == "<" and (self.contentModelFlag ==\
+          contentModelFlags["PCDATA"] or (self.contentModelFlag in
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == False)):
             self.state = self.states["tagOpen"]
+        elif data == ">" and self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
+            self.escapeFlag = False
+            self.tokenQueue.append({"type": "Characters", "data":data})
         elif data == EOF:
             # Tokenization ends.
             return False
@@ -292,7 +312,7 @@ def dataState(self):
               data + self.stream.charsUntil(spaceCharacters, True)})
         else:
             self.tokenQueue.append({"type": "Characters", "data": 
-              data + self.stream.charsUntil((u"&", u"<"))})
+              data + self.stream.charsUntil(("&", "<", ">", "-"))})
         return True
 
     def entityDataState(self):