Merge

James Graham · James Graham · commit 448271009f64 · 2010-08-09T13:45:31.000+02:00
diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -13,7 +13,7 @@
 E = {
     "null-character": 
        _(u"Null character in input stream, replaced with U+FFFD."),
-    "invalid-character": 
+    "invalid-codepoint": 
        _(u"Invalid codepoint in stream."),
     "incorrectly-placed-solidus":
        _(u"Solidus (/) incorrectly placed in tag."),
@@ -74,6 +74,10 @@
         _(u"Unexpected = in unquoted attribute"),
     'unexpected-character-in-unquoted-attribute-value':
         _(u"Unexpected character in unquoted attribute"),
+    "invalid-character-after-attribute-name":
+       _(u"Unexpected character after attribute name."),
+    "unexpected-character-after-attribute-value":
+       _(u"Unexpected character after attribute value."),
     "eof-in-attribute-value-double-quote":
        _(u"Unexpected end of file in attribute value (\")."),
     "eof-in-attribute-value-single-quote":
@@ -100,6 +104,10 @@
        _(u"Unexpected '-' after '--' found in comment."),
     "eof-in-comment-double-dash":
        _(u"Unexpected end of file in comment (--)."),
+    "eof-in-comment-end-space-state":
+       _(u"Unexpected end of file in comment."),
+    "eof-in-comment-end-bang-state":
+       _(u"Unexpected end of file in comment."),
     "unexpected-char-in-comment":
        _(u"Unexpected character in comment found."),
     "need-space-after-doctype":
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -134,8 +134,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         #Craziness
         if len(u"\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
+            self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
+            self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
 
         # List of where new lines occur
         self.newLines = [0]
@@ -159,6 +161,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         if (self.charEncoding[0] is None):
             self.charEncoding = self.detectEncoding(parseMeta, chardet)
 
+
         self.reset()
 
     def reset(self):
@@ -175,8 +178,8 @@ def reset(self):
         # number of columns in the last line of the previous chunk
         self.prevNumCols = 0
         
-        #Flag to indicate we may have a CR LF broken across a data chunk
-        self._lastChunkEndsWithCR = False
+        #Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
 
     def openStream(self, source):
         """Produces a file object from source.
@@ -341,20 +344,27 @@ def readChunk(self, chunkSize=None):
         self.chunkOffset = 0
 
         data = self.dataStream.read(chunkSize)
-
-        if not data:
+        
+        #Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
             return False
         
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+        
         self.reportCharacterErrors(data)
-
+        
+        # Replace invalid characters
         data = data.replace(u"\u0000", u"\ufffd")
-        #Check for CR LF broken across chunks
-        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
-            data = data[1:]
-            # Stop if the chunk is now empty
-            if not data:
-                return False
-        self._lastChunkEndsWithCR = data[-1] == u"\r"
+        data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
+                    
         data = data.replace(u"\r\n", u"\n")
         data = data.replace(u"\r", u"\n")
 
@@ -394,8 +404,6 @@ def characterErrorsUCS2(self, data):
             else:
                 skip = False
                 self.errors.append("invalid-codepoint")
-        #This is still wrong if it is possible for a surrogate pair to break a
-        #chunk boundary
 
     def charsUntil(self, characters, opposite = False):
         """ Returns a string of characters from the stream up to but not
diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
@@ -27,7 +27,11 @@
     for k, v in entities.items():
         if v != "&" and encode_entity_map.get(v) != k.lower():
             # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
-            encode_entity_map[ord(v)] = k
+            if len(v) == 2:
+                v = utils.surrogatePairToCodepoint(v)
+            else:
+                v = ord(v)
+            encode_entity_map[v] = k
 
     def htmlentityreplace_errors(exc):
         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py
@@ -0,0 +1,64 @@
+import sys
+import os
+import json
+import re
+
+import html5lib
+import support
+import test_parser
+import test_tokenizer
+
+p = html5lib.HTMLParser()
+
+unnamespaceExpected = re.compile(r"^(\s*)<html (\S+)>", re.M).sub
+
+def main(out_path):
+    if not os.path.exists(out_path):
+        sys.stderr.write("Path %s does not exist"%out_path)
+        sys.exit(1)
+
+    for filename in support.html5lib_test_files('tokenizer', '*.test'):
+        run_file(filename, out_path)
+
+def run_file(filename, out_path):
+    try:
+        tests_data = json.load(file(filename))
+    except ValueError:
+        sys.stderr.write("Failed to load %s\n"%filename)
+        return
+    name = os.path.splitext(os.path.split(filename)[1])[0]
+    output_file = open(os.path.join(out_path, "tokenizer_%s.dat"%name), "w")
+
+    if 'tests' in tests_data:
+        for test_data in tests_data['tests']:
+            if 'initialStates' not in test_data:
+                test_data["initialStates"] = ["Data state"]
+                
+            for initial_state in test_data["initialStates"]:
+                if initial_state != "Data state":
+                    #don't support this yet
+                    continue
+                test = make_test(test_data)
+                output_file.write(test)
+
+    output_file.close()
+
+def make_test(test_data):
+    if 'doubleEscaped' in test_data:
+        test_data = test_tokenizer.unescape_test(test_data)
+
+    rv = []
+    rv.append("#data")
+    rv.append(test_data["input"].encode("utf8"))
+    rv.append("#errors")
+    rv.append("#document")
+    tree = p.parse(test_data["input"])
+    output = test_parser.convertTreeDump(p.tree.testSerializer(tree))
+    output = test_parser.attrlist.sub(test_parser.sortattrs, output)
+    output = unnamespaceExpected(r"\1<\2>", output)
+    rv.append(output.encode("utf8"))
+    rv.append("")
+    return "\n".join(rv)
+
+if __name__ == "__main__":
+    main(sys.argv[1])