Attempt at merging svgmathml branch to the default branch

jgraham · jgraham · commit f3c54fd53c56 · 2009-05-30T23:07:14.000+02:00
--HG--
branch : svgmathml
rename : python/parse.py =&gt; python3/parse.py
rename : python/src/html5lib/__init__.py =&gt; python3/src/html5lib/__init__.py
rename : python/src/html5lib/constants.py =&gt; python3/src/html5lib/constants.py
rename : python/src/html5lib/filters/optionaltags.py =&gt; python3/src/html5lib/filters/optionaltags.py
rename : python/src/html5lib/html5parser.py =&gt; python3/src/html5lib/html5parser.py
rename : python/src/html5lib/inputstream.py =&gt; python3/src/html5lib/inputstream.py
rename : python/src/html5lib/sanitizer.py =&gt; python3/src/html5lib/sanitizer.py
rename : python/src/html5lib/serializer/__init__.py =&gt; python3/src/html5lib/serializer/__init__.py
rename : python/src/html5lib/tokenizer.py =&gt; python3/src/html5lib/tokenizer.py
rename : python/src/html5lib/treebuilders/etree_lxml.py =&gt; python3/src/html5lib/treebuilders/etree_lxml.py
rename : python/src/html5lib/treebuilders/simpletree.py =&gt; python3/src/html5lib/treebuilders/simpletree.py
rename : python/tests/test_encoding.py =&gt; python3/tests/test_encoding.py
rename : python/tests/test_parser.py =&gt; python3/tests/test_parser.py
rename : python/tests/test_tokenizer.py =&gt; python3/tests/test_tokenizer.py
diff --git a/parse.py b/parse.py
@@ -52,7 +52,7 @@ def parse():
     else:
         tokenizer = HTMLTokenizer
 
-    if opts.xml:
+    if opts.liberalxml:
         p = liberalxmlparser.XHTMLParser(tree=treebuilder, tokenizer=tokenizer)
     else:
         p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
@@ -196,6 +196,9 @@ def getOptParser():
     parser.add_option("", "--sanitize", action="store_true", default=False,
                       dest="sanitize", help="sanitize")
 
+    parser.add_option("", "--liberal-xml-parser", action="store_true", default=False,
+                      dest="liberalxml", help="parse with liberal xml parser")
+
     return parser
 
 if __name__ == "__main__":
diff --git a/src/html5lib/__init__.py b/src/html5lib/__init__.py
@@ -13,5 +13,6 @@
 """
 from html5parser import HTMLParser, parse
 from treebuilders import getTreeBuilder
+from serializer import serialize
 
 from liberalxmlparser import XMLParser, XHTMLParser
diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py
@@ -1091,7 +1091,6 @@
     'utf16': 'utf-16',
     'utf16be': 'utf-16-be',
     'utf16le': 'utf-16-le',
-    'utf7': 'utf-7',
     'utf8': 'utf-8',
     'windows1250': 'cp1250',
     'windows1251': 'cp1251',
diff --git a/src/html5lib/filters/optionaltags.py b/src/html5lib/filters/optionaltags.py
@@ -31,7 +31,11 @@ def is_optional_start(self, tagname, previous, next):
         elif tagname == 'head':
             # A head element's start tag may be omitted if the first thing
             # inside the head element is an element.
-            return type == "StartTag"
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
         elif tagname == 'body':
             # A body element's start tag may be omitted if the first thing
             # inside the body element is not a space character or a comment,
@@ -52,7 +56,7 @@ def is_optional_start(self, tagname, previous, next):
             # inside the colgroup element is a col element, and if the element
             # is not immediately preceeded by another colgroup element whose
             # end tag has been omitted.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                 # XXX: we do not look at the preceding event, so instead we never
                 # omit the colgroup element's end tag when it is immediately
                 # followed by another colgroup element. See is_optional_end.
@@ -114,7 +118,7 @@ def is_optional_end(self, tagname, next):
             # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
             # nav, ol, p, pre, section, table, or ul, element, or if
             # there is no more content in the parent element.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                 return next["name"] in ('address', 'article', 'aside',     \
                     'blockquote', 'datagrid', 'dialog', 'dir', 'div',      \
                     'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3',  \
diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py
@@ -18,10 +18,11 @@
 from constants import scopingElements, formattingElements, specialElements
 from constants import headingElements, tableInsertModeElements
 from constants import cdataElements, rcdataElements, voidElements
+from constants import tokenTypes, ReparseException
 from constants import tokenTypes, namespaces
 
-def parse(doc, treebuilderName="simpletree", encoding=None):
-    tb = treebuilders.getTreeBuilder(treebuilderName)
+def parse(doc, treebuilder="simpletree", encoding=None):
+    tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb)
     return p.parse(doc, encoding=encoding)
 
@@ -79,19 +80,30 @@ def __init__(self, tree = simpletree.TreeBuilder,
 
     def _parse(self, stream, innerHTML=False, container="div",
                encoding=None, parseMeta=True, useChardet=True, **kwargs):
-        
+
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+                                              parseMeta=parseMeta,
+                                              useChardet=useChardet, **kwargs)
+        self.reset()
+
+        while True:
+            try:
+                self.mainLoop()
+                break
+            except ReparseException, e:
+                self.reset()
+
+    def reset(self):
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
         # "quirks" / "limited quirks" / "no quirks"
         self.compatMode = "no quirks"
 
-        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
-                                              parseMeta=parseMeta,
-                                              useChardet=useChardet, **kwargs)
-
-        if innerHTML:
-            self.innerHTML = container.lower()
+        if self.innerHTMLMode:
+            self.innerHTML = self.container.lower()
 
             if self.innerHTML in cdataElements:
                 self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
@@ -114,6 +126,19 @@ def _parse(self, stream, innerHTML=False, container="div",
         self.secondaryPhase = None
 
         self.beforeRCDataPhase = None
+        
+    def mainLoop(self):
+        (CharactersToken, 
+         SpaceCharactersToken, 
+         StartTagToken,
+         EndTagToken, 
+         CommentToken,
+         DoctypeToken) = (tokenTypes["Characters"],
+                          tokenTypes["SpaceCharacters"],
+                          tokenTypes["StartTag"],
+                          tokenTypes["EndTag"],
+                          tokenTypes["Comment"],
+                          tokenTypes["Doctype"])
 
         CharactersToken = tokenTypes["Characters"]
         SpaceCharactersToken = tokenTypes["SpaceCharacters"]
@@ -124,6 +149,8 @@ def _parse(self, stream, innerHTML=False, container="div",
         
         
         for token in self.normalizedTokens():
+            #print self.phase.__class__.__name__
+            #print token
             type = token["type"]
             if type == CharactersToken:
                 self.phase.processCharacters(token)
@@ -378,18 +405,6 @@ def __init__(self, parser, tree):
 
     def processEOF(self):
         raise NotImplementedError
-        self.tree.generateImpliedEndTags()
-        if len(self.tree.openElements) > 2:
-            self.parser.parseError("expected-closing-tag-but-got-eof")
-        elif len(self.tree.openElements) == 2 and\
-          self.tree.openElements[1].name != "body":
-            # This happens for framesets or something?
-            self.parser.parseError("expected-closing-tag-but-got-eof")
-        elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
-            # XXX This is not what the specification says. Not sure what to do
-            # here.
-            self.parser.parseError("eof-in-innerhtml")
-        # Betting ends.
 
     def processComment(self, token):
         # For most phases the following is correct. Where it's not it will be
@@ -702,10 +717,10 @@ def startTagMeta(self, token):
         attributes = token["data"]
         if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
             if "charset" in attributes:
-                codec = inputstream.codecName(attributes["charset"])
-                self.parser.tokenizer.stream.changeEncoding(codec)
+                self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
             elif "content" in attributes:
-                data = inputstream.EncodingBytes(attributes["content"])
+                data = inputstream.EncodingBytes(
+                    attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
                 parser = inputstream.ContentAttrParser(data)
                 codec = parser.parse()
                 self.parser.tokenizer.stream.changeEncoding(codec)
diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py
@@ -1,6 +1,7 @@
 import codecs
 import re
 import types
+import sys
 
 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from constants import encodings, ReparseException
@@ -10,7 +11,7 @@
 asciiLettersBytes = [str(item) for item in asciiLetters]
 asciiUppercaseBytes = [str(item) for item in asciiUppercase]
 
-invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDDF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -196,7 +197,8 @@ def openStream(self, source):
             import cStringIO
             stream = cStringIO.StringIO(str(source))
 
-        if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
+        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+            stream is sys.stdin):
             stream = BufferedStream(stream)
 
         return stream
@@ -494,8 +496,10 @@ class EncodingBytes(str):
     """String-like object with an assosiated position and various extra methods
     If the position is ever greater than the string length then an exception is
     raised"""
+    def __new__(self, value):
+        return str.__new__(self, value)
+
     def __init__(self, value):
-        str.__init__(self, value)
         self._position=-1
     
     def __iter__(self):
diff --git a/src/html5lib/sanitizer.py b/src/html5lib/sanitizer.py
@@ -152,7 +152,7 @@ def sanitize_token(self, token):
                             continue
                         val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                                unescape(attrs[attr])).lower()
-                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) or
+                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
                             (val_unescaped.split(':')[0] not in 
                              self.allowed_protocols)):
                             del attrs[attr]
diff --git a/src/html5lib/serializer/__init__.py b/src/html5lib/serializer/__init__.py
@@ -1,3 +1,17 @@
 
+from html5lib import treewalkers
+
 from htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
+
+def serialize(input, tree="simpletree", format="html", encoding=None,
+              **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree) 
+    if format == "html":
+        s = HTMLSerializer(**serializer_opts)
+    elif format == "xhtml":
+        s = XHTMLSerializer(**serializer_opts)
+    else:
+        raise ValueError, "type must be either html or xhtml"
+    return s.render(walker(input), encoding)
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -151,7 +151,7 @@ def consumeNumberEntity(self, isHex):
         # Certain characters get replaced with U+FFFD
         if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
          or (0x007F <= charAsInt <= 0x009F)
-         or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
+         or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
          or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
          or (0x10FFFF < charAsInt)):
             char = u"\uFFFD"
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -136,7 +136,8 @@ def buildTestSuite():
                 def testFunc(self, innerHTML=innerHTML, input=input,
                     expected=expected, errors=errors, treeCls=treeCls): 
                     return self.runParserTest(innerHTML, input, expected, errors, treeCls)
-                setattr(TestCase, "test_%s_%d_%s" % (testName,index+1,treeName),
+                testFunc.__name__ = "test_%s_%d_%s" % (testName,index+1,treeName)
+                setattr(TestCase, testFunc.__name__,
                      testFunc)
 
     return unittest.TestLoader().loadTestsFromTestCase(TestCase)