liberal xml parsing

rubys · rubys · commit 77d539dde58d · 2007-01-09T15:19:50.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40441
diff --git a/src/constants.py b/src/constants.py
@@ -124,11 +124,13 @@
 ))
 
 asciiLowercase = frozenset(string.ascii_lowercase)
-asciiUppercase = frozenset(string.ascii_uppercase)
 asciiLetters = frozenset(string.ascii_letters)
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
 
+asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
+    for c in string.ascii_uppercase])
+
 # Heading elements need to be ordered 
 headingElements = (
     "h1",
diff --git a/src/html5parser.py b/src/html5parser.py
@@ -27,7 +27,7 @@
 from treebuilders import simpletree
 
 import utils
-from constants import contentModelFlags, spaceCharacters
+from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
 from constants import scopingElements, formattingElements, specialElements
 from constants import headingElements, tableInsertModeElements
 
@@ -96,6 +96,7 @@ def parse(self, stream, innerHTML=False):
         # XXX This is temporary for the moment so there isn't any other
         # changes needed for the parser to work with the iterable tokenizer
         for token in self.tokenizer:
+            token = self.normalizeToken(token)
             type = token["type"]
             method = getattr(self.phase, "process%s" % type, None)
             if type in ("Characters", "SpaceCharacters", "Comment"):
@@ -124,6 +125,31 @@ def atheistParseError(self):
         """This error is not an error"""
         pass
 
+    def normalizeToken(self, token):
+        """ HTML5 specific normalizations to the token stream """
+       
+        if token["type"] == "EmptyTag":
+            token["type"] = "StartTag"
+
+        if token["type"] == "StartTag":
+            token["name"] = token["name"].translate(asciiUpper2Lower)
+
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            if token["data"]:
+                token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
+                    for attr,value in token["data"][::-1]])
+            else:
+                token["data"] = {}
+
+        elif token["type"] == "EndTag":
+            token["name"] = token["name"].lower()
+
+        return token
+
     #XXX - almost everthing after this point should be moved into a
     #seperate treebuilder object
 
diff --git a/src/liberalxmlparser.py b/src/liberalxmlparser.py
@@ -0,0 +1,106 @@
+""" 
+Warning: this module is experimental and subject to change and even removal
+at any time. 
+
+For background/rationale, see:
+ * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
+ * http://tinyurl.com/ylfj8k (and follow-ups)
+
+References:
+ * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
+ * http://wiki.whatwg.org/wiki/HtmlVsXhtml
+
+@@TODO:
+ * Build a Treebuilder that produces Python DOM objects:
+     http://docs.python.org/lib/module-xml.dom.html
+ * Produce SAX events based on the produced DOM.  This is intended not to
+   support streaming, but rather to support application level compatibility. 
+ * Optional namespace support
+ * Special case the output of XHTML <script> elements so that the empty
+   element syntax is never used, even when the src attribute is provided.
+   Also investigate the use of <![CDATA[]>> to ensure dual HTML/XHTML
+   compatibility.
+ * Map illegal XML characters to U+FFFD, possibly with additional markup in
+   the case of XHTML
+ * Selectively lowercase only XHTML, but not foreign markup
+"""
+
+import html5parser
+import gettext
+_ = gettext.gettext
+
+class XHTMLParser(html5parser.HTMLParser):
+    """ liberal XMTHML parser """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
+
+    def normalizeToken(self, token):
+        if token["type"] == "StartTag" or token["type"] == "EmptyTag":
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            token["data"] = dict(token["data"][::-1])
+
+            # For EmptyTags, process both a Start and an End tag
+            if token["type"] == "EmptyTag":
+                self.phase.processStartTag(token["name"], token["data"])
+                token["data"] = {}
+                token["type"] = "EndTag"
+
+        return token
+
+class XhmlRootPhase(html5parser.RootElementPhase):
+    def insertHtmlElement(self):
+        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
+        self.tree.openElements.append(element)
+        self.tree.document.appendChild(element)
+        self.parser.phase = self.parser.phases["beforeHead"]
+
+class XMLParser(XHTMLParser):
+    """ liberal XML parser """
+
+    def __init__(self, *args, **kwargs):
+        XHTMLParser.__init__(self, *args, **kwargs)
+        self.phases["initial"] = XmlRootPhase(self, self.tree)
+
+class XmlRootPhase(html5parser.Phase):
+    """ Prime the Xml parser """
+    def __getattr__(self, name):
+        self.tree.openElements.append(self.tree.document)
+        self.parser.phase = XmlElementPhase(self.parser, self.tree)
+        return getattr(self.parser.phase, name)
+
+class XmlElementPhase(html5parser.Phase):
+    """ Generic handling for all XML elements """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.Phase.__init__(self, *args, **kwargs)
+        self.startTagHandler = html5parser.utils.MethodDispatcher([])
+        self.startTagHandler.default = self.startTagOther
+        self.endTagHandler = html5parser.utils.MethodDispatcher([])
+        self.endTagHandler.default = self.endTagOther
+
+    def startTagOther(self, name, attributes):
+        element = self.tree.createElement(name, attributes)
+        self.tree.openElements[-1].appendChild(element)
+        self.tree.openElements.append(element)
+
+    def endTagOther(self, name):
+        for node in self.tree.openElements[::-1]:
+            if node.name == name:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != name:
+                    self.parser.parseError(_("Unexpected end tag " + name +\
+                      "."))
+                while self.tree.openElements.pop() != node:
+                    pass
+                break
+            else:
+                self.parser.parseError()
+
+    def processCharacters(self, data):
+        self.tree.insertText(data)
diff --git a/src/tokenizer.py b/src/tokenizer.py
@@ -9,7 +9,7 @@
 
 from constants import contentModelFlags, spaceCharacters
 from constants import entitiesWindows1252, entities, voidElements
-from constants import asciiLowercase, asciiUppercase, asciiLetters
+from constants import asciiLowercase, asciiLetters
 from constants import digits, hexDigits, EOF
 
 from inputstream import HTMLInputStream
@@ -104,6 +104,10 @@ def processSolidusInTag(self):
             self.tokenQueue.append({"type": "ParseError", "data":
               _("Solidus (/) incorrectly placed in tag.")})
 
+        # XML/XHTML enablement hook
+        if self.currentToken["type"] == "StartTag" and data == u">":
+            self.currentToken["type"] = "EmptyTag"
+
         # The character we just consumed need to be put back on the stack so it
         # doesn't get lost...
         self.stream.queue.append(data)
@@ -259,17 +263,10 @@ def emitCurrentToken(self):
         # internal usage.
 
         token = self.currentToken
-        # For start tags convert attribute list into a distinct dictionary
-        if token["type"] == "StartTag":
-            # We need to remove the duplicate attributes and convert attributes
-            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
-
-            # AT When Python 2.4 is widespread we should use
-            # dict(reversed(token.data))
-            token["data"] = dict(token["data"][::-1])
+
         # If an end tag has attributes it's a parse error and they should
         # be removed
-        elif token["type"] == "EndTag" and token["data"]:
+        if token["type"] == "EndTag" and token["data"]:
             self.tokenQueue.append({"type": "ParseError", "data":
               _("End tag contains unexpected attributes.")})
             token["data"] = {}
@@ -349,7 +346,7 @@ def tagOpenState(self):
                 self.state = self.states["closeTagOpen"]
             elif data in asciiLetters:
                 self.currentToken =\
-                  {"type": "StartTag", "name": data.lower(), "data": []}
+                  {"type": "StartTag", "name": data, "data": []}
                 self.state = self.states["tagName"]
             elif data == u">":
                 # XXX In theory it could be something besides a tag name. But
@@ -405,7 +402,7 @@ def closeTagOpenState(self):
             # the stack.
             self.stream.queue.extend(charStack)
 
-            if self.currentToken["name"] == "".join(charStack[:-1]).lower() \
+            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
               and charStack[-1] in (spaceCharacters |
               frozenset((u">", u"/", u"<", EOF))):
                 # Because the characters are correct we can safely switch to
@@ -426,7 +423,7 @@ def closeTagOpenState(self):
             data = self.stream.char()
             if data in asciiLetters:
                 self.currentToken =\
-                  {"type": "EndTag", "name": data.lower(), "data": []}
+                  {"type": "EndTag", "name": data, "data": []}
                 self.state = self.states["tagName"]
             elif data == u">":
                 self.tokenQueue.append({"type": "ParseError", "data":
@@ -449,12 +446,9 @@ def tagNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.states["beforeAttributeName"]
-        elif data in asciiLowercase:
+        elif data in asciiLetters:
             self.currentToken["name"] += data +\
-              self.stream.charsUntil(asciiLowercase, True)
-        elif data in asciiUppercase:
-            self.currentToken["name"] += data.lower() +\
-              self.stream.charsUntil(asciiLetters, True).lower()
+              self.stream.charsUntil(asciiLetters, True)
         elif data == u">":
             self.emitCurrentToken()
         elif data == u"<" or data == EOF:
@@ -470,8 +464,8 @@ def beforeAttributeNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
-        elif data in asciiUppercase:
-            self.currentToken["data"].append([data.lower(), ""])
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
             self.state = self.states["attributeName"]
         elif data == u">":
             self.emitCurrentToken()
@@ -489,13 +483,9 @@ def attributeNameState(self):
         leavingThisState = True
         if data == u"=":
             self.state = self.states["beforeAttributeValue"]
-        elif data in asciiLowercase:
+        elif data in asciiLetters:
             self.currentToken["data"][-1][0] += data +\
-              self.stream.charsUntil(asciiLowercase, True)
-            leavingThisState = False
-        elif data in asciiUppercase:
-            self.currentToken["data"][-1][0] += data.lower() +\
-              self.stream.charsUntil(asciiLetters, True).lower()
+              self.stream.charsUntil(asciiLetters, True)
             leavingThisState = False
         elif data == u">":
             # XXX If we emit here the attributes are converted to a dict
@@ -535,8 +525,8 @@ def afterAttributeNameState(self):
             self.state = self.states["beforeAttributeValue"]
         elif data == u">":
             self.emitCurrentToken()
-        elif data in asciiUppercase:
-            self.currentToken["data"].append([data.lower(), ""])
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
             self.state = self.states["attributeName"]
         elif data == u"/":
             self.processSolidusInTag()
diff --git a/src/treebuilders/simpletree.py b/src/treebuilders/simpletree.py
@@ -1,4 +1,5 @@
 import _base
+from xml.sax.saxutils import escape
 
 # Really crappy basic implementation of a DOM-core like thing
 class Node(_base.Node):
@@ -76,6 +77,12 @@ def printTree(self):
             tree += child.printTree(2)
         return tree
 
+    def toxml(self, encoding="utf=8"):
+        result = ''
+        for child in self.childNodes:
+            result += child.toxml()
+        return result.encode(encoding)
+
 class DocumentType(Node):
     def __init__(self, name):
         Node.__init__(self, name)
@@ -91,6 +98,9 @@ def __init__(self, value):
     def __unicode__(self):
         return "\"%s\"" % self.value
 
+    def toxml(self):
+        return escape(self.value)
+
 class Element(Node):
     def __init__(self, name):
         Node.__init__(self, name)
@@ -109,6 +119,20 @@ def printTree(self, indent):
             tree += child.printTree(indent)
         return tree
 
+    def toxml(self):
+        result = '<' + self.name
+        if self.attributes:
+            for name,value in self.attributes.iteritems():
+                result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
+        if self.childNodes:
+            result += '>'
+            for child in self.childNodes:
+                result += child.toxml()
+            result += '</%s>' % self.name
+        else:
+            result += '/>'
+        return result
+
 class CommentNode(Node):
     def __init__(self, data):
         Node.__init__(self, None)
@@ -117,6 +141,8 @@ def __init__(self, data):
     def __unicode__(self):
         return "<!-- %s -->" % self.data
 
+    toxml = __unicode__ 
+
 class TreeBuilder(_base.TreeBuilder):
     documentClass = Document
     doctypeClass = DocumentType
diff --git a/tests/test_lxp.py b/tests/test_lxp.py
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py