Skip to content

Commit 77d539d

Browse files
committed
liberal xml parsing
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40441
1 parent 767916f commit 77d539d

7 files changed

Lines changed: 370 additions & 31 deletions

File tree

src/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,13 @@
124124
))
125125

126126
asciiLowercase = frozenset(string.ascii_lowercase)
127-
asciiUppercase = frozenset(string.ascii_uppercase)
128127
asciiLetters = frozenset(string.ascii_letters)
129128
digits = frozenset(string.digits)
130129
hexDigits = frozenset(string.hexdigits)
131130

131+
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
132+
for c in string.ascii_uppercase])
133+
132134
# Heading elements need to be ordered
133135
headingElements = (
134136
"h1",

src/html5parser.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from treebuilders import simpletree
2828

2929
import utils
30-
from constants import contentModelFlags, spaceCharacters
30+
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
3131
from constants import scopingElements, formattingElements, specialElements
3232
from constants import headingElements, tableInsertModeElements
3333

@@ -96,6 +96,7 @@ def parse(self, stream, innerHTML=False):
9696
# XXX This is temporary for the moment so there isn't any other
9797
# changes needed for the parser to work with the iterable tokenizer
9898
for token in self.tokenizer:
99+
token = self.normalizeToken(token)
99100
type = token["type"]
100101
method = getattr(self.phase, "process%s" % type, None)
101102
if type in ("Characters", "SpaceCharacters", "Comment"):
@@ -124,6 +125,31 @@ def atheistParseError(self):
124125
"""This error is not an error"""
125126
pass
126127

128+
def normalizeToken(self, token):
129+
""" HTML5 specific normalizations to the token stream """
130+
131+
if token["type"] == "EmptyTag":
132+
token["type"] = "StartTag"
133+
134+
if token["type"] == "StartTag":
135+
token["name"] = token["name"].translate(asciiUpper2Lower)
136+
137+
# We need to remove the duplicate attributes and convert attributes
138+
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
139+
140+
# AT When Python 2.4 is widespread we should use
141+
# dict(reversed(token.data))
142+
if token["data"]:
143+
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
144+
for attr,value in token["data"][::-1]])
145+
else:
146+
token["data"] = {}
147+
148+
elif token["type"] == "EndTag":
149+
token["name"] = token["name"].lower()
150+
151+
return token
152+
127153
#XXX - almost everthing after this point should be moved into a
128154
#seperate treebuilder object
129155

src/liberalxmlparser.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""
2+
Warning: this module is experimental and subject to change and even removal
3+
at any time.
4+
5+
For background/rationale, see:
6+
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
7+
* http://tinyurl.com/ylfj8k (and follow-ups)
8+
9+
References:
10+
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
11+
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
12+
13+
@@TODO:
14+
* Build a Treebuilder that produces Python DOM objects:
15+
http://docs.python.org/lib/module-xml.dom.html
16+
* Produce SAX events based on the produced DOM. This is intended not to
17+
support streaming, but rather to support application level compatibility.
18+
* Optional namespace support
19+
* Special case the output of XHTML <script> elements so that the empty
20+
element syntax is never used, even when the src attribute is provided.
21+
Also investigate the use of <![CDATA[]>> to ensure dual HTML/XHTML
22+
compatibility.
23+
* Map illegal XML characters to U+FFFD, possibly with additional markup in
24+
the case of XHTML
25+
* Selectively lowercase only XHTML, but not foreign markup
26+
"""
27+
28+
import html5parser
29+
import gettext
30+
_ = gettext.gettext
31+
32+
class XHTMLParser(html5parser.HTMLParser):
33+
""" liberal XMTHML parser """
34+
35+
def __init__(self, *args, **kwargs):
36+
html5parser.HTMLParser.__init__(self, *args, **kwargs)
37+
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
38+
39+
def normalizeToken(self, token):
40+
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
41+
# We need to remove the duplicate attributes and convert attributes
42+
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
43+
44+
# AT When Python 2.4 is widespread we should use
45+
# dict(reversed(token.data))
46+
token["data"] = dict(token["data"][::-1])
47+
48+
# For EmptyTags, process both a Start and an End tag
49+
if token["type"] == "EmptyTag":
50+
self.phase.processStartTag(token["name"], token["data"])
51+
token["data"] = {}
52+
token["type"] = "EndTag"
53+
54+
return token
55+
56+
class XhmlRootPhase(html5parser.RootElementPhase):
57+
def insertHtmlElement(self):
58+
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
59+
self.tree.openElements.append(element)
60+
self.tree.document.appendChild(element)
61+
self.parser.phase = self.parser.phases["beforeHead"]
62+
63+
class XMLParser(XHTMLParser):
64+
""" liberal XML parser """
65+
66+
def __init__(self, *args, **kwargs):
67+
XHTMLParser.__init__(self, *args, **kwargs)
68+
self.phases["initial"] = XmlRootPhase(self, self.tree)
69+
70+
class XmlRootPhase(html5parser.Phase):
71+
""" Prime the Xml parser """
72+
def __getattr__(self, name):
73+
self.tree.openElements.append(self.tree.document)
74+
self.parser.phase = XmlElementPhase(self.parser, self.tree)
75+
return getattr(self.parser.phase, name)
76+
77+
class XmlElementPhase(html5parser.Phase):
78+
""" Generic handling for all XML elements """
79+
80+
def __init__(self, *args, **kwargs):
81+
html5parser.Phase.__init__(self, *args, **kwargs)
82+
self.startTagHandler = html5parser.utils.MethodDispatcher([])
83+
self.startTagHandler.default = self.startTagOther
84+
self.endTagHandler = html5parser.utils.MethodDispatcher([])
85+
self.endTagHandler.default = self.endTagOther
86+
87+
def startTagOther(self, name, attributes):
88+
element = self.tree.createElement(name, attributes)
89+
self.tree.openElements[-1].appendChild(element)
90+
self.tree.openElements.append(element)
91+
92+
def endTagOther(self, name):
93+
for node in self.tree.openElements[::-1]:
94+
if node.name == name:
95+
self.tree.generateImpliedEndTags()
96+
if self.tree.openElements[-1].name != name:
97+
self.parser.parseError(_("Unexpected end tag " + name +\
98+
"."))
99+
while self.tree.openElements.pop() != node:
100+
pass
101+
break
102+
else:
103+
self.parser.parseError()
104+
105+
def processCharacters(self, data):
106+
self.tree.insertText(data)

src/tokenizer.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from constants import contentModelFlags, spaceCharacters
1111
from constants import entitiesWindows1252, entities, voidElements
12-
from constants import asciiLowercase, asciiUppercase, asciiLetters
12+
from constants import asciiLowercase, asciiLetters
1313
from constants import digits, hexDigits, EOF
1414

1515
from inputstream import HTMLInputStream
@@ -104,6 +104,10 @@ def processSolidusInTag(self):
104104
self.tokenQueue.append({"type": "ParseError", "data":
105105
_("Solidus (/) incorrectly placed in tag.")})
106106

107+
# XML/XHTML enablement hook
108+
if self.currentToken["type"] == "StartTag" and data == u">":
109+
self.currentToken["type"] = "EmptyTag"
110+
107111
# The character we just consumed need to be put back on the stack so it
108112
# doesn't get lost...
109113
self.stream.queue.append(data)
@@ -259,17 +263,10 @@ def emitCurrentToken(self):
259263
# internal usage.
260264

261265
token = self.currentToken
262-
# For start tags convert attribute list into a distinct dictionary
263-
if token["type"] == "StartTag":
264-
# We need to remove the duplicate attributes and convert attributes
265-
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
266-
267-
# AT When Python 2.4 is widespread we should use
268-
# dict(reversed(token.data))
269-
token["data"] = dict(token["data"][::-1])
266+
270267
# If an end tag has attributes it's a parse error and they should
271268
# be removed
272-
elif token["type"] == "EndTag" and token["data"]:
269+
if token["type"] == "EndTag" and token["data"]:
273270
self.tokenQueue.append({"type": "ParseError", "data":
274271
_("End tag contains unexpected attributes.")})
275272
token["data"] = {}
@@ -349,7 +346,7 @@ def tagOpenState(self):
349346
self.state = self.states["closeTagOpen"]
350347
elif data in asciiLetters:
351348
self.currentToken =\
352-
{"type": "StartTag", "name": data.lower(), "data": []}
349+
{"type": "StartTag", "name": data, "data": []}
353350
self.state = self.states["tagName"]
354351
elif data == u">":
355352
# XXX In theory it could be something besides a tag name. But
@@ -405,7 +402,7 @@ def closeTagOpenState(self):
405402
# the stack.
406403
self.stream.queue.extend(charStack)
407404

408-
if self.currentToken["name"] == "".join(charStack[:-1]).lower() \
405+
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
409406
and charStack[-1] in (spaceCharacters |
410407
frozenset((u">", u"/", u"<", EOF))):
411408
# Because the characters are correct we can safely switch to
@@ -426,7 +423,7 @@ def closeTagOpenState(self):
426423
data = self.stream.char()
427424
if data in asciiLetters:
428425
self.currentToken =\
429-
{"type": "EndTag", "name": data.lower(), "data": []}
426+
{"type": "EndTag", "name": data, "data": []}
430427
self.state = self.states["tagName"]
431428
elif data == u">":
432429
self.tokenQueue.append({"type": "ParseError", "data":
@@ -449,12 +446,9 @@ def tagNameState(self):
449446
data = self.stream.char()
450447
if data in spaceCharacters:
451448
self.state = self.states["beforeAttributeName"]
452-
elif data in asciiLowercase:
449+
elif data in asciiLetters:
453450
self.currentToken["name"] += data +\
454-
self.stream.charsUntil(asciiLowercase, True)
455-
elif data in asciiUppercase:
456-
self.currentToken["name"] += data.lower() +\
457-
self.stream.charsUntil(asciiLetters, True).lower()
451+
self.stream.charsUntil(asciiLetters, True)
458452
elif data == u">":
459453
self.emitCurrentToken()
460454
elif data == u"<" or data == EOF:
@@ -470,8 +464,8 @@ def beforeAttributeNameState(self):
470464
data = self.stream.char()
471465
if data in spaceCharacters:
472466
self.stream.charsUntil(spaceCharacters, True)
473-
elif data in asciiUppercase:
474-
self.currentToken["data"].append([data.lower(), ""])
467+
elif data in asciiLetters:
468+
self.currentToken["data"].append([data, ""])
475469
self.state = self.states["attributeName"]
476470
elif data == u">":
477471
self.emitCurrentToken()
@@ -489,13 +483,9 @@ def attributeNameState(self):
489483
leavingThisState = True
490484
if data == u"=":
491485
self.state = self.states["beforeAttributeValue"]
492-
elif data in asciiLowercase:
486+
elif data in asciiLetters:
493487
self.currentToken["data"][-1][0] += data +\
494-
self.stream.charsUntil(asciiLowercase, True)
495-
leavingThisState = False
496-
elif data in asciiUppercase:
497-
self.currentToken["data"][-1][0] += data.lower() +\
498-
self.stream.charsUntil(asciiLetters, True).lower()
488+
self.stream.charsUntil(asciiLetters, True)
499489
leavingThisState = False
500490
elif data == u">":
501491
# XXX If we emit here the attributes are converted to a dict
@@ -535,8 +525,8 @@ def afterAttributeNameState(self):
535525
self.state = self.states["beforeAttributeValue"]
536526
elif data == u">":
537527
self.emitCurrentToken()
538-
elif data in asciiUppercase:
539-
self.currentToken["data"].append([data.lower(), ""])
528+
elif data in asciiLetters:
529+
self.currentToken["data"].append([data, ""])
540530
self.state = self.states["attributeName"]
541531
elif data == u"/":
542532
self.processSolidusInTag()

src/treebuilders/simpletree.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import _base
2+
from xml.sax.saxutils import escape
23

34
# Really crappy basic implementation of a DOM-core like thing
45
class Node(_base.Node):
@@ -76,6 +77,12 @@ def printTree(self):
7677
tree += child.printTree(2)
7778
return tree
7879

80+
def toxml(self, encoding="utf=8"):
81+
result = ''
82+
for child in self.childNodes:
83+
result += child.toxml()
84+
return result.encode(encoding)
85+
7986
class DocumentType(Node):
8087
def __init__(self, name):
8188
Node.__init__(self, name)
@@ -91,6 +98,9 @@ def __init__(self, value):
9198
def __unicode__(self):
9299
return "\"%s\"" % self.value
93100

101+
def toxml(self):
102+
return escape(self.value)
103+
94104
class Element(Node):
95105
def __init__(self, name):
96106
Node.__init__(self, name)
@@ -109,6 +119,20 @@ def printTree(self, indent):
109119
tree += child.printTree(indent)
110120
return tree
111121

122+
def toxml(self):
123+
result = '<' + self.name
124+
if self.attributes:
125+
for name,value in self.attributes.iteritems():
126+
result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
127+
if self.childNodes:
128+
result += '>'
129+
for child in self.childNodes:
130+
result += child.toxml()
131+
result += '</%s>' % self.name
132+
else:
133+
result += '/>'
134+
return result
135+
112136
class CommentNode(Node):
113137
def __init__(self, data):
114138
Node.__init__(self, None)
@@ -117,6 +141,8 @@ def __init__(self, data):
117141
def __unicode__(self):
118142
return "<!-- %s -->" % self.data
119143

144+
toxml = __unicode__
145+
120146
class TreeBuilder(_base.TreeBuilder):
121147
documentClass = Document
122148
doctypeClass = DocumentType

0 commit comments

Comments
 (0)