1111 * http://wiki.whatwg.org/wiki/HtmlVsXhtml
1212
1313@@TODO:
14- * Build a Treebuilder that produces Python DOM objects:
15- http://docs.python.org/lib/module-xml.dom.html
1614 * Produce SAX events based on the produced DOM. This is intended not to
1715 support streaming, but rather to support application level compatibility.
1816 * Optional namespace support
19- * Special case the output of XHTML <script> elements so that the empty
20- element syntax is never used, even when the src attribute is provided.
21- Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
17+ * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
2218 indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
23- * Map illegal XML characters to U+FFFD, possibly with additional markup in
24- the case of XHTML
2519 * Selectively lowercase only XHTML, but not foreign markup
2620"""
2721
2822import html5parser
23+ from constants import voidElements
2924import gettext
3025_ = gettext .gettext
3126
32- class XHTMLParser (html5parser .HTMLParser ):
33- """ liberal XMTHML parser """
27+ class XMLParser (html5parser .HTMLParser ):
28+ """ liberal XML parser """
3429
3530 def __init__ (self , * args , ** kwargs ):
3631 html5parser .HTMLParser .__init__ (self , * args , ** kwargs )
37- self .phases ["rootElement " ] = XhmlRootPhase (self , self .tree )
32+ self .phases ["initial " ] = XmlRootPhase (self , self .tree )
3833
3934 def normalizeToken (self , token ):
4035 if token ["type" ] == "StartTag" or token ["type" ] == "EmptyTag" :
@@ -57,20 +52,38 @@ def normalizeToken(self, token):
5752
5853 return token
5954
55+ class XHTMLParser (XMLParser ):
56+ """ liberal XMTHML parser """
57+
58+ def __init__ (self , * args , ** kwargs ):
59+ html5parser .HTMLParser .__init__ (self , * args , ** kwargs )
60+ self .phases ["rootElement" ] = XhmlRootPhase (self , self .tree )
61+
62+ def normalizeToken (self , token ):
63+ token = XMLParser .normalizeToken (self , token )
64+
65+ # ensure that non-void XHTML elements have content so that separate
66+ # open and close tags are emitted
67+ if token ["type" ] == "EndTag" and \
68+ token ["name" ] not in voidElements and \
69+ token ["name" ] == self .tree .openElements [- 1 ].name and \
70+ not self .tree .openElements [- 1 ].hasContent ():
71+ for e in self .tree .openElements :
72+ if 'xmlns' in e .attributes .keys ():
73+ if e .attributes ['xmlns' ] <> 'http://www.w3.org/1999/xhtml' :
74+ break
75+ else :
76+ self .tree .insertText ('' )
77+
78+ return token
79+
6080class XhmlRootPhase (html5parser .RootElementPhase ):
6181 def insertHtmlElement (self ):
6282 element = self .tree .createElement ("html" , {'xmlns' : 'http://www.w3.org/1999/xhtml' })
6383 self .tree .openElements .append (element )
6484 self .tree .document .appendChild (element )
6585 self .parser .phase = self .parser .phases ["beforeHead" ]
6686
67- class XMLParser (XHTMLParser ):
68- """ liberal XML parser """
69-
70- def __init__ (self , * args , ** kwargs ):
71- XHTMLParser .__init__ (self , * args , ** kwargs )
72- self .phases ["initial" ] = XmlRootPhase (self , self .tree )
73-
7487class XmlRootPhase (html5parser .Phase ):
7588 """ Prime the Xml parser """
7689 def __getattr__ (self , name ):
0 commit comments