@@ -20,6 +20,16 @@ def __init__(self, name, value):
2020 self .childNodes = []
2121 self .attributes = {}
2222 self ._flags = []
23+ def __str__ (self ):
24+ """Convert the current subtree to a pretty-printed representation"""
25+ rv = self ._printNode ()
26+ for node in self .childNodes :
27+ rv += " " + str (node )
28+ return rv
29+
30+ def _printNode (self ):
31+ raise NotImplementedError
32+
2333 def appendChild (self , node ):
2434 if (isinstance (node , TextNode ) and self .childNodes and
2535 isinstance (self .childNodes [- 1 ], TextNode )):
@@ -36,22 +46,37 @@ class Document(Node):
3646 def __init__ (self ):
3747 Node .__init__ (self , None , None )
3848
49+ def _printNode (self ):
50+ return "document\n "
51+
3952class DocumentType (Node ):
4053 def __init__ (self , name ):
4154 Node .__init__ (self , name , None )
4255
56+ def _printNode (self ):
57+ return " " .join (["<!DOCTYPE" , name , ">\n " ])
58+
4359class TextNode (Node ):
4460 def __init__ (self , value ):
4561 Node .__init__ (self , None , value )
4662
63+ def _printNode (self ):
64+ return "" .join (["'" ,self .value , "'\n " ])
65+
4766class Element (Node ):
4867 def __init__ (self , name ):
4968 Node .__init__ (self , name , None )
5069
70+ def _printNode (self ):
71+ return " " .join (["<" + self .name + ">" , str (self .attributes ), "\n " ])
72+
5173class CommentNode (Node ):
5274 def __init__ (self , data ):
5375 Node .__init__ (self , None , None , None )
5476 self .data = data
77+
78+ def _printNode (self ):
79+ return "<!--" + self .value + "-->\n "
5580
5681class HTMLParser (object ):
5782 """Main parser class"""
@@ -114,15 +139,9 @@ def atheistParseError(self):
114139 """This error is not an error"""
115140 pass
116141
117- def switchInsertionMode (self , name ):
118- """Switch between different insertion modes in the main phase"""
119- # XXX AT Arguably this should be on the main phase object itself
120- self .phase .insertionMode = self .phase .insertionModes [name ](self )
121-
122142 def switchPhase (self , name ):
123143 """Switch between different phases of the parsing
124144 """
125- print name , self .phases ["trailingEnd" ]
126145 # Need to hang on to state between trailing end phase and main phase
127146 if (name == "trailingEnd" and
128147 isinstance (self .phase , self .phases ["main" ])):
@@ -134,6 +153,14 @@ def switchPhase(self, name):
134153 else :
135154 self .phase = self .phases [name ](self )
136155
156+ #XXX - almost everthing after this point should be moved into a
157+ #seperate treebuilder object
158+
159+ def switchInsertionMode (self , name ):
160+ """Switch between different insertion modes in the main phase"""
161+ # XXX AT Arguably this should be on the main phase object itself
162+ self .phase .insertionMode = self .phase .insertionModes [name ](self )
163+
137164 def elementInScope (self , target , tableVariant = False ):
138165 for node in self .openElements [::- 1 ]:
139166 if node == target :
@@ -148,6 +175,9 @@ def elementInScope(self, target, tableVariant=False):
148175
149176 def reconstructActiveFormattingElements (self ):
150177 afe = self .activeFormattingElements
178+ #If there are no active formatting elements exit early
179+ if not afe :
180+ return
151181 entry = afe [- 1 ]
152182 if entry == Marker or entry in self .openElements :
153183 return
@@ -192,7 +222,6 @@ def insertElement(self, name, attributes, parent=None):
192222 if self .openElements :
193223 self .openElements [- 1 ].appendChild (element )
194224 self .openElements .append (element )
195- print name , self .openElements
196225 else :
197226 # XXX Haven't implemented this yet as spec is vaugely unclear
198227 raise NotImplementedError
@@ -358,7 +387,7 @@ def processStartTag(self, name, attributes):
358387 self .parser .parseError ()
359388 for attr , value in attributes .iteritems ():
360389 if attr not in self .parser .openElements [0 ].attributes :
361- selfparser .openElements [0 ].attributes [attr ] = value
390+ self . parser .openElements [0 ].attributes [attr ] = value
362391 else :
363392 self .insertionMode .processStartTag (name , attributes )
364393
@@ -596,7 +625,7 @@ def processStartTag(self, name, attributes):
596625 # XXX Should this handle unknown elements as well?
597626 handlers = utils .MethodDispatcher ([
598627 ("script" ,self .startTagScript ),
599- (("base" , "link" , "meta" , "style" , "title" ), startTagFromHead ),
628+ (("base" , "link" , "meta" , "style" , "title" ), self . startTagFromHead ),
600629 ("body" , self .startTagBody ),
601630 (("address" , "blockquote" , "center" , "dir" , "div" , "dl" ,
602631 "fieldset" , "listing" , "menu" , "ol" , "p" , "pre" , "ul" ),
@@ -608,7 +637,25 @@ def processStartTag(self, name, attributes):
608637 ("a" ,self .startTagA ),
609638 (("b" , "big" , "em" , "font" , "i" , "nobr" , "s" , "small" ,
610639 "strike" , "strong" , "tt" , "u" ),self .startTagFormatting ),
640+ ("button" , self .startTagButton ),
641+ (("marquee" , "object" ), self .startTagMarqueeObject ),
642+ ("xmp" , self .startTagXMP ),
643+ ("table" , self .startTagTable ),
644+ (("area" , "basefont" , "bgsound" , "br" , "embed" , "img" ,
645+ "param" , "spacer" , "wbr" ), self .startTagVoidFormatting ),
646+ ("hr" , self .startTagHR ),
647+ ("image" , self .startTagImage ),
648+ ("isindex" , self .startTagIsIndex ),
649+ ("textarea" , self .startTagTextarea ),
650+ (("iframe" , "noembed" , "noframes" , "noscript" ), self .startTagCDATA ),
651+ ("select" , self .startTagSelect ),
652+ (("caption" , "col" , "colgroup" , "frame" , "frameset" , "head" ,
653+ "option" , "optgroup" , "tbody" , "td" , "tfoot" , "th" , "thead" ,
654+ "tr" ), self .startTagMisplaced ),
655+ (("event-source" , "section" , "nav" , "article" , "aside" , "header" ,
656+ "footer" , "datagrid" , "command" ), self .startTagNew )
611657 ])
658+ handlers .setDefaultValue (self .startTagOther )
612659 handlers [name ](name , attributes )
613660
614661 def processEndTag (self , name ):
@@ -621,8 +668,20 @@ def processEndTag(self, name):
621668 "listing" , "menu" , "ol" , "pre" , "ul" ), self .endTagBlock ),
622669 ("form" , self .endTagForm ),
623670 (("dd" , "dt" , "li" ), self .endTagListItem ),
624- (headingElements , self .endTagHeading )
625- ])
671+ (headingElements , self .endTagHeading ),
672+ (("a" , "b" , "big" , "em" , "font" , "i" , "nobr" , "s" , "small" ,
673+ "strike" , "strong" , "tt" , "u" ), self .endTagFormatting ),
674+ (("marquee" , "object" , "button" ), self .endTagButtonMarqueeObject ),
675+ (("caption" , "col" , "colgroup" , "frame" , "frameset" , "head" ,
676+ "option" , "optgroup" , "tbody" , "td" , "tfoot" , "th" , "thead" ,
677+ "tr" , "area" , "basefont" , "bgsound" , "br" , "embed" , "hr" ,
678+ "iframe" , "image" , "img" , "input" , "isindex" , "noembed" ,
679+ "noframes" , "param" , "select" , "spacer" , "table" , "textarea" ,
680+ "wbr" , "noscript" ),self .endTagMisplacedNone ),
681+ (("event-source" , "section" , "nav" , "article" , "aside" , "header" ,
682+ "footer" , "datagrid" , "command" ), self .endTagNew )
683+ ])
684+ handlers .setDefaultValue (self .endTagOther )
626685 handlers [name ](name )
627686
628687 def endTagP (self , name ):
@@ -889,6 +948,70 @@ def startTagIsIndex(self, name, attributes):
889948 self .parser .processStartTag ("hr" )
890949 self .parser .processEndTag ("form" )
891950
951+ def startTagTextarea (self , name , attributes ):
952+ raise NotImplementedError
953+
954+ def startTagCDATA (self , name , attributes ):
955+ """iframe, noembed noframes, noscript(if scripting enabled)"""
956+ raise NotImplementedError
957+
958+ def startTagSelect (self , name , attributes ):
959+ self .parser .reconstructActiveFormattingElements ()
960+ self .parser .insertElement (name , attributes )
961+ self .parser .switchInsertionMode ("inSelect" )
962+
963+ def startTagMisplaced (self , name , attributes ):
964+ """ Elements that should be children of other elements that have a
965+ different insertion mode; here they are ignored
966+ "caption", "col", "colgroup", "frame", "frameset", "head",
967+ "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
968+ "tr", "noscript"
969+ """
970+ self .parser .parseError ()
971+
972+ def endTagMisplacedNone (self , name ):
973+ """ Elements that should be children of other elements that have a
974+ different insertion mode or elements that have no end tag;
975+ here they are ignored
976+ "caption", "col", "colgroup", "frame", "frameset", "head",
977+ "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
978+ "tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
979+ "hr", "iframe", "image", "img", "input", "isindex", "noembed",
980+ "noframes", "param", "select", "spacer", "table", "textarea", "wbr""
981+ """
982+ self .parser .parseError ()
983+
984+ def startTagNew (self , name , other ):
985+ """New HTML5 elements, "event-source", "section", "nav",
986+ "article", "aside", "header", "footer", "datagrid", "command"
987+ """
988+ raise NotImplementedError
989+
990+ def endTagNew (self , name ):
991+ """New HTML5 elements, "event-source", "section", "nav",
992+ "article", "aside", "header", "footer", "datagrid", "command"
993+ """
994+ raise NotImplementedError
995+
996+ def startTagOther (self , name , attributes ):
997+ self .parser .reconstructActiveFormattingElements ()
998+ self .parser .insertElement (name , attributes )
999+
1000+ def endTagOther (self , name ):
1001+ #XXX This logic should be moved into the treebuilder
1002+ for node in self .parser .openElements [::- 1 ]:
1003+ if node .name == name :
1004+ self .parser .generateImpliedEndTags ()
1005+ if self .parser .openElements [- 1 ].name != name :
1006+ self .parser .parseError ()
1007+ while self .openElements .pop () != node :
1008+ pass
1009+ break
1010+ else :
1011+ if (node not in formattingElements and
1012+ node in specialElements | scopingElements ):
1013+ self .parser .parseError ()
1014+
8921015class InTable (InsertionMode ):
8931016 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
8941017
0 commit comments