Skip to content

Commit d58a829

Browse files
committed
Simple prettry printing of the parse tree (still some bugs) and a bit more fleshing out of the inBody state so we can parse the test1.html file
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40110
1 parent 19f3bb6 commit d58a829

2 files changed

Lines changed: 136 additions & 12 deletions

File tree

parser.py

Lines changed: 134 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ def __init__(self, name, value):
2020
self.childNodes = []
2121
self.attributes = {}
2222
self._flags = []
23+
def __str__(self):
24+
"""Convert the current subtree to a pretty-printed representation"""
25+
rv = self._printNode()
26+
for node in self.childNodes:
27+
rv += " " + str(node)
28+
return rv
29+
30+
def _printNode(self):
31+
raise NotImplementedError
32+
2333
def appendChild(self, node):
2434
if (isinstance(node, TextNode) and self.childNodes and
2535
isinstance(self.childNodes[-1], TextNode)):
@@ -36,22 +46,37 @@ class Document(Node):
3646
def __init__(self):
3747
Node.__init__(self, None, None)
3848

49+
def _printNode(self):
50+
return "document\n"
51+
3952
class DocumentType(Node):
4053
def __init__(self, name):
4154
Node.__init__(self, name, None)
4255

56+
def _printNode(self):
57+
return " ".join(["<!DOCTYPE", name, ">\n"])
58+
4359
class TextNode(Node):
4460
def __init__(self, value):
4561
Node.__init__(self, None, value)
4662

63+
def _printNode(self):
64+
return "".join(["'",self.value, "'\n"])
65+
4766
class Element(Node):
4867
def __init__(self, name):
4968
Node.__init__(self, name, None)
5069

70+
def _printNode(self):
71+
return " ".join(["<"+self.name+">", str(self.attributes), "\n"])
72+
5173
class CommentNode(Node):
5274
def __init__(self, data):
5375
Node.__init__(self, None, None, None)
5476
self.data = data
77+
78+
def _printNode(self):
79+
return "<!--" + self.value + "-->\n"
5580

5681
class HTMLParser(object):
5782
"""Main parser class"""
@@ -114,15 +139,9 @@ def atheistParseError(self):
114139
"""This error is not an error"""
115140
pass
116141

117-
def switchInsertionMode(self, name):
118-
"""Switch between different insertion modes in the main phase"""
119-
# XXX AT Arguably this should be on the main phase object itself
120-
self.phase.insertionMode = self.phase.insertionModes[name](self)
121-
122142
def switchPhase(self, name):
123143
"""Switch between different phases of the parsing
124144
"""
125-
print name, self.phases["trailingEnd"]
126145
# Need to hang on to state between trailing end phase and main phase
127146
if (name == "trailingEnd" and
128147
isinstance(self.phase, self.phases["main"])):
@@ -134,6 +153,14 @@ def switchPhase(self, name):
134153
else:
135154
self.phase = self.phases[name](self)
136155

156+
#XXX - almost everthing after this point should be moved into a
157+
#seperate treebuilder object
158+
159+
def switchInsertionMode(self, name):
160+
"""Switch between different insertion modes in the main phase"""
161+
# XXX AT Arguably this should be on the main phase object itself
162+
self.phase.insertionMode = self.phase.insertionModes[name](self)
163+
137164
def elementInScope(self, target, tableVariant=False):
138165
for node in self.openElements[::-1]:
139166
if node == target:
@@ -148,6 +175,9 @@ def elementInScope(self, target, tableVariant=False):
148175

149176
def reconstructActiveFormattingElements(self):
150177
afe = self.activeFormattingElements
178+
#If there are no active formatting elements exit early
179+
if not afe:
180+
return
151181
entry = afe[-1]
152182
if entry == Marker or entry in self.openElements:
153183
return
@@ -192,7 +222,6 @@ def insertElement(self, name, attributes, parent=None):
192222
if self.openElements:
193223
self.openElements[-1].appendChild(element)
194224
self.openElements.append(element)
195-
print name, self.openElements
196225
else:
197226
# XXX Haven't implemented this yet as spec is vaugely unclear
198227
raise NotImplementedError
@@ -358,7 +387,7 @@ def processStartTag(self, name, attributes):
358387
self.parser.parseError()
359388
for attr, value in attributes.iteritems():
360389
if attr not in self.parser.openElements[0].attributes:
361-
selfparser.openElements[0].attributes[attr] = value
390+
self.parser.openElements[0].attributes[attr] = value
362391
else:
363392
self.insertionMode.processStartTag(name, attributes)
364393

@@ -596,7 +625,7 @@ def processStartTag(self, name, attributes):
596625
# XXX Should this handle unknown elements as well?
597626
handlers=utils.MethodDispatcher([
598627
("script",self.startTagScript),
599-
(("base", "link", "meta", "style", "title"), startTagFromHead),
628+
(("base", "link", "meta", "style", "title"), self.startTagFromHead),
600629
("body", self.startTagBody),
601630
(("address", "blockquote", "center", "dir", "div", "dl",
602631
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@@ -608,7 +637,25 @@ def processStartTag(self, name, attributes):
608637
("a",self.startTagA),
609638
(("b", "big", "em", "font", "i", "nobr", "s", "small",
610639
"strike", "strong", "tt", "u"),self.startTagFormatting),
640+
("button", self.startTagButton),
641+
(("marquee", "object"), self.startTagMarqueeObject),
642+
("xmp", self.startTagXMP),
643+
("table", self.startTagTable),
644+
(("area", "basefont", "bgsound", "br", "embed", "img",
645+
"param", "spacer", "wbr"), self.startTagVoidFormatting),
646+
("hr", self.startTagHR),
647+
("image", self.startTagImage),
648+
("isindex", self.startTagIsIndex),
649+
("textarea", self.startTagTextarea),
650+
(("iframe", "noembed", "noframes", "noscript"), self.startTagCDATA),
651+
("select", self.startTagSelect),
652+
(("caption", "col", "colgroup", "frame", "frameset", "head",
653+
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
654+
"tr"), self.startTagMisplaced),
655+
(("event-source", "section", "nav", "article", "aside", "header",
656+
"footer", "datagrid", "command"), self.startTagNew)
611657
])
658+
handlers.setDefaultValue(self.startTagOther)
612659
handlers[name](name, attributes)
613660

614661
def processEndTag(self, name):
@@ -621,8 +668,20 @@ def processEndTag(self, name):
621668
"listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
622669
("form", self.endTagForm),
623670
(("dd", "dt", "li"), self.endTagListItem),
624-
(headingElements, self.endTagHeading)
625-
])
671+
(headingElements, self.endTagHeading),
672+
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
673+
"strike", "strong", "tt", "u"), self.endTagFormatting),
674+
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
675+
(("caption", "col", "colgroup", "frame", "frameset", "head",
676+
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
677+
"tr", "area", "basefont", "bgsound", "br", "embed", "hr",
678+
"iframe", "image", "img", "input", "isindex", "noembed",
679+
"noframes", "param", "select", "spacer", "table", "textarea",
680+
"wbr", "noscript"),self.endTagMisplacedNone),
681+
(("event-source", "section", "nav", "article", "aside", "header",
682+
"footer", "datagrid", "command"), self.endTagNew)
683+
])
684+
handlers.setDefaultValue(self.endTagOther)
626685
handlers[name](name)
627686

628687
def endTagP(self, name):
@@ -889,6 +948,70 @@ def startTagIsIndex(self, name, attributes):
889948
self.parser.processStartTag("hr")
890949
self.parser.processEndTag("form")
891950

951+
def startTagTextarea(self, name, attributes):
952+
raise NotImplementedError
953+
954+
def startTagCDATA(self, name, attributes):
955+
"""iframe, noembed noframes, noscript(if scripting enabled)"""
956+
raise NotImplementedError
957+
958+
def startTagSelect(self, name, attributes):
959+
self.parser.reconstructActiveFormattingElements()
960+
self.parser.insertElement(name, attributes)
961+
self.parser.switchInsertionMode("inSelect")
962+
963+
def startTagMisplaced(self, name, attributes):
964+
""" Elements that should be children of other elements that have a
965+
different insertion mode; here they are ignored
966+
"caption", "col", "colgroup", "frame", "frameset", "head",
967+
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
968+
"tr", "noscript"
969+
"""
970+
self.parser.parseError()
971+
972+
def endTagMisplacedNone(self, name):
973+
""" Elements that should be children of other elements that have a
974+
different insertion mode or elements that have no end tag;
975+
here they are ignored
976+
"caption", "col", "colgroup", "frame", "frameset", "head",
977+
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
978+
"tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
979+
"hr", "iframe", "image", "img", "input", "isindex", "noembed",
980+
"noframes", "param", "select", "spacer", "table", "textarea", "wbr""
981+
"""
982+
self.parser.parseError()
983+
984+
def startTagNew(self, name, other):
985+
"""New HTML5 elements, "event-source", "section", "nav",
986+
"article", "aside", "header", "footer", "datagrid", "command"
987+
"""
988+
raise NotImplementedError
989+
990+
def endTagNew(self, name):
991+
"""New HTML5 elements, "event-source", "section", "nav",
992+
"article", "aside", "header", "footer", "datagrid", "command"
993+
"""
994+
raise NotImplementedError
995+
996+
def startTagOther(self, name, attributes):
997+
self.parser.reconstructActiveFormattingElements()
998+
self.parser.insertElement(name, attributes)
999+
1000+
def endTagOther(self, name):
1001+
#XXX This logic should be moved into the treebuilder
1002+
for node in self.parser.openElements[::-1]:
1003+
if node.name == name:
1004+
self.parser.generateImpliedEndTags()
1005+
if self.parser.openElements[-1].name != name:
1006+
self.parser.parseError()
1007+
while self.openElements.pop() != node:
1008+
pass
1009+
break
1010+
else:
1011+
if (node not in formattingElements and
1012+
node in specialElements | scopingElements):
1013+
self.parser.parseError()
1014+
8921015
class InTable(InsertionMode):
8931016
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
8941017

tests/test_tokenizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ def test_tokenizer():
7878
for filename in glob.glob('tokenizer/*.test'):
7979
tests = simplejson.load(file(filename))
8080
for test in tests['tests']:
81-
yield runTokenizerTest, test['description'], test['input'], test['output']
81+
yield (runTokenizerTest, test['description'], test['input'],
82+
test['output'])
8283

8384
def runTokenizerTest(description, input, output):
8485
#XXX - move this out into the setup function

0 commit comments

Comments
 (0)