Skip to content

Commit 5df4d2d

Browse files
committed
Refactored treewalkers to use a non-recursive algorithm inspired from John Cowan's algorithm <http://lists.w3.org/Archives/Public/www-dom/1998OctDec/0218.html>
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40630
1 parent bdcc024 commit 5df4d2d

6 files changed

Lines changed: 250 additions & 90 deletions

File tree

src/treewalkers/_base.py

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@ class TreeWalker(object):
88
def walk(self, node):
99
raise NotImplementedError
1010

11-
def walkChildren(self, node):
12-
raise NodeImplementedError
13-
1411
def error(self, msg):
1512
return {"type": "SerializeError", "data": msg}
1613

@@ -21,17 +18,6 @@ def normalizeAttrs(self, attrs):
2118
attrs = attrs.items()
2219
return attrs
2320

24-
def element(self, node, name, attrs, hasChildren):
25-
if name in voidElements:
26-
for token in self.emptyTag(name, attrs, hasChildren):
27-
yield token
28-
else:
29-
yield self.startTag(name, attrs)
30-
if hasChildren:
31-
for token in self.walkChildren(node):
32-
yield token
33-
yield self.endTag(name)
34-
3521
def emptyTag(self, name, attrs, hasChildren=False):
3622
yield {"type": "EmptyTag", "name": name, \
3723
"data": self.normalizeAttrs(attrs)}
@@ -66,3 +52,92 @@ def doctype(self, name):
6652

6753
def unknown(self, nodeType):
6854
return self.error(_("Unknown node type: ") + nodeType)
55+
56+
class RecursiveTreeWalker(TreeWalker):
57+
def walkChildren(self, node):
58+
raise NodeImplementedError
59+
60+
def element(self, node, name, attrs, hasChildren):
61+
if name in voidElements:
62+
for token in self.emptyTag(name, attrs, hasChildren):
63+
yield token
64+
else:
65+
yield self.startTag(name, attrs)
66+
if hasChildren:
67+
for token in self.walkChildren(node):
68+
yield token
69+
yield self.endTag(name)
70+
71+
from xml.dom import Node
72+
73+
DOCUMENT = Node.DOCUMENT_NODE
74+
DOCTYPE = Node.DOCUMENT_TYPE_NODE
75+
TEXT = Node.TEXT_NODE
76+
ELEMENT = Node.ELEMENT_NODE
77+
COMMENT = Node.COMMENT_NODE
78+
UNKNOWN = "<#UNKNOWN#>"
79+
80+
class NonRecursiveTreeWalker(TreeWalker):
81+
def getNodeDetails(self, node):
82+
raise NotImplementedError
83+
84+
def getFirstChild(self, node):
85+
raise NotImplementedError
86+
87+
def getNextSibling(self, node):
88+
raise NotImplementedError
89+
90+
def getParentNode(self, node):
91+
raise NotImplementedError
92+
93+
def walk(self, node):
94+
currentNode = node
95+
while currentNode is not None:
96+
details = self.getNodeDetails(currentNode)
97+
type, details = details[0], details[1:]
98+
hasChildren = False
99+
100+
if type == DOCTYPE:
101+
yield self.doctype(*details)
102+
103+
elif type == TEXT:
104+
for token in self.text(*details):
105+
yield token
106+
107+
elif type == ELEMENT:
108+
name, attributes, hasChildren = details
109+
if name in voidElements:
110+
for token in self.emptyTag(name, attributes, hasChildren):
111+
yield token
112+
hasChildren = False
113+
else:
114+
yield self.startTag(name, attributes)
115+
116+
elif type == COMMENT:
117+
yield self.comment(details[0])
118+
119+
elif type == DOCUMENT:
120+
hasChildren = True
121+
122+
else:
123+
yield self.unknown(details[0])
124+
125+
firstChild = hasChildren and self.getFirstChild(currentNode) or None
126+
if firstChild is not None:
127+
currentNode = firstChild
128+
else:
129+
while currentNode is not None:
130+
details = self.getNodeDetails(currentNode)
131+
type, details = details[0], details[1:]
132+
if type == ELEMENT:
133+
name, attributes, hasChildren = details
134+
if name not in voidElements:
135+
yield self.endTag(name)
136+
nextSibling = self.getNextSibling(currentNode)
137+
if nextSibling is not None:
138+
currentNode = nextSibling
139+
break
140+
if node is currentNode:
141+
currentNode = None
142+
else:
143+
currentNode = self.getParentNode(currentNode)

src/treewalkers/dom.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,33 @@
55

66
import _base
77

8-
class TreeWalker(_base.TreeWalker):
9-
def walk(self, node):
8+
from constants import voidElements
9+
10+
class TreeWalker(_base.NonRecursiveTreeWalker):
11+
def getNodeDetails(self, node):
1012
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
11-
yield self.doctype(node.nodeName)
13+
return _base.DOCTYPE, node.nodeName
1214

1315
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
14-
for token in self.text(node.nodeValue):
15-
yield token
16+
return _base.TEXT, node.nodeValue
1617

1718
elif node.nodeType == Node.ELEMENT_NODE:
18-
for token in self.element(node, node.nodeName, \
19-
node.attributes.items(), node.childNodes):
20-
yield token
19+
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
2120

2221
elif node.nodeType == Node.COMMENT_NODE:
23-
yield self.comment(node.nodeValue)
22+
return _base.COMMENT, node.nodeValue
2423

2524
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
26-
for token in self.walkChildren(node):
27-
yield token
25+
return (_base.DOCUMENT,)
2826

2927
else:
30-
yield self.unknown(node.nodeType)
28+
return _base.UNKNOWN, node.nodeType
29+
30+
def getFirstChild(self, node):
31+
return node.firstChild
32+
33+
def getNextSibling(self, node):
34+
return node.nextSibling
3135

32-
def walkChildren(self, node):
33-
for childNode in node.childNodes:
34-
for token in self.walk(childNode):
35-
yield token
36+
def getParentNode(self, node):
37+
return node.parentNode

src/treewalkers/etree.py

Lines changed: 72 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -23,43 +23,90 @@ def getETreeModule(ElementTreeImplementation):
2323
def getETreeBuilder(ElementTreeImplementation):
2424
ElementTree = ElementTreeImplementation
2525

26-
class TreeWalker(_base.TreeWalker):
27-
def walk(self, node):
26+
class TreeWalker(_base.NonRecursiveTreeWalker):
27+
"""Given the particular ElementTree representation, this implementation,
28+
to avoid using recursion, returns "nodes" as tuples with the following
29+
content:
30+
31+
1. An Element node serving as *context* (it cannot be called the parent
32+
node due to the particular ``tail`` text nodes.
33+
34+
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
35+
36+
3. A list used as a stack of all ancestor *context nodes*. It is a
37+
pair tuple whose first item is an Element and second item is a child
38+
index.
39+
"""
40+
41+
def getNodeDetails(self, node):
42+
if isinstance(node, tuple): # It might be the root Element
43+
elt, key, parents = node
44+
if key in ("text", "tail"):
45+
return _base.TEXT, getattr(elt, key)
46+
else:
47+
node = elt[int(key)]
48+
2849
if not(hasattr(node, "tag")):
2950
node = node.getroot()
3051

3152
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
32-
for token in self.walkChildren(node):
33-
yield token
53+
return (_base.DOCUMENT,)
3454

3555
elif node.tag == "<!DOCTYPE>":
36-
yield self.doctype(node.text)
56+
return _base.DOCTYPE, node.text
3757

3858
elif type(node.tag) == type(ElementTree.Comment):
39-
yield self.comment(node.text)
59+
return _base.COMMENT, node.text
4060

4161
else:
4262
#This is assumed to be an ordinary element
43-
if node.tag in voidElements:
44-
for token in self.emptyTag(node.tag, \
45-
node.attrib.items(), len(node) or node.text):
46-
yield token
47-
else:
48-
yield self.startTag(node.tag, node.attrib.items())
49-
for token in self.walkChildren(node):
50-
yield token
51-
yield self.endTag(node.tag)
63+
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
5264

53-
if node.tail:
54-
for token in self.text(node.tail):
55-
yield token
56-
57-
def walkChildren(self, node):
65+
def getFirstChild(self, node):
66+
if isinstance(node, tuple): # It might be the root Element
67+
elt, key, parents = node
68+
assert key not in ("text", "tail"), "Text nodes have no children"
69+
parents.append((elt, int(key)))
70+
node = elt[int(key)]
71+
else:
72+
parents = []
73+
74+
assert len(node) or node.text, "Node has no children"
5875
if node.text:
59-
for token in self.text(node.text):
60-
yield token
61-
for childNode in node.getchildren():
62-
for token in self.walk(childNode):
63-
yield token
76+
return (node, "text", parents)
77+
else:
78+
return (node, 0, parents)
79+
80+
def getNextSibling(self, node):
81+
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
82+
83+
elt, key, parents = node
84+
if key == "text":
85+
key = -1
86+
elif key == "tail":
87+
elt, key = parents.pop()
88+
else:
89+
# Look for "tail" of the "revisited" node
90+
child = elt[key]
91+
if child.tail:
92+
parents.append((elt, key))
93+
return (child, "tail", parents)
94+
95+
# case where key were "text" or "tail" or elt[key] had a tail
96+
key += 1
97+
if len(elt) > key:
98+
return (elt, key, parents)
99+
else:
100+
return None
101+
102+
def getParentNode(self, node):
103+
assert isinstance(node, tuple)
104+
elt, key, parents = node
105+
if parents:
106+
elt, key = parents.pop()
107+
return elt, key, parents
108+
else:
109+
# HACK: We could return ``elt`` but None will stop the algorithm the same way
110+
return None
64111

65112
return locals()

src/treewalkers/pulldom.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,3 @@ def tokens(self, event, next):
5050

5151
else:
5252
yield self.unknown(type)
53-
54-
def walkChildren(self, node):
55-
raise Exception(_("PullDOM tree walker's walkChildren should never be called"))

src/treewalkers/simpletree.py

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,32 +3,70 @@
33

44
import _base
55

6-
class TreeWalker(_base.TreeWalker):
7-
def walk(self, node):
6+
class TreeWalker(_base.NonRecursiveTreeWalker):
7+
"""Given that simpletree has no performant way of getting a node's
8+
next sibling, this implementation returns "nodes" as tuples with the
9+
following content:
10+
11+
1. The parent Node (Element, Document or DocumentFragment)
12+
13+
2. The child index of the current node in its parent's children list
14+
15+
3. A list used as a stack of all ancestors. It is a pair tuple whose
16+
first item is a parent Node and second item is a child index.
17+
"""
18+
19+
def getNodeDetails(self, node):
20+
if isinstance(node, tuple): # It might be the root Node
21+
parent, idx, parents = node
22+
node = parent.childNodes[idx]
23+
824
# testing node.type allows us not to import treebuilders.simpletree
925
if node.type in (1, 2): # Document or DocumentFragment
10-
for token in self.walkChildren(node):
11-
yield token
26+
return (_base.DOCUMENT,)
1227

1328
elif node.type == 3: # DocumentType
14-
yield self.doctype(node.name)
29+
return _base.DOCTYPE, node.name
1530

1631
elif node.type == 4: # TextNode
17-
for token in self.text(node.value):
18-
yield token
32+
return _base.TEXT, node.value
1933

2034
elif node.type == 5: # Element
21-
for token in self.element(node, node.name, \
22-
node.attributes.items(), node.childNodes):
23-
yield token
35+
return _base.ELEMENT, node.name, \
36+
node.attributes.items(), node.hasContent()
2437

2538
elif node.type == 6: # CommentNode
26-
yield self.comment(node.data)
39+
return _base.COMMENT, node.data
2740

2841
else:
29-
yield self.unknown(node.type)
42+
return _node.UNKNOWN, node.type
3043

31-
def walkChildren(self, node):
32-
for childNode in node.childNodes:
33-
for token in self.walk(childNode):
34-
yield token
44+
def getFirstChild(self, node):
45+
if isinstance(node, tuple): # It might be the root Node
46+
parent, idx, parents = node
47+
parents.append((parent, idx))
48+
node = parent.childNodes[idx]
49+
else:
50+
parents = []
51+
52+
assert node.hasContent(), "Node has no children"
53+
return (node, 0, parents)
54+
55+
def getNextSibling(self, node):
56+
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
57+
parent, idx, parents = node
58+
idx += 1
59+
if len(parent.childNodes) > idx:
60+
return (parent, idx, parents)
61+
else:
62+
return None
63+
64+
def getParentNode(self, node):
65+
assert isinstance(node, tuple)
66+
parent, idx, parents = node
67+
if parents:
68+
parent, idx = parents.pop()
69+
return parent, idx, parents
70+
else:
71+
# HACK: We could return ``parent`` but None will stop the algorithm the same way
72+
return None

0 commit comments

Comments
 (0)