Skip to content

Commit cda0b54

Browse files
committed
Add an lxml-optimized treewalker and fix caching of etree modules (do not cache at the __init__ level, let the etree submodule do its own caching based on the implementation)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40843
1 parent 7b95e69 commit cda0b54

3 files changed

Lines changed: 78 additions & 4 deletions

File tree

src/html5lib/treewalkers/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,16 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
2020
more pythonic idioms.
2121
"dom" - The xml.dom.minidom DOM implementation
2222
"pulldom" - The xml.dom.pulldom event stream
23-
"etree" - A generic builder for tree implementations exposing an
23+
"etree" - A generic walker for tree implementations exposing an
2424
elementtree-like interface (known to work with
2525
ElementTree, cElementTree and lxml.etree).
26+
"lxml" - Optimized walker for lxml.etree
2627
"beautifulsoup" - Beautiful soup (if installed)
2728
"genshi" - a Genshi stream
2829
2930
implementation - (Currently applies to the "etree" tree type only). A module
3031
implementing the tree type e.g. xml.etree.ElementTree or
31-
lxml.etree."""
32+
cElementTree."""
3233

3334
treeType = treeType.lower()
3435
if treeType not in treeWalkerCache:
@@ -41,7 +42,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
4142
elif treeType == "beautifulsoup":
4243
import soup
4344
treeWalkerCache[treeType] = soup.TreeWalker
45+
elif treeType == "lxml":
46+
import lxmletree
47+
treeWalkerCache[treeType] = lxmletree.TreeWalker
4448
elif treeType == "etree":
4549
import etree
46-
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
50+
# XXX: NEVER cache here, caching is done in the etree submodule
51+
return etree.getETreeModule(implementation, **kwargs).TreeWalker
4752
return treeWalkerCache.get(treeType)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from lxml import etree
2+
3+
from gettext import gettext
4+
_ = gettext
5+
6+
import _base
7+
8+
from html5lib.constants import voidElements
9+
10+
class TreeWalker(_base.NonRecursiveTreeWalker):
11+
def getNodeDetails(self, node):
12+
if isinstance(node, tuple): # Text node
13+
node, key = node
14+
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
15+
return _base.TEXT, getattr(node, key)
16+
17+
if not(hasattr(node, "tag")):
18+
node = node.getroot()
19+
20+
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
21+
return (_base.DOCUMENT,)
22+
23+
elif node.tag == "<!DOCTYPE>":
24+
return _base.DOCTYPE, node.text
25+
26+
elif node.tag == etree.Comment:
27+
return _base.COMMENT, node.text
28+
29+
else:
30+
#This is assumed to be an ordinary element
31+
return _base.ELEMENT, node.tag, node.attrib.items(), bool(node) or node.text
32+
33+
def getFirstChild(self, node):
34+
assert not isinstance(node, tuple), _("Text nodes have no children")
35+
36+
assert bool(node) or node.text, "Node has no children"
37+
if node.text:
38+
return (node, "text")
39+
else:
40+
return node[0]
41+
42+
def getNextSibling(self, node):
43+
if isinstance(node, tuple): # Text node
44+
node, key = node
45+
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
46+
if key == "text":
47+
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
48+
# because node[0] might evaluate to False if it has no child element
49+
if bool(node):
50+
return node[0]
51+
else:
52+
return None
53+
else: # tail
54+
return node.getnext()
55+
56+
return node.tail and (node, "tail") or node.getnext()
57+
58+
def getParentNode(self, node):
59+
if isinstance(node, tuple): # Text node
60+
node, key = node
61+
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
62+
if key == "text":
63+
return node
64+
# else: fallback to "normal" processing
65+
66+
return node.getparent()

tests/test_treewalkers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def PullDOMAdapter(node):
8080

8181
try:
8282
import lxml.etree as ElementTree
83-
treeTypes['lxml'] = \
83+
treeTypes['lxml_as_etree'] = \
8484
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
8585
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
86+
treeTypes['lxml_native'] = \
87+
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
88+
"walker": treewalkers.getTreeWalker("lxml")}
8689
except ImportError:
8790
pass
8891

0 commit comments

Comments
 (0)