"""HTMLTag.py
HTMLTag defines a class of the same name that represents HTML content.
An additional HTMLReader class kicks off the process of reading an HTML
file into a set of tags:
from WebUtils.HTMLTag import HTMLReader
reader = HTMLReader()
tag = reader.readFileNamed('foo.html')
tag.pprint()
Tags have attributes and children, which makes them hierarchical.
See HTMLTag class docs for more info.
Note that you imported HTMLReader instead of HTMLTag.
You only need the latter if you plan on creating tags directly.
You can discard the reader immediately if you like:
tag = HTMLReader().readFileNamed('foo.html')
The point of reading HTML into tag objects is so that you have a concrete,
Pythonic data structure to work with. The original motiviation for such a
beast was in building automated regression test suites that wanted granular,
structured access to the HTML output by the web application.
See the doc string for HTMLTag for examples of what you can do with tags.
CAVEATS
* HTMLReader needs special attention with regards to tags like
and
which sometimes are closed ( ) and sometimes not.
See its doc string for full information.
* HTMLReader is picky about the correctness of the HTML you feed it.
Again see the class docs for full info.
TO DO
* See the TO DO sections for each class.
CREDITS
* I didn't grok how to write an SGMLParser subclass until I read the very
small example by Sean McGrath at http://www.digitome.com/html2pyx.py
(which I believe is broken for empty tags).
* Determined what HTML tags are empty by scanning O'Reilly's HTML Pocket
Reference.
"""
import sys
from sgmllib import SGMLParser
from MiscUtils import NoDefault, AbstractError
# If enabled, overrides some key SGMLParser methods for more speed.
# Changing this has no effect once the module is imported (unless you reload()).
runFast = True
class HTMLTagError(Exception):
"""General HTML tag error"""
def __init__(self, msg, **values):
Exception.__init__(self, msg)
self.values = values.copy()
class HTMLTagAttrLookupError(HTMLTagError, LookupError):
"""HTML tag attribute lookup error"""
class HTMLTagUnbalancedError(HTMLTagError):
"""Unbalanced HTML tag error"""
class HTMLNotAllowedError(HTMLTagError):
"""HTML tag not allowed here error"""
class HTMLTagProcessingInstructionError(HTMLTagError):
"""HTML tag processing instruction error"""
class HTMLTagIncompleteError(HTMLTagError):
"""HTML tag incomplete error"""
DefaultEmptyTags = ('area basefont base bgsound br col colgroup frame hr'
' img input isindex link meta spacer wbr').split()
class HTMLTag(object):
"""Container class for representing HTML as tag objects.
Tags essentially have 4 major attributes:
* name
* attributes
* children
* subtags
Name is simple:
print tag.name()
Attributes are dictionary-like in nature:
print tag.attr('color') # throws an exception if no color
print tag.attr('bgcolor', None) # returns None if no bgcolor
print tag.attrs()
Children are all the leaf parts of a tag, consisting of other tags
and strings of character data.
print tag.numChildren()
print tag.childAt(0)
print tag.children()
Subtags is a convenient list of only the tags in the children:
print tag.numSubtags()
print tag.subtagAt(0)
print tag.subtags()
You can search a tag and all the tags it contains for a tag with
a particular attribute matching a particular value:
print tag.tagWithMatchingAttr('width', '100%')
An HTMLTagAttrLookupError is raised if no matching tag is found.
You can avoid this by providing a default value:
print tag.tagWithMatchingAttr('width', '100%', None)
Looking for specific 'id' attributes is common in regression testing
(it allows you to zero in on logical portions of a page),
so a convenience method is provided:
tag = htmlTag.tagWithId('accountTable')
TO DO
* A walker() method for traversing the tag tree.
* Search for a subtag with a given name, recursive or not.
* Attribute traversal with dotted notation?
* Do we need to convert tag names and attribute names to lower case,
or does SGMLParser already do that?
* Should attribute values be strip()ed?
Probably not. SGMLParser probably strips them already unless they
really do have spaces as in " quoted ". But that's speculation.
"""
## Init and reading ##
def __init__(self, name, lineNumber=None):
assert '\n' not in name
self._name = name
self._attrs = {}
self._children = []
self._subtags = []
self._lineNumber = lineNumber
# Used by closedBy() and __repr__, helps with HTMLReader error messages:
self._isClosed = False
def readAttr(self, name, value):
"""Set an attribute of the tag with the given name and value.
An assertion fails if an attribute is set twice.
"""
assert name not in self._attrs, 'name = %r, attrs = %r' % (name, attrs)
self._attrs[name] = value
def addChild(self, child):
"""Add a child to the receiver.
The child will be another tag or a string (CDATA).
"""
assert isinstance(child, (basestring, HTMLTag)), 'Invalid child: %r' % child
self._children.append(child)
if isinstance(child, HTMLTag):
self._subtags.append(child)
## Access ##
def name(self):
return self._name
def attr(self, name, default=NoDefault):
if default is NoDefault:
return self._attrs[name]
else:
return self._attrs.get(name, default)
def hasAttr(self, name):
return name in self._attrs
def attrs(self):
return self._attrs
def numAttrs(self):
return len(self._attrs)
def childAt(self, index):
return self._children[index]
def numChildren(self):
return len(self._children)
def children(self):
return self._children
def subtagAt(self, index):
return self._subtags[index]
def numSubtags(self):
return len(self._subtags)
def subtags(self):
return self._subtags
## Printing ##
def pprint(self, out=None, indent=0):
if out is None:
out = sys.stdout
wr = out.write
spacer = ' '*4*indent
wr('%s<%s>\n' % (spacer, self._name))
for key, value in self._attrs.items():
wr('%s %s = %s\n' % (spacer, key.ljust(12), value))
indent += 1
for child in self._children:
if isinstance(child, HTMLTag):
child.pprint(out, indent)
else:
wr('%s %s\n' % (spacer, child))
wr('%s%s>\n' % (spacer, self._name))
# Note: Printing a closing tag for an empty tag (such as
)
# doesn't make much sense, but then it's a good reminder that
# certain tags like are closed immediately.
def __repr__(self):
r = ['<', self._name]
if self._attrs:
for key in sorted(self._attrs):
r.extend([' ', key, '="', self._attrs[key], '"'])
r.append('>')
if self._lineNumber or self._isClosed:
r.append(' (')
if self._lineNumber:
r.append('%s' % self._lineNumber)
if self._isClosed:
if self._lineNumber:
r.append('; ')
r.append('closed by %s at %s' % (self._closedBy, self._closedAt))
r.append(')')
r = ''.join(r)
return r
## Searching ##
def tagWithMatchingAttr(self, name, value, default=NoDefault):
"""Search for tag with matching attributes.
Performs a depth-first search for a tag with an attribute that matches
the given value. If the tag cannot be found, a KeyError will be raised
*unless* a default value was specified, which is then returned.
tag = tag.tagWithMatchingAttr('bgcolor', '#FFFF', None)
"""
tag = self._tagWithMatchingAttr(name, value)
if tag is None:
if default is NoDefault:
raise HTMLTagAttrLookupError('name = %r, value = %r' % (name, value), name=name, value=value)
else:
return default
else:
return tag
def tagWithId(self, id, default=NoDefault):
"""Search for tag with a given id.
Finds and returns the tag with the given id. As in:
bar |
This is just a cover for:
tagWithMatchingAttr('id', id, default)
But searching for id's is so popular (at least in regression testing
web sites) that this convenience method is provided.
Why is it so popular? Because by attaching ids to logical portions
of your HTML, your regression test suite can quickly zero in on them
for examination.
"""
return self.tagWithMatchingAttr('id', id, default)
## Parsing (HTMLReader) ##
def closedBy(self, name, lineNumber):
self._isClosed = True
self._closedBy = name
self._closedAt = lineNumber
## Self utility ##
def _tagWithMatchingAttr(self, name, value):
"""Search for tag with matching attributes.
Performs a depth-first search for a tag with an attribute that matches
the given value. Returns None if the tag cannot be found. The method
tagWithMatchingAttr() (e.g., sans underscore) is more commonly used.
"""
if self._attrs.get(name) == value:
return self
for tag in self._subtags:
matchingTag = tag._tagWithMatchingAttr(name, value)
if matchingTag:
return matchingTag
return None
class HTMLReader(SGMLParser):
"""Reader class for representing HTML as tag objects.
NOTES
* Special attention is required regarding tags like and
which
sometimes are closed and sometimes not. HTMLReader can deal with both
situations (closed and not) provided that:
* the file doesn't change conventions for a given tag
* the reader knows ahead of time what to expect
Be default, HTMLReader assumes that and
will be closed with
and as the official HTML spec, as well as upcomer XHTML, encourage
or require, respectively.
But if your files don't close certain tags that are supposed to be required,
you can do this:
HTMLReader(extraEmptyTags=['p', 'li'])
or:
reader.extendEmptyTags(['p', 'li'])
or just set them entirely:
HTMLReader(emptyTags=['br', 'hr', 'p'])
reader.setEmptyTags(['br', 'hr', 'p'])
Although there are quite a few. Consider the DefaultEmptyTags global
list (which is used to initialize the reader's tags) which contains
about 16 tag names.
If an HTML file doesn't conform to the reader's expectation, you will get
an exception (see more below for details).
If your HTML file doesn't contain root ... tags wrapping
everything, a fake root tag will be constructed for you, unless you pass
in fakeRootTagIfNeeded=False.
Besides fixing your reader manually, you could conceivably loop through
the permutations of the various empty tags to see if one of them resulted
in a correct read.
Or you could fix the HTML.
* The reader ignores extra preceding and trailing whitespace by stripping
it from strings. I suppose this is a little harsher than reducing spans
of preceding and trailing whitespace down to one space, which is what
really happens in an HTML browser.
* The reader will not read past the closing