Skip to content

Commit 52cda1a

Browse files
committed
More cleanup
--HG-- rename : README => python/README rename : parse.py => python/parse.py rename : setup_base.py => python/setup_base.py extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40715
1 parent 99a73b0 commit 52cda1a

File tree

3 files changed

+198
-0
lines changed

3 files changed

+198
-0
lines changed

README

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
html5lib is a pure-python library for parsing HTML. It is designed to
2+
conform to the Web Applications 1.0 specification, which has
3+
formalized the error handling algorithms of popular web browsers.
4+
5+
= Installation =
6+
7+
html5lib is packaged with distutils. To install it use:
8+
$ python setup.py install
9+
10+
= Tests =
11+
12+
You may wish to check that your installation has been a success by
13+
running the testsuite. All the tests can be run by invoking
14+
runtests.py in the tests/ directory
15+
16+
= Usage =
17+
18+
Simple usage follows this pattern:
19+
20+
import html5lib
21+
f = open("mydocument.html")
22+
parser = html5lib.HTMLParser()
23+
document = parser.parse(f)
24+
25+
26+
More documentation is avaliable in the docstrings or from
27+
http://code.google.com/p/html5lib/wiki/UserDocumentation
28+
29+
= Bugs =
30+
31+
Please report any bugs on the issue tracker:
32+
http://code.google.com/p/html5lib/issues/list
33+
34+
= Get Involved =
35+
36+
Contributions to code or documenation are actively encouraged. Submit
37+
patches to the issue tracker or discuss changes on irc in the #whatwg
38+
channel on freenode.net

parse.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/usr/bin/env python
2+
"""usage: %prog [options] filename
3+
4+
Parse a document to a simpletree tree, with optional profiling
5+
"""
6+
#RELEASE move ./examples/
7+
8+
import sys
9+
import os
10+
from optparse import OptionParser
11+
12+
#RELEASE remove
13+
from src import html5parser, liberalxmlparser
14+
from src import treebuilders, serializer, treewalkers
15+
#END RELEASE
16+
#RELEASE add
17+
#from html5lib import html5parser, liberalxmlparser
18+
#from html5lib import treebuilders, serializer, treewalkers
19+
#END RELEASE
20+
21+
def parse():
22+
optParser = getOptParser()
23+
opts,args = optParser.parse_args()
24+
25+
try:
26+
f = args[-1]
27+
# Try opening from the internet
28+
if f.startswith('http://'):
29+
try:
30+
import urllib
31+
f = urllib.urlopen(f).read()
32+
except: pass
33+
else:
34+
try:
35+
# Try opening from file system
36+
f = open(f)
37+
except IOError: pass
38+
except IndexError:
39+
sys.stderr.write("No filename provided. Use -h for help\n")
40+
sys.exit(1)
41+
42+
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
43+
44+
if opts.xml:
45+
p = liberalxmlparser.XHTMLParser(tree=treebuilder)
46+
else:
47+
p = html5parser.HTMLParser(tree=treebuilder)
48+
49+
if opts.profile:
50+
import hotshot
51+
import hotshot.stats
52+
prof = hotshot.Profile('stats.prof')
53+
prof.runcall(p.parse, f)
54+
prof.close()
55+
# XXX - We should use a temp file here
56+
stats = hotshot.stats.load('stats.prof')
57+
stats.strip_dirs()
58+
stats.sort_stats('time')
59+
stats.print_stats()
60+
elif opts.time:
61+
import time
62+
t0 = time.time()
63+
document = p.parse(f)
64+
t1 = time.time()
65+
printOutput(p, document, opts)
66+
t2 = time.time()
67+
sys.stdout.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
68+
else:
69+
document = p.parse(f)
70+
printOutput(p, document, opts)
71+
72+
def printOutput(parser, document, opts):
73+
if opts.encoding:
74+
print "Encoding:", parser.tokenizer.stream.charEncoding
75+
if not opts.no_tree:
76+
if opts.xml:
77+
sys.stdout.write(document.toxml("utf-8"))
78+
elif opts.html:
79+
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
80+
for text in serializer.HTMLSerializer().serialize(tokens, encoding='utf-8'):
81+
sys.stdout.write(text)
82+
elif opts.hilite:
83+
sys.stdout.write(document.hilite("utf-8"))
84+
else:
85+
sys.stdout.write(parser.tree.testSerializer(document).encode("utf-8"))
86+
if opts.error:
87+
errList=[]
88+
for pos, message in parser.errors:
89+
errList.append("Line %i Col %i"%pos + " " + message)
90+
sys.stderr.write("\nParse errors:\n" + "\n".join(errList)+"\n")
91+
92+
def getOptParser():
93+
parser = OptionParser(usage=__doc__)
94+
95+
parser.add_option("-p", "--profile", action="store_true", default=False,
96+
dest="profile", help="Use the hotshot profiler to "
97+
"produce a detailed log of the run")
98+
99+
parser.add_option("-t", "--time",
100+
action="store_true", default=False, dest="time",
101+
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
102+
103+
parser.add_option("", "--no-tree", action="store_true", default=False,
104+
dest="no_tree", help="Do not print output tree")
105+
106+
parser.add_option("-b", "--treebuilder", action="store", type="string",
107+
dest="treebuilder", default="simpleTree")
108+
109+
parser.add_option("-e", "--error", action="store_true", default=False,
110+
dest="error", help="Print a list of parse errors")
111+
112+
parser.add_option("-x", "--xml", action="store_true", default=False,
113+
dest="xml", help="Output as xml")
114+
115+
parser.add_option("", "--html", action="store_true", default=False,
116+
dest="html", help="Output as html")
117+
118+
parser.add_option("", "--hilite", action="store_true", default=False,
119+
dest="hilite", help="Output as formatted highlighted code.")
120+
121+
parser.add_option("-c", "--encoding", action="store_true", default=False,
122+
dest="encoding", help="Print character encoding used")
123+
return parser
124+
125+
if __name__ == "__main__":
126+
parse()

setup_base.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from distutils.core import setup
2+
3+
long_description="""HTML parser designed to follow the WHATWG HTML5
4+
specification. The parser is designed to handle all flavours of HTML and
5+
parses invalid documents using well-defined error handling rules compatible
6+
with the behaviour of major desktop web browsers.
7+
8+
Output is to a tree structure; the current release supports output to
9+
a custom tree similar to DOM and to ElementTree.
10+
"""
11+
12+
classifiers=[
13+
'Development Status :: %(status)s',
14+
'Intended Audience :: Developers',
15+
'License :: OSI Approved :: MIT License',
16+
'Operating System :: OS Independent',
17+
'Programming Language :: Python',
18+
'Topic :: Software Development :: Libraries :: Python Modules',
19+
'Topic :: Text Processing :: Markup :: HTML'
20+
],
21+
22+
setup(name='html5lib',
23+
version='%(version)s',
24+
url='http://code.google.com/p/html5lib/',
25+
license="MIT License",
26+
description='HTML parser based on the WHAT-WG Web Applications 1.0'
27+
'("HTML5") specifcation',
28+
long_description=long_description,
29+
classifiers=classifiers,
30+
maintainer='James Graham',
31+
maintainer_email='jg307@cam.ac.uk',
32+
packages=['html5lib', 'html5lib.treebuilders'],
33+
package_dir = {'html5lib': 'src'}
34+
)

0 commit comments

Comments
 (0)