forked from html5lib/html5lib-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse.py
More file actions
executable file
·126 lines (106 loc) · 4.3 KB
/
parse.py
File metadata and controls
executable file
·126 lines (106 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a simpletree tree, with optional profiling
"""
#RELEASE move ./examples/
import sys
import os
from optparse import OptionParser
#RELEASE remove
from src import html5parser, liberalxmlparser
from src import treebuilders, serializer, treewalkers
#END RELEASE
#RELEASE add
#from html5lib import html5parser, liberalxmlparser
#from html5lib import treebuilders, serializer, treewalkers
#END RELEASE
def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
try:
f = args[-1]
# Try opening from the internet
if f.startswith('http://'):
try:
import urllib
f = urllib.urlopen(f).read()
except: pass
else:
try:
# Try opening from file system
f = open(f)
except IOError: pass
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
if opts.xml:
p = liberalxmlparser.XHTMLParser(tree=treebuilder)
else:
p = html5parser.HTMLParser(tree=treebuilder)
if opts.profile:
import hotshot
import hotshot.stats
prof = hotshot.Profile('stats.prof')
prof.runcall(p.parse, f)
prof.close()
# XXX - We should use a temp file here
stats = hotshot.stats.load('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = p.parse(f)
t1 = time.time()
printOutput(p, document, opts)
t2 = time.time()
sys.stdout.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
else:
document = p.parse(f)
printOutput(p, document, opts)
def printOutput(parser, document, opts):
if opts.encoding:
print "Encoding:", parser.tokenizer.stream.charEncoding
if not opts.no_tree:
if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.html:
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
for text in serializer.HTMLSerializer().serialize(tokens, encoding='utf-8'):
sys.stdout.write(text)
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
else:
sys.stdout.write(parser.tree.testSerializer(document).encode("utf-8"))
if opts.error:
errList=[]
for pos, message in parser.errors:
errList.append("Line %i Col %i"%pos + " " + message)
sys.stderr.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("", "--no-tree", action="store_true", default=False,
dest="no_tree", help="Do not print output tree")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder", default="simpleTree")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="Output as xml")
parser.add_option("", "--html", action="store_true", default=False,
dest="html", help="Output as html")
parser.add_option("", "--hilite", action="store_true", default=False,
dest="hilite", help="Output as formatted highlighted code.")
parser.add_option("-c", "--encoding", action="store_true", default=False,
dest="encoding", help="Print character encoding used")
return parser
if __name__ == "__main__":
parse()