Skip to content

Commit 8c8eb7e

Browse files
committed
Changes to parser.py to make the command line interface more sane, type parser.py -h for usage. Also rolls in a fix to the import in treebuilders.base so that both parser and unittests should work and a few miscellaneous changes there that should be in a different patch
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40370
1 parent f34b500 commit 8c8eb7e

4 files changed

Lines changed: 64 additions & 46 deletions

File tree

parse.py

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1+
"""usage: %prog [options] filename
2+
3+
Parse a document to a DOMlite tree, with optional profiling
14
"""
2-
Usage:
3-
python parse.py tests/sites/web-apps.htm > outputfile
4-
To parse the file web-apps.htm and get a tree.
5-
6-
python parse.py tests/sites/web-apps.htm x > outputfile
7-
To parse the file web-apps.htm and get a profile.
8-
"""
5+
96
import sys
107
import os
8+
from optparse import OptionParser
119

1210
from src import parser
1311

@@ -23,30 +21,48 @@ def convertTreeDump(treedump):
2321
rv.append(line)
2422
return "\n".join(rv)
2523

26-
if __name__ == "__main__":
24+
def parse():
25+
optParser = getOptParser()
26+
opts,args = optParser.parse_args()
27+
2728
p = parser.HTMLParser()
28-
if len(sys.argv) > 1:
29-
x = sys.argv[1]
30-
if len(sys.argv) > 2:
31-
import hotshot
32-
import hotshot.stats
33-
prof = hotshot.Profile('stats.prof')
34-
prof.runcall(p.parse, x, False)
35-
prof.close()
36-
stats = hotshot.stats.load('stats.prof')
37-
stats.strip_dirs()
38-
stats.sort_stats('time')
39-
stats.print_stats()
40-
else:
41-
from time import time
42-
t = time()
43-
document = p.parse(x)
44-
t = time() - t
45-
t2 = time()
46-
print convertTreeDump(document.printTree())
47-
t2 = time() - t2
48-
print "\n\nDuration:", t, "\nTree dump duration:", t2
29+
f = open(args[0])
30+
if opts.profile:
31+
import hotshot
32+
import hotshot.stats
33+
prof = hotshot.Profile('stats.prof')
34+
prof.runcall(p.parse, f, False)
35+
prof.close()
36+
#XXX - We should use a temp file here
37+
stats = hotshot.stats.load('stats.prof')
38+
stats.strip_dirs()
39+
stats.sort_stats('time')
40+
stats.print_stats()
41+
elif opts.time:
42+
import time
43+
t0 = time.time()
44+
document = p.parse(f)
45+
t1 = time.time()
46+
print convertTreeDump(document.printTree())
47+
t2 = time.time()
48+
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
4949
else:
50-
print """Pass one argument to parse the document and two to get an
51-
indication on what's going on.
52-
"""
50+
document = p.parse(f)
51+
print convertTreeDump(document.printTree())
52+
53+
def getOptParser():
54+
parser = OptionParser(usage=__doc__)
55+
56+
parser.add_option("-p", "--profile", action="store_true", default=False,
57+
dest="profile", help="Use the hotdhot profiler to "
58+
"produce a detailed log of the run")
59+
60+
parser.add_option("-t", "--time",
61+
action="store_true", default=False, dest="time",
62+
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
63+
64+
return parser
65+
66+
if __name__ == "__main__":
67+
print os.path.abspath(os.curdir)
68+
parse()

src/parser.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,6 @@ def parse(self, stream, innerHTML=False):
5959
# assertations
6060
self.innerHTML = innerHTML
6161

62-
# Flag indicationg special insertion mode from elements misnested inside
63-
# a table
64-
self.insertFromTable = False
65-
6662
self.tokenizer = tokenizer.HTMLTokenizer(stream)
6763

6864
# XXX This is temporary for the moment so there isn't any other
@@ -84,7 +80,7 @@ def parse(self, stream, innerHTML=False):
8480
# When the loop finishes it's EOF
8581
self.phase.processEOF()
8682

87-
return self.tree.document
83+
return self.tree.getDocument()
8884

8985
def parseError(self):
9086
if self.strict:

src/treebuilders/base.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
import sys
22
import os
33

4-
# XXX someone please fix this up! And make sure it doesn't break Windows.
5-
from src.constants import contentModelFlags, spaceCharacters
6-
from src.constants import scopingElements, formattingElements, specialElements
7-
from src.constants import headingElements, tableInsertModeElements
4+
#Insert the parent directory of the current file into the path
5+
_curDir = os.path.abspath(os.curdir)
6+
os.chdir(os.path.dirname(__file__))
7+
sys.path.insert(0, os.path.abspath(os.pardir))
8+
os.chdir(_curDir)
9+
del _curDir
10+
11+
from constants import contentModelFlags, spaceCharacters
12+
from constants import scopingElements, formattingElements, specialElements
13+
from constants import headingElements, tableInsertModeElements
814

915
# The scope markers are inserted when entering buttons, object elements,
1016
# marquees, table cells, and table captions, and are used to prevent formatting
@@ -224,3 +230,7 @@ def generateImpliedEndTags(self, exclude=None):
224230
# we should keep it in.
225231
# self.processEndTag(name)
226232
self.generateImpliedEndTags(exclude)
233+
234+
def getDocument(self):
235+
"Return the final tree"
236+
return self.document

tests/test_tokenizer.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def processParseError(self, token):
5151

5252
def processAtheistParseError(self, token):
5353
"""This error is not an error"""
54-
self.outputTokens.append(u"AtheistParseError")
54+
pass
5555

5656
def concatenateCharacterTokens(tokens):
5757
outputTokens = []
@@ -118,8 +118,4 @@ def main():
118118
unittest.main()
119119

120120
if __name__ == "__main__":
121-
#Allow us to import the parent module
122-
os.chdir(os.path.split(os.path.abspath(__file__))[0])
123-
sys.path.insert(0, os.path.abspath(os.pardir))
124-
125121
main()

0 commit comments

Comments
 (0)