66
77import sys
88import os
9+ import traceback
910from optparse import OptionParser
1011
1112from html5lib import html5parser , sanitizer
@@ -48,10 +49,7 @@ def parse():
4849 else :
4950 tokenizer = HTMLTokenizer
5051
51- if opts .log :
52- html5parser .debug_log = True
53-
54- p = html5parser .HTMLParser (tree = treebuilder , tokenizer = tokenizer )
52+ p = html5parser .HTMLParser (tree = treebuilder , tokenizer = tokenizer , debug = opts .log )
5553
5654 if opts .fragment :
5755 parseMethod = p .parseFragment
@@ -73,46 +71,54 @@ def parse():
7371 elif opts .time :
7472 import time
7573 t0 = time .time ()
76- document = parseMethod ( f , encoding = encoding )
74+ document = run ( parseMethod , f , encoding )
7775 t1 = time .time ()
7876 printOutput (p , document , opts )
7977 t2 = time .time ()
8078 sys .stderr .write ("\n \n Run took: %fs (plus %fs to print the output)" % (t1 - t0 , t2 - t1 ))
8179 else :
82- document = parseMethod ( f , encoding = encoding )
80+ document = run ( parseMethod , f , encoding )
8381 printOutput (p , document , opts )
8482
83+ def run (parseMethod , f , encoding ):
84+ try :
85+ document = parseMethod (f , encoding = encoding )
86+ except :
87+ document = None
88+ traceback .print_exc ()
89+ return document
90+
8591def printOutput (parser , document , opts ):
8692 if opts .encoding :
8793 print "Encoding:" , parser .tokenizer .stream .charEncoding
8894
89- if opts .log :
90- for item in parser . log :
91- print item
92-
93- if opts .xml :
94- sys .stdout .write (document .toxml ("utf-8" ))
95- elif opts .tree :
96- if not hasattr (document ,'__getitem__' ):
97- document = [document ]
98- for fragment in document :
99- print parser .tree .testSerializer (fragment ).encode ("utf-8" )
100- elif opts .hilite :
101- sys .stdout .write (document .hilite ("utf-8" ))
102- elif opts .html :
103- kwargs = {}
104- for opt in serializer .HTMLSerializer .options :
105- try :
106- kwargs [opt ] = getattr (opts ,opt )
107- except :
108- pass
109- if not kwargs ['quote_char' ]:
110- del kwargs ['quote_char' ]
111-
112- tokens = treewalkers .getTreeWalker (opts .treebuilder )(document )
113- for text in serializer .HTMLSerializer (** kwargs ).serialize (tokens , encoding = 'utf-8' ):
114- sys .stdout .write (text )
115- if not text .endswith ('\n ' ): sys .stdout .write ('\n ' )
95+ for item in parser .log :
96+ print item
97+
98+ if document is not None :
99+ if opts .xml :
100+ sys .stdout .write (document .toxml ("utf-8" ))
101+ elif opts .tree :
102+ if not hasattr (document ,'__getitem__' ):
103+ document = [document ]
104+ for fragment in document :
105+ print parser .tree .testSerializer (fragment ).encode ("utf-8" )
106+ elif opts .hilite :
107+ sys .stdout .write (document .hilite ("utf-8" ))
108+ elif opts .html :
109+ kwargs = {}
110+ for opt in serializer .HTMLSerializer .options :
111+ try :
112+ kwargs [opt ] = getattr (opts ,opt )
113+ except :
114+ pass
115+ if not kwargs ['quote_char' ]:
116+ del kwargs ['quote_char' ]
117+
118+ tokens = treewalkers .getTreeWalker (opts .treebuilder )(document )
119+ for text in serializer .HTMLSerializer (** kwargs ).serialize (tokens , encoding = 'utf-8' ):
120+ sys .stdout .write (text )
121+ if not text .endswith ('\n ' ): sys .stdout .write ('\n ' )
116122 if opts .error :
117123 errList = []
118124 for pos , errorcode , datavars in parser .errors :
0 commit comments