#!/usr/bin/env python """usage: %prog [options] filename Parse a document to a tree, with optional profiling """ import sys import traceback from optparse import OptionParser from html5lib import html5parser from html5lib import treebuilders, serializer, treewalkers from html5lib import constants from html5lib import _utils def parse(): optParser = getOptParser() opts, args = optParser.parse_args() encoding = "utf8" try: f = args[-1] # Try opening from the internet if f.startswith('http://'): try: import urllib.request import urllib.parse import urllib.error import cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') except: pass elif f == '-': f = sys.stdin if sys.version_info[0] >= 3: encoding = None else: try: # Try opening from file system f = open(f, "rb") except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: sys.stderr.write("No filename provided. Use -h for help\n") sys.exit(1) treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log) if opts.fragment: parseMethod = p.parseFragment else: parseMethod = p.parse if opts.profile: import cProfile import pstats cProfile.runctx("run(parseMethod, f, encoding, scripting)", None, {"run": run, "parseMethod": parseMethod, "f": f, "encoding": encoding, "scripting": opts.scripting}, "stats.prof") # XXX - We should use a temp file here stats = pstats.Stats('stats.prof') stats.strip_dirs() stats.sort_stats('time') stats.print_stats() elif opts.time: import time t0 = time.time() document = run(parseMethod, f, encoding, opts.scripting) t1 = time.time() if document: printOutput(p, document, opts) t2 = time.time() sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) else: sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) else: document = run(parseMethod, f, encoding, opts.scripting) if document: printOutput(p, document, opts) def run(parseMethod, f, encoding, scripting): try: document = parseMethod(f, override_encoding=encoding, scripting=scripting) except: document = None traceback.print_exc() return document def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == "dom": document.writexml(sys.stdout, encoding="utf-8") elif tb == "lxml": import lxml.etree sys.stdout.write(lxml.etree.tostring(document, encoding="unicode")) elif tb == "etree": sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: del kwargs['quote_char'] if opts.sanitize: kwargs["sanitize"] = True tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n") def getOptParser(): parser = OptionParser(usage=__doc__) parser.add_option("-p", "--profile", action="store_true", default=False, dest="profile", help="Use the hotshot profiler to " "produce a detailed log of the run") parser.add_option("-t", "--time", action="store_true", default=False, dest="time", help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") parser.add_option("-b", "--treebuilder", action="store", type="string", dest="treebuilder", default="etree") parser.add_option("-e", "--error", action="store_true", default=False, dest="error", help="Print a list of parse errors") parser.add_option("-f", "--fragment", action="store_true", default=False, dest="fragment", help="Parse as a fragment") parser.add_option("-s", "--scripting", action="store_true", default=False, dest="scripting", help="Handle noscript tags as if scripting was enabled") parser.add_option("", "--tree", action="store_true", default=False, dest="tree", help="Output as debug tree") parser.add_option("-x", "--xml", action="store_true", default=False, dest="xml", help="Output as xml") parser.add_option("", "--no-html", action="store_false", default=True, dest="html", help="Don't output html") parser.add_option("-c", "--encoding", action="store_true", default=False, dest="encoding", help="Print character encoding used") parser.add_option("", "--inject-meta-charset", action="store_true", default=False, dest="inject_meta_charset", help="inject ") parser.add_option("", "--strip-whitespace", action="store_true", default=False, dest="strip_whitespace", help="strip whitespace") parser.add_option("", "--omit-optional-tags", action="store_true", default=False, dest="omit_optional_tags", help="omit optional tags") parser.add_option("", "--quote-attr-values", action="store_true", default=False, dest="quote_attr_values", help="quote attribute values") parser.add_option("", "--use-best-quote-char", action="store_true", default=False, dest="use_best_quote_char", help="use best quote character") parser.add_option("", "--quote-char", action="store", default=None, dest="quote_char", help="quote character") parser.add_option("", "--no-minimize-boolean-attributes", action="store_false", default=True, dest="minimize_boolean_attributes", help="minimize boolean attributes") parser.add_option("", "--use-trailing-solidus", action="store_true", default=False, dest="use_trailing_solidus", help="use trailing solidus") parser.add_option("", "--space-before-trailing-solidus", action="store_true", default=False, dest="space_before_trailing_solidus", help="add space before trailing solidus") parser.add_option("", "--escape-lt-in-attrs", action="store_true", default=False, dest="escape_lt_in_attrs", help="escape less than signs in attribute values") parser.add_option("", "--escape-rcdata", action="store_true", default=False, dest="escape_rcdata", help="escape rcdata element values") parser.add_option("", "--sanitize", action="store_true", default=False, dest="sanitize", help="sanitize") parser.add_option("-l", "--log", action="store_true", default=False, dest="log", help="log state transitions") return parser if __name__ == "__main__": parse()