More cleanup

rubys · rubys · commit 52cda1afe66a · 2007-06-12T12:53:42.000Z
--HG--
rename : README =&gt; python/README
rename : parse.py =&gt; python/parse.py
rename : setup_base.py =&gt; python/setup_base.py
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40715
diff --git a/README b/README
@@ -0,0 +1,38 @@
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the Web Applications 1.0 specification, which has
+formalized the error handling algorithms of popular web browsers.
+
+ = Installation =
+
+html5lib is packaged with distutils. To install it use:
+ $ python setup.py install
+
+ = Tests =
+
+You may wish to check that your installation has been a success by
+running the testsuite. All the tests can be run by invoking
+runtests.py in the tests/ directory
+
+ = Usage =
+
+Simple usage follows this pattern:
+
+import html5lib
+f = open("mydocument.html")
+parser = html5lib.HTMLParser()
+document = parser.parse(f)
+
+
+More documentation is avaliable in the docstrings or from
+http://code.google.com/p/html5lib/wiki/UserDocumentation
+
+ = Bugs =
+
+Please report any bugs on the issue tracker:
+http://code.google.com/p/html5lib/issues/list
+
+ = Get Involved =
+
+Contributions to code or documenation are actively encouraged. Submit
+patches to the issue tracker or discuss changes on irc in the #whatwg
+channel on freenode.net
diff --git a/parse.py b/parse.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+"""usage: %prog [options] filename
+
+Parse a document to a simpletree tree, with optional profiling
+"""
+#RELEASE move ./examples/
+
+import sys
+import os
+from optparse import OptionParser
+
+#RELEASE remove
+from src import html5parser, liberalxmlparser
+from src import treebuilders, serializer, treewalkers
+#END RELEASE
+#RELEASE add
+#from html5lib import html5parser, liberalxmlparser
+#from html5lib import treebuilders, serializer, treewalkers
+#END RELEASE
+
+def parse():
+    optParser = getOptParser()
+    opts,args = optParser.parse_args()
+
+    try:
+        f = args[-1]
+        # Try opening from the internet
+        if f.startswith('http://'):
+            try:
+                import urllib
+                f = urllib.urlopen(f).read()
+            except: pass
+        else:
+            try:
+                # Try opening from file system
+                f = open(f)
+            except IOError: pass
+    except IndexError:
+        sys.stderr.write("No filename provided. Use -h for help\n")
+        sys.exit(1)
+
+    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
+
+    if opts.xml:
+        p = liberalxmlparser.XHTMLParser(tree=treebuilder)
+    else:
+        p = html5parser.HTMLParser(tree=treebuilder)
+
+    if opts.profile:
+        import hotshot
+        import hotshot.stats
+        prof = hotshot.Profile('stats.prof')
+        prof.runcall(p.parse, f)
+        prof.close()
+        # XXX - We should use a temp file here
+        stats = hotshot.stats.load('stats.prof')
+        stats.strip_dirs()
+        stats.sort_stats('time')
+        stats.print_stats()
+    elif opts.time:
+        import time
+        t0 = time.time()
+        document = p.parse(f)
+        t1 = time.time()
+        printOutput(p, document, opts)
+        t2 = time.time()
+        sys.stdout.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+    else:
+        document = p.parse(f)
+        printOutput(p, document, opts)
+
+def printOutput(parser, document, opts):
+    if opts.encoding:
+        print "Encoding:", parser.tokenizer.stream.charEncoding
+    if not opts.no_tree:
+        if opts.xml:
+            sys.stdout.write(document.toxml("utf-8"))
+        elif opts.html:
+            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
+            for text in serializer.HTMLSerializer().serialize(tokens, encoding='utf-8'):
+                sys.stdout.write(text)
+        elif opts.hilite:
+            sys.stdout.write(document.hilite("utf-8"))
+        else:
+            sys.stdout.write(parser.tree.testSerializer(document).encode("utf-8"))
+    if opts.error:
+        errList=[]
+        for pos, message in parser.errors:
+            errList.append("Line %i Col %i"%pos + " " + message)
+        sys.stderr.write("\nParse errors:\n" + "\n".join(errList)+"\n")
+
+def getOptParser():
+    parser = OptionParser(usage=__doc__)
+
+    parser.add_option("-p", "--profile", action="store_true", default=False,
+                      dest="profile", help="Use the hotshot profiler to "
+                      "produce a detailed log of the run")
+    
+    parser.add_option("-t", "--time",
+                      action="store_true", default=False, dest="time",
+                      help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
+    
+    parser.add_option("", "--no-tree", action="store_true", default=False,
+                      dest="no_tree", help="Do not print output tree")
+    
+    parser.add_option("-b", "--treebuilder", action="store", type="string",
+                      dest="treebuilder", default="simpleTree")
+
+    parser.add_option("-e", "--error", action="store_true", default=False,
+                      dest="error", help="Print a list of parse errors")
+
+    parser.add_option("-x", "--xml", action="store_true", default=False,
+                      dest="xml", help="Output as xml")
+    
+    parser.add_option("", "--html", action="store_true", default=False,
+                      dest="html", help="Output as html")
+    
+    parser.add_option("", "--hilite", action="store_true", default=False,
+                      dest="hilite", help="Output as formatted highlighted code.")
+    
+    parser.add_option("-c", "--encoding", action="store_true", default=False,
+                      dest="encoding", help="Print character encoding used")
+    return parser
+
+if __name__ == "__main__":
+    parse()
diff --git a/setup_base.py b/setup_base.py
@@ -0,0 +1,34 @@
+from distutils.core import setup
+
+long_description="""HTML parser designed to follow the WHATWG HTML5 
+specification. The parser is designed to handle all flavours of HTML and 
+parses invalid documents using well-defined error handling rules compatible
+with the behaviour of major desktop web browsers.
+
+Output is to a tree structure; the current release supports output to
+a custom tree similar to DOM and to ElementTree.
+"""
+
+classifiers=[
+    'Development Status :: %(status)s',
+    'Intended Audience :: Developers',
+    'License :: OSI Approved :: MIT License',
+    'Operating System :: OS Independent',
+    'Programming Language :: Python',
+    'Topic :: Software Development :: Libraries :: Python Modules',
+    'Topic :: Text Processing :: Markup :: HTML'
+    ],
+
+setup(name='html5lib',
+      version='%(version)s',
+      url='http://code.google.com/p/html5lib/',
+      license="MIT License",
+      description='HTML parser based on the WHAT-WG Web Applications 1.0' 
+                  '("HTML5") specifcation',
+      long_description=long_description,
+      classifiers=classifiers,
+      maintainer='James Graham',
+      maintainer_email='jg307@cam.ac.uk',
+      packages=['html5lib', 'html5lib.treebuilders'],
+      package_dir = {'html5lib': 'src'}
+      )