Speed up make-sqlite and trivial-validate.

jsha · jsha · commit 95db03b9ac93 · 2014-08-16T09:49:41.000-04:00
diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
@@ -2,24 +2,15 @@
 #
 # Builds an sqlite DB containing all the rulesets, indexed by target.
 
-import subprocess
+import glob
+import os
+import re
 import sqlite3
-import sys, re, os
+import subprocess
+import sys
 
 from lxml import etree
 
-def nomes_all(where=sys.argv[1:]):
-    """Returns generator to extract all files from a list of files/dirs"""
-    if not where: where=['.']
-    for i in where:
-        if os.path.isfile(i):
-            yield i
-        elif os.path.isdir(i):
-            for r, d, f in os.walk(i):
-                for fi in f:
-                    yield os.path.join(r, fi)
-
-
 conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
 c = conn.cursor()
 c.execute('''DROP TABLE IF EXISTS rulesets''')
@@ -39,28 +30,33 @@ def nomes_all(where=sys.argv[1:]):
 
 parser = etree.XMLParser(remove_blank_text=True)
 
+def nomes_all(where=sys.argv[1:]):
+    """Returns generator to extract all files from a list of files/dirs"""
+    return glob.glob('src/chrome/content/rules/*.xml')
+
+# Precompile xpath expressions that get run repeatedly.
+xpath_host = etree.XPath("/ruleset/target/@host")
+xpath_ruleset = etree.XPath("/ruleset")
+
 for fi in nomes_all():
     try:
         tree = etree.parse(fi, parser)
     except Exception as oops:
-        if fi[-4:] != ".xml":
-            continue
         print("%s failed XML validity: %s\n" % (fi, oops))
-    if not tree.xpath("/ruleset"):
-        continue
+        sys.exit(1)
 
     # Remove comments to save space.
     etree.strip_tags(tree,etree.Comment)
 
-    targets = tree.xpath("/ruleset/target/@host")
+    targets = xpath_host(tree)
     # Strip out the target tags. These aren't necessary in the DB because
     # targets are looked up in the target table, which has a foreign key
     # pointing into the ruleset table.
-    etree.strip_tags(tree,'target')
+    etree.strip_tags(tree, 'target')
 
     # Store the filename in the `f' attribute so "view source XML" for rules in
     # FF version can find it.
-    tree.xpath("/ruleset")[0].attrib["f"] = os.path.basename(fi).decode(encoding="UTF-8")
+    xpath_ruleset(tree)[0].attrib["f"] = os.path.basename(fi).decode(encoding="UTF-8")
 
     c.execute('''INSERT INTO rulesets (contents) VALUES(?)''', (etree.tostring(tree),));
     ruleset_id = c.lastrowid
diff --git a/utils/trivial-validate.py b/utils/trivial-validate.py
@@ -1,16 +1,12 @@
 #!/usr/bin/env python2.7
 
 import argparse
-import sys, re, os
+import os
+import re
 import sqlite3
+import sys
 
-try:
-    from lxml import etree
-except ImportError:
-    sys.stderr.write("** Could not import lxml!  Rule validation SKIPPED.\n")
-    sys.stderr.write("** Caution: A resulting build MAY CONTAIN INVALID RULES.\n")
-    sys.stderr.write("** Please install libxml2 and lxml to permit validation!\n")
-    sys.exit(0)
+from lxml import etree
 
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -19,68 +15,69 @@
     default="",
     help="Ignore entries."
     )
-parser.add_argument('--dupdir', type=str, nargs="*",
-    default="",
-    help="Duplicate directory."
-    )
 parser.add_argument('--quiet', action="store_true",
     default=False, help="Suppress debug output."
     )
-parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
-    default="src/chrome/content/rules",
-    help='Directory of XML files to validate.')
+parser.add_argument('--db', type=str, nargs="*",
+    default=os.path.join(os.path.dirname(__file__),
+                         "../src/defaults/rulesets.sqlite"),
+    help='SQLite db with rules')
 
 args = parser.parse_args()
 
 ignoredups = [re.compile(val) for val in args.ignoredups]
-dupdir = [val for val in args.dupdir]
 quiet = args.quiet
 
 def warn(s):
-    if not quiet: sys.stdout.write("warning: %s\n" % s)
+    if not quiet:
+        sys.stdout.write("warning: %s\n" % s)
 
 def fail(s):
     sys.stdout.write("failure: %s\n" % s)
 
-def test_not_anchored(tree, fi):
+def test_not_anchored(tree, fi, host, fm, to):
     # Rules not anchored to the beginning of a line.
     """The 'from' rule is not anchored to beginning of line using the ^ symbol."""
-    for f in tree.xpath("/ruleset/rule/@from"):
+    for f in fm:
         if not f or f[0] != "^":
             return False
     return True
 
-def test_bad_regexp(tree, fi):
+# Precompile xpath expressions that get run repeatedly.
+xpath_exlusion_pattern = etree.XPath("/ruleset/exclusion/@pattern")
+xpath_cookie_pattern = etree.XPath("/ruleset/securecookie/@host")
+
+def test_bad_regexp(tree, fi, host, fm, to):
     # Rules with invalid regular expressions.
     """The 'from' rule contains an invalid extended regular expression."""
-    for f in tree.xpath("/ruleset/rule/@from") + \
-             tree.xpath("/ruleset/exclusion/@pattern") + \
-             tree.xpath("/ruleset/securecookie/@host"):
+    patterns = fm + xpath_exlusion_pattern(tree) + xpath_cookie_pattern(tree)
+    for pat in patterns:
         try:
-            re.compile(f)
+            re.compile(pat)
         except:
             return False
     return True
 
-def test_missing_to(tree, fi):
+xpath_rule = etree.XPath("/ruleset/rule")
+def test_missing_to(tree, fi, host, fm, to):
     # Rules that are terminated before setting 'to'.
     # These cases are probably either due to a misplaced
     # rule end or intended to be different elements.
     """Rule is missing a 'to' value."""
-    for rule in tree.xpath("/ruleset/rule"):
+    for rule in xpath_rule(tree):
         if not rule.get("to"):
             warn("'to' attribute missing in %s. " % fi)
             warn("Misplaced end or misnamed element?")
             return False
     return True
 
-def test_unescaped_dots(tree, fi):
+def test_unescaped_dots(tree, fi, host, fm, to):
     # Rules containing unescaped dots outside of brackets and before slash.
     # Note: this is meant to require example\.com instead of example.com,
     # but it also forbids things like .* which usually ought to be replaced
     # with something like ([^/:@\.]+)
     """The 'from' rule contains unescaped period in regular expression.  Try escaping it with a backslash."""
-    for f in tree.xpath("/ruleset/rule/@from"):
+    for f in fm:
         escaped = False
         bracketed = False
         s = re.sub("^\^https?://", "", f)
@@ -99,22 +96,22 @@ def test_unescaped_dots(tree, fi):
                escaped = False
     return True
 
-def test_space_in_to(tree, fi):
+def test_space_in_to(tree, fi, host, fm, to):
     # Rules where the to pattern contains a space.
     """The 'to' rule contains a space."""
-    for t in tree.xpath("/ruleset/rule/@to"):
+    for t in to:
         if ' ' in t:
             return False
     return True
 
-def test_unencrypted_to(tree, fi):
+def test_unencrypted_to(tree, fi, host, fm, to):
     # Rules that redirect to something other than https or http.
     # This used to test for http: but testing for lack of https: will
     # catch more kinds of mistakes.
     # Now warn if the rule author indicates they intended it, with the
     # downgrade attribute.  Error if this attribute is not present.
     """Rule redirects to something other than https."""
-    for rule in tree.xpath("/ruleset/rule"):
+    for rule in xpath_rule(tree):
         to, downgrade = rule.get("to"), rule.get("downgrade")
         if to[:6] != "https:" and to[:5] != "http:":
             return False
@@ -125,82 +122,58 @@ def test_unencrypted_to(tree, fi):
             return False
     return True
 
-def test_backslash_in_to(tree, fi):
+def test_backslash_in_to(tree, fi, host, fm, to):
     # Rules containing backslashes in to pattern.
     """The 'to' rule contains a backslash."""
-    for t in tree.xpath("/ruleset/rule/@to"):
+    for t in to:
         if '\\' in t:
             return False
     return True
 
-def test_no_trailing_slash(tree, fi):
+RE_TRAILING_SLASH = re.compile("//.*/")
+
+def test_no_trailing_slash(tree, fi, host, fm, to):
     # Rules not containing trailing slash in from or to pattern.
     """Rule omits forward slash after host name."""
-    for r in tree.xpath("/ruleset/rule"):
+    for r in xpath_rule(tree):
         f, t = r.get("from"), r.get("to")
-        if not re.search("//.*/", f):
+        if not RE_TRAILING_SLASH.search(f):
             return False
-        if not re.search("//.*/", t):
+        if not RE_TRAILING_SLASH.search(t):
             return False
     return True
 
-def test_lacks_target_host(tree, fi):
+def test_lacks_target_host(tree, fi, host, fm, to):
     # Rules that lack at least one target host (target tag with host attr).
     """Rule fails to specify at least one target host."""
-    return not not tree.xpath("/ruleset/target/@host")
+    return not not host
 
-def test_bad_target_host(tree, fi):
+def test_bad_target_host(tree, fi, host, fm, to):
     # Rules where a target host contains multiple wildcards or a slash.
     """The target host must be a hostname, not URL, and must use at most one wildcard."""
-    for target in tree.xpath("/ruleset/target/@host"):
+    for target in host:
         if "/" in target:
             return False
         if target.count("*") > 1:
             return False
     return True
 
-def test_duplicated_target_host(tree, fi):
+def test_duplicated_target_host(tree, fi, host, fm, to):
     # Rules where a single target host appears more than once.
     """Rule contains the same target host more than once."""
-    targets = tree.xpath("/ruleset/target/@host")
-    return len(set(targets)) == len(targets)
+    return len(set(host)) == len(host)
 
 printable_characters = set(map(chr, list(range(32, 127))))
 
-def test_non_ascii(tree, fi):
+def test_non_ascii(tree, fi, host, fm, to):
     # Rules containing non-printable characters.
     """Rule contains non-printable character in 'to' pattern."""
-    for t in tree.xpath("/ruleset/rule/@to"):
+    for t in to:
         for c in t:
             if c not in printable_characters:
                 return False
     return True
 
-def test_ruleset_name(tree):
-    """Rule has name"""
-    if tree.xpath("/ruleset/@name"):
-        return True
-    else:
-        return False
-
-def get_all_names_and_targets(ds):
-    """extract unique names and targets from a list of dirs of xml files"""
-    names = set()
-    targets = set()
-    for d in ds:
-        for fi in os.listdir(d):
-            fi = os.path.join(d, fi)
-            try:
-                tree = etree.parse(fi)
-                ruleset_name = tree.xpath("/ruleset/@name")[0]
-                target_names = tree.xpath("/ruleset/target/@host")
-            except Exception:
-                continue
-            names.add(ruleset_name)
-            for target in target_names:
-                targets.add(target)
-    return names, targets
-
 def nomes_all(where=sys.argv[1:]):
     """Returns generator to extract all files from a list of files/dirs"""
     if not where: where=['.']
@@ -212,34 +185,53 @@ def nomes_all(where=sys.argv[1:]):
                 for fi in f:
                     yield os.path.join(r, fi)
 
-tests = [test_not_anchored, test_bad_regexp, test_unescaped_dots, test_missing_to,
-         test_space_in_to, test_unencrypted_to, test_backslash_in_to,
-         test_no_trailing_slash,  test_bad_target_host,
-         test_duplicated_target_host, test_non_ascii]
+tests = [test_not_anchored,
+         test_bad_regexp,
+         test_unescaped_dots,
+         test_missing_to,
+         test_space_in_to,
+         test_unencrypted_to,
+         test_backslash_in_to,
+         test_no_trailing_slash,
+         test_bad_target_host,
+         test_duplicated_target_host,
+         test_non_ascii]
 
 failure = 0
 seen_file = False
 
-conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
+xpath_ruleset = etree.XPath("/ruleset")
+xpath_ruleset_name = etree.XPath("/ruleset/@name")
+xpath_ruleset_file = etree.XPath("/ruleset/@f")
+xpath_host = etree.XPath("/ruleset/target/@host")
+xpath_from = etree.XPath("/ruleset/rule/@from")
+xpath_to = etree.XPath("/ruleset/rule/@to")
+
+print args.db
+conn = sqlite3.connect(args.db)
 c = conn.cursor()
 for row in c.execute('''SELECT contents from rulesets'''):
     try:
         tree = etree.fromstring(row[0])
     except Exception as oops:
         failure = 1
         print("failed XML validity: %s\n" % (oops))
-    if failure or not tree.xpath("/ruleset"):
+    if failure or not xpath_ruleset(tree):
         continue
-    if not test_ruleset_name(tree):
+    rn = xpath_ruleset_name(tree)[0]
+    if not rn:
         failure = 1
         fail("unnamed ruleset")
         continue
-    ruleset_name = tree.xpath("/ruleset/@name")[0]
-    ruleset_file = tree.xpath("/ruleset/@f")[0]
+    rf = xpath_ruleset_name(tree)[0]
+    host = xpath_host(tree)
+    fm = xpath_from(tree)
+    to = xpath_to(tree)
     for test in tests:
-        if not test(tree, ruleset_file):
-            print("%s failed test: %s" % (ruleset_file, test.__doc__))
-    for target in tree.xpath("/ruleset/target/@host"):
+        if not test(tree, rf, host=host, fm=fm, to=to):
+            print("%s failed test: %s" % (rf, test.__doc__))
+    for target in xpath_host(tree):
+        print target
         if target in all_targets and not any(ign.search(target) for ign in ignoredups):
             # suppress warning about duplicate targets if an --ignoredups
             # pattern matches target