#!/usr/bin/env python2.7 import argparse import os import re import sqlite3 import sys from lxml import etree parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Ruleset validation script.") parser.add_argument('--quiet', action="store_true", default=False, help="Suppress debug output." ) parser.add_argument('--db', default=os.path.join(os.path.dirname(__file__), "../src/defaults/rulesets.sqlite"), help='SQLite db with rules') args = parser.parse_args() quiet = args.quiet def warn(s): if not quiet: sys.stdout.write("warning: %s\n" % s) def fail(s): sys.stdout.write("failure: %s\n" % s) # Precompile xpath expressions that get run repeatedly. xpath_exclusion_pattern = etree.XPath("/ruleset/exclusion/@pattern") xpath_cookie_host_pattern = etree.XPath("/ruleset/securecookie/@host") xpath_cookie_name_pattern = etree.XPath("/ruleset/securecookie/@name") # Load lists of ruleset names whitelisted for downgrade & duplicate rules thispath = os.path.dirname(os.path.realpath(__file__)) with open(thispath + '/downgrade-whitelist.txt') as downgrade_fh: downgrade_allowed_list = [x.rstrip('\n') for x in downgrade_fh.readlines()] with open(thispath + '/duplicate-whitelist.txt') as duplicate_fh: duplicate_allowed_list = [x.rstrip('\n') for x in duplicate_fh.readlines()] def test_bad_regexp(tree, rulename, from_attrib, to): # Rules with invalid regular expressions. """The rule contains an invalid extended regular expression.""" patterns = (from_attrib + xpath_exclusion_pattern(tree) + xpath_cookie_host_pattern(tree) + xpath_cookie_name_pattern(tree)) for pat in patterns: try: re.compile(pat) except: return False return True def unescaped_dot(s): escaped = False bracketed = False for c in s: if c == "\\": escaped = not escaped elif not escaped and c == "[": bracketed = True elif not escaped and c == "]": bracketed = False elif not escaped and not bracketed and c == ".": return True elif not bracketed and c == "/": break else: escaped = False return False def test_unescaped_dots(tree, rulename, from_attrib, to): # Rules containing unescaped dots outside of brackets and before slash. # Note: this is meant to require example\.com instead of example.com, # but it also forbids things like .* which usually ought to be replaced # with something like ([^/:@\.]+) """The 'from' rule contains unescaped period in regular expression. Try escaping it with a backslash.""" for f in from_attrib: s = re.sub("^\^https?://", "", f) if unescaped_dot(s): return False return True def test_unescaped_dots_in_exclusion(tree, rulename, from_attrib, to): """The 'exclusion' tag contains unescaped period in regular expression. Try escaping it with a backslash.""" pattern_attrib = etree.XPath("/ruleset/exclusion/@pattern")(tree) for f in pattern_attrib: if unescaped_dot(f): return False return True xpath_rule = etree.XPath("/ruleset/rule") def test_unencrypted_to(tree, rulename, from_attrib, to): # Rules that redirect to something other than https or http. # This used to test for http: but testing for lack of https: will # catch more kinds of mistakes. # Now warn if the rule author indicates they intended it, with the # downgrade attribute. Error if this attribute is not present. """Rule redirects to something other than https.""" for rule in xpath_rule(tree): to, downgrade = rule.get("to"), rule.get("downgrade") if to[:6] != "https:" and to[:5] != "http:": return False elif to[:5] == "http:" and downgrade: if rulename in downgrade_allowed_list: warn("whitelisted downgrade rule in %s redirects to http." % rulename) else: fail("non-whitelisted downgrade rule in %s redirects to http." % rulename) return False elif to[:5] == "http:": fail("non-downgrade rule in %s redirects to http." % rulename) return False return True printable_characters = set(map(chr, list(range(32, 127)))) def test_non_ascii(tree, rulename, from_attrib, to): # Rules containing non-printable characters. """Rule contains non-printable character in 'to' pattern.""" for t in to: for c in t: if c not in printable_characters: return False return True def is_valid_target_host(host): # Rules where a target host contains multiple wildcards or a slash. """The target host must be a hostname, not URL, and must use at most one wildcard.""" if "/" in host: return False if host.count("*") > 1: return False return True def nomes_all(where=sys.argv[1:]): """Returns generator to extract all files from a list of files/dirs""" if not where: where=['.'] for i in where: if os.path.isfile(i): yield i elif os.path.isdir(i): for r, d, f in os.walk(i): for filename in f: yield os.path.join(r, filename) tests = [ test_bad_regexp, test_unescaped_dots, test_unescaped_dots_in_exclusion, test_unencrypted_to, test_non_ascii ] failure = 0 seen_file = False xpath_ruleset = etree.XPath("/ruleset") xpath_ruleset_name = etree.XPath("/ruleset/@name") xpath_ruleset_file = etree.XPath("/ruleset/@f") xpath_host = etree.XPath("/ruleset/target/@host") xpath_from = etree.XPath("/ruleset/rule/@from") xpath_to = etree.XPath("/ruleset/rule/@to") conn = sqlite3.connect(args.db) c = conn.cursor() for row in c.execute('''SELECT contents from rulesets'''): try: tree = etree.fromstring(row[0]) except Exception as oops: failure = 1 print("failed XML validity: %s\n" % (oops)) if not xpath_ruleset(tree): continue rn = xpath_ruleset_name(tree)[0] if not rn: failure = 1 fail("unnamed ruleset") continue rf = xpath_ruleset_file(tree)[0] from_attrib = xpath_from(tree) to = xpath_to(tree) for test in tests: if not test(tree, rn, from_attrib=from_attrib, to=to): failure = 1 fail("%s failed test: %s" % (rf, test.__doc__)) for (host, count) in c.execute(''' select host, count(host) as c from targets group by host;'''): if count > 1: if host in duplicate_allowed_list: warn("Whitelisted hostname %s shows up in %d different rulesets." % (host, count)) else: failure = 1 fail("Hostname %s shows up in %d different rulesets." % (host, count)) if not is_valid_target_host(host): failure = 1 fail("%s failed: %s" % (host, is_valid_target_host.__doc__)) sys.exit(failure)