#!/usr/bin/python2.7

import argparse
import sys, re, os

try:
    from lxml import etree
except ImportError:
    sys.stderr.write("** Could not import lxml!  Rule validation SKIPPED.\n")
    sys.stderr.write("** Caution: A resulting build MAY CONTAIN INVALID RULES.\n")
    sys.stderr.write("** Please install libxml2 and lxml to permit validation!\n")
    sys.exit(0)

parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description="Ruleset validation script.")
parser.add_argument('--ignoredups', type=str, nargs="*",
    default="",
    help="Ignore entries."
    )
parser.add_argument('--dupdir', type=str, nargs="*",
    default="",
    help="Duplicate directory."
    )
parser.add_argument('--quiet', action="store_true",
    default=False, help="Suppress debug output."
    )
parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
    default="src/chrome/content/rules",
    help='Directory of XML files to validate.')

args = parser.parse_args()

ignoredups = [re.compile(val) for val in args.ignoredups]
dupdir = [val for val in args.dupdir]
quiet = args.quiet

def warn(s):
    if not quiet: sys.stdout.write("warning: %s\n" % s)

def fail(s):
    sys.stdout.write("failure: %s\n" % s)

def test_not_anchored(tree):
    # Rules not anchored to the beginning of a line.
    """The 'from' rule is not anchored to beginning of line using the ^ symbol."""
    for f in tree.xpath("/ruleset/rule/@from"):
        if not f or f[0] != "^":
            return False
    return True

def test_bad_regexp(tree):
    # Rules with invalid regular expressions.
    """The 'from' rule contains an invalid extended regular expression."""
    for f in tree.xpath("/ruleset/rule/@from") + \
             tree.xpath("/ruleset/exclusion/@pattern") + \
             tree.xpath("/ruleset/securecookie/@host"):
        try:
            re.compile(f)
        except:
            return False
    return True

def test_missing_to(tree):

    # Rules that are terminated before setting 'to'.
    # These cases are probably either due to a misplaced
    # rule end or intended to be different elements.
    """Rule is missing a 'to' value."""
    for rule in tree.xpath("/ruleset/rule"):
	if not rule.get("to"):
            warn("'to' attribute missing in %s. " %fi)
            warn("Misplaced end or misnamed element?")
            return False
    return True

def test_unescaped_dots(tree):
    # Rules containing unescaped dots outside of brackets and before slash.
    # Note: this is meant to require example\.com instead of example.com,
    # but it also forbids things like .* which usually ought to be replaced
    # with something like ([^/:@\.]+)
    """The 'from' rule contains unescaped period in regular expression.  Try escaping it with a backslash."""
    for f in tree.xpath("/ruleset/rule/@from"):
        escaped = False
        bracketed = False
        s = re.sub("^\^https?://", "", f)
        for c in s:
            if c == "\\":
               escaped = not escaped
            elif not escaped and c == "[":
               bracketed = True
            elif not escaped and c == "]":
               bracketed = False
            elif not escaped and not bracketed and c == ".":
               return False
            elif not bracketed and c == "/":
               break
            else:
               escaped = False
    return True

def test_space_in_to(tree):
    # Rules where the to pattern contains a space.
    """The 'to' rule contains a space."""
    for t in tree.xpath("/ruleset/rule/@to"):
        if ' ' in t:
            return False
    return True

def test_unencrypted_to(tree):
    # Rules that redirect to something other than https or http.
    # This used to test for http: but testing for lack of https: will
    # catch more kinds of mistakes.
    # Now warn if the rule author indicates they intended it, with the
    # downgrade attribute.  Error if this attribute is not present.
    """Rule redirects to something other than https."""
    for rule in tree.xpath("/ruleset/rule"):
        to, downgrade = rule.get("to"), rule.get("downgrade")
        if to[:6] != "https:" and to[:5] != "http:":
            return False
        elif to[:5] == "http:" and downgrade:
            warn("downgrade rule in %s redirects to http." % fi)
        elif to[:5] == "http:":
            fail("non-downgrade rule in %s redirects to http." % fi)
            return False
    return True

def test_backslash_in_to(tree):
    # Rules containing backslashes in to pattern.
    """The 'to' rule contains a backslash."""
    for t in tree.xpath("/ruleset/rule/@to"):
        if '\\' in t:
            return False
    return True

def test_no_trailing_slash(tree):
    # Rules not containing trailing slash in from or to pattern.
    """Rule omits forward slash after host name."""
    for r in tree.xpath("/ruleset/rule"):
        f, t = r.get("from"), r.get("to")
        if not re.search("//.*/", f):
            return False
        if not re.search("//.*/", t):
            return False
    return True

def test_lacks_target_host(tree):
    # Rules that lack at least one target host (target tag with host attr).
    """Rule fails to specify at least one target host."""
    return not not tree.xpath("/ruleset/target/@host")

def test_bad_target_host(tree):
    # Rules where a target host contains multiple wildcards or a slash.
    """The target host must be a hostname, not URL, and must use at most one wildcard."""
    for target in tree.xpath("/ruleset/target/@host"):
        if "/" in target:
            return False
        if target.count("*") > 1:
            return False
    return True

def test_duplicated_target_host(tree):
    # Rules where a single target host appears more than once.
    """Rule contains the same target host more than once."""
    targets = tree.xpath("/ruleset/target/@host")
    return len(set(targets)) == len(targets)

printable_characters = set(map(chr, list(range(32, 127))))

def test_non_ascii(tree):
    # Rules containing non-printable characters.
    """Rule contains non-printable character in 'to' pattern."""
    for t in tree.xpath("/ruleset/rule/@to"):
        for c in t:
            if c not in printable_characters:
                return False
    return True

def test_ruleset_name(tree):
    """Rule has name"""
    if tree.xpath("/ruleset/@name"):
        return True
    else:
        return False

def get_all_names_and_targets(ds):
    """extract unique names and targets from a list of dirs of xml files"""
    names = set()
    targets = set()
    for d in ds:
        for fi in os.listdir(d):
            fi = os.path.join(d, fi)
            try:
                tree = etree.parse(fi)
                ruleset_name = tree.xpath("/ruleset/@name")[0]
                target_names = tree.xpath("/ruleset/target/@host")
            except Exception:
                continue
            names.add(ruleset_name)
            for target in target_names:
                targets.add(target)
    return names, targets

def nomes_all(where=sys.argv[1:]):
    """Returns generator to extract all files from a list of files/dirs"""
    if not where: where=['.']
    for i in where:
        if os.path.isfile(i):
            yield i
        elif os.path.isdir(i):
            for r, d, f in os.walk(i):
                for fi in f:
                    yield os.path.join(r, fi)

tests = [test_not_anchored, test_bad_regexp, test_unescaped_dots, test_missing_to,
         test_space_in_to, test_unencrypted_to, test_backslash_in_to,
         test_no_trailing_slash, test_lacks_target_host, test_bad_target_host,
         test_duplicated_target_host, test_non_ascii]

failure = 0
seen_file = False
all_names, all_targets = get_all_names_and_targets(dupdir)

for fi in nomes_all():
    try:
        tree = etree.parse(fi)
        if fi[-4:] != ".xml":
            if tree.xpath("/ruleset"):
                warn("ruleset in file without .xml extension: %s" % fi)
            else:
                continue
        seen_file = True
    except Exception as oops:
        if fi[-4:] != ".xml":
            continue
        failure = 1
        fail("%s failed XML validity: %s\n" % (fi, oops))
    if failure or not tree.xpath("/ruleset"):
        continue
    if not test_ruleset_name(tree):
        failure = 1
        fail("unnamed ruleset: %s" % fi)
        continue
    ruleset_name = tree.xpath("/ruleset/@name")[0]
    if ruleset_name in all_names:
        failure = 1
        fail("duplicate ruleset name %s" % ruleset_name)
    all_names.add(ruleset_name)
    for test in tests:
        if not test(tree):
            failure = 1
            fail("%s failed test: %s" % (fi, test.__doc__))
    for target in tree.xpath("/ruleset/target/@host"):
        if target in all_targets and not any(ign.search(target) for ign in ignoredups):
            # suppress warning about duplicate targets if an --ignoredups
            # pattern matches target
            warn("%s has duplicate target: %s" % (fi, target))
        all_targets.add(target)

if not seen_file:
   which = "specified" if args else "current"
   sys.stdout.write("There were no valid XML files in the %s " % which)
   sys.stdout.write("directory.\n")
   failure = 3

sys.exit(failure)