|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +from lxml import etree |
| 4 | +import regex |
| 5 | + |
| 6 | +# XXX: this doesn't work for from patterns that use the ?: in (?:www\.)? |
| 7 | +# (one of many examples in Zoosk.com.xml) |
| 8 | +# XXX: this doesn't figure out if a target host causes a particular rule |
| 9 | +# to be completely inapplicable (in which case it should probably be |
| 10 | +# ignored) for determining simplicity |
| 11 | +# XXX: this doesn't catch simple rules that use alternation with |
| 12 | +# backreferences, like from="^http://(foo|bar)\.example\.com/" |
| 13 | +# to="\1.example.com" |
| 14 | + |
| 15 | +def simple(f): |
| 16 | + tree = etree.parse(f) |
| 17 | + targets = [target.attrib["host"] for target in tree.xpath("/ruleset/target")] |
| 18 | + return all([ |
| 19 | + # ruleset must not be default_off |
| 20 | + "default_off" not in tree.xpath("/ruleset")[0].attrib, |
| 21 | + # ruleset must not contain a match_rule |
| 22 | + "match_rule" not in tree.xpath("/ruleset")[0].attrib, |
| 23 | + # XXX: maybe also check for platform="mixedcontent" here |
| 24 | + # ruleset must not apply any securecookie patterns |
| 25 | + not tree.xpath("/ruleset/securecookie"), |
| 26 | + # ruleset must not contain any exclusions |
| 27 | + not tree.xpath("/ruleset/exclusion"), |
| 28 | + # targets must not contain any wildcards |
| 29 | + not any("*" in target for target in targets), |
| 30 | + # ruleset must not contain any downgrade rules |
| 31 | + not any("downgrade" in rule.attrib for rule in tree.xpath("/ruleset/rule")), |
| 32 | + # and every rule must itself be simple according to the criteria below |
| 33 | + all(simple_rule(rule, targets) for rule in tree.xpath("/ruleset/rule")) |
| 34 | + ]) |
| 35 | + |
| 36 | +def simple_rule(rule, targets): |
| 37 | + """Is this rule a simple rule? A simple rule rewrites a single hostname, |
| 38 | + perhaps with an optional leading www\., to itself or to itself plus www., |
| 39 | + at the top level with no other effects.""" |
| 40 | + rule_from = rule.attrib["from"] |
| 41 | + rule_to = rule.attrib["to"] |
| 42 | + # Simple rule with no capture |
| 43 | + if regex.match(r"^\^http://[-A-Za-z0-9.\\]+/$", rule_from): |
| 44 | + applicable_host = unescape(regex.search(r"^\^http://([-A-Za-z0-9.\\])+/$", rule_from).groups()[0]) |
| 45 | + if regex.match(r"^https://%s/" % applicable_host, rule_to) or regex.match("r^https://%s/" % applicable_host, rule_to): |
| 46 | + return True |
| 47 | + else: |
| 48 | + return False |
| 49 | + # Optional www |
| 50 | + if regex.match(r"^\^http://\(www\\\.\)\?[-A-Za-z0-9.\\]+/$", rule_from): |
| 51 | + applicable_host = unescape(regex.search(r"^\^http://\(www\\\.\)\?([-A-Za-z0-9.\\]+)/$", rule_from).groups()[0]) |
| 52 | + if regex.match(r"^https://www\.%s/" % applicable_host, rule_to) or regex.match(r"^https://%s/" % applicable_host, rule_to): |
| 53 | + return True |
| 54 | + else: |
| 55 | + return False |
| 56 | + return False |
| 57 | + |
| 58 | +def unescape(s): |
| 59 | + return s.replace(r"\.", ".") |
0 commit comments