Skip to content

Commit 9693ff4

Browse files
author
Seth Schoen
committed
first cut at ruleset simplicity tester
1 parent de5a7b8 commit 9693ff4

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

utils/simple.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
3+
from lxml import etree
4+
import regex
5+
6+
# XXX: this doesn't work for from patterns that use the ?: in (?:www\.)?
7+
# (one of many examples in Zoosk.com.xml)
8+
# XXX: this doesn't figure out if a target host causes a particular rule
9+
# to be completely inapplicable (in which case it should probably be
10+
# ignored) for determining simplicity
11+
# XXX: this doesn't catch simple rules that use alternation with
12+
# backreferences, like from="^http://(foo|bar)\.example\.com/"
13+
# to="\1.example.com"
14+
15+
def simple(f):
16+
tree = etree.parse(f)
17+
targets = [target.attrib["host"] for target in tree.xpath("/ruleset/target")]
18+
return all([
19+
# ruleset must not be default_off
20+
"default_off" not in tree.xpath("/ruleset")[0].attrib,
21+
# ruleset must not contain a match_rule
22+
"match_rule" not in tree.xpath("/ruleset")[0].attrib,
23+
# XXX: maybe also check for platform="mixedcontent" here
24+
# ruleset must not apply any securecookie patterns
25+
not tree.xpath("/ruleset/securecookie"),
26+
# ruleset must not contain any exclusions
27+
not tree.xpath("/ruleset/exclusion"),
28+
# targets must not contain any wildcards
29+
not any("*" in target for target in targets),
30+
# ruleset must not contain any downgrade rules
31+
not any("downgrade" in rule.attrib for rule in tree.xpath("/ruleset/rule")),
32+
# and every rule must itself be simple according to the criteria below
33+
all(simple_rule(rule, targets) for rule in tree.xpath("/ruleset/rule"))
34+
])
35+
36+
def simple_rule(rule, targets):
37+
"""Is this rule a simple rule? A simple rule rewrites a single hostname,
38+
perhaps with an optional leading www\., to itself or to itself plus www.,
39+
at the top level with no other effects."""
40+
rule_from = rule.attrib["from"]
41+
rule_to = rule.attrib["to"]
42+
# Simple rule with no capture
43+
if regex.match(r"^\^http://[-A-Za-z0-9.\\]+/$", rule_from):
44+
applicable_host = unescape(regex.search(r"^\^http://([-A-Za-z0-9.\\])+/$", rule_from).groups()[0])
45+
if regex.match(r"^https://%s/" % applicable_host, rule_to) or regex.match("r^https://%s/" % applicable_host, rule_to):
46+
return True
47+
else:
48+
return False
49+
# Optional www
50+
if regex.match(r"^\^http://\(www\\\.\)\?[-A-Za-z0-9.\\]+/$", rule_from):
51+
applicable_host = unescape(regex.search(r"^\^http://\(www\\\.\)\?([-A-Za-z0-9.\\]+)/$", rule_from).groups()[0])
52+
if regex.match(r"^https://www\.%s/" % applicable_host, rule_to) or regex.match(r"^https://%s/" % applicable_host, rule_to):
53+
return True
54+
else:
55+
return False
56+
return False
57+
58+
def unescape(s):
59+
return s.replace(r"\.", ".")

0 commit comments

Comments
 (0)