Skip to content

Commit 95db03b

Browse files
committed
Speed up make-sqlite and trivial-validate.
1 parent a3268fb commit 95db03b

File tree

2 files changed

+93
-105
lines changed

2 files changed

+93
-105
lines changed

utils/make-sqlite.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,15 @@
22
#
33
# Builds an sqlite DB containing all the rulesets, indexed by target.
44

5-
import subprocess
5+
import glob
6+
import os
7+
import re
68
import sqlite3
7-
import sys, re, os
9+
import subprocess
10+
import sys
811

912
from lxml import etree
1013

11-
def nomes_all(where=sys.argv[1:]):
12-
"""Returns generator to extract all files from a list of files/dirs"""
13-
if not where: where=['.']
14-
for i in where:
15-
if os.path.isfile(i):
16-
yield i
17-
elif os.path.isdir(i):
18-
for r, d, f in os.walk(i):
19-
for fi in f:
20-
yield os.path.join(r, fi)
21-
22-
2314
conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
2415
c = conn.cursor()
2516
c.execute('''DROP TABLE IF EXISTS rulesets''')
@@ -39,28 +30,33 @@ def nomes_all(where=sys.argv[1:]):
3930

4031
parser = etree.XMLParser(remove_blank_text=True)
4132

33+
def nomes_all(where=sys.argv[1:]):
34+
"""Returns generator to extract all files from a list of files/dirs"""
35+
return glob.glob('src/chrome/content/rules/*.xml')
36+
37+
# Precompile xpath expressions that get run repeatedly.
38+
xpath_host = etree.XPath("/ruleset/target/@host")
39+
xpath_ruleset = etree.XPath("/ruleset")
40+
4241
for fi in nomes_all():
4342
try:
4443
tree = etree.parse(fi, parser)
4544
except Exception as oops:
46-
if fi[-4:] != ".xml":
47-
continue
4845
print("%s failed XML validity: %s\n" % (fi, oops))
49-
if not tree.xpath("/ruleset"):
50-
continue
46+
sys.exit(1)
5147

5248
# Remove comments to save space.
5349
etree.strip_tags(tree,etree.Comment)
5450

55-
targets = tree.xpath("/ruleset/target/@host")
51+
targets = xpath_host(tree)
5652
# Strip out the target tags. These aren't necessary in the DB because
5753
# targets are looked up in the target table, which has a foreign key
5854
# pointing into the ruleset table.
59-
etree.strip_tags(tree,'target')
55+
etree.strip_tags(tree, 'target')
6056

6157
# Store the filename in the `f' attribute so "view source XML" for rules in
6258
# FF version can find it.
63-
tree.xpath("/ruleset")[0].attrib["f"] = os.path.basename(fi).decode(encoding="UTF-8")
59+
xpath_ruleset(tree)[0].attrib["f"] = os.path.basename(fi).decode(encoding="UTF-8")
6460

6561
c.execute('''INSERT INTO rulesets (contents) VALUES(?)''', (etree.tostring(tree),));
6662
ruleset_id = c.lastrowid

utils/trivial-validate.py

Lines changed: 76 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
#!/usr/bin/env python2.7
22

33
import argparse
4-
import sys, re, os
4+
import os
5+
import re
56
import sqlite3
7+
import sys
68

7-
try:
8-
from lxml import etree
9-
except ImportError:
10-
sys.stderr.write("** Could not import lxml! Rule validation SKIPPED.\n")
11-
sys.stderr.write("** Caution: A resulting build MAY CONTAIN INVALID RULES.\n")
12-
sys.stderr.write("** Please install libxml2 and lxml to permit validation!\n")
13-
sys.exit(0)
9+
from lxml import etree
1410

1511
parser = argparse.ArgumentParser(
1612
formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -19,68 +15,69 @@
1915
default="",
2016
help="Ignore entries."
2117
)
22-
parser.add_argument('--dupdir', type=str, nargs="*",
23-
default="",
24-
help="Duplicate directory."
25-
)
2618
parser.add_argument('--quiet', action="store_true",
2719
default=False, help="Suppress debug output."
2820
)
29-
parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
30-
default="src/chrome/content/rules",
31-
help='Directory of XML files to validate.')
21+
parser.add_argument('--db', type=str, nargs="*",
22+
default=os.path.join(os.path.dirname(__file__),
23+
"../src/defaults/rulesets.sqlite"),
24+
help='SQLite db with rules')
3225

3326
args = parser.parse_args()
3427

3528
ignoredups = [re.compile(val) for val in args.ignoredups]
36-
dupdir = [val for val in args.dupdir]
3729
quiet = args.quiet
3830

3931
def warn(s):
40-
if not quiet: sys.stdout.write("warning: %s\n" % s)
32+
if not quiet:
33+
sys.stdout.write("warning: %s\n" % s)
4134

4235
def fail(s):
4336
sys.stdout.write("failure: %s\n" % s)
4437

45-
def test_not_anchored(tree, fi):
38+
def test_not_anchored(tree, fi, host, fm, to):
4639
# Rules not anchored to the beginning of a line.
4740
"""The 'from' rule is not anchored to beginning of line using the ^ symbol."""
48-
for f in tree.xpath("/ruleset/rule/@from"):
41+
for f in fm:
4942
if not f or f[0] != "^":
5043
return False
5144
return True
5245

53-
def test_bad_regexp(tree, fi):
46+
# Precompile xpath expressions that get run repeatedly.
47+
xpath_exlusion_pattern = etree.XPath("/ruleset/exclusion/@pattern")
48+
xpath_cookie_pattern = etree.XPath("/ruleset/securecookie/@host")
49+
50+
def test_bad_regexp(tree, fi, host, fm, to):
5451
# Rules with invalid regular expressions.
5552
"""The 'from' rule contains an invalid extended regular expression."""
56-
for f in tree.xpath("/ruleset/rule/@from") + \
57-
tree.xpath("/ruleset/exclusion/@pattern") + \
58-
tree.xpath("/ruleset/securecookie/@host"):
53+
patterns = fm + xpath_exlusion_pattern(tree) + xpath_cookie_pattern(tree)
54+
for pat in patterns:
5955
try:
60-
re.compile(f)
56+
re.compile(pat)
6157
except:
6258
return False
6359
return True
6460

65-
def test_missing_to(tree, fi):
61+
xpath_rule = etree.XPath("/ruleset/rule")
62+
def test_missing_to(tree, fi, host, fm, to):
6663
# Rules that are terminated before setting 'to'.
6764
# These cases are probably either due to a misplaced
6865
# rule end or intended to be different elements.
6966
"""Rule is missing a 'to' value."""
70-
for rule in tree.xpath("/ruleset/rule"):
67+
for rule in xpath_rule(tree):
7168
if not rule.get("to"):
7269
warn("'to' attribute missing in %s. " % fi)
7370
warn("Misplaced end or misnamed element?")
7471
return False
7572
return True
7673

77-
def test_unescaped_dots(tree, fi):
74+
def test_unescaped_dots(tree, fi, host, fm, to):
7875
# Rules containing unescaped dots outside of brackets and before slash.
7976
# Note: this is meant to require example\.com instead of example.com,
8077
# but it also forbids things like .* which usually ought to be replaced
8178
# with something like ([^/:@\.]+)
8279
"""The 'from' rule contains unescaped period in regular expression. Try escaping it with a backslash."""
83-
for f in tree.xpath("/ruleset/rule/@from"):
80+
for f in fm:
8481
escaped = False
8582
bracketed = False
8683
s = re.sub("^\^https?://", "", f)
@@ -99,22 +96,22 @@ def test_unescaped_dots(tree, fi):
9996
escaped = False
10097
return True
10198

102-
def test_space_in_to(tree, fi):
99+
def test_space_in_to(tree, fi, host, fm, to):
103100
# Rules where the to pattern contains a space.
104101
"""The 'to' rule contains a space."""
105-
for t in tree.xpath("/ruleset/rule/@to"):
102+
for t in to:
106103
if ' ' in t:
107104
return False
108105
return True
109106

110-
def test_unencrypted_to(tree, fi):
107+
def test_unencrypted_to(tree, fi, host, fm, to):
111108
# Rules that redirect to something other than https or http.
112109
# This used to test for http: but testing for lack of https: will
113110
# catch more kinds of mistakes.
114111
# Now warn if the rule author indicates they intended it, with the
115112
# downgrade attribute. Error if this attribute is not present.
116113
"""Rule redirects to something other than https."""
117-
for rule in tree.xpath("/ruleset/rule"):
114+
for rule in xpath_rule(tree):
118115
to, downgrade = rule.get("to"), rule.get("downgrade")
119116
if to[:6] != "https:" and to[:5] != "http:":
120117
return False
@@ -125,82 +122,58 @@ def test_unencrypted_to(tree, fi):
125122
return False
126123
return True
127124

128-
def test_backslash_in_to(tree, fi):
125+
def test_backslash_in_to(tree, fi, host, fm, to):
129126
# Rules containing backslashes in to pattern.
130127
"""The 'to' rule contains a backslash."""
131-
for t in tree.xpath("/ruleset/rule/@to"):
128+
for t in to:
132129
if '\\' in t:
133130
return False
134131
return True
135132

136-
def test_no_trailing_slash(tree, fi):
133+
RE_TRAILING_SLASH = re.compile("//.*/")
134+
135+
def test_no_trailing_slash(tree, fi, host, fm, to):
137136
# Rules not containing trailing slash in from or to pattern.
138137
"""Rule omits forward slash after host name."""
139-
for r in tree.xpath("/ruleset/rule"):
138+
for r in xpath_rule(tree):
140139
f, t = r.get("from"), r.get("to")
141-
if not re.search("//.*/", f):
140+
if not RE_TRAILING_SLASH.search(f):
142141
return False
143-
if not re.search("//.*/", t):
142+
if not RE_TRAILING_SLASH.search(t):
144143
return False
145144
return True
146145

147-
def test_lacks_target_host(tree, fi):
146+
def test_lacks_target_host(tree, fi, host, fm, to):
148147
# Rules that lack at least one target host (target tag with host attr).
149148
"""Rule fails to specify at least one target host."""
150-
return not not tree.xpath("/ruleset/target/@host")
149+
return not not host
151150

152-
def test_bad_target_host(tree, fi):
151+
def test_bad_target_host(tree, fi, host, fm, to):
153152
# Rules where a target host contains multiple wildcards or a slash.
154153
"""The target host must be a hostname, not URL, and must use at most one wildcard."""
155-
for target in tree.xpath("/ruleset/target/@host"):
154+
for target in host:
156155
if "/" in target:
157156
return False
158157
if target.count("*") > 1:
159158
return False
160159
return True
161160

162-
def test_duplicated_target_host(tree, fi):
161+
def test_duplicated_target_host(tree, fi, host, fm, to):
163162
# Rules where a single target host appears more than once.
164163
"""Rule contains the same target host more than once."""
165-
targets = tree.xpath("/ruleset/target/@host")
166-
return len(set(targets)) == len(targets)
164+
return len(set(host)) == len(host)
167165

168166
printable_characters = set(map(chr, list(range(32, 127))))
169167

170-
def test_non_ascii(tree, fi):
168+
def test_non_ascii(tree, fi, host, fm, to):
171169
# Rules containing non-printable characters.
172170
"""Rule contains non-printable character in 'to' pattern."""
173-
for t in tree.xpath("/ruleset/rule/@to"):
171+
for t in to:
174172
for c in t:
175173
if c not in printable_characters:
176174
return False
177175
return True
178176

179-
def test_ruleset_name(tree):
180-
"""Rule has name"""
181-
if tree.xpath("/ruleset/@name"):
182-
return True
183-
else:
184-
return False
185-
186-
def get_all_names_and_targets(ds):
187-
"""extract unique names and targets from a list of dirs of xml files"""
188-
names = set()
189-
targets = set()
190-
for d in ds:
191-
for fi in os.listdir(d):
192-
fi = os.path.join(d, fi)
193-
try:
194-
tree = etree.parse(fi)
195-
ruleset_name = tree.xpath("/ruleset/@name")[0]
196-
target_names = tree.xpath("/ruleset/target/@host")
197-
except Exception:
198-
continue
199-
names.add(ruleset_name)
200-
for target in target_names:
201-
targets.add(target)
202-
return names, targets
203-
204177
def nomes_all(where=sys.argv[1:]):
205178
"""Returns generator to extract all files from a list of files/dirs"""
206179
if not where: where=['.']
@@ -212,34 +185,53 @@ def nomes_all(where=sys.argv[1:]):
212185
for fi in f:
213186
yield os.path.join(r, fi)
214187

215-
tests = [test_not_anchored, test_bad_regexp, test_unescaped_dots, test_missing_to,
216-
test_space_in_to, test_unencrypted_to, test_backslash_in_to,
217-
test_no_trailing_slash, test_bad_target_host,
218-
test_duplicated_target_host, test_non_ascii]
188+
tests = [test_not_anchored,
189+
test_bad_regexp,
190+
test_unescaped_dots,
191+
test_missing_to,
192+
test_space_in_to,
193+
test_unencrypted_to,
194+
test_backslash_in_to,
195+
test_no_trailing_slash,
196+
test_bad_target_host,
197+
test_duplicated_target_host,
198+
test_non_ascii]
219199

220200
failure = 0
221201
seen_file = False
222202

223-
conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
203+
xpath_ruleset = etree.XPath("/ruleset")
204+
xpath_ruleset_name = etree.XPath("/ruleset/@name")
205+
xpath_ruleset_file = etree.XPath("/ruleset/@f")
206+
xpath_host = etree.XPath("/ruleset/target/@host")
207+
xpath_from = etree.XPath("/ruleset/rule/@from")
208+
xpath_to = etree.XPath("/ruleset/rule/@to")
209+
210+
print args.db
211+
conn = sqlite3.connect(args.db)
224212
c = conn.cursor()
225213
for row in c.execute('''SELECT contents from rulesets'''):
226214
try:
227215
tree = etree.fromstring(row[0])
228216
except Exception as oops:
229217
failure = 1
230218
print("failed XML validity: %s\n" % (oops))
231-
if failure or not tree.xpath("/ruleset"):
219+
if failure or not xpath_ruleset(tree):
232220
continue
233-
if not test_ruleset_name(tree):
221+
rn = xpath_ruleset_name(tree)[0]
222+
if not rn:
234223
failure = 1
235224
fail("unnamed ruleset")
236225
continue
237-
ruleset_name = tree.xpath("/ruleset/@name")[0]
238-
ruleset_file = tree.xpath("/ruleset/@f")[0]
226+
rf = xpath_ruleset_name(tree)[0]
227+
host = xpath_host(tree)
228+
fm = xpath_from(tree)
229+
to = xpath_to(tree)
239230
for test in tests:
240-
if not test(tree, ruleset_file):
241-
print("%s failed test: %s" % (ruleset_file, test.__doc__))
242-
for target in tree.xpath("/ruleset/target/@host"):
231+
if not test(tree, rf, host=host, fm=fm, to=to):
232+
print("%s failed test: %s" % (rf, test.__doc__))
233+
for target in xpath_host(tree):
234+
print target
243235
if target in all_targets and not any(ign.search(target) for ign in ignoredups):
244236
# suppress warning about duplicate targets if an --ignoredups
245237
# pattern matches target

0 commit comments

Comments
 (0)