Skip to content

Commit 8665c2b

Browse files
author
Pasu Chan
committed
Update utils/merge-rulesets.py
1 parent b5759b3 commit 8665c2b

File tree

1 file changed

+85
-94
lines changed

1 file changed

+85
-94
lines changed

utils/merge-rulesets.py

Lines changed: 85 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,96 @@
11
#!/usr/bin/env python2.7
22

33
# Merge all the .xml rulesets into a single "default.rulesets" file -- this
4-
# prevents inodes from wasting disk space, but more importantly, works around
5-
# the fact that zip does not perform well on a pile of small files.
4+
# prevents inodes from wasting disk space, but more importantly, this works
5+
# around the fact that zip does not perform well on a pile of small files.
66

7-
# currently a very literal translation of merge-rulesets.sh, but about five
8-
# times faster
9-
from __future__ import print_function
10-
pass
7+
# Currently, it merges rulesets into a JSON Object for minimal overhead,
8+
# in both storage and parsing speed.
9+
10+
import argparse
11+
import glob
12+
import json
1113
import os
12-
from glob import glob
13-
from subprocess import call
14+
import subprocess
1415
import sys
15-
import traceback
16-
import re
1716
import unicodedata
18-
import argparse
17+
import xml.etree.ElementTree
18+
19+
def normalize(f):
20+
"""
21+
OSX and Linux filesystems encode composite characters differently in
22+
filenames. We should normalize to NFC: http://unicode.org/reports/tr15/
23+
"""
24+
f = unicodedata.normalize("NFC", unicode(f, "utf-8")).encode("utf-8")
25+
return f
26+
27+
# commandline arguments parsing (nobody use it, though)
28+
parser = argparse.ArgumentParser(description="Merge rulesets")
29+
parser.add_argument("--source_dir", default="src/chrome/content/rules")
1930

20-
parser = argparse.ArgumentParser(description='Merge rulesets.')
21-
parser.add_argument('--source_dir', default='src/chrome/content/rules', help='source directory')
22-
parser.add_argument('--fast', help='fast merge', action='store_true')
2331
args = parser.parse_args()
2432

25-
def normalize(f):
26-
"""
27-
OSX and Linux filesystems encode composite characters differently in filenames.
28-
We should normalize to NFC: http://unicode.org/reports/tr15/.
29-
"""
30-
f = unicodedata.normalize('NFC', unicode(f, 'utf-8')).encode('utf-8')
31-
return f
32-
33-
rulesets_fn= args.source_dir + "/default.rulesets"
34-
xml_ruleset_files = map(normalize, glob(args.source_dir + "/*.xml"))
35-
36-
# cleanup after bugs :/
37-
misfile = rulesets_fn + "r"
38-
if os.path.exists(misfile):
39-
print("Cleaning up malformed rulesets file...")
40-
os.unlink(misfile)
41-
42-
if args.fast:
43-
library_compiled_time = os.path.getmtime(rulesets_fn)
44-
newest_xml = max([os.path.getmtime(f) for f in xml_ruleset_files])
45-
if library_compiled_time >= newest_xml:
46-
print("Library is newer that all rulesets, skipping rebuild...")
47-
sys.exit(0)
48-
49-
print("Creating ruleset library...")
50-
51-
# Under git bash, sed -i issues errors and sets the file "read only". Thanks.
52-
if os.path.isfile(rulesets_fn):
53-
os.system("chmod u+w " + rulesets_fn)
54-
55-
def rulesize():
56-
return len(open(rulesets_fn).read())
57-
58-
def clean_up(rulefile):
59-
"""Remove extra whitespace, comments and tests from a ruleset"""
60-
comment_and_newline_pattern = re.compile(r"<!--.*?-->|\n|\r", flags=re.DOTALL)
61-
rulefile = comment_and_newline_pattern.sub('', rulefile)
62-
to_and_from_pattern = re.compile(r'\s*(from=)')
63-
rulefile = to_and_from_pattern.sub(r' \1', rulefile)
64-
rulefile = re.sub(r'"\s*(to=)', r'" \1', rulefile)
65-
rulefile = re.sub(r">\s*<", r"><", rulefile)
66-
rulefile = re.sub(r"</ruleset>\s*", r"</ruleset>\n", rulefile)
67-
rulefile = re.sub(r"\s*(/>|<ruleset)", r"\1", rulefile)
68-
rulefile = re.sub(r"<test.+?/>", r"", rulefile)
69-
return rulefile
70-
71-
library = open(rulesets_fn,"w")
72-
73-
try:
74-
commit_id = os.environ["GIT_COMMIT_ID"]
75-
library.write('<rulesetlibrary gitcommitid="%s">' % commit_id)
76-
except:
77-
# Chromium
78-
library.write('<rulesetlibrary>')
79-
80-
# Include the filename.xml as the "f" attribute
81-
print("Removing whitespaces and comments...")
82-
83-
for rfile in sorted(xml_ruleset_files):
84-
ruleset = open(rfile).read()
85-
fn = os.path.basename(rfile)
86-
ruleset = ruleset.replace("<ruleset", '<ruleset f="%s"' % fn, 1)
87-
library.write(clean_up(ruleset))
88-
library.write("</rulesetlibrary>\n")
89-
library.close()
90-
91-
try:
92-
if 0 == call(["xmllint", "--noout", rulesets_fn]):
93-
print(rulesets_fn, "passed XML validity test.")
94-
else:
95-
print("ERROR:", rulesets_fn, "failed XML validity test!")
96-
sys.exit(1)
97-
except OSError as e:
98-
if "No such file or directory" not in traceback.format_exc():
99-
raise
100-
print("WARNING: xmllint not present; validation of", rulesets_fn, " skipped.")
101-
102-
# We make default.rulesets at build time, but it shouldn't have a variable
103-
# timestamp
104-
call(["touch", "-r", "src/install.rdf", rulesets_fn])
33+
# output filename, pointed to the merged ruleset
34+
ofn = os.path.join(args.source_dir, "default.rulesets")
35+
36+
# XML Ruleset Files
37+
files = map(normalize, glob.glob(os.path.join(args.source_dir, "*.xml")))
38+
39+
# Under git bash, sed -i issues errors and sets the file "read-only".
40+
if os.path.isfile(ofn):
41+
os.system("chmod u+w " + ofn)
42+
43+
# Library (JSON Object)
44+
library = []
45+
46+
# Parse XML ruleset and construct JSON library
47+
print(" * Parsing XML ruleset and constructing JSON library...")
48+
for filename in sorted(files):
49+
tree = xml.etree.ElementTree.parse(filename)
50+
root = tree.getroot()
51+
52+
ruleset = {}
53+
54+
for attr in root.attrib:
55+
ruleset[attr] = root.attrib[attr]
56+
57+
for child in root:
58+
if child.tag in ["target", "rule", "securecookie", "exclusion"]:
59+
ruleset[child.tag] = []
60+
else:
61+
continue
62+
63+
if child.tag == "target":
64+
ruleset["target"].append(child.attrib["host"])
65+
66+
elif child.tag == "rule":
67+
ru = {}
68+
ru["from"] = child.attrib["from"]
69+
ru["to"] = child.attrib["to"]
70+
71+
ruleset["rule"].append(ru)
72+
73+
elif child.tag == "securecookie":
74+
sc = {}
75+
sc["host"] = child.attrib["host"]
76+
sc["name"] = child.attrib["name"]
77+
78+
ruleset["securecookie"].append(sc)
79+
80+
elif child.tag == "exclusion":
81+
ruleset["exclusion"].append(child.attrib["pattern"])
82+
83+
library.append(ruleset);
84+
85+
# Write to default.rulesets
86+
print(" * Writing JSON library to %s" % ofn)
87+
outfile = open(ofn, "w")
88+
outfile.write(json.dumps(library))
89+
outfile.close()
90+
91+
# We make default.rulesets at build time,
92+
# but it shouldn't have a variable timestamp
93+
subprocess.call(["touch", "-r", "src/install.rdf", ofn])
10594

95+
# Everything is okay.
96+
print(" * Everything is Okay.")

0 commit comments

Comments
 (0)