|
1 | 1 | #!/usr/bin/env python2.7 |
2 | 2 |
|
3 | 3 | # Merge all the .xml rulesets into a single "default.rulesets" file -- this |
4 | | -# prevents inodes from wasting disk space, but more importantly, works around |
5 | | -# the fact that zip does not perform well on a pile of small files. |
| 4 | +# prevents inodes from wasting disk space, but more importantly, this works |
| 5 | +# around the fact that zip does not perform well on a pile of small files. |
6 | 6 |
|
7 | | -# currently a very literal translation of merge-rulesets.sh, but about five |
8 | | -# times faster |
9 | | -from __future__ import print_function |
10 | | -pass |
| 7 | +# Currently, it merges rulesets into a JSON Object for minimal overhead, |
| 8 | +# in both storage and parsing speed. |
| 9 | + |
| 10 | +import argparse |
| 11 | +import glob |
| 12 | +import json |
11 | 13 | import os |
12 | | -from glob import glob |
13 | | -from subprocess import call |
| 14 | +import subprocess |
14 | 15 | import sys |
15 | | -import traceback |
16 | | -import re |
17 | 16 | import unicodedata |
18 | | -import argparse |
| 17 | +import xml.etree.ElementTree |
| 18 | + |
| 19 | +def normalize(f): |
| 20 | + """ |
| 21 | + OSX and Linux filesystems encode composite characters differently in |
| 22 | + filenames. We should normalize to NFC: http://unicode.org/reports/tr15/ |
| 23 | + """ |
| 24 | + f = unicodedata.normalize("NFC", unicode(f, "utf-8")).encode("utf-8") |
| 25 | + return f |
| 26 | + |
| 27 | +# commandline arguments parsing (nobody use it, though) |
| 28 | +parser = argparse.ArgumentParser(description="Merge rulesets") |
| 29 | +parser.add_argument("--source_dir", default="src/chrome/content/rules") |
19 | 30 |
|
20 | | -parser = argparse.ArgumentParser(description='Merge rulesets.') |
21 | | -parser.add_argument('--source_dir', default='src/chrome/content/rules', help='source directory') |
22 | | -parser.add_argument('--fast', help='fast merge', action='store_true') |
23 | 31 | args = parser.parse_args() |
24 | 32 |
|
25 | | -def normalize(f): |
26 | | - """ |
27 | | - OSX and Linux filesystems encode composite characters differently in filenames. |
28 | | - We should normalize to NFC: http://unicode.org/reports/tr15/. |
29 | | - """ |
30 | | - f = unicodedata.normalize('NFC', unicode(f, 'utf-8')).encode('utf-8') |
31 | | - return f |
32 | | - |
33 | | -rulesets_fn= args.source_dir + "/default.rulesets" |
34 | | -xml_ruleset_files = map(normalize, glob(args.source_dir + "/*.xml")) |
35 | | - |
36 | | -# cleanup after bugs :/ |
37 | | -misfile = rulesets_fn + "r" |
38 | | -if os.path.exists(misfile): |
39 | | - print("Cleaning up malformed rulesets file...") |
40 | | - os.unlink(misfile) |
41 | | - |
42 | | -if args.fast: |
43 | | - library_compiled_time = os.path.getmtime(rulesets_fn) |
44 | | - newest_xml = max([os.path.getmtime(f) for f in xml_ruleset_files]) |
45 | | - if library_compiled_time >= newest_xml: |
46 | | - print("Library is newer that all rulesets, skipping rebuild...") |
47 | | - sys.exit(0) |
48 | | - |
49 | | -print("Creating ruleset library...") |
50 | | - |
51 | | -# Under git bash, sed -i issues errors and sets the file "read only". Thanks. |
52 | | -if os.path.isfile(rulesets_fn): |
53 | | - os.system("chmod u+w " + rulesets_fn) |
54 | | - |
55 | | -def rulesize(): |
56 | | - return len(open(rulesets_fn).read()) |
57 | | - |
58 | | -def clean_up(rulefile): |
59 | | - """Remove extra whitespace, comments and tests from a ruleset""" |
60 | | - comment_and_newline_pattern = re.compile(r"<!--.*?-->|\n|\r", flags=re.DOTALL) |
61 | | - rulefile = comment_and_newline_pattern.sub('', rulefile) |
62 | | - to_and_from_pattern = re.compile(r'\s*(from=)') |
63 | | - rulefile = to_and_from_pattern.sub(r' \1', rulefile) |
64 | | - rulefile = re.sub(r'"\s*(to=)', r'" \1', rulefile) |
65 | | - rulefile = re.sub(r">\s*<", r"><", rulefile) |
66 | | - rulefile = re.sub(r"</ruleset>\s*", r"</ruleset>\n", rulefile) |
67 | | - rulefile = re.sub(r"\s*(/>|<ruleset)", r"\1", rulefile) |
68 | | - rulefile = re.sub(r"<test.+?/>", r"", rulefile) |
69 | | - return rulefile |
70 | | - |
71 | | -library = open(rulesets_fn,"w") |
72 | | - |
73 | | -try: |
74 | | - commit_id = os.environ["GIT_COMMIT_ID"] |
75 | | - library.write('<rulesetlibrary gitcommitid="%s">' % commit_id) |
76 | | -except: |
77 | | - # Chromium |
78 | | - library.write('<rulesetlibrary>') |
79 | | - |
80 | | -# Include the filename.xml as the "f" attribute |
81 | | -print("Removing whitespaces and comments...") |
82 | | - |
83 | | -for rfile in sorted(xml_ruleset_files): |
84 | | - ruleset = open(rfile).read() |
85 | | - fn = os.path.basename(rfile) |
86 | | - ruleset = ruleset.replace("<ruleset", '<ruleset f="%s"' % fn, 1) |
87 | | - library.write(clean_up(ruleset)) |
88 | | -library.write("</rulesetlibrary>\n") |
89 | | -library.close() |
90 | | - |
91 | | -try: |
92 | | - if 0 == call(["xmllint", "--noout", rulesets_fn]): |
93 | | - print(rulesets_fn, "passed XML validity test.") |
94 | | - else: |
95 | | - print("ERROR:", rulesets_fn, "failed XML validity test!") |
96 | | - sys.exit(1) |
97 | | -except OSError as e: |
98 | | - if "No such file or directory" not in traceback.format_exc(): |
99 | | - raise |
100 | | - print("WARNING: xmllint not present; validation of", rulesets_fn, " skipped.") |
101 | | - |
102 | | -# We make default.rulesets at build time, but it shouldn't have a variable |
103 | | -# timestamp |
104 | | -call(["touch", "-r", "src/install.rdf", rulesets_fn]) |
| 33 | +# output filename, pointed to the merged ruleset |
| 34 | +ofn = os.path.join(args.source_dir, "default.rulesets") |
| 35 | + |
| 36 | +# XML Ruleset Files |
| 37 | +files = map(normalize, glob.glob(os.path.join(args.source_dir, "*.xml"))) |
| 38 | + |
| 39 | +# Under git bash, sed -i issues errors and sets the file "read-only". |
| 40 | +if os.path.isfile(ofn): |
| 41 | + os.system("chmod u+w " + ofn) |
| 42 | + |
| 43 | +# Library (JSON Object) |
| 44 | +library = [] |
| 45 | + |
| 46 | +# Parse XML ruleset and construct JSON library |
| 47 | +print(" * Parsing XML ruleset and constructing JSON library...") |
| 48 | +for filename in sorted(files): |
| 49 | + tree = xml.etree.ElementTree.parse(filename) |
| 50 | + root = tree.getroot() |
| 51 | + |
| 52 | + ruleset = {} |
| 53 | + |
| 54 | + for attr in root.attrib: |
| 55 | + ruleset[attr] = root.attrib[attr] |
| 56 | + |
| 57 | + for child in root: |
| 58 | + if child.tag in ["target", "rule", "securecookie", "exclusion"]: |
| 59 | + ruleset[child.tag] = [] |
| 60 | + else: |
| 61 | + continue |
| 62 | + |
| 63 | + if child.tag == "target": |
| 64 | + ruleset["target"].append(child.attrib["host"]) |
| 65 | + |
| 66 | + elif child.tag == "rule": |
| 67 | + ru = {} |
| 68 | + ru["from"] = child.attrib["from"] |
| 69 | + ru["to"] = child.attrib["to"] |
| 70 | + |
| 71 | + ruleset["rule"].append(ru) |
| 72 | + |
| 73 | + elif child.tag == "securecookie": |
| 74 | + sc = {} |
| 75 | + sc["host"] = child.attrib["host"] |
| 76 | + sc["name"] = child.attrib["name"] |
| 77 | + |
| 78 | + ruleset["securecookie"].append(sc) |
| 79 | + |
| 80 | + elif child.tag == "exclusion": |
| 81 | + ruleset["exclusion"].append(child.attrib["pattern"]) |
| 82 | + |
| 83 | + library.append(ruleset); |
| 84 | + |
| 85 | +# Write to default.rulesets |
| 86 | +print(" * Writing JSON library to %s" % ofn) |
| 87 | +outfile = open(ofn, "w") |
| 88 | +outfile.write(json.dumps(library)) |
| 89 | +outfile.close() |
| 90 | + |
| 91 | +# We make default.rulesets at build time, |
| 92 | +# but it shouldn't have a variable timestamp |
| 93 | +subprocess.call(["touch", "-r", "src/install.rdf", ofn]) |
105 | 94 |
|
| 95 | +# Everything is okay. |
| 96 | +print(" * Everything is Okay.") |
0 commit comments