Skip to content

Commit 604d296

Browse files
committed
Actually, this is a much more readable merge-rulesets implementation
Though it is slower. Python is probably the long-term way forward
1 parent b31057b commit 604d296

File tree

3 files changed

+147
-38
lines changed

3 files changed

+147
-38
lines changed

makecrx.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ do_not_ship="*.py *.xml icon.jpg"
7878
rm -f $do_not_ship
7979
cd ../..
8080

81-
python ./utils/merge-rulesets.py
81+
sh ./utils/merge-rulesets.sh
8282

8383
cp src/$RULESETS pkg/crx/rules/default.rulesets
8484

makexpi.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,7 @@ if [ -e "$GIT_OBJECT_FILE" ]; then
108108
export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE")
109109
fi
110110

111-
112-
python ./utils/merge-rulesets.py
111+
sh ./utils/merge-rulesets.sh
113112
cd src
114113

115114
# Build the XPI!

utils/merge-rulesets.sh

Lines changed: 145 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,48 +3,158 @@
33
# Merge all the .xml rulesets into a single "default.rulesets" file -- this
44
# prevents inodes from wasting disk space, but more importantly, works around
55
# the fact that zip does not perform well on a pile of small files.
6-
cd src
6+
77
RULESETS=chrome/content/rules/default.rulesets
8-
echo "Creating ruleset library..."
98

10-
# Under git bash, sed -i issues errors and sets the file "read only". Thanks.
11-
[ -f "$RULESETS" ] && chmod u+w $RULESETS
9+
INDENT_CHAR=' '
10+
# Any whitespace that marks one level of indentation.
11+
12+
TAG_DEFINITIONS='
13+
# tag | level of indentation | + prepended linebreaks | + appended linebreaks
14+
rulesetlibrary 0 -1 0
15+
ruleset 0 1 0
16+
rule 1 0 0
17+
target 1 0 0
18+
exclusion 2 0 0
19+
securecookie 2 0 0'
20+
# Extra prepended linebreaks are added before the opening <tags>,
21+
# and appended after closing </tags> and <tags/>. It's not perfect but it works.
22+
# One linebreak is implicitly prepended; opt out by supplying -1
23+
# This does not work whatsoever with nested tags, mind.
24+
25+
SED_TRIM_CMD='
26+
:a
27+
s/<!--.*-->//g
28+
/<!--/N
29+
//ba
30+
31+
s/\([^ ]\)\(to\|from\|name\)=/\1 \2=/g
32+
s: />:/>:g
33+
s/\([^ ]\) \{2,\}/\1/g
34+
s/ \+$//g
35+
s:\(http[s?]\{,2\}\)[^:]//:\1\://:g
36+
s:\([^:]\)/\{2,\}:\1/:g'
37+
# sed command to scrub comments and fix various whitespace irregularities;
38+
# missing whitespace inbetween tag fields: <x y="z"trapped=":("/>,
39+
# random double and trailing whitespace, unwanted whitespace before '/>',
40+
# semicolons after protocols;// (rather than colons), and mid-URI double slashes
41+
42+
43+
# Functions
44+
45+
repeat_char() {
46+
[ $2 -gt 0 ] || return
47+
local i
48+
for i in $(seq 1 $2); do printf "$1"; done
49+
}
50+
51+
format_rulesets() {
52+
local IFS tag idepth prebreaks postbreaks
53+
local _indent _pre _post _sed_pre _sed_post _sed_oneshot
54+
55+
# Print pretty banner, very hardcoded
56+
printf '\n'
57+
printf '%15s | %s | %s | %s\n' 'tag name' 'indent' 'prebreak' 'postbreak'
58+
printf '%15s-+-%s-+-%s-+-%s\n' \
59+
'-------------' '------' '--------' '----------'
60+
61+
# Iterate through tags and add appropriate indentation and linebreaks
62+
while read tag idepth prebreaks postbreaks; do
63+
( [ "$tag" = '#' ] || [ ! $postbreaks ] ) && continue # Invalid; skip
64+
unset _indent _pre _post _sed_pre _sed_post _sed_oneshot
65+
66+
printf "%15s | %6d | %8d | %9s\n" "$tag" $idepth $prebreaks $postbreaks
67+
68+
# Special characters (\n) need double escaping when saved to a variable
69+
# since they are dereferenced and break everytime they're passed around.
70+
# bash printf has a %q format character for this, but we're /bin/sh
1271

13-
echo "<rulesetlibrary gitcommitid=\"${GIT_COMMIT_ID}\">" > $RULESETS
14-
# Include the filename.xml as the "f" attribute
15-
for file in chrome/content/rules/*.xml; do
16-
xmlInsertString="<ruleset"
17-
fileName=$(basename "$file")
18-
fileContent=$(sed "s/${xmlInsertString}/${xmlInsertString} f=\"${fileName}\"/" "chrome/content/rules/${fileName}")
19-
echo "$fileContent" >> $RULESETS
20-
done
21-
echo "</rulesetlibrary>" >> $RULESETS
72+
# Should always be a prepended linebreak unless we opt out with -1
73+
_pre="$(repeat_char '\\n' $((prebreaks+1)))"
74+
_post="$(repeat_char '\\n' $postbreaks)"
75+
_indent="$(repeat_char "$INDENT_CHAR" $idepth)"
2276

23-
echo "Removing whitespaces and comments..."
77+
# breaks before opening <tags> and <tags/>
78+
_sed_pre="s:<${tag}[ />]:${_pre}${_indent}\0:g;"
79+
# breaks after closing </tags>
80+
_sed_post="s:</${tag}>:\n${_indent}\0${_post}:g;"
81+
# breaks after oneshot <tags/>
82+
_sed_oneshot="s:<${tag}\(/>\| [^>]\+/>\):\0${_post}:g;"
83+
84+
sed -ir "$_sed_pre $_sed_post $_sed_oneshot" $RULESETS
85+
done <<- EOF
86+
$TAG_DEFINITIONS
87+
EOF
88+
89+
echo #padding for some distance after the tag table
90+
}
2491

2592
rulesize() {
26-
echo `wc -c $RULESETS | cut -d \ -f 1`
93+
wc -c < $RULESETS
2794
}
28-
CRUSH=`rulesize`
29-
sed -i -e :a -re 's/<!--.*?-->//g;/<!--/N;//ba' $RULESETS
30-
sed -i ':a;N;$!ba;s/\n//g;s/>[ ]*</></g;s/[ ]*to=/ to=/g;s/[ ]*from=/ from=/g;s/ \/>/\/>/g' $RULESETS
31-
echo "Crushed $CRUSH bytes of rulesets into `rulesize`"
32-
33-
if [ -x $(which xmllint) ]
34-
then
35-
if xmllint --noout $RULESETS
36-
then
37-
echo "$RULESETS passed XML validity test."
38-
else
39-
echo "ERROR: $RULESETS failed XML validity test."
40-
exit 2
41-
fi
42-
else
43-
echo "WARNING: xmllint not present; validation of $RULESETS skipped."
44-
fi
45-
46-
# We make default.rulesets at build time, but it shouldn't have a variable
47-
# timestamp
95+
96+
populate_rulesets() {
97+
local xmlfile
98+
# Under git bash, sed -i issues errors and sets the file "read only"
99+
[ -f "$RULESETS" ] && chmod u+w $RULESETS
100+
101+
printf '<rulesetlibrary gitcommitid="%s">' \
102+
"${GIT_COMMIT_ID:-unset}" > $RULESETS
103+
104+
# Include the filename.xml as the "f" attribute
105+
for xmlfile in chrome/content/rules/*.xml; do
106+
sed "s/<ruleset/\0 f=\"${xmlfile##*/}\"/g" "$xmlfile" >> $RULESETS
107+
done
108+
109+
echo "</rulesetlibrary>" >> $RULESETS
110+
}
111+
112+
flatten_file() {
113+
# Strip *all* control chars; we'll re-add them soon as per tag definitions.
114+
# tr cannot edit in-place so we need to temp, either in a file or a variable
115+
echo "$(tr -d '[:cntrl:]' < $RULESETS | tr -s '[:space:]')" > $RULESETS
116+
# Beware that this *assumes* the used shell accepts variable sizes of >2Mb.
117+
}
118+
119+
120+
# Execution start
121+
122+
cd src
123+
124+
echo "Creating ruleset library..."
125+
populate_rulesets
126+
127+
echo "Removing control characters, whitespace and comments..."
128+
PRECRUSH=$(rulesize)
129+
flatten_file
130+
131+
echo "Formatting..."
132+
format_rulesets
133+
134+
echo "Final touches..."
135+
# sed -i is not portable (GNU extension), but maybe we don't care.
136+
sed -ir "$SED_TRIM_CMD" $RULESETS
137+
POSTCRUSH=$(rulesize)
138+
139+
# All done, print summary
140+
printf "Crushed %d bytes of rulesets into %d (delta %d)\n" \
141+
$PRECRUSH $POSTCRUSH $((POSTCRUSH-PRECRUSH))
142+
143+
# Timestamp
48144
touch -r chrome/content/rules $RULESETS
49145

146+
# We need to keep $RULESETS for makecrx.sh but the rest is of no further use
147+
unset INDENT_CHAR TAG_DEFINITIONS SED_TRIM_CMD PRECRUSH POSTCRUSH
148+
unset repeat_char format_rulesets rulesize populate_rulesets flatten_file
149+
50150
cd ..
151+
152+
153+
# grep tests to ensure the sed magic worked (should find no matches):
154+
#
155+
# non-indenting double whitespace: '[^ ] \{2,\}'
156+
# missing space after field: '="[^"]\+"[^ />]' # not perfect
157+
# trailing whitespace: ' $' # pipe to | cat -A -
158+
# malformed http(s) protocol text 'http[s?]\{,2\}[^:]//'
159+
# random double+ slashes: '[^:;]//'
160+
#

0 commit comments

Comments
 (0)