-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Expand file tree
/
Copy pathtest-deploy-docs-diff.sh
More file actions
executable file
·291 lines (252 loc) · 9.47 KB
/
test-deploy-docs-diff.sh
File metadata and controls
executable file
·291 lines (252 loc) · 9.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env bash
# Regression tests for the NUL-delimited diff parser in deploy-docs.yaml.
# The workflow runs `git diff --name-status -z` into $DIFF_FILE and feeds
# the result through an awk script that emits <path>\t<status> lines.
# jq then slurps those lines into a JSON array. This script exercises
# the awk parser against synthetic NUL-delimited inputs so we can
# verify path escaping, rename handling, and unknown-status-code
# behavior without spinning up the full workflow.
#
# Keep `parse_diff` and `build_json_array` below in sync with
# deploy-docs.yaml. The workflow comment "Tested in
# test-deploy-docs-diff.sh" is the contract.
#
# Test inputs are passed to the parser as file paths (not via shell
# variables) because bash strips NUL bytes from command substitutions
# and parameter values. Each test writes its synthetic diff to a tmp
# file before invoking the parser, which is also how the workflow
# itself feeds the parser ($DIFF_FILE).
set -euo pipefail
TMPDIR_SELF="$(mktemp -d)"
trap 'rm -rf "$TMPDIR_SELF"' EXIT
# parse_diff replicates the awk block in deploy-docs.yaml so we can
# exercise it without running the full workflow. Reads NUL-delimited
# `git diff --name-status -z` output from $1 and emits
# <path>\t<status> lines on stdout. Unknown status codes log a warning
# to stderr and consume the path field so the record alignment stays
# correct.
parse_diff() {
awk -v RS='\0' '
function emit(path, status) {
printf "%s\t%s\n", path, status
}
{
code = substr($0, 1, 1)
if (code == "A") { getline; emit($0, "added"); next }
if (code == "M") { getline; emit($0, "modified"); next }
if (code == "T") { getline; emit($0, "modified"); next }
if (code == "D") { getline; emit($0, "deleted"); next }
if (code == "R") {
# R<similarity>\0<old>\0<new>\0
getline old_path
getline new_path
emit(new_path, "renamed")
next
}
if ($0 != "") {
unknown_code = $0
getline unknown_path
printf "::warning::Unknown git diff status %s for %s; skipping.\n", unknown_code, unknown_path > "/dev/stderr"
}
}
' "$1"
}
# build_json_array mirrors the jq slurp in deploy-docs.yaml. Reads
# <path>\t<status> lines from $1 and emits a compact JSON array.
build_json_array() {
jq -Rcn '
[ inputs
| split("\t")
| { path: .[0], status: .[1] }
]
' <"$1"
}
# write_nul_input writes a NUL-delimited diff to a fresh tmp file and
# echoes the file path. Args become NUL-delimited records.
write_nul_input() {
local f
f="$(mktemp -p "$TMPDIR_SELF")"
# Cannot use a single printf %s\0 list because bash's printf will
# happily emit literal NULs, but the surrounding command
# substitution does not strip NULs from file descriptors, only
# from variables. Write directly to the file.
local arg
for arg in "$@"; do
printf '%s\0' "$arg"
done >"$f"
printf '%s' "$f"
}
failures=0
section=""
start_section() {
section="$1"
echo
echo "--- $section ---"
}
assert_parse() {
local description="$1"
local input_file="$2"
local expected="$3"
local actual
actual="$(parse_diff "$input_file" 2>/dev/null)"
if [ "$actual" = "$expected" ]; then
echo "PASS: $description"
else
echo "FAIL: $description"
echo " expected: $(printf '%s' "$expected" | cat -A)"
echo " actual: $(printf '%s' "$actual" | cat -A)"
failures=$((failures + 1))
fi
}
assert_json() {
local description="$1"
local input_file="$2"
local expected="$3"
local parsed
parsed="$(mktemp -p "$TMPDIR_SELF")"
parse_diff "$input_file" 2>/dev/null >"$parsed"
local actual
actual="$(build_json_array "$parsed")"
if [ "$actual" = "$expected" ]; then
echo "PASS: $description"
else
echo "FAIL: $description"
echo " expected: $expected"
echo " actual: $actual"
failures=$((failures + 1))
fi
}
assert_warns() {
local description="$1"
local input_file="$2"
local needle="$3"
local stderr_out
stderr_out="$(parse_diff "$input_file" 2>&1 >/dev/null)"
if printf '%s' "$stderr_out" | grep -q -- "$needle"; then
echo "PASS: $description"
else
echo "FAIL: $description"
echo " needle: $needle"
echo " stderr: $stderr_out"
failures=$((failures + 1))
fi
}
assert_count_matches_emitter() {
# Verify count derivation cannot diverge from the emitter output.
# This is the structural guarantee DEREM-21 calls out: counter and
# emitter must agree by construction. Here that means
# `wc -l < parsed` always equals the number of <path>\t<status>
# lines emitted, even when the input contains unknown codes.
local description="$1"
local input_file="$2"
local expected_count="$3"
local actual_count
actual_count="$(parse_diff "$input_file" 2>/dev/null | wc -l | tr -d ' ')"
if [ "$actual_count" = "$expected_count" ]; then
echo "PASS: $description (count=$actual_count)"
else
echo "FAIL: $description"
echo " expected count: $expected_count"
echo " actual count: $actual_count"
failures=$((failures + 1))
fi
}
# ---------------------------------------------------------------
start_section "Status codes (covers DEREM-3 awk rewrite)"
# ---------------------------------------------------------------
assert_parse "single added file" \
"$(write_nul_input 'A' 'docs/added.md')" \
$'docs/added.md\tadded'
assert_parse "single modified file" \
"$(write_nul_input 'M' 'docs/modified.md')" \
$'docs/modified.md\tmodified'
assert_parse "type-changed treated as modified" \
"$(write_nul_input 'T' 'docs/typechange.md')" \
$'docs/typechange.md\tmodified'
assert_parse "single deleted file" \
"$(write_nul_input 'D' 'docs/deleted.md')" \
$'docs/deleted.md\tdeleted'
assert_parse "rename indexes the new path" \
"$(write_nul_input 'R100' 'docs/old.md' 'docs/new.md')" \
$'docs/new.md\trenamed'
assert_parse "multiple mixed records" \
"$(write_nul_input 'A' 'docs/a.md' 'M' 'docs/b.md' 'D' 'docs/c.md')" \
$'docs/a.md\tadded\ndocs/b.md\tmodified\ndocs/c.md\tdeleted'
assert_parse "rename interleaved with simple records" \
"$(write_nul_input 'A' 'docs/a.md' 'R85' 'docs/old.md' 'docs/new.md' 'D' 'docs/c.md')" \
$'docs/a.md\tadded\ndocs/new.md\trenamed\ndocs/c.md\tdeleted'
empty_file="$(mktemp -p "$TMPDIR_SELF")"
: >"$empty_file"
assert_parse "empty input emits nothing" "$empty_file" ""
# ---------------------------------------------------------------
start_section "Path escaping (covers DEREM-2 path-injection rewrite)"
# ---------------------------------------------------------------
assert_parse "path with spaces survives" \
"$(write_nul_input 'M' 'docs/file with space.md')" \
$'docs/file with space.md\tmodified'
assert_parse "path with double quote survives raw" \
"$(write_nul_input 'M' 'docs/quote".md')" \
$'docs/quote".md\tmodified'
assert_parse "path with backslash survives raw" \
"$(write_nul_input 'M' 'docs/back\slash.md')" \
$'docs/back\\slash.md\tmodified'
# Tab inside a path: the parser is line-based, so a tab character
# inside the path field will be preserved verbatim through awk; jq's
# split on tab then turns this into a multi-element array. We don't
# defend against this at the parser layer because real-world doc paths
# never contain tabs and git would normally quote-escape them anyway.
# Capture the current behavior so a future change is visible.
assert_parse "tab in path preserved raw by parser" \
"$(write_nul_input 'M' $'docs/has\ttab.md')" \
$'docs/has\ttab.md\tmodified'
assert_json "jq escapes double quote in JSON output" \
"$(write_nul_input 'M' 'docs/quote".md')" \
'[{"path":"docs/quote\".md","status":"modified"}]'
assert_json "jq escapes backslash in JSON output" \
"$(write_nul_input 'M' 'docs/back\slash.md')" \
'[{"path":"docs/back\\slash.md","status":"modified"}]'
assert_json "jq emits empty array for empty input" "$empty_file" "[]"
# ---------------------------------------------------------------
start_section "Unknown status codes (DEREM-21 structural guarantee)"
# ---------------------------------------------------------------
# This is the exact case the reviewer reproduced. Old design diverged:
# counter awk said 2, emitter awk said 1. New design has a single awk
# whose output is the source of truth for both.
assert_parse "unknown code consumes its path, valid record after is preserved" \
"$(write_nul_input 'X' 'docs/a.md' 'M' 'docs/real.md')" \
$'docs/real.md\tmodified'
assert_warns "unknown code emits a workflow warning" \
"$(write_nul_input 'X' 'docs/a.md' 'M' 'docs/real.md')" \
'::warning::Unknown git diff status X for docs/a.md'
assert_count_matches_emitter "count matches emitter when an unknown code is skipped" \
"$(write_nul_input 'X' 'docs/a.md' 'M' 'docs/real.md')" \
"1"
assert_count_matches_emitter "count matches emitter for a clean batch" \
"$(write_nul_input 'A' 'docs/a.md' 'M' 'docs/b.md' 'D' 'docs/c.md')" \
"3"
assert_count_matches_emitter "rename counts as one record, not two" \
"$(write_nul_input 'R100' 'docs/old.md' 'docs/new.md')" \
"1"
assert_count_matches_emitter "all unknown produces zero" \
"$(write_nul_input 'X' 'docs/a.md' 'Y' 'docs/b.md')" \
"0"
# ---------------------------------------------------------------
start_section "Sanity checks"
# ---------------------------------------------------------------
# 50-file boundary at the parser layer. The cap-at-50 decision lives
# above this parser in the workflow, but the parser must handle the
# boundary input correctly regardless.
big_input="$(mktemp -p "$TMPDIR_SELF")"
{
for i in $(seq 1 50); do
printf 'M\0docs/big-%02d.md\0' "$i"
done
} >"$big_input"
assert_count_matches_emitter "50 records parse to 50 lines" "$big_input" "50"
if [ "$failures" -gt 0 ]; then
echo
echo "$failures test(s) failed."
exit 1
fi
echo
echo "All tests passed."