Skip to content

Commit 444d4e7

Browse files
fix[ci]: aws s3 download retry more (#6466)
Our CI can race allow more download retries. see: https://github.com/vortex-data/vortex/actions/runs/21955482131/job/63419566060 --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent e0de842 commit 444d4e7

7 files changed

Lines changed: 172 additions & 33 deletions

File tree

.github/workflows/bench-pr.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,8 @@ jobs:
115115
| jq -r '.workflow_runs[].head_sha' \
116116
)
117117
118-
aws s3 cp s3://vortex-benchmark-results-database/data.json.gz - --no-sign-request \
119-
| gzip -d \
120-
| grep $base_commit_sha \
121-
> base.json
118+
python3 scripts/s3-download.py s3://vortex-benchmark-results-database/data.json.gz data.json.gz --no-sign-request
119+
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
122120
123121
echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
124122
echo '' >> comment.md

.github/workflows/fuzz-coverage.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
CORPUS_KEY="${{ matrix.fuzz_target }}_corpus.tar.zst"
6565
CORPUS_DIR="fuzz/corpus/${{ matrix.fuzz_target }}"
6666
67-
if aws s3 cp "s3://vortex-fuzz-corpus/$CORPUS_KEY" . 2>/dev/null; then
67+
if python3 scripts/s3-download.py "s3://vortex-fuzz-corpus/$CORPUS_KEY" "$CORPUS_KEY"; then
6868
echo "Downloaded corpus successfully"
6969
tar -xf "$CORPUS_KEY"
7070
else

.github/workflows/minimize_fuzz_corpus_workflow.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
7474
CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
7575
76-
if aws s3 cp "s3://vortex-fuzz-corpus/$CORPUS_KEY" . 2>/dev/null; then
76+
if python3 scripts/s3-download.py "s3://vortex-fuzz-corpus/$CORPUS_KEY" "$CORPUS_KEY"; then
7777
echo "Downloaded corpus successfully"
7878
tar -xf "$CORPUS_KEY"
7979
else
@@ -108,4 +108,4 @@ jobs:
108108
CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
109109
CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
110110
tar -acf "$CORPUS_KEY" "$CORPUS_DIR"
111-
aws s3api put-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32
111+
python3 scripts/s3-upload.py --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32

.github/workflows/run-fuzzer.yml

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,8 @@ jobs:
9292
CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
9393
CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
9494
95-
# Try to get ETag for optimistic locking on upload
96-
if aws s3api head-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --query ETag --output text > current_etag 2>/dev/null; then
97-
echo "Found existing corpus at s3://vortex-fuzz-corpus/$CORPUS_KEY"
98-
else
99-
echo ""
100-
echo "=========================================="
101-
echo "WARNING: No existing corpus found for ${{ inputs.fuzz_target }}"
102-
echo "This is expected for new fuzzers. Starting with empty corpus."
103-
echo "=========================================="
104-
echo ""
105-
echo '""' > current_etag
106-
fi
107-
10895
# Try to download corpus
109-
if aws s3 cp "s3://vortex-fuzz-corpus/$CORPUS_KEY" . 2>/dev/null; then
96+
if python3 scripts/s3-download.py "s3://vortex-fuzz-corpus/$CORPUS_KEY" "$CORPUS_KEY"; then
11097
echo "Downloaded corpus successfully"
11198
tar -xf "$CORPUS_KEY"
11299
else
@@ -183,14 +170,7 @@ jobs:
183170
184171
tar -acf "$CORPUS_KEY" "$CORPUS_DIR"
185172
186-
ETAG=$(cat current_etag)
187-
if [ "$ETAG" = '""' ] || [ -z "$ETAG" ]; then
188-
# New corpus, no ETag check needed
189-
aws s3api put-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32
190-
else
191-
# Existing corpus, use optimistic locking
192-
aws s3api put-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32 --if-match "$ETAG"
193-
fi
173+
python3 scripts/s3-upload.py --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32 --optimistic-lock
194174
195175
- name: Fail job if fuzz run found a bug
196176
if: steps.check.outputs.crashes_found == 'true'

.github/workflows/sql-benchmarks.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,8 @@ jobs:
231231
| jq -r '.workflow_runs[].head_sha' \
232232
)
233233
234-
aws s3 cp s3://vortex-benchmark-results-database/data.json.gz - --no-sign-request \
235-
| gzip -d \
236-
| grep $base_commit_sha \
237-
> base.json
234+
python3 scripts/s3-download.py s3://vortex-benchmark-results-database/data.json.gz data.json.gz --no-sign-request
235+
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
238236
239237
echo '# Benchmarks: ${{ matrix.name }}' > comment.md
240238
echo '' >> comment.md

scripts/s3-download.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
4+
5+
"""Download a file from S3 with exponential backoff retry."""
6+
7+
import argparse
8+
import subprocess
9+
import sys
10+
import time
11+
12+
13+
def main():
14+
parser = argparse.ArgumentParser(description="Download a file from S3 with retry")
15+
parser.add_argument("s3_url", help="S3 URL to download (e.g. s3://bucket/key)")
16+
parser.add_argument("output", help="Local output file path")
17+
parser.add_argument(
18+
"--no-sign-request",
19+
action="store_true",
20+
help="Do not sign the request (for public buckets)",
21+
)
22+
parser.add_argument("--max-retries", type=int, default=5, help="Maximum number of retries")
23+
args = parser.parse_args()
24+
25+
cmd = ["aws", "s3", "cp", args.s3_url, args.output]
26+
if args.no_sign_request:
27+
cmd.append("--no-sign-request")
28+
29+
for attempt in range(1, args.max_retries + 1):
30+
result = subprocess.run(cmd)
31+
if result.returncode == 0:
32+
return
33+
34+
if attempt == args.max_retries:
35+
break
36+
37+
delay = min(2**attempt, 30)
38+
print(
39+
f"S3 download failed (attempt {attempt}/{args.max_retries}), retrying in {delay}s...",
40+
file=sys.stderr,
41+
)
42+
time.sleep(delay)
43+
44+
print(
45+
f"S3 download failed after {args.max_retries} attempts",
46+
file=sys.stderr,
47+
)
48+
sys.exit(1)
49+
50+
51+
if __name__ == "__main__":
52+
main()

scripts/s3-upload.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
4+
5+
"""Upload a file to S3 with exponential backoff retry and optional optimistic locking."""
6+
7+
import argparse
8+
import subprocess
9+
import sys
10+
import time
11+
12+
13+
def head_etag(bucket: str, key: str) -> str | None:
14+
"""Fetch the current ETag for an object, or None if it doesn't exist."""
15+
result = subprocess.run(
16+
[
17+
"aws",
18+
"s3api",
19+
"head-object",
20+
"--bucket",
21+
bucket,
22+
"--key",
23+
key,
24+
"--query",
25+
"ETag",
26+
"--output",
27+
"text",
28+
],
29+
capture_output=True,
30+
text=True,
31+
)
32+
if result.returncode != 0:
33+
return None
34+
etag = result.stdout.strip()
35+
if not etag or etag == "null":
36+
return None
37+
return etag
38+
39+
40+
def put_object(
41+
bucket: str,
42+
key: str,
43+
body: str,
44+
checksum_algorithm: str | None,
45+
if_match: str | None,
46+
) -> bool:
47+
"""Upload an object, returning True on success."""
48+
cmd = [
49+
"aws",
50+
"s3api",
51+
"put-object",
52+
"--bucket",
53+
bucket,
54+
"--key",
55+
key,
56+
"--body",
57+
body,
58+
]
59+
if checksum_algorithm:
60+
cmd.extend(["--checksum-algorithm", checksum_algorithm])
61+
if if_match:
62+
cmd.extend(["--if-match", if_match])
63+
64+
result = subprocess.run(cmd)
65+
return result.returncode == 0
66+
67+
68+
def main():
69+
parser = argparse.ArgumentParser(description="Upload a file to S3 with retry and optional optimistic locking")
70+
parser.add_argument("--bucket", required=True, help="S3 bucket name")
71+
parser.add_argument("--key", required=True, help="S3 object key")
72+
parser.add_argument("--body", required=True, help="Local file to upload")
73+
parser.add_argument("--checksum-algorithm", help="Checksum algorithm (e.g. CRC32)")
74+
parser.add_argument(
75+
"--optimistic-lock",
76+
action="store_true",
77+
help="Use ETag-based optimistic locking (re-fetches ETag on each retry)",
78+
)
79+
parser.add_argument("--max-retries", type=int, default=5, help="Maximum number of retries")
80+
args = parser.parse_args()
81+
82+
for attempt in range(1, args.max_retries + 1):
83+
if_match = None
84+
if args.optimistic_lock:
85+
if_match = head_etag(args.bucket, args.key)
86+
# New object, no ETag to match — just upload without locking
87+
# (this handles the first-ever upload case)
88+
89+
if put_object(args.bucket, args.key, args.body, args.checksum_algorithm, if_match):
90+
print("Upload successful.")
91+
return
92+
93+
if attempt == args.max_retries:
94+
break
95+
96+
delay = min(2**attempt, 30)
97+
print(
98+
f"S3 upload failed (attempt {attempt}/{args.max_retries}), retrying in {delay}s...",
99+
file=sys.stderr,
100+
)
101+
time.sleep(delay)
102+
103+
print(
104+
f"S3 upload failed after {args.max_retries} attempts",
105+
file=sys.stderr,
106+
)
107+
sys.exit(1)
108+
109+
110+
if __name__ == "__main__":
111+
main()

0 commit comments

Comments
 (0)