From b60f6f02e935b47a491f750324acaede07f4671e Mon Sep 17 00:00:00 2001 From: Scott Miller Date: Tue, 9 Jun 2026 20:18:30 +0000 Subject: [PATCH 1/3] test(testutil): retry terraform provider cache population on transient failures The provider cache population path in DownloadTFProviders shells out to terraform init and terraform providers mirror against the live registry on a cache miss. These intermittently fail with transient registry/GitHub 5xx errors, failing the whole test. Retry runCmd up to 3 times with exponential backoff on any non-zero exit. Re-running the commands in the same working directory is safe because both are idempotent. --- testutil/terraform_cache.go | 41 ++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/testutil/terraform_cache.go b/testutil/terraform_cache.go index 1d6f27ede2802..6679a9c902d46 100644 --- a/testutil/terraform_cache.go +++ b/testutil/terraform_cache.go @@ -4,6 +4,7 @@ package testutil import ( "bytes" + "context" "crypto/sha256" "encoding/hex" "fmt" @@ -13,8 +14,11 @@ import ( "slices" "strings" "testing" + "time" "github.com/stretchr/testify/require" + + "github.com/coder/retry" ) const ( @@ -85,17 +89,40 @@ func WriteTFCliConfig(t *testing.T, dir string) string { return cliConfigPath } +// runCmdMaxAttempts is the number of times runCmd attempts a command before +// failing the test. These commands populate the provider cache from the +// network, which intermittently returns transient registry/GitHub 5xx errors. +// Retrying any non-zero exit absorbs those flakes. Re-running the commands in +// the same working directory is safe because `terraform init` and `terraform +// providers mirror` are idempotent: each run reconciles the existing state. +const runCmdMaxAttempts = 3 + func runCmd(t *testing.T, dir string, args ...string) { t.Helper() - stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil) - cmd := exec.Command(args[0], args[1:]...) //#nosec - cmd.Dir = dir - cmd.Stdout = stdout - cmd.Stderr = stderr - if err := cmd.Run(); err != nil { - t.Fatalf("failed to run %s: %s\nstdout: %s\nstderr: %s", strings.Join(args, " "), err, stdout.String(), stderr.String()) + ctx := context.Background() + var ( + attempt int + lastErr error + lastStdout, lastStderr string + ) + for r := retry.New(time.Second, 15*time.Second); attempt < runCmdMaxAttempts && r.Wait(ctx); attempt++ { + stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil) + cmd := exec.Command(args[0], args[1:]...) //#nosec + cmd.Dir = dir + cmd.Stdout = stdout + cmd.Stderr = stderr + err := cmd.Run() + if err == nil { + return + } + lastErr = err + lastStdout, lastStderr = stdout.String(), stderr.String() + t.Logf("attempt %d/%d to run %s failed: %s\nstderr: %s", + attempt+1, runCmdMaxAttempts, strings.Join(args, " "), err, lastStderr) } + t.Fatalf("failed to run %s after %d attempts: %s\nstdout: %s\nstderr: %s", + strings.Join(args, " "), runCmdMaxAttempts, lastErr, lastStdout, lastStderr) } // GetTestTFCacheDir returns a unique cache directory path based on the test name and template files. From ff41a97f6277fd7d10989b21e37a46451381b740 Mon Sep 17 00:00:00 2001 From: Scott Miller Date: Tue, 9 Jun 2026 20:38:05 +0000 Subject: [PATCH 2/3] test(testutil): widen provider cache retry window Use 5 attempts with retry.New(5s, 30s) (~1 minute) instead of 3 attempts over ~4s. Registry/GitHub incidents typically last seconds to minutes, so a short window only survives an isolated blip. The wait is only incurred on a cache miss, so the cost is negligible against the per-package test timeout. --- testutil/terraform_cache.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/testutil/terraform_cache.go b/testutil/terraform_cache.go index 6679a9c902d46..dc6fe9c93efb8 100644 --- a/testutil/terraform_cache.go +++ b/testutil/terraform_cache.go @@ -95,7 +95,14 @@ func WriteTFCliConfig(t *testing.T, dir string) string { // Retrying any non-zero exit absorbs those flakes. Re-running the commands in // the same working directory is safe because `terraform init` and `terraform // providers mirror` are idempotent: each run reconciles the existing state. -const runCmdMaxAttempts = 3 +// +// The backoff window is deliberately wide (5 attempts over roughly a minute via +// retry.New(5s, 30s)). Registry/GitHub incidents typically last seconds to +// minutes rather than a single request, so a short window would only survive an +// isolated blip. The cost is acceptable because this network path runs only on +// a cache miss (see DownloadTFProviders), not on every test, so the wait is +// rarely incurred and is negligible against the per-package test timeout. +const runCmdMaxAttempts = 5 func runCmd(t *testing.T, dir string, args ...string) { t.Helper() @@ -106,7 +113,7 @@ func runCmd(t *testing.T, dir string, args ...string) { lastErr error lastStdout, lastStderr string ) - for r := retry.New(time.Second, 15*time.Second); attempt < runCmdMaxAttempts && r.Wait(ctx); attempt++ { + for r := retry.New(5*time.Second, 30*time.Second); attempt < runCmdMaxAttempts && r.Wait(ctx); attempt++ { stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil) cmd := exec.Command(args[0], args[1:]...) //#nosec cmd.Dir = dir From b5582fc7cf576313f005ff4702683f7fb2a5bd0a Mon Sep 17 00:00:00 2001 From: Scott Miller Date: Tue, 9 Jun 2026 20:38:44 +0000 Subject: [PATCH 3/3] test(testutil): tighten cache retry comment --- testutil/terraform_cache.go | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/testutil/terraform_cache.go b/testutil/terraform_cache.go index dc6fe9c93efb8..fada5d536b2ff 100644 --- a/testutil/terraform_cache.go +++ b/testutil/terraform_cache.go @@ -92,16 +92,12 @@ func WriteTFCliConfig(t *testing.T, dir string) string { // runCmdMaxAttempts is the number of times runCmd attempts a command before // failing the test. These commands populate the provider cache from the // network, which intermittently returns transient registry/GitHub 5xx errors. -// Retrying any non-zero exit absorbs those flakes. Re-running the commands in -// the same working directory is safe because `terraform init` and `terraform -// providers mirror` are idempotent: each run reconciles the existing state. +// Retrying any non-zero exit absorbs those flakes, and is safe because +// `terraform init` and `terraform providers mirror` are idempotent. // -// The backoff window is deliberately wide (5 attempts over roughly a minute via -// retry.New(5s, 30s)). Registry/GitHub incidents typically last seconds to -// minutes rather than a single request, so a short window would only survive an -// isolated blip. The cost is acceptable because this network path runs only on -// a cache miss (see DownloadTFProviders), not on every test, so the wait is -// rarely incurred and is negligible against the per-package test timeout. +// The window is wide (5 attempts over ~1 minute via retry.New(5s, 30s)) because +// registry/GitHub incidents last seconds to minutes, not a single request. It +// is cheap because this path runs only on a cache miss (see DownloadTFProviders). const runCmdMaxAttempts = 5 func runCmd(t *testing.T, dir string, args ...string) {