From 10f1021f70678bb45d7f69852fd51992cbe31364 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 15 Apr 2026 19:36:11 +0000 Subject: [PATCH 1/3] fix(scaletest/prebuilds): fix Runner.Cleanup() to delete workspaces before template The API rejects template deletion with a 400 while workspaces still exist. Cleanup() now lists all workspaces for the template, deletes each via workspacebuild.NewCleanupRunner with up to 3 retries, then deletes the template. If Run() fails with a 409 conflict (template already exists from a prior failed run), the existing template is looked up so Cleanup() can remove it rather than silently skipping it. --- scaletest/prebuilds/run.go | 103 ++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/scaletest/prebuilds/run.go b/scaletest/prebuilds/run.go index 0808f5ebb1929..49732ffdf10a9 100644 --- a/scaletest/prebuilds/run.go +++ b/scaletest/prebuilds/run.go @@ -6,6 +6,7 @@ import ( _ "embed" "html/template" "io" + "net/http" "time" "github.com/google/uuid" @@ -17,6 +18,7 @@ import ( "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/scaletest/harness" "github.com/coder/coder/v2/scaletest/loadtestutil" + "github.com/coder/coder/v2/scaletest/workspacebuild" ) type Runner struct { @@ -77,6 +79,22 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { } templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq) if err != nil { + // If the template already exists from a previous failed run, look it up so + // Cleanup() can delete it and the rerun doesn't leave orphaned resources. + var sdkErr *codersdk.Error + if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict { + existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{ + OrganizationID: r.cfg.OrganizationID, + ExactName: templateName, + }) + if listErr == nil && len(existing) > 0 { + r.template = existing[0] + logger.Warn(ctx, "template already exists from a previous run, will be cleaned up", + slog.F("template_name", r.template.Name), + slog.F("template_id", r.template.ID), + ) + } + } r.cfg.Metrics.AddError(templateName, "create_template") return xerrors.Errorf("create template: %w", err) } @@ -305,10 +323,69 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { logs = loadtestutil.NewSyncWriter(logs) logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug) - logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) + // If Run failed before the template was created, there is nothing to clean up. + if r.template.ID == uuid.Nil { + logger.Info(ctx, "template was never created, skipping cleanup") + return nil + } - err := r.client.DeleteTemplate(ctx, r.template.ID) + // Workspaces must be deleted before the template can be deleted. + workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name) if err != nil { + return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err) + } + + logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name)) + + // Retry failed workspace deletions up to maxDeletionAttempts times to + // handle transient errors (e.g. a delete build that fails due to a + // provisioner hiccup). + const maxDeletionAttempts = 3 + remaining := workspaces + for attempt := range maxDeletionAttempts { + if len(remaining) == 0 { + break + } + if attempt > 0 { + logger.Info(ctx, "retrying workspace deletions", + slog.F("attempt", attempt+1), + slog.F("remaining", len(remaining)), + slog.F("template_name", r.template.Name), + ) + } + var failed []codersdk.Workspace + for _, ws := range remaining { + cr := workspacebuild.NewCleanupRunner(r.client, ws.ID) + if err := cr.Run(ctx, ws.ID.String(), logs); err != nil { + logger.Warn(ctx, "failed to delete workspace", + slog.F("workspace_id", ws.ID), + slog.F("workspace_name", ws.Name), + slog.Error(err), + ) + failed = append(failed, ws) + } + } + remaining = failed + } + + if len(remaining) > 0 { + ids := make([]string, len(remaining)) + for i, ws := range remaining { + ids[i] = ws.ID.String() + } + logger.Error(ctx, "CLEANUP INCOMPLETE: could not delete all workspaces after retries; template deletion will likely fail", + slog.F("template_name", r.template.Name), + slog.F("remaining_count", len(remaining)), + slog.F("remaining_workspace_ids", ids), + ) + } + + // Always attempt template deletion even if some workspaces could not be + // removed. The delete call will fail with a 400 if workspaces remain, but + // the error — combined with the log above — makes the state clear to the + // operator. + logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) + if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil { return xerrors.Errorf("delete template: %w", err) } @@ -316,6 +393,28 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { return nil } +// allWorkspacesForTemplate returns all workspaces belonging to templateName, +// paginating through results until exhausted. +func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) { + const pageSize = 100 + var workspaces []codersdk.Workspace + for page := 0; ; page++ { + resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{ + Template: templateName, + Offset: page * pageSize, + Limit: pageSize, + }) + if err != nil { + return nil, xerrors.Errorf("list workspaces page %d: %w", page, err) + } + workspaces = append(workspaces, resp.Workspaces...) + if len(resp.Workspaces) < pageSize { + break + } + } + return workspaces, nil +} + //go:embed tf/main.tf.tpl var templateContent string From 3a556228c914b347e03b2162e85941287be474c3 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 15 Apr 2026 19:36:18 +0000 Subject: [PATCH 2/3] fix(scaletest/prebuilds): fail immediately if workspaces remain after deletion retries Log every deletion attempt (not just retries) and return an error if workspaces remain after all attempts, rather than logging an error and proceeding to template deletion which would fail anyway. Co-Authored-By: Claude Sonnet 4.6 --- scaletest/prebuilds/run.go | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/scaletest/prebuilds/run.go b/scaletest/prebuilds/run.go index 49732ffdf10a9..a9a8afd1f0b73 100644 --- a/scaletest/prebuilds/run.go +++ b/scaletest/prebuilds/run.go @@ -346,13 +346,11 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { if len(remaining) == 0 { break } - if attempt > 0 { - logger.Info(ctx, "retrying workspace deletions", - slog.F("attempt", attempt+1), - slog.F("remaining", len(remaining)), - slog.F("template_name", r.template.Name), - ) - } + logger.Info(ctx, "trying to delete workspaces", + slog.F("attempt", attempt+1), + slog.F("remaining", len(remaining)), + slog.F("template_name", r.template.Name), + ) var failed []codersdk.Workspace for _, ws := range remaining { cr := workspacebuild.NewCleanupRunner(r.client, ws.ID) @@ -373,17 +371,9 @@ func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { for i, ws := range remaining { ids[i] = ws.ID.String() } - logger.Error(ctx, "CLEANUP INCOMPLETE: could not delete all workspaces after retries; template deletion will likely fail", - slog.F("template_name", r.template.Name), - slog.F("remaining_count", len(remaining)), - slog.F("remaining_workspace_ids", ids), - ) + return xerrors.Errorf("could not delete all workspaces after %d attempts; remaining: %v", maxDeletionAttempts, ids) } - // Always attempt template deletion even if some workspaces could not be - // removed. The delete call will fail with a 400 if workspaces remain, but - // the error — combined with the log above — makes the state clear to the - // operator. logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil { return xerrors.Errorf("delete template: %w", err) From 75634262257d1c0788bcbad42d179de2924fbe85 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 21 Apr 2026 21:16:00 +0000 Subject: [PATCH 3/3] fix(scaletest/prebuilds): extract clearTemplatePrebuilds helper and call it on orphaned templates Extracts the empty-template-version logic into a reusable helper and calls it when an orphaned template is detected, preventing the reconciler from spawning new workspaces while Cleanup() drains them. Co-authored-by: Claude Sonnet 4.6 --- scaletest/prebuilds/run.go | 44 ++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/scaletest/prebuilds/run.go b/scaletest/prebuilds/run.go index a9a8afd1f0b73..1ffa25f016894 100644 --- a/scaletest/prebuilds/run.go +++ b/scaletest/prebuilds/run.go @@ -93,6 +93,15 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { slog.F("template_name", r.template.Name), slog.F("template_id", r.template.ID), ) + // Clear any prebuild config on the orphaned template so the + // reconciler doesn't keep spawning workspaces while Cleanup() + // is trying to delete them. + if clearErr := r.pushEmptyTemplateVersion(ctx); clearErr != nil { + logger.Warn(ctx, "failed to clear prebuilds config on orphaned template", + slog.F("template_id", r.template.ID), + slog.Error(clearErr), + ) + } } } r.cfg.Metrics.AddError(templateName, "create_template") @@ -123,21 +132,12 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { r.cfg.DeletionSetupBarrier.Wait() logger.Info(ctx, "prebuilds paused, preparing for deletion") - // Now prepare for deletion by creating an empty template version - // At this point, prebuilds should be paused by the caller + // Now prepare for deletion by creating an empty template version. + // At this point, prebuilds should be paused by the caller. logger.Info(ctx, "creating empty template version for deletion") - emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) - if err != nil { - r.cfg.Metrics.AddError(r.template.Name, "create_empty_template_version") - return xerrors.Errorf("create empty template version for deletion: %w", err) - } - - err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ - ID: emptyVersion.ID, - }) - if err != nil { - r.cfg.Metrics.AddError(r.template.Name, "update_active_template_version") - return xerrors.Errorf("update active template version to empty for deletion: %w", err) + if err = r.pushEmptyTemplateVersion(ctx); err != nil { + r.cfg.Metrics.AddError(r.template.Name, "clear_template_prebuilds") + return xerrors.Errorf("clear template prebuilds for deletion: %w", err) } logger.Info(ctx, "waiting for all runners to reach deletion barrier") @@ -319,6 +319,22 @@ func (r *Runner) createTemplateVersion(ctx context.Context, templateID uuid.UUID var errTickerDone = xerrors.New("done") +// pushEmptyTemplateVersion pushes a new empty template version (no presets, no +// prebuilds) and makes it active. This stops the reconciler from spawning new +// prebuild workspaces for the template. +func (r *Runner) pushEmptyTemplateVersion(ctx context.Context) error { + emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) + if err != nil { + return xerrors.Errorf("create empty template version: %w", err) + } + if err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ + ID: emptyVersion.ID, + }); err != nil { + return xerrors.Errorf("update active template version: %w", err) + } + return nil +} + func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { logs = loadtestutil.NewSyncWriter(logs) logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)