-
Notifications
You must be signed in to change notification settings - Fork 1.3k
fix(scaletest/prebuilds): fix Runner.Cleanup() to delete workspaces #23627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
| _ "embed" | ||
| "html/template" | ||
| "io" | ||
| "net/http" | ||
| "time" | ||
|
|
||
| "github.com/google/uuid" | ||
|
|
@@ -17,6 +18,7 @@ import ( | |
| "github.com/coder/coder/v2/codersdk" | ||
| "github.com/coder/coder/v2/scaletest/harness" | ||
| "github.com/coder/coder/v2/scaletest/loadtestutil" | ||
| "github.com/coder/coder/v2/scaletest/workspacebuild" | ||
| ) | ||
|
|
||
| type Runner struct { | ||
|
|
@@ -77,6 +79,31 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { | |
| } | ||
| templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq) | ||
| if err != nil { | ||
| // If the template already exists from a previous failed run, look it up so | ||
| // Cleanup() can delete it and the rerun doesn't leave orphaned resources. | ||
| var sdkErr *codersdk.Error | ||
| if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict { | ||
| existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{ | ||
| OrganizationID: r.cfg.OrganizationID, | ||
| ExactName: templateName, | ||
| }) | ||
| if listErr == nil && len(existing) > 0 { | ||
| r.template = existing[0] | ||
| logger.Warn(ctx, "template already exists from a previous run, will be cleaned up", | ||
| slog.F("template_name", r.template.Name), | ||
| slog.F("template_id", r.template.ID), | ||
| ) | ||
| // Clear any prebuild config on the orphaned template so the | ||
| // reconciler doesn't keep spawning workspaces while Cleanup() | ||
| // is trying to delete them. | ||
| if clearErr := r.pushEmptyTemplateVersion(ctx); clearErr != nil { | ||
| logger.Warn(ctx, "failed to clear prebuilds config on orphaned template", | ||
| slog.F("template_id", r.template.ID), | ||
| slog.Error(clearErr), | ||
| ) | ||
| } | ||
| } | ||
| } | ||
| r.cfg.Metrics.AddError(templateName, "create_template") | ||
| return xerrors.Errorf("create template: %w", err) | ||
| } | ||
|
|
@@ -105,21 +132,12 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { | |
| r.cfg.DeletionSetupBarrier.Wait() | ||
| logger.Info(ctx, "prebuilds paused, preparing for deletion") | ||
|
|
||
| // Now prepare for deletion by creating an empty template version | ||
| // At this point, prebuilds should be paused by the caller | ||
| // Now prepare for deletion by creating an empty template version. | ||
| // At this point, prebuilds should be paused by the caller. | ||
| logger.Info(ctx, "creating empty template version for deletion") | ||
| emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) | ||
| if err != nil { | ||
| r.cfg.Metrics.AddError(r.template.Name, "create_empty_template_version") | ||
| return xerrors.Errorf("create empty template version for deletion: %w", err) | ||
| } | ||
|
|
||
| err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ | ||
| ID: emptyVersion.ID, | ||
| }) | ||
| if err != nil { | ||
| r.cfg.Metrics.AddError(r.template.Name, "update_active_template_version") | ||
| return xerrors.Errorf("update active template version to empty for deletion: %w", err) | ||
| if err = r.pushEmptyTemplateVersion(ctx); err != nil { | ||
| r.cfg.Metrics.AddError(r.template.Name, "clear_template_prebuilds") | ||
| return xerrors.Errorf("clear template prebuilds for deletion: %w", err) | ||
| } | ||
|
|
||
| logger.Info(ctx, "waiting for all runners to reach deletion barrier") | ||
|
|
@@ -301,21 +319,108 @@ func (r *Runner) createTemplateVersion(ctx context.Context, templateID uuid.UUID | |
|
|
||
| var errTickerDone = xerrors.New("done") | ||
|
|
||
| // pushEmptyTemplateVersion pushes a new empty template version (no presets, no | ||
| // prebuilds) and makes it active. This stops the reconciler from spawning new | ||
| // prebuild workspaces for the template. | ||
| func (r *Runner) pushEmptyTemplateVersion(ctx context.Context) error { | ||
| emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0) | ||
| if err != nil { | ||
| return xerrors.Errorf("create empty template version: %w", err) | ||
| } | ||
| if err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{ | ||
| ID: emptyVersion.ID, | ||
| }); err != nil { | ||
| return xerrors.Errorf("update active template version: %w", err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error { | ||
| logs = loadtestutil.NewSyncWriter(logs) | ||
| logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug) | ||
|
|
||
| logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) | ||
| // If Run failed before the template was created, there is nothing to clean up. | ||
| if r.template.ID == uuid.Nil { | ||
| logger.Info(ctx, "template was never created, skipping cleanup") | ||
| return nil | ||
| } | ||
|
|
||
| err := r.client.DeleteTemplate(ctx, r.template.ID) | ||
| // Workspaces must be deleted before the template can be deleted. | ||
| workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name) | ||
| if err != nil { | ||
| return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err) | ||
| } | ||
|
|
||
| logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name)) | ||
|
|
||
| // Retry failed workspace deletions up to maxDeletionAttempts times to | ||
| // handle transient errors (e.g. a delete build that fails due to a | ||
| // provisioner hiccup). | ||
| const maxDeletionAttempts = 3 | ||
| remaining := workspaces | ||
| for attempt := range maxDeletionAttempts { | ||
| if len(remaining) == 0 { | ||
| break | ||
| } | ||
| logger.Info(ctx, "trying to delete workspaces", | ||
| slog.F("attempt", attempt+1), | ||
| slog.F("remaining", len(remaining)), | ||
| slog.F("template_name", r.template.Name), | ||
| ) | ||
| var failed []codersdk.Workspace | ||
| for _, ws := range remaining { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In your experience, are there a lot of these? Do we need to think about running in parallel?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per runner there is at most 50 workspaces that would need to be deleted, because we create a runner per template and all of our prebuilds scenarios create 5 presets per template and 10 workspaces per preset. I don't think a max of 50 is too much to delete serially per runner. We're already running each runner in a separate go routine so we're already running If we switch up our scenario sizing or something else in the future that would result in potentially many more workspaces per runner that would need to be cleaned up we can make changes here at that point. |
||
| cr := workspacebuild.NewCleanupRunner(r.client, ws.ID) | ||
| if err := cr.Run(ctx, ws.ID.String(), logs); err != nil { | ||
| logger.Warn(ctx, "failed to delete workspace", | ||
| slog.F("workspace_id", ws.ID), | ||
| slog.F("workspace_name", ws.Name), | ||
| slog.Error(err), | ||
| ) | ||
| failed = append(failed, ws) | ||
| } | ||
| } | ||
| remaining = failed | ||
| } | ||
|
|
||
| if len(remaining) > 0 { | ||
| ids := make([]string, len(remaining)) | ||
| for i, ws := range remaining { | ||
| ids[i] = ws.ID.String() | ||
| } | ||
| return xerrors.Errorf("could not delete all workspaces after %d attempts; remaining: %v", maxDeletionAttempts, ids) | ||
| } | ||
|
|
||
| logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name)) | ||
| if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil { | ||
| return xerrors.Errorf("delete template: %w", err) | ||
| } | ||
|
|
||
| logger.Info(ctx, "template deleted successfully", slog.F("template_name", r.template.Name)) | ||
| return nil | ||
| } | ||
|
|
||
| // allWorkspacesForTemplate returns all workspaces belonging to templateName, | ||
| // paginating through results until exhausted. | ||
| func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) { | ||
| const pageSize = 100 | ||
| var workspaces []codersdk.Workspace | ||
| for page := 0; ; page++ { | ||
| resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{ | ||
| Template: templateName, | ||
| Offset: page * pageSize, | ||
| Limit: pageSize, | ||
| }) | ||
| if err != nil { | ||
| return nil, xerrors.Errorf("list workspaces page %d: %w", page, err) | ||
| } | ||
| workspaces = append(workspaces, resp.Workspaces...) | ||
| if len(resp.Workspaces) < pageSize { | ||
| break | ||
| } | ||
| } | ||
| return workspaces, nil | ||
| } | ||
|
|
||
| //go:embed tf/main.tf.tpl | ||
| var templateContent string | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we be worrying about the case where the template is left in a state with a positive number of prebuilds, and thus this routine and the reconciler get into a fight --- it creating more as these are deleted?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question. I never ran into this case when running the prebuilds tests myself, but I think it is possible.
Previously we would only ever push a new template version with no presets in the
Runfunction after being able to successfully configure and run some prebuilds first. In that case we also had, via the CLI code, already set the global prebuilds reconciler flag to block new prebuilds anyways.In the edge case(s) this PR is trying to resolve, where old templates still exist during a subsequent run of the prebuilds load generator we also need to push a new version of the template in that error case. I've updated the relevant block in
Runin the latest commit.