Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 121 additions & 16 deletions scaletest/prebuilds/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
_ "embed"
"html/template"
"io"
"net/http"
"time"

"github.com/google/uuid"
Expand All @@ -17,6 +18,7 @@ import (
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/scaletest/harness"
"github.com/coder/coder/v2/scaletest/loadtestutil"
"github.com/coder/coder/v2/scaletest/workspacebuild"
)

type Runner struct {
Expand Down Expand Up @@ -77,6 +79,31 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
}
templ, err := r.client.CreateTemplate(ctx, r.cfg.OrganizationID, templateReq)
if err != nil {
// If the template already exists from a previous failed run, look it up so
// Cleanup() can delete it and the rerun doesn't leave orphaned resources.
var sdkErr *codersdk.Error
if xerrors.As(err, &sdkErr) && sdkErr.StatusCode() == http.StatusConflict {
existing, listErr := r.client.Templates(ctx, codersdk.TemplateFilter{
OrganizationID: r.cfg.OrganizationID,
ExactName: templateName,
})
if listErr == nil && len(existing) > 0 {
r.template = existing[0]
logger.Warn(ctx, "template already exists from a previous run, will be cleaned up",
slog.F("template_name", r.template.Name),
slog.F("template_id", r.template.ID),
)
// Clear any prebuild config on the orphaned template so the
// reconciler doesn't keep spawning workspaces while Cleanup()
// is trying to delete them.
if clearErr := r.pushEmptyTemplateVersion(ctx); clearErr != nil {
logger.Warn(ctx, "failed to clear prebuilds config on orphaned template",
slog.F("template_id", r.template.ID),
slog.Error(clearErr),
)
}
}
}
r.cfg.Metrics.AddError(templateName, "create_template")
return xerrors.Errorf("create template: %w", err)
}
Expand Down Expand Up @@ -105,21 +132,12 @@ func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error {
r.cfg.DeletionSetupBarrier.Wait()
logger.Info(ctx, "prebuilds paused, preparing for deletion")

// Now prepare for deletion by creating an empty template version
// At this point, prebuilds should be paused by the caller
// Now prepare for deletion by creating an empty template version.
// At this point, prebuilds should be paused by the caller.
logger.Info(ctx, "creating empty template version for deletion")
emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0)
if err != nil {
r.cfg.Metrics.AddError(r.template.Name, "create_empty_template_version")
return xerrors.Errorf("create empty template version for deletion: %w", err)
}

err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{
ID: emptyVersion.ID,
})
if err != nil {
r.cfg.Metrics.AddError(r.template.Name, "update_active_template_version")
return xerrors.Errorf("update active template version to empty for deletion: %w", err)
if err = r.pushEmptyTemplateVersion(ctx); err != nil {
r.cfg.Metrics.AddError(r.template.Name, "clear_template_prebuilds")
return xerrors.Errorf("clear template prebuilds for deletion: %w", err)
}

logger.Info(ctx, "waiting for all runners to reach deletion barrier")
Expand Down Expand Up @@ -301,21 +319,108 @@ func (r *Runner) createTemplateVersion(ctx context.Context, templateID uuid.UUID

var errTickerDone = xerrors.New("done")

// pushEmptyTemplateVersion pushes a new empty template version (no presets, no
// prebuilds) and makes it active. This stops the reconciler from spawning new
// prebuild workspaces for the template.
func (r *Runner) pushEmptyTemplateVersion(ctx context.Context) error {
emptyVersion, err := r.createTemplateVersion(ctx, r.template.ID, 0, 0)
if err != nil {
return xerrors.Errorf("create empty template version: %w", err)
}
if err = r.client.UpdateActiveTemplateVersion(ctx, r.template.ID, codersdk.UpdateActiveTemplateVersion{
ID: emptyVersion.ID,
}); err != nil {
return xerrors.Errorf("update active template version: %w", err)
}
return nil
}

func (r *Runner) Cleanup(ctx context.Context, _ string, logs io.Writer) error {
logs = loadtestutil.NewSyncWriter(logs)
logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug)

logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
// If Run failed before the template was created, there is nothing to clean up.
if r.template.ID == uuid.Nil {
logger.Info(ctx, "template was never created, skipping cleanup")
return nil
}

err := r.client.DeleteTemplate(ctx, r.template.ID)
// Workspaces must be deleted before the template can be deleted.
workspaces, err := allWorkspacesForTemplate(ctx, r.client, r.template.Name)
if err != nil {
return xerrors.Errorf("list workspaces for template %q: %w", r.template.Name, err)
}

logger.Info(ctx, "deleting workspaces for template", slog.F("count", len(workspaces)), slog.F("template_name", r.template.Name))

// Retry failed workspace deletions up to maxDeletionAttempts times to
// handle transient errors (e.g. a delete build that fails due to a
// provisioner hiccup).
const maxDeletionAttempts = 3
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be worrying about the case where the template is left in a state with a positive number of prebuilds, and thus this routine and the reconciler get into a fight --- it creating more as these are deleted?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I never ran into this case when running the prebuilds tests myself, but I think it is possible.

Previously we would only ever push a new template version with no presets in the Run function after being able to successfully configure and run some prebuilds first. In that case we also had, via the CLI code, already set the global prebuilds reconciler flag to block new prebuilds anyways.

In the edge case(s) this PR is trying to resolve, where old templates still exist during a subsequent run of the prebuilds load generator we also need to push a new version of the template in that error case. I've updated the relevant block in Run in the latest commit.

remaining := workspaces
for attempt := range maxDeletionAttempts {
if len(remaining) == 0 {
break
}
logger.Info(ctx, "trying to delete workspaces",
slog.F("attempt", attempt+1),
slog.F("remaining", len(remaining)),
slog.F("template_name", r.template.Name),
)
var failed []codersdk.Workspace
for _, ws := range remaining {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In your experience, are there a lot of these? Do we need to think about running in parallel?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per runner there is at most 50 workspaces that would need to be deleted, because we create a runner per template and all of our prebuilds scenarios create 5 presets per template and 10 workspaces per preset. I don't think a max of 50 is too much to delete serially per runner.

We're already running each runner in a separate go routine so we're already running num_templates (defined in each prebuild scenario) go routines.

If we switch up our scenario sizing or something else in the future that would result in potentially many more workspaces per runner that would need to be cleaned up we can make changes here at that point.

cr := workspacebuild.NewCleanupRunner(r.client, ws.ID)
if err := cr.Run(ctx, ws.ID.String(), logs); err != nil {
logger.Warn(ctx, "failed to delete workspace",
slog.F("workspace_id", ws.ID),
slog.F("workspace_name", ws.Name),
slog.Error(err),
)
failed = append(failed, ws)
}
}
remaining = failed
}

if len(remaining) > 0 {
ids := make([]string, len(remaining))
for i, ws := range remaining {
ids[i] = ws.ID.String()
}
return xerrors.Errorf("could not delete all workspaces after %d attempts; remaining: %v", maxDeletionAttempts, ids)
}

logger.Info(ctx, "deleting template", slog.F("template_name", r.template.Name))
if err := r.client.DeleteTemplate(ctx, r.template.ID); err != nil {
return xerrors.Errorf("delete template: %w", err)
}

logger.Info(ctx, "template deleted successfully", slog.F("template_name", r.template.Name))
return nil
}

// allWorkspacesForTemplate returns all workspaces belonging to templateName,
// paginating through results until exhausted.
func allWorkspacesForTemplate(ctx context.Context, client *codersdk.Client, templateName string) ([]codersdk.Workspace, error) {
const pageSize = 100
var workspaces []codersdk.Workspace
for page := 0; ; page++ {
resp, err := client.Workspaces(ctx, codersdk.WorkspaceFilter{
Template: templateName,
Offset: page * pageSize,
Limit: pageSize,
})
if err != nil {
return nil, xerrors.Errorf("list workspaces page %d: %w", page, err)
}
workspaces = append(workspaces, resp.Workspaces...)
if len(resp.Workspaces) < pageSize {
break
}
}
return workspaces, nil
}

//go:embed tf/main.tf.tpl
var templateContent string

Expand Down
Loading