Skip to content

Commit f009c17

Browse files
f0sseldeansheather
andauthored
fix(coderd): cut DB fan-out on agent instance-identity auth (backport #24973) (#24982)
Backport of #24973 to `release/2.33`. ## Summary Restores `v2.33.0-rc.2`-equivalent query cost for agent instance-identity auth, which currently saturates the pgx pool when multiple agents share an instance ID. Customer report against rc.3 traced 233x `Internal error fetching provisioner job resource` 500s during a 50-minute incident window to this path. ## Changes 1. **System fast-path on `authorizeProvisionerJob`** (`coderd/database/dbauthz/dbauthz.go`): Short-circuits the per-job RBAC fan-out through `GetWorkspaceBuildByJobID` -> `GetWorkspaceByID` for `AsSystemRestricted` callers. 2. **Drop survivor re-fetch in `handleAuthInstanceID`** (`coderd/workspaceresourceauth.go`): Captures the provisioner job alongside each candidate during the filter loop so the post-selection code reads it directly instead of re-querying. ## Conflict resolution One conflict in `coderd/database/dbauthz/dbauthz_test.go`: the `TestAsAutostart` test function (from an unrelated commit on `main`) was brought in as surrounding context during the cherry-pick. It was removed since it tests functionality (`ResourceUserSecret.Read` for the Autostart role) not present on the release branch. ## Tests - `TestAuthorizeProvisionerJob_SystemFastPath` (3 sub-tests): all pass - `TestPostWorkspaceAuthAWSInstanceIdentity/Ambiguous/*` (7 sub-tests): all pass > Generated by Coder Agents Co-authored-by: Dean Sheather <dean@deansheather.com>
1 parent 17635dd commit f009c17

3 files changed

Lines changed: 159 additions & 36 deletions

File tree

coderd/database/dbauthz/dbauthz.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,6 +1502,28 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole,
15021502
}
15031503

15041504
func (q *querier) authorizeProvisionerJob(ctx context.Context, job database.ProvisionerJob) error {
1505+
// System-restricted callers (e.g. instance-identity agent auth via
1506+
// AsSystemRestricted) have already passed an outer authz check before
1507+
// reaching the provisioner job. Skip the per-job RBAC fan-out through
1508+
// GetWorkspaceBuildByJobID -> GetWorkspaceByID, which serializes 2
1509+
// extra DB queries + 1 RBAC eval per call. Under saturated pgx pools
1510+
// this cascade can block agent auth past the HTTP write timeout (see
1511+
// incident report against v2.33.0-rc.3 with multi-agent
1512+
// instance-identity templates).
1513+
//
1514+
// We check the subject type directly rather than calling
1515+
// authorizeContext(ResourceSystem) so we do not record a site-scoped
1516+
// authz call on every provisioner-job lookup; tests like
1517+
// TestCreateUserWorkspace/AuthzStory assert that workspace creation
1518+
// only emits org-scoped authz calls. The same actor.Type check is
1519+
// already used elsewhere in this file (see GetChatDiffStatusesByChatIDs).
1520+
//
1521+
// If a future system actor needs the same fast-path, add its
1522+
// SubjectType here explicitly rather than broadening to a permission
1523+
// check.
1524+
if actor, ok := ActorFromContext(ctx); ok && actor.Type == rbac.SubjectTypeSystemRestricted {
1525+
return nil
1526+
}
15051527
switch job.Type {
15061528
case database.ProvisionerJobTypeWorkspaceBuild:
15071529
// Authorized call to get workspace build. If we can read the build, we can

coderd/database/dbauthz/dbauthz_test.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6198,6 +6198,114 @@ func TestGetWorkspaceAgentByID_FastPath(t *testing.T) {
61986198
})
61996199
}
62006200

6201+
// TestAuthorizeProvisionerJob_SystemFastPath verifies that
6202+
// authorizeProvisionerJob short-circuits for system-restricted callers
6203+
// instead of fanning out into GetWorkspaceBuildByJobID -> GetWorkspaceByID.
6204+
// That cascade adds 2 SQL queries + 1 RBAC eval per provisioner-job lookup
6205+
// and saturates the pgx pool when called repeatedly from agent
6206+
// instance-identity auth (see incident report against v2.33.0-rc.3).
6207+
func TestAuthorizeProvisionerJob_SystemFastPath(t *testing.T) {
6208+
t.Parallel()
6209+
6210+
jobID := uuid.New()
6211+
job := database.ProvisionerJob{
6212+
ID: jobID,
6213+
Type: database.ProvisionerJobTypeWorkspaceBuild,
6214+
}
6215+
6216+
authorizer := rbac.NewAuthorizer(prometheus.NewRegistry())
6217+
6218+
t.Run("AsSystemRestricted/SkipsCascade", func(t *testing.T) {
6219+
t.Parallel()
6220+
6221+
ctrl := gomock.NewController(t)
6222+
mockDB := dbmock.NewMockStore(ctrl)
6223+
6224+
mockDB.EXPECT().Wrappers().Return([]string{})
6225+
// The fast-path must short-circuit before GetWorkspaceBuildByJobID
6226+
// or GetWorkspaceByID can be called. The strict mock will fail
6227+
// the test if either is invoked.
6228+
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
6229+
6230+
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
6231+
ctx := dbauthz.AsSystemRestricted(context.Background())
6232+
6233+
got, err := q.GetProvisionerJobByID(ctx, jobID)
6234+
require.NoError(t, err)
6235+
require.Equal(t, job, got)
6236+
})
6237+
6238+
t.Run("AsSystemRestricted/TemplateVersion/SkipsCascade", func(t *testing.T) {
6239+
t.Parallel()
6240+
6241+
// The fast-path is type-agnostic: it must short-circuit the
6242+
// template-version cascade as well, so neither
6243+
// GetTemplateVersionByJobID nor GetTemplateByID is invoked.
6244+
tvJobID := uuid.New()
6245+
tvJob := database.ProvisionerJob{
6246+
ID: tvJobID,
6247+
Type: database.ProvisionerJobTypeTemplateVersionImport,
6248+
}
6249+
6250+
ctrl := gomock.NewController(t)
6251+
mockDB := dbmock.NewMockStore(ctrl)
6252+
6253+
mockDB.EXPECT().Wrappers().Return([]string{})
6254+
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), tvJobID).Return(tvJob, nil)
6255+
6256+
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
6257+
ctx := dbauthz.AsSystemRestricted(context.Background())
6258+
6259+
got, err := q.GetProvisionerJobByID(ctx, tvJobID)
6260+
require.NoError(t, err)
6261+
require.Equal(t, tvJob, got)
6262+
})
6263+
6264+
t.Run("NonSystemActor/StillCascades", func(t *testing.T) {
6265+
t.Parallel()
6266+
6267+
// An auditor has no ResourceSystem permission, so the fast-path
6268+
// must fall through to the workspace-build cascade. That cascade
6269+
// then fails authz on the workspace because auditors cannot read
6270+
// arbitrary workspaces. The error type is what we assert: it
6271+
// proves the cascade ran rather than the fast-path short-circuiting.
6272+
orgID := uuid.New()
6273+
wsID := uuid.New()
6274+
workspace := database.Workspace{
6275+
ID: wsID,
6276+
OwnerID: uuid.New(),
6277+
OrganizationID: orgID,
6278+
}
6279+
build := database.WorkspaceBuild{
6280+
ID: uuid.New(),
6281+
WorkspaceID: wsID,
6282+
JobID: jobID,
6283+
}
6284+
auditor := rbac.Subject{
6285+
ID: uuid.NewString(),
6286+
Roles: rbac.RoleIdentifiers{rbac.RoleAuditor()},
6287+
Groups: []string{orgID.String()},
6288+
Scope: rbac.ScopeAll,
6289+
}
6290+
6291+
ctrl := gomock.NewController(t)
6292+
mockDB := dbmock.NewMockStore(ctrl)
6293+
6294+
mockDB.EXPECT().Wrappers().Return([]string{})
6295+
mockDB.EXPECT().GetProvisionerJobByID(gomock.Any(), jobID).Return(job, nil)
6296+
mockDB.EXPECT().GetWorkspaceBuildByJobID(gomock.Any(), jobID).Return(build, nil)
6297+
mockDB.EXPECT().GetWorkspaceByID(gomock.Any(), wsID).Return(workspace, nil)
6298+
6299+
q := dbauthz.New(mockDB, authorizer, slogtest.Make(t, nil), coderdtest.AccessControlStorePointer())
6300+
ctx := dbauthz.As(context.Background(), auditor)
6301+
6302+
_, err := q.GetProvisionerJobByID(ctx, jobID)
6303+
require.Error(t, err)
6304+
require.True(t, dbauthz.IsNotAuthorizedError(err),
6305+
"cascade must run and produce a NotAuthorized error for auditor: got %v", err)
6306+
})
6307+
}
6308+
62016309
func TestAsChatd(t *testing.T) {
62026310
t.Parallel()
62036311

coderd/workspaceresourceauth.go

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,18 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
148148
// Template version agents can share an instance ID with workspace build
149149
// agents. Keep only workspace build agents before resolving ambiguity so
150150
// template version agents do not force CODER_AGENT_NAME.
151-
buildAgents := agents[:0]
151+
//
152+
// We attach the provisioner job to each candidate during the filter
153+
// loop so the post-selection code below can read it directly from the
154+
// chosen candidate instead of re-querying. The previous code re-fetched
155+
// the resource and job for the surviving agent, firing the
156+
// resource->job->build->workspace dbauthz cascade twice and saturating
157+
// the pgx pool under load.
158+
type instanceCandidate struct {
159+
agent database.WorkspaceAgent
160+
job database.ProvisionerJob
161+
}
162+
buildCandidates := make([]instanceCandidate, 0, len(agents))
152163
for _, candidate := range agents {
153164
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, candidate.ResourceID)
154165
if err != nil {
@@ -167,40 +178,42 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
167178
return
168179
}
169180
if job.Type == database.ProvisionerJobTypeWorkspaceBuild {
170-
buildAgents = append(buildAgents, candidate)
181+
buildCandidates = append(buildCandidates, instanceCandidate{
182+
agent: candidate,
183+
job: job,
184+
})
171185
}
172186
}
173-
agents = buildAgents
174-
if len(agents) == 0 {
187+
if len(buildCandidates) == 0 {
175188
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
176189
Message: fmt.Sprintf("Instance with id %q not found.", instanceID),
177190
})
178191
return
179192
}
180193

181-
var agent database.WorkspaceAgent
194+
var selected instanceCandidate
182195
if agentName != "" {
183-
for _, candidate := range agents {
184-
if candidate.Name == agentName {
185-
agent = candidate
196+
for _, candidate := range buildCandidates {
197+
if candidate.agent.Name == agentName {
198+
selected = candidate
186199
break
187200
}
188201
}
189-
if agent.ID == uuid.Nil {
202+
if selected.agent.ID == uuid.Nil {
190203
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
191204
Message: fmt.Sprintf("No agent found with instance ID %q and name %q.", instanceID, agentName),
192205
})
193206
return
194207
}
195208
} else {
196-
if len(agents) != 1 {
209+
if len(buildCandidates) != 1 {
197210
// Include agent names in the error message to help operators
198211
// configure CODER_AGENT_NAME. The caller has already proven
199212
// cloud instance identity, so agent names are not sensitive
200213
// here.
201-
names := make([]string, len(agents))
202-
for i, candidate := range agents {
203-
names[i] = candidate.Name
214+
names := make([]string, len(buildCandidates))
215+
for i, candidate := range buildCandidates {
216+
names[i] = candidate.agent.Name
204217
}
205218
sort.Strings(names)
206219
httpapi.Write(ctx, rw, http.StatusConflict, codersdk.Response{
@@ -212,30 +225,10 @@ func (api *API) handleAuthInstanceID(rw http.ResponseWriter, r *http.Request, in
212225
})
213226
return
214227
}
215-
agent = agents[0]
216-
}
217-
resource, err := api.Database.GetWorkspaceResourceByID(systemCtx, agent.ResourceID)
218-
if err != nil {
219-
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
220-
Message: "Internal error fetching provisioner job resource.",
221-
Detail: err.Error(),
222-
})
223-
return
224-
}
225-
job, err := api.Database.GetProvisionerJobByID(systemCtx, resource.JobID)
226-
if err != nil {
227-
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
228-
Message: "Internal error fetching provisioner job.",
229-
Detail: err.Error(),
230-
})
231-
return
232-
}
233-
if job.Type != database.ProvisionerJobTypeWorkspaceBuild {
234-
httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
235-
Message: fmt.Sprintf("%q jobs cannot be authenticated.", job.Type),
236-
})
237-
return
228+
selected = buildCandidates[0]
238229
}
230+
agent := selected.agent
231+
job := selected.job
239232
var jobData provisionerdserver.WorkspaceProvisionJob
240233
err = json.Unmarshal(job.Input, &jobData)
241234
if err != nil {

0 commit comments

Comments
 (0)