coder
diff --git a/‎coderd/x/chatd/chatd.go‎
Lines changed: 151 additions & 8 deletions b/‎coderd/x/chatd/chatd.go‎
Lines changed: 151 additions & 8 deletions
@@ -72,7 +72,22 @@ const (
 	// cold-start agent's first MCP reload can settle before
 	// chatd gives up.
 	workspaceMCPDiscoveryTimeout = 35 * time.Second
-	turnStatusLabelWriteTimeout  = 5 * time.Second
+	// workspaceMCPPrimeMaxWait bounds the deadline used by the
+	// create_workspace / start_workspace post-ready cache primer
+	// loop. The primer checks the deadline only after each
+	// discoverWorkspaceMCPTools call returns, so total wall-clock
+	// time can exceed this by one such call (dialTimeout +
+	// workspaceMCPDiscoveryTimeout in the worst case). The constant
+	// caps when new retries can start, not when an in-flight call
+	// must finish. Empty results usually mean the agent's MCP
+	// Connect is still racing with agent startup. The agent-side
+	// budget is agent/x/agentmcp.connectTimeout (30s).
+	workspaceMCPPrimeMaxWait = 30 * time.Second
+	// workspaceMCPPrimeRetryInterval is the short backoff between
+	// re-attempts inside the primer when ListMCPTools returns an
+	// empty list without error.
+	workspaceMCPPrimeRetryInterval = 2 * time.Second
+	turnStatusLabelWriteTimeout    = 5 * time.Second
 	// defaultDialTimeout matches the timeout used by ~8 other
 	// server-side AgentConn callers.
 	defaultDialTimeout = 30 * time.Second
@@ -568,6 +583,61 @@ func (p *Server) discoverWorkspaceMCPTools(
 	return tools
 }
 
+// primeWorkspaceMCPCache populates workspaceMCPToolsCache after the
+// create_workspace or start_workspace tool finishes waiting for the
+// workspace agent to become reachable. By the time it runs the agent
+// is already Ready, so a single ListMCPTools call usually succeeds.
+// When the agent's MCP server is still racing with agent startup,
+// ListMCPTools may return an empty list (no error) on the first call;
+// the primer retries with a short backoff up to
+// workspaceMCPPrimeMaxWait so the LLM step that follows the tool call
+// sees the workspace MCP tools in the cache and PrepareTools does not
+// need to dial again.
+//
+// Returns silently on every failure mode. The chat continues without
+// workspace MCP tools when the agent does not advertise any within
+// the budget. The next user turn re-runs top-of-turn discovery from
+// scratch.
+func (p *Server) primeWorkspaceMCPCache(
+	ctx context.Context,
+	logger slog.Logger,
+	chatID uuid.UUID,
+	workspaceCtx *turnWorkspaceContext,
+) {
+	deadline := p.clock.Now().Add(workspaceMCPPrimeMaxWait)
+	attempt := 0
+	for {
+		attempt++
+		tools := p.discoverWorkspaceMCPTools(ctx, logger, chatID, workspaceCtx)
+		if len(tools) > 0 {
+			logger.Debug(ctx, "primed workspace MCP cache",
+				slog.F("chat_id", chatID),
+				slog.F("tool_count", len(tools)),
+				slog.F("attempts", attempt),
+			)
+			return
+		}
+		if ctx.Err() != nil {
+			return
+		}
+		if !p.clock.Now().Before(deadline) {
+			logger.Debug(ctx,
+				"workspace MCP cache primer gave up waiting for tools",
+				slog.F("chat_id", chatID),
+				slog.F("attempts", attempt),
+			)
+			return
+		}
+		timer := p.clock.NewTimer(workspaceMCPPrimeRetryInterval, "chatd", "workspace-mcp-prime")
+		select {
+		case <-timer.C:
+		case <-ctx.Done():
+			timer.Stop()
+			return
+		}
+	}
+}
+
 type turnWorkspaceContext struct {
 	server           *Server
 	chatStateMu      *sync.Mutex
@@ -6457,6 +6527,11 @@ type rootChatToolsOptions struct {
 	resolvePlanPath func(context.Context) (string, string, error)
 	storeFile       chattool.StoreFileFunc
 	isPlanModeTurn  bool
+	// primerCtx scopes the workspace MCP cache primer goroutines
+	// that onChatUpdated launches. runChat cancels it before
+	// workspaceCtx.close() so an in-flight primer cannot dial a
+	// fresh conn after the cached one was released.
+	primerCtx context.Context
 }
 
 func (p *Server) loadPlanModeInstructions(
@@ -6520,6 +6595,50 @@ func (p *Server) appendRootChatTools(
 				}
 			}
 		}
+
+		// Prime the workspace MCP tools cache while the create_workspace
+		// or start_workspace tool is still running. The AgentID guard
+		// below restricts the primer to the post-ready callback, when
+		// the agent is reachable. ListMCPTools may still return an
+		// empty list on the first try when the agent's MCP Connect is
+		// racing with agent startup; primeWorkspaceMCPCache retries
+		// with a short backoff up to workspaceMCPPrimeMaxWait. Priming
+		// here lets the next LLM step's PrepareTools hit the cache
+		// instead of dialing again on a separate timeout budget.
+		//
+		// Run asynchronously: the tool itself must not block on the
+		// primer because the agent may not advertise any MCP tools at
+		// all (e.g. minimal templates), in which case the primer waits
+		// the full budget before giving up. PrepareTools on the next
+		// step covers the cache miss path; the primer is purely an
+		// optimization that warms the cache while the LLM is thinking.
+		// inflight tracking ensures server shutdown still waits for any
+		// in-progress primer.
+		//
+		// Guard on both WorkspaceID and AgentID being valid:
+		// create_workspace and start_workspace each fire onChatUpdated
+		// twice for a new build (binding before waitForAgentReady;
+		// post-ready after it), and stop_workspace fires it with a nil
+		// agent. Only the post-ready callback has a live AgentID, so
+		// the pre-build and stop-side firings would otherwise spawn a
+		// primer goroutine that dials a missing or dying agent and
+		// burns the full budget for nothing.
+		//
+		// Read the snapshot from workspaceCtx rather than the
+		// updatedChat parameter: persistInstructionFiles above runs
+		// ensureWorkspaceAgent which calls persistBuildAgentBinding and
+		// setCurrentChat, so by the time we get here the in-memory
+		// snapshot has the freshly bound AgentID even when the
+		// updatedChat parameter (read from the DB before the binding
+		// was persisted) does not.
+		snapshot := opts.workspaceCtx.currentChatSnapshot()
+		if snapshot.WorkspaceID.Valid && snapshot.AgentID.Valid {
+			p.inflight.Add(1)
+			go func() {
+				defer p.inflight.Done()
+				p.primeWorkspaceMCPCache(opts.primerCtx, p.logger, snapshot.ID, opts.workspaceCtx)
+			}()
+		}
 	}
 
 	tools = append(tools,
@@ -6852,7 +6971,16 @@ func (p *Server) runChat(
 		currentChat:      &currentChat,
 		loadChatSnapshot: loadChatSnapshot,
 	}
-	defer workspaceCtx.close()
+	// primerCtx scopes the workspace MCP cache primer goroutines that
+	// onChatUpdated launches. We cancel it before workspaceCtx.close()
+	// so an in-flight primer cannot wake from its retry backoff,
+	// observe a cleared cached conn, dial a fresh one, and leak it
+	// when no subsequent close() runs.
+	primerCtx, primerCancel := context.WithCancel(ctx)
+	defer func() {
+		primerCancel()
+		workspaceCtx.close()
+	}()
 
 	planPathFn := func(ctx context.Context) (string, string, error) {
 		conn, err := workspaceCtx.getWorkspaceConn(ctx)
@@ -7435,6 +7563,7 @@ func (p *Server) runChat(
 			resolvePlanPath: resolvePlanPathForTools,
 			storeFile:       storeChatAttachment,
 			isPlanModeTurn:  isPlanModeTurn,
+			primerCtx:       primerCtx,
 		})
 	}
 
@@ -7766,25 +7895,39 @@ func (p *Server) runChat(
 		},
 		PrepareTools: func(currentTools []fantasy.AgentTool) []fantasy.AgentTool {
 			// Mid-turn workspace MCP discovery for chats that bind a
-			// workspace via create_workspace or start_workspace
-			// after the turn has already started. The top-of-turn
-			// discovery path is gated on chat.WorkspaceID.Valid; this
-			// callback bridges the gap so the LLM sees workspace MCP
-			// tools on the very next step instead of the turn after.
+			// workspace via create_workspace or start_workspace after the
+			// turn has already started. The top-of-turn discovery path is
+			// gated on chat.WorkspaceID.Valid; this callback bridges the
+			// gap so the LLM sees workspace MCP tools on the very next
+			// step instead of the turn after.
+			//
+			// create_workspace and start_workspace prime
+			// workspaceMCPToolsCache via onChatUpdated after
+			// waitForAgentReady returns, so the call below is almost
+			// always a cache hit. The primer's bounded wait means the
+			// dial fallback here only runs when priming itself failed.
 			if workspaceMCPDiscovered || isExploreSubagent {
 				return nil
 			}
 			snapshot := workspaceCtx.currentChatSnapshot()
 			if !snapshot.WorkspaceID.Valid {
 				return nil
 			}
-			workspaceMCPDiscovered = true
 			discovered := p.discoverWorkspaceMCPTools(
 				ctx, loopLogger, chat.ID, &workspaceCtx,
 			)
 			if len(discovered) == 0 {
+				// Leave workspaceMCPDiscovered false so a subsequent
+				// step retries discovery. PrepareTools fires once per
+				// LLM step, so retries are unbounded for the rest of
+				// the turn. Per-step cost is one
+				// GetWorkspaceAgentsInLatestBuildByWorkspaceID query
+				// plus one ListMCPTools RPC, both fast against a live
+				// conn. The primer's 30s budget applies to its own
+				// loop only.
 				return nil
 			}
+			workspaceMCPDiscovered = true
 			return append(slices.Clone(currentTools), discovered...)
 		},
 		PrepareMessages: func(msgs []fantasy.Message) []fantasy.Message {