coderight-dev
diff --git a/‎coderd/x/chatd/chaterror/classify.go‎
Lines changed: 52 additions & 0 deletions b/‎coderd/x/chatd/chaterror/classify.go‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎coderd/x/chatd/chaterror/classify_test.go‎
Lines changed: 117 additions & 0 deletions b/‎coderd/x/chatd/chaterror/classify_test.go‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎coderd/x/chatd/chatloop/chatloop.go‎
Lines changed: 73 additions & 27 deletions b/‎coderd/x/chatd/chatloop/chatloop.go‎
Lines changed: 73 additions & 27 deletions
@@ -22,6 +22,15 @@ type ClassifiedError struct {
 	// RetryAfter is a normalized minimum retry delay derived from
 	// provider response metadata when available.
 	RetryAfter time.Duration
+
+	// ChainBroken is true when the provider reported that the
+	// previous_response_id (or analogous chain anchor) is no longer
+	// retrievable. The chatloop retry path uses this signal to exit
+	// chain mode and replay full history before the next attempt.
+	// This is an internal signal; it is not surfaced as a separate
+	// codersdk.ChatErrorKind so the user-visible kind set stays
+	// stable.
+	ChainBroken bool
 }
 
 const responsesAPIDiagnosticMessage = "The chat continuation failed due to an " +
@@ -165,6 +174,20 @@ func Classify(err error) ClassifiedError {
 		return classified
 	}
 
+	// Chain-broken detection runs before the generic rule table so a
+	// 404 carrying a chain anchor failure is not classified as a
+	// generic non-retryable error. The chatloop retry callback uses
+	// the ChainBroken flag to exit chain mode and replay full
+	// history.
+	if classified, ok := chainBrokenClassification(
+		lower,
+		provider,
+		statusCode,
+		structured,
+	); ok {
+		return classified
+	}
+
 	deadline := errors.Is(err, context.DeadlineExceeded) || strings.Contains(lower, "context deadline exceeded")
 	overloadedMatch := statusCode == 529 || containsAny(lower, overloadedPatterns...)
 	authStrong := statusCode == 401 || containsAny(lower, authStrongPatterns...)
@@ -276,6 +299,35 @@ func streamIncompleteMessage(provider string) string {
 	return providerSubject(provider) + " stream closed unexpectedly before the response completed."
 }
 
+// chainBrokenClassification recognizes the OpenAI error
+// "Previous response with id ... not found" returned when a
+// chained turn references a previous_response_id the provider no
+// longer recognizes.
+func chainBrokenClassification(
+	lowerMessage string,
+	provider string,
+	statusCode int,
+	structured providerErrorDetails,
+) (ClassifiedError, bool) {
+	if !(strings.Contains(lowerMessage, "previous response with id") &&
+		strings.Contains(lowerMessage, "not found")) {
+		return ClassifiedError{}, false
+	}
+	// This class of error has so far only been observed with OpenAI.
+	if provider == "" {
+		provider = "openai"
+	}
+	return normalizeClassification(ClassifiedError{
+		Detail:      structured.detail,
+		Kind:        codersdk.ChatErrorKindGeneric,
+		Provider:    provider,
+		Retryable:   true,
+		StatusCode:  statusCode,
+		RetryAfter:  structured.retryAfter,
+		ChainBroken: true,
+	}), true
+}
+
 func responsesAPIDiagnostic(lowerMessage, detail string) (string, bool) {
 	lowerDetail := strings.ToLower(detail)
 	for _, match := range responsesAPIDiagnosticMatches {
 
@@ -746,6 +746,123 @@ func TestClassify_TruncatesProviderDetail(t *testing.T) {
 	require.True(t, strings.HasSuffix(classified.Detail, "…"))
 }
 
+func TestClassify_ChainBroken(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name            string
+		err             error
+		wantChainBroken bool
+		wantRetryable   bool
+		wantProvider    string
+		wantStatusCode  int
+	}{
+		{
+			name: "OpenAIPreviousResponseNotFoundBareString",
+			err: xerrors.New(
+				"Previous response with id 'resp_abc' not found.",
+			),
+			wantChainBroken: true,
+			wantRetryable:   true,
+			wantProvider:    "openai",
+			wantStatusCode:  0,
+		},
+		{
+			name: "OpenAIPreviousResponseNotFoundProviderError",
+			err: testProviderError(
+				"Previous response with id 'resp_096c70c5bb8d52bc0069fa11e0630c81a3ba210cddfa75bae9' not found.",
+				404,
+				nil,
+			),
+			wantChainBroken: true,
+			wantRetryable:   true,
+			wantProvider:    "openai",
+			wantStatusCode:  404,
+		},
+		{
+			name: "OpenAIPreviousResponseCaseInsensitive",
+			err: testProviderError(
+				"PREVIOUS RESPONSE WITH ID 'resp_abc' NOT FOUND.",
+				404,
+				nil,
+			),
+			wantChainBroken: true,
+			wantRetryable:   true,
+			wantProvider:    "openai",
+			wantStatusCode:  404,
+		},
+		{
+			name: "PreviousResponseWithoutNotFoundIsNotChainBroken",
+			err: testProviderError(
+				"Previous response with id 'resp_abc' is invalid.",
+				400,
+				nil,
+			),
+			wantChainBroken: false,
+		},
+		{
+			name: "UnrelatedNotFoundIsNotChainBroken",
+			err: testProviderError(
+				"resource not found",
+				404,
+				nil,
+			),
+			wantChainBroken: false,
+		},
+		{
+			name: "UnrelatedInvalidRequestIsNotChainBroken",
+			err: testProviderError(
+				"",
+				400,
+				nil,
+				testProviderResponseDump(`{"error":{"type":"invalid_request_error","message":"Image exceeds 5 MB maximum."}}`),
+			),
+			wantChainBroken: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			classified := chaterror.Classify(tt.err)
+			require.Equal(t, tt.wantChainBroken, classified.ChainBroken,
+				"chain broken flag mismatch")
+			if !tt.wantChainBroken {
+				return
+			}
+			require.Equal(t, tt.wantRetryable, classified.Retryable,
+				"chain-broken errors must be retryable so the loop"+
+					" can self-heal")
+			require.Equal(t, tt.wantProvider, classified.Provider)
+			require.Equal(t, tt.wantStatusCode, classified.StatusCode)
+			require.Equal(t, codersdk.ChatErrorKindGeneric, classified.Kind,
+				"chain-broken keeps the user-visible kind unchanged"+
+					" so we don't add a new codersdk surface")
+		})
+	}
+}
+
+func TestClassify_ChainBrokenSurvivesWithClassification(t *testing.T) {
+	t.Parallel()
+
+	original := chaterror.Classify(testProviderError(
+		"Previous response with id 'resp_abc' not found.",
+		404,
+		nil,
+	))
+	require.True(t, original.ChainBroken)
+
+	wrapped := chaterror.WithClassification(
+		xerrors.New("transport blew up"),
+		original,
+	)
+	round := chaterror.Classify(wrapped)
+	require.True(t, round.ChainBroken,
+		"WithClassification round-trips ChainBroken so the retry path"+
+			" can detect it after re-classification")
+}
+
 func testProviderError(
 	message string,
 	statusCode int,
 
@@ -161,11 +161,14 @@ type RunOptions struct {
 	Compaction       *CompactionOptions
 	ReloadMessages   func(context.Context) ([]fantasy.Message, error)
 	DisableChainMode func()
-	// PrepareMessages is called before each LLM step with the
-	// current message history. If it returns non-nil, the returned
-	// slice replaces messages for this and all subsequent steps.
+	// PrepareMessages is called at least once before each LLM step
+	// with the current message history. If it returns non-nil, the
+	// returned slice replaces messages for this and all subsequent
+	// steps.
 	// Used to inject system context that becomes available mid-loop
 	// (e.g. AGENTS.md after create_workspace).
+	// NOTE: It may be called more than once per step in case of a
+	// retry, so callbacks should avoid duplicating messages.
 	PrepareMessages func([]fantasy.Message) []fantasy.Message
 
 	// OnRetry is called before each retry attempt when the LLM
@@ -349,7 +352,6 @@ func Run(ctx context.Context, opts RunOptions) error {
 	}
 
 	tools := buildToolDefinitions(opts.Tools, opts.ActiveTools, opts.ProviderTools)
-	applyAnthropicCaching := shouldApplyAnthropicPromptCaching(opts.Model)
 
 	messages := opts.Messages
 	var lastUsage fantasy.Usage
@@ -390,30 +392,10 @@ func Run(ctx context.Context, opts RunOptions) error {
 			modelName := opts.Model.Model()
 			opts.Metrics.StepsTotal.WithLabelValues(provider, modelName).Inc()
 			stepStart := time.Now()
-			// Copy messages so that provider-specific caching
-			// mutations don't leak back to the caller's slice.
-			// copy copies Message structs by value, so field
-			// reassignments in addAnthropicPromptCaching only
-			// affect the prepared slice.
-			if opts.PrepareMessages != nil {
-				if updated := opts.PrepareMessages(messages); updated != nil {
-					messages = updated
-				}
-			}
-			prepared := make([]fantasy.Message, len(messages))
-			copy(prepared, messages)
-			prepared, sanitizeStats := chatsanitize.SanitizeAnthropicProviderToolHistory(provider, prepared)
-			chatsanitize.LogAnthropicProviderToolSanitization(
-				ctx, opts.Logger, "pre_request", provider, modelName, sanitizeStats,
-				slog.F("step_index", step),
-				slog.F("total_steps", totalSteps),
-			)
-			prepared = chatsanitize.ApplyAnthropicProviderToolGuard(
-				ctx, opts.Logger, provider, modelName, prepared,
+			var prepared []fantasy.Message
+			messages, prepared = prepareMessagesForRequest(
+				ctx, opts, messages, provider, modelName, step, totalSteps,
 			)
-			if applyAnthropicCaching {
-				addAnthropicPromptCaching(prepared)
-			}
 			opts.Metrics.MessageCount.WithLabelValues(provider, modelName).Observe(float64(len(prepared)))
 			opts.Metrics.PromptSizeBytes.WithLabelValues(provider, modelName).Observe(float64(EstimatePromptSize(prepared)))
 
@@ -469,6 +451,33 @@ func Run(ctx context.Context, opts RunOptions) error {
 				// classified payload handed to OnRetry.
 				classified = classified.WithProvider(provider)
 				opts.Metrics.RecordStreamRetry(provider, modelName, classified)
+				if classified.ChainBroken {
+					if chatopenai.HasPreviousResponseID(opts.ProviderOptions) {
+						opts.ProviderOptions = chatopenai.ClearPreviousResponseID(opts.ProviderOptions)
+					}
+					if chatopenai.HasPreviousResponseID(call.ProviderOptions) {
+						call.ProviderOptions = chatopenai.ClearPreviousResponseID(call.ProviderOptions)
+					}
+					if opts.DisableChainMode != nil {
+						opts.DisableChainMode()
+					}
+					if opts.ReloadMessages != nil {
+						reloaded, err := opts.ReloadMessages(ctx)
+						if err != nil {
+							opts.Logger.Warn(ctx,
+								"chain-broken recovery: reload messages failed",
+								slog.Error(err),
+							)
+						} else {
+							// Reloaded history replaces the prompt prepared before
+							// the failed attempt, so run the same preparation
+							// pipeline used by normal provider requests.
+							messages, call.Prompt = prepareMessagesForRequest(
+								ctx, opts, reloaded, provider, modelName, step, totalSteps,
+							)
+						}
+					}
+				}
 				if opts.OnRetry != nil {
 					opts.OnRetry(attempt, retryErr, classified, delay)
 				}
@@ -656,6 +665,43 @@ func Run(ctx context.Context, opts RunOptions) error {
 	return nil
 }
 
+// prepareMessagesForRequest applies the prompt preparation pipeline used
+// immediately before sending messages to a provider. It returns the
+// possibly updated canonical messages and an independent provider-ready
+// prompt.
+func prepareMessagesForRequest(
+	ctx context.Context,
+	opts RunOptions,
+	messages []fantasy.Message,
+	provider string,
+	modelName string,
+	step int,
+	totalSteps int,
+) (canonical []fantasy.Message, prompt []fantasy.Message) {
+	canonical = messages
+	if opts.PrepareMessages != nil {
+		if updated := opts.PrepareMessages(canonical); updated != nil {
+			canonical = updated
+		}
+	}
+	// Copy messages so provider-specific caching mutations don't leak
+	// back to the canonical message slice.
+	prompt = slices.Clone(canonical)
+	prompt, sanitizeStats := chatsanitize.SanitizeAnthropicProviderToolHistory(provider, prompt)
+	chatsanitize.LogAnthropicProviderToolSanitization(
+		ctx, opts.Logger, "pre_request", provider, modelName, sanitizeStats,
+		slog.F("step_index", step),
+		slog.F("total_steps", totalSteps),
+	)
+	prompt = chatsanitize.ApplyAnthropicProviderToolGuard(
+		ctx, opts.Logger, provider, modelName, prompt,
+	)
+	if shouldApplyAnthropicPromptCaching(opts.Model) {
+		addAnthropicPromptCaching(prompt)
+	}
+	return canonical, prompt
+}
+
 // guardedAttempt owns an attempt-scoped context and startup guard
 // around a provider stream. release is idempotent and frees the
 // attempt-scoped timer/context. finish canonicalizes startup timeout