diff --git a/.server-changes/resume-retry-transient-db.md b/.server-changes/resume-retry-transient-db.md new file mode 100644 index 0000000000..bdd7043291 --- /dev/null +++ b/.server-changes/resume-retry-transient-db.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Runs resuming after a wait no longer fail with TASK_EXECUTION_ABORTED when the database is briefly unreachable; the resume endpoint returns a retryable response for transient infrastructure errors instead of a permanent one. diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts index 85b3233989..3c264adcd2 100644 --- a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts +++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts @@ -4,7 +4,7 @@ import type { WorkerApiContinueRunExecutionRequestBody } from "@trigger.dev/core import { z } from "zod"; import { logger } from "~/services/logger.server"; import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server"; -import { clientSafeErrorMessage } from "~/utils/prismaErrors"; +import { clientSafeErrorMessage, isInfrastructureError } from "~/utils/prismaErrors"; export const loader = createLoaderWorkerApiRoute( { @@ -31,7 +31,21 @@ export const loader = createLoaderWorkerApiRoute( return json(continuationResult); } catch (error) { - logger.warn("Failed to suspend run", { runFriendlyId, snapshotFriendlyId, error }); + logger.warn("Failed to continue run execution", { + runFriendlyId, + snapshotFriendlyId, + error, + }); + + // A Prisma infrastructure error (e.g. P1001 "Can't reach database + // server") means the DB was transiently unreachable while resuming. A 422 + // is non-retryable, so the worker would permanently abort a run over a + // blip. Let it propagate to the generic 500 handler, which scrubs the + // message and is retried by the worker's HTTP client. + if (isInfrastructureError(error)) { + throw error; + } + if (error instanceof Error) { throw json({ error: clientSafeErrorMessage(error) }, { status: 422 }); } diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts index 7b605373d0..bbb1fe227d 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/http.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/http.ts @@ -245,6 +245,21 @@ export class SupervisorHttpClient { ...this.defaultHeaders, ...this.runnerIdHeader(runnerId), }, + }, + { + // This is the hop that reaches the engine, so it's where a transient + // database outage during resume surfaces (as a retryable 5xx). Resuming + // is idempotent server-side (guarded by the snapshot id), so retry + // generously to ride out the outage rather than aborting the run. + // `randomize` jitters the delay so a fleet of runs resuming at once + // doesn't stampede the DB the moment it recovers. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ); } diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 8f4703a9e6..43328b7244 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -132,6 +132,20 @@ export class WorkloadHttpClient { headers: { ...this.defaultHeaders(), }, + }, + { + // This hop only reaches the supervisor's workload server, so retry + // generously with jittered backoff to ride out a transient blip + // talking to the supervisor (e.g. a restart) rather than aborting the + // run. Database outages surface one hop further in, on the + // supervisor-to-engine call, which carries its own retry for them. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ) );