From 79da4f8fcb064c20fa77c66fb3a1466ec10d7ac2 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 5 Jul 2026 15:26:23 +0100 Subject: [PATCH 1/5] fix(webapp,core): retry run resume through transient database outages Resuming a run after a wait calls the engine's continue endpoint. When the database was briefly unreachable, that route caught the Prisma infrastructure error and returned a non-retryable 422, so the worker aborted the run with TASK_EXECUTION_ABORTED over a transient blip. The continue route now lets infrastructure errors propagate to the generic 500 handler (scrubbed and retryable), matching how the trigger path already treats them. The worker's continue call also retries with a longer, jittered backoff so it can ride out an outage lasting tens of seconds without stampeding the database on recovery. Genuine validation errors still return 422. --- .changeset/resume-retry-transient-db.md | 5 +++++ ...d.snapshots.$snapshotFriendlyId.continue.ts | 18 ++++++++++++++++-- .../src/v3/runEngineWorker/workload/http.ts | 14 ++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 .changeset/resume-retry-transient-db.md diff --git a/.changeset/resume-retry-transient-db.md b/.changeset/resume-retry-transient-db.md new file mode 100644 index 00000000000..048ccb0fc2b --- /dev/null +++ b/.changeset/resume-retry-transient-db.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Runs resuming after a wait now survive a transient platform database outage instead of failing with `TASK_EXECUTION_ABORTED`. The worker retries the resume call generously with jittered backoff, so a brief blip while the run is being continued no longer aborts it. diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts index 85b3233989d..3c264adcd23 100644 --- a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts +++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts @@ -4,7 +4,7 @@ import type { WorkerApiContinueRunExecutionRequestBody } from "@trigger.dev/core import { z } from "zod"; import { logger } from "~/services/logger.server"; import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server"; -import { clientSafeErrorMessage } from "~/utils/prismaErrors"; +import { clientSafeErrorMessage, isInfrastructureError } from "~/utils/prismaErrors"; export const loader = createLoaderWorkerApiRoute( { @@ -31,7 +31,21 @@ export const loader = createLoaderWorkerApiRoute( return json(continuationResult); } catch (error) { - logger.warn("Failed to suspend run", { runFriendlyId, snapshotFriendlyId, error }); + logger.warn("Failed to continue run execution", { + runFriendlyId, + snapshotFriendlyId, + error, + }); + + // A Prisma infrastructure error (e.g. P1001 "Can't reach database + // server") means the DB was transiently unreachable while resuming. A 422 + // is non-retryable, so the worker would permanently abort a run over a + // blip. Let it propagate to the generic 500 handler, which scrubs the + // message and is retried by the worker's HTTP client. + if (isInfrastructureError(error)) { + throw error; + } + if (error instanceof Error) { throw json({ error: clientSafeErrorMessage(error) }, { status: 422 }); } diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 8f4703a9e62..064b39a401d 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -132,6 +132,20 @@ export class WorkloadHttpClient { headers: { ...this.defaultHeaders(), }, + }, + { + // Resuming after a wait is idempotent (guarded server-side by the + // snapshot id), so retry generously to ride out a transient database + // outage rather than aborting the run. `randomize` jitters the delay + // so a fleet of runs resuming at once doesn't stampede the DB the + // moment it recovers. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ) ); From 83348201733e9117bdd4f4fee53129cc4ed2f545 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 5 Jul 2026 15:33:39 +0100 Subject: [PATCH 2/5] fix(core): extend resume retry to the supervisor engine hop The supervisor-to-engine hop is the one that reaches the continue endpoint, so it is where a transient database outage surfaces as a retryable 5xx. Give its continueRunExecution the same longer, jittered retry budget as the workload client so it can ride out the outage. --- .../src/v3/runEngineWorker/supervisor/http.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts index 7b605373d07..bbb1fe227de 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/http.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/http.ts @@ -245,6 +245,21 @@ export class SupervisorHttpClient { ...this.defaultHeaders, ...this.runnerIdHeader(runnerId), }, + }, + { + // This is the hop that reaches the engine, so it's where a transient + // database outage during resume surfaces (as a retryable 5xx). Resuming + // is idempotent server-side (guarded by the snapshot id), so retry + // generously to ride out the outage rather than aborting the run. + // `randomize` jitters the delay so a fleet of runs resuming at once + // doesn't stampede the DB the moment it recovers. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ); } From 81ffb3fd8763bd35c57f0c6af8b450814a06fae0 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 5 Jul 2026 15:48:57 +0100 Subject: [PATCH 3/5] docs(core): clarify what the workload resume retry covers The database-outage retry lives on the supervisor-to-engine hop; the workload client only reaches the supervisor's workload server, so its retry rides out supervisor blips (e.g. a restart), not DB outages. Fix the comment to say so. --- packages/core/src/v3/runEngineWorker/workload/http.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 064b39a401d..43328b7244c 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -134,11 +134,11 @@ export class WorkloadHttpClient { }, }, { - // Resuming after a wait is idempotent (guarded server-side by the - // snapshot id), so retry generously to ride out a transient database - // outage rather than aborting the run. `randomize` jitters the delay - // so a fleet of runs resuming at once doesn't stampede the DB the - // moment it recovers. + // This hop only reaches the supervisor's workload server, so retry + // generously with jittered backoff to ride out a transient blip + // talking to the supervisor (e.g. a restart) rather than aborting the + // run. Database outages surface one hop further in, on the + // supervisor-to-engine call, which carries its own retry for them. retry: { minTimeoutInMs: 500, maxTimeoutInMs: 10_000, From 6c615be681c58905aab71cb05376fb14966b04ec Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 5 Jul 2026 18:03:05 +0100 Subject: [PATCH 4/5] refactor(webapp,core): scope resume-retry fix to the server route Drop the worker HTTP-client retry tuning and keep only the continue route change, so the fix is server-only. Swap the package changeset for a .server-changes entry. --- .changeset/resume-retry-transient-db.md | 5 ----- .server-changes/resume-retry-transient-db.md | 6 ++++++ .../src/v3/runEngineWorker/supervisor/http.ts | 15 --------------- .../core/src/v3/runEngineWorker/workload/http.ts | 14 -------------- 4 files changed, 6 insertions(+), 34 deletions(-) delete mode 100644 .changeset/resume-retry-transient-db.md create mode 100644 .server-changes/resume-retry-transient-db.md diff --git a/.changeset/resume-retry-transient-db.md b/.changeset/resume-retry-transient-db.md deleted file mode 100644 index 048ccb0fc2b..00000000000 --- a/.changeset/resume-retry-transient-db.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/core": patch ---- - -Runs resuming after a wait now survive a transient platform database outage instead of failing with `TASK_EXECUTION_ABORTED`. The worker retries the resume call generously with jittered backoff, so a brief blip while the run is being continued no longer aborts it. diff --git a/.server-changes/resume-retry-transient-db.md b/.server-changes/resume-retry-transient-db.md new file mode 100644 index 00000000000..bdd70432917 --- /dev/null +++ b/.server-changes/resume-retry-transient-db.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Runs resuming after a wait no longer fail with TASK_EXECUTION_ABORTED when the database is briefly unreachable; the resume endpoint returns a retryable response for transient infrastructure errors instead of a permanent one. diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts index bbb1fe227de..7b605373d07 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/http.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/http.ts @@ -245,21 +245,6 @@ export class SupervisorHttpClient { ...this.defaultHeaders, ...this.runnerIdHeader(runnerId), }, - }, - { - // This is the hop that reaches the engine, so it's where a transient - // database outage during resume surfaces (as a retryable 5xx). Resuming - // is idempotent server-side (guarded by the snapshot id), so retry - // generously to ride out the outage rather than aborting the run. - // `randomize` jitters the delay so a fleet of runs resuming at once - // doesn't stampede the DB the moment it recovers. - retry: { - minTimeoutInMs: 500, - maxTimeoutInMs: 10_000, - maxAttempts: 8, - factor: 2, - randomize: true, - }, } ); } diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 43328b7244c..8f4703a9e62 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -132,20 +132,6 @@ export class WorkloadHttpClient { headers: { ...this.defaultHeaders(), }, - }, - { - // This hop only reaches the supervisor's workload server, so retry - // generously with jittered backoff to ride out a transient blip - // talking to the supervisor (e.g. a restart) rather than aborting the - // run. Database outages surface one hop further in, on the - // supervisor-to-engine call, which carries its own retry for them. - retry: { - minTimeoutInMs: 500, - maxTimeoutInMs: 10_000, - maxAttempts: 8, - factor: 2, - randomize: true, - }, } ) ); From 988f53d69f8af93a29d110db4eb2ebbb0b76bcb7 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 5 Jul 2026 19:45:45 +0100 Subject: [PATCH 5/5] fix(core): jittered retry on the resume hops for transient DB outages Restore the extended, jittered retry on the workload and supervisor continueRunExecution calls so the resume can ride out a transient database outage. Recorded via .server-changes; no package changeset. --- .../src/v3/runEngineWorker/supervisor/http.ts | 15 +++++++++++++++ .../core/src/v3/runEngineWorker/workload/http.ts | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts index 7b605373d07..bbb1fe227de 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/http.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/http.ts @@ -245,6 +245,21 @@ export class SupervisorHttpClient { ...this.defaultHeaders, ...this.runnerIdHeader(runnerId), }, + }, + { + // This is the hop that reaches the engine, so it's where a transient + // database outage during resume surfaces (as a retryable 5xx). Resuming + // is idempotent server-side (guarded by the snapshot id), so retry + // generously to ride out the outage rather than aborting the run. + // `randomize` jitters the delay so a fleet of runs resuming at once + // doesn't stampede the DB the moment it recovers. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ); } diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 8f4703a9e62..43328b7244c 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -132,6 +132,20 @@ export class WorkloadHttpClient { headers: { ...this.defaultHeaders(), }, + }, + { + // This hop only reaches the supervisor's workload server, so retry + // generously with jittered backoff to ride out a transient blip + // talking to the supervisor (e.g. a restart) rather than aborting the + // run. Database outages surface one hop further in, on the + // supervisor-to-engine call, which carries its own retry for them. + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 10_000, + maxAttempts: 8, + factor: 2, + randomize: true, + }, } ) );