triggerdotdev · matt-aitken · Jul 5, 2026 · Jul 5, 2026 · Jul 5, 2026 · Jul 5, 2026
diff --git a/.server-changes/resume-retry-transient-db.md b/.server-changes/resume-retry-transient-db.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: fix
+---
+
+Runs resuming after a wait no longer fail with TASK_EXECUTION_ABORTED when the database is briefly unreachable; the resume endpoint returns a retryable response for transient infrastructure errors instead of a permanent one.
diff --git a/...es/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts b/...es/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts
@@ -4,7 +4,7 @@ import type { WorkerApiContinueRunExecutionRequestBody } from "@trigger.dev/core
 import { z } from "zod";
 import { logger } from "~/services/logger.server";
 import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
-import { clientSafeErrorMessage } from "~/utils/prismaErrors";
+import { clientSafeErrorMessage, isInfrastructureError } from "~/utils/prismaErrors";
 
 export const loader = createLoaderWorkerApiRoute(
   {
@@ -31,7 +31,21 @@ export const loader = createLoaderWorkerApiRoute(
 
       return json(continuationResult);
     } catch (error) {
-      logger.warn("Failed to suspend run", { runFriendlyId, snapshotFriendlyId, error });
+      logger.warn("Failed to continue run execution", {
+        runFriendlyId,
+        snapshotFriendlyId,
+        error,
+      });
+
+      // A Prisma infrastructure error (e.g. P1001 "Can't reach database
+      // server") means the DB was transiently unreachable while resuming. A 422
+      // is non-retryable, so the worker would permanently abort a run over a
+      // blip. Let it propagate to the generic 500 handler, which scrubs the
+      // message and is retried by the worker's HTTP client.
+      if (isInfrastructureError(error)) {
+        throw error;
+      }
+
       if (error instanceof Error) {
         throw json({ error: clientSafeErrorMessage(error) }, { status: 422 });
       }

diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts
@@ -245,6 +245,21 @@ export class SupervisorHttpClient {
           ...this.defaultHeaders,
           ...this.runnerIdHeader(runnerId),
         },
+      },
+      {
+        // This is the hop that reaches the engine, so it's where a transient
+        // database outage during resume surfaces (as a retryable 5xx). Resuming
+        // is idempotent server-side (guarded by the snapshot id), so retry
+        // generously to ride out the outage rather than aborting the run.
+        // `randomize` jitters the delay so a fleet of runs resuming at once
+        // doesn't stampede the DB the moment it recovers.
+        retry: {
+          minTimeoutInMs: 500,
+          maxTimeoutInMs: 10_000,
+          maxAttempts: 8,
+          factor: 2,
+          randomize: true,
+        },
       }
     );
   }

diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts
@@ -132,6 +132,20 @@ export class WorkloadHttpClient {
           headers: {
             ...this.defaultHeaders(),
           },
+        },
+        {
+          // This hop only reaches the supervisor's workload server, so retry
+          // generously with jittered backoff to ride out a transient blip
+          // talking to the supervisor (e.g. a restart) rather than aborting the
+          // run. Database outages surface one hop further in, on the
+          // supervisor-to-engine call, which carries its own retry for them.
+          retry: {
+            minTimeoutInMs: 500,
+            maxTimeoutInMs: 10_000,
+            maxAttempts: 8,
+            factor: 2,
+            randomize: true,
+          },
         }
       )
     );