diff --git a/.changeset/tricky-keys-attack.md b/.changeset/tricky-keys-attack.md
new file mode 100644
index 00000000000..271096497d7
--- /dev/null
+++ b/.changeset/tricky-keys-attack.md
@@ -0,0 +1,14 @@
+---
+"trigger.dev": patch
+"@trigger.dev/core": patch
+---
+
+- Clear paused states before retry
+- Detect and handle unrecoverable worker errors
+- Remove checkpoints after successful push
+- Permanently switch to DO hosted busybox image
+- Fix IPC timeout issue, or at least handle it more gracefully
+- Handle checkpoint failures
+- Basic chaos monkey for checkpoint testing
+- Stack traces are back in the dashboard
+- Display final errors on root span
diff --git a/.changeset/warm-olives-provide.md b/.changeset/warm-olives-provide.md
new file mode 100644
index 00000000000..b57d242a52b
--- /dev/null
+++ b/.changeset/warm-olives-provide.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/core": patch
+---
+
+Improve handling of IPC timeouts and fix checkpoint cancellation after failures
diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts
index e49aa04ebd0..1bdbc4bc445 100644
--- a/apps/coordinator/src/index.ts
+++ b/apps/coordinator/src/index.ts
@@ -1,5 +1,6 @@
 import { createServer } from "node:http";
-import { $ } from "execa";
+import fs from "node:fs/promises";
+import { $, type ExecaChildProcess } from "execa";
 import { nanoid } from "nanoid";
 import { Server } from "socket.io";
 import {
@@ -19,6 +20,11 @@ collectDefaultMetrics();
 const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || 8020);
 const NODE_NAME = process.env.NODE_NAME || "coordinator";
 const DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS = 30_000;
+const CHAOS_MONKEY_ENABLED = !!process.env.CHAOS_MONKEY_ENABLED;
+
+const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes(
+  process.env.FORCE_CHECKPOINT_SIMULATION ?? "true"
+);
 
 const REGISTRY_HOST = process.env.REGISTRY_HOST || "localhost:5000";
 const CHECKPOINT_PATH = process.env.CHECKPOINT_PATH || "/checkpoints";
@@ -32,6 +38,10 @@ const SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ?
 
 const logger = new SimpleLogger(`[${NODE_NAME}]`);
 
+if (CHAOS_MONKEY_ENABLED) {
+  logger.log("🍌 Chaos monkey enabled");
+}
+
 type CheckpointerInitializeReturn = {
   canCheckpoint: boolean;
   willSimulate: boolean;
@@ -49,6 +59,40 @@ type CheckpointData = {
   docker: boolean;
 };
 
+function isExecaChildProcess(maybeExeca: unknown): maybeExeca is Awaited<ExecaChildProcess> {
+  return typeof maybeExeca === "object" && maybeExeca !== null && "escapedCommand" in maybeExeca;
+}
+
+async function getFileSize(filePath: string): Promise<number> {
+  try {
+    const stats = await fs.stat(filePath);
+    return stats.size;
+  } catch (error) {
+    console.error("Error getting file size:", error);
+    return -1;
+  }
+}
+
+async function getParsedFileSize(filePath: string) {
+  const sizeInBytes = await getFileSize(filePath);
+
+  let message = `Size in bytes: ${sizeInBytes}`;
+
+  if (sizeInBytes > 1024 * 1024) {
+    const sizeInMB = (sizeInBytes / 1024 / 1024).toFixed(2);
+    message = `Size in MB (rounded): ${sizeInMB}`;
+  } else if (sizeInBytes > 1024) {
+    const sizeInKB = (sizeInBytes / 1024).toFixed(2);
+    message = `Size in KB (rounded): ${sizeInKB}`;
+  }
+
+  return {
+    path: filePath,
+    sizeInBytes,
+    message,
+  };
+}
+
 class Checkpointer {
   #initialized = false;
   #canCheckpoint = false;
@@ -56,6 +100,7 @@ class Checkpointer {
 
   #logger = new SimpleLogger("[checkptr]");
   #abortControllers = new Map<string, AbortController>();
+  #failedCheckpoints = new Map<string, unknown>();
 
   constructor(private opts = { forceSimulate: false }) {}
 
@@ -150,7 +195,11 @@ class Checkpointer {
       success: !!result,
     });
 
-    return result;
+    if (!result.success) {
+      return;
+    }
+
+    return result.checkpoint;
   }
 
   isCheckpointing(runId: string) {
@@ -158,6 +207,13 @@ class Checkpointer {
   }
 
   cancelCheckpoint(runId: string): boolean {
+    // If the last checkpoint failed, pretend we canceled it
+    // This ensures tasks don't wait for external resume messages to continue
+    if (this.#hasFailedCheckpoint(runId)) {
+      this.#clearFailedCheckpoint(runId);
+      return true;
+    }
+
     const controller = this.#abortControllers.get(runId);
 
     if (!controller) {
@@ -176,44 +232,58 @@ class Checkpointer {
     leaveRunning = true, // This mirrors kubernetes behaviour more accurately
     projectRef,
     deploymentVersion,
-  }: CheckpointAndPushOptions): Promise<CheckpointData | undefined> {
+  }: CheckpointAndPushOptions): Promise<
+    { success: true; checkpoint: CheckpointData } | { success: false; reason?: "CANCELED" }
+  > {
     await this.initialize();
 
+    const options = {
+      runId,
+      leaveRunning,
+      projectRef,
+      deploymentVersion,
+    };
+
     if (!this.#dockerMode && !this.#canCheckpoint) {
       this.#logger.error("No checkpoint support. Simulation requires docker.");
-      return;
+      return { success: false };
     }
 
     if (this.#abortControllers.has(runId)) {
-      logger.error("Checkpoint procedure already in progress", {
-        options: {
-          runId,
-          leaveRunning,
-          projectRef,
-          deploymentVersion,
-        },
-      });
-      return;
+      logger.error("Checkpoint procedure already in progress", { options });
+      return { success: false };
     }
 
+    // This is a new checkpoint, clear any last failure for this run
+    this.#clearFailedCheckpoint(runId);
+
     const controller = new AbortController();
     this.#abortControllers.set(runId, controller);
 
     const $$ = $({ signal: controller.signal });
 
     try {
+      if (CHAOS_MONKEY_ENABLED) {
+        console.log("🍌 Chaos monkey wreaking havoc");
+
+        const random = Math.random();
+
+        if (random < 0.33) {
+          // Fake long checkpoint duration
+          await $$`sleep 300`;
+        } else if (random < 0.66) {
+          // Fake checkpoint error
+          await $$`false`;
+        } else {
+          // no-op
+        }
+      }
+
       const shortCode = nanoid(8);
       const imageRef = this.#getImageRef(projectRef, deploymentVersion, shortCode);
       const exportLocation = this.#getExportLocation(projectRef, deploymentVersion, shortCode);
 
-      this.#logger.log("Checkpointing:", {
-        options: {
-          runId,
-          leaveRunning,
-          projectRef,
-          deploymentVersion,
-        },
-      });
+      this.#logger.log("Checkpointing:", { options });
 
       const containterName = this.#getRunContainerName(runId);
 
@@ -234,9 +304,9 @@ class Checkpointer {
               );
             }
           }
-        } catch (error: any) {
-          this.#logger.error(error.stderr);
-          return;
+        } catch (error) {
+          this.#logger.error("Failed while creating docker checkpoint", { exportLocation });
+          throw error;
         }
 
         this.#logger.log("checkpoint created:", {
@@ -245,8 +315,11 @@ class Checkpointer {
         });
 
         return {
-          location: exportLocation,
-          docker: true,
+          success: true,
+          checkpoint: {
+            location: exportLocation,
+            docker: true,
+          },
         };
       }
 
@@ -266,54 +339,104 @@ class Checkpointer {
         throw new Error("could not find container id");
       }
 
+      const start = performance.now();
+
+      // Create checkpoint
       this.#logger.debug(await $$`crictl checkpoint --export=${exportLocation} ${containerId}`);
+      const postCheckpoint = performance.now();
+
+      // Print checkpoint size
+      const size = await getParsedFileSize(exportLocation);
+      this.#logger.log("checkpoint archive created", { size, options });
 
       // Create image from checkpoint
       const container = this.#logger.debug(await $$`buildah from scratch`);
+      const postFrom = performance.now();
+
       this.#logger.debug(await $$`buildah add ${container} ${exportLocation} /`);
+      const postAdd = performance.now();
+
       this.#logger.debug(
         await $$`buildah config --annotation=io.kubernetes.cri-o.annotations.checkpoint.name=counter ${container}`
       );
+      const postConfig = performance.now();
+
       this.#logger.debug(await $$`buildah commit ${container} ${imageRef}`);
+      const postCommit = performance.now();
+
       this.#logger.debug(await $$`buildah rm ${container}`);
+      const postRm = performance.now();
 
       // Push checkpoint image
       this.#logger.debug(await $$`buildah push --tls-verify=${REGISTRY_TLS_VERIFY} ${imageRef}`);
+      const postPush = performance.now();
+
+      const perf = {
+        "crictl checkpoint": postCheckpoint - start,
+        "buildah from": postFrom - postCheckpoint,
+        "buildah add": postAdd - postFrom,
+        "buildah config": postConfig - postAdd,
+        "buildah commit": postCommit - postConfig,
+        "buildah rm": postRm - postCommit,
+        "buildah push": postPush - postRm,
+      };
 
-      this.#logger.log("Checkpointed and pushed image to:", { location: imageRef });
+      this.#logger.log("Checkpointed and pushed image to:", { location: imageRef, perf });
 
       try {
         await $$`rm ${exportLocation}`;
         this.#logger.log("Deleted checkpoint archive", { exportLocation });
 
-        // Disabled for now as this will increase restore time by having to pull the image again
-        // await $`buildah rmi ${imageRef}`;
-        // this.#logger.log("Deleted checkpoint image", { imageRef });
+        await $`buildah rmi ${imageRef}`;
+        this.#logger.log("Deleted checkpoint image", { imageRef });
       } catch (error) {
         this.#logger.error("Failed during checkpoint cleanup", { exportLocation });
-        this.#logger.debug(error);
+        throw error;
       }
 
       return {
-        location: imageRef,
-        docker: false,
+        success: true,
+        checkpoint: {
+          location: imageRef,
+          docker: false,
+        },
       };
     } catch (error) {
-      this.#logger.error("checkpoint failed", {
-        options: {
-          runId,
-          leaveRunning,
-          projectRef,
-          deploymentVersion,
-        },
-        error,
-      });
-      return;
+      if (isExecaChildProcess(error)) {
+        if (error.isCanceled) {
+          this.#logger.error("Checkpoint canceled", { options, error });
+
+          return { success: false, reason: "CANCELED" };
+        }
+
+        // Everything that's not a cancellation is a failure
+        this.#failCheckpoint(runId, error);
+        this.#logger.error("Checkpoint command error", { options, error });
+
+        return { success: false };
+      }
+
+      this.#failCheckpoint(runId, error);
+      this.#logger.error("Unhandled checkpoint error", { options, error });
+
+      return { success: false };
     } finally {
       this.#abortControllers.delete(runId);
     }
   }
 
+  #failCheckpoint(runId: string, error: unknown) {
+    this.#failedCheckpoints.set(runId, error);
+  }
+
+  #clearFailedCheckpoint(runId: string) {
+    this.#failedCheckpoints.delete(runId);
+  }
+
+  #hasFailedCheckpoint(runId: string) {
+    return this.#failedCheckpoints.has(runId);
+  }
+
   #getRunContainerName(suffix: string) {
     return `task-run-${suffix}`;
   }
@@ -321,7 +444,7 @@ class Checkpointer {
 
 class TaskCoordinator {
   #httpServer: ReturnType<typeof createServer>;
-  #checkpointer = new Checkpointer({ forceSimulate: true });
+  #checkpointer = new Checkpointer({ forceSimulate: FORCE_CHECKPOINT_SIMULATION });
 
   #prodWorkerNamespace: ZodNamespace<
     typeof ProdWorkerToCoordinatorMessages,
@@ -442,6 +565,28 @@ class TaskCoordinator {
 
           taskSocket.emit("REQUEST_ATTEMPT_CANCELLATION", message);
         },
+        REQUEST_RUN_CANCELLATION: async (message) => {
+          const taskSocket = await this.#getRunSocket(message.runId);
+
+          if (!taskSocket) {
+            logger.log("Socket for run not found", {
+              runId: message.runId,
+            });
+            return;
+          }
+
+          if (message.delayInMs) {
+            taskSocket.emit("REQUEST_EXIT", {
+              version: "v2",
+              delayInMs: message.delayInMs,
+            });
+          } else {
+            // If there's no delay, assume the worker doesn't support non-v1 messages
+            taskSocket.emit("REQUEST_EXIT", {
+              version: "v1",
+            });
+          }
+        },
         READY_FOR_RETRY: async (message) => {
           const taskSocket = await this.#getRunSocket(message.runId);
 
@@ -528,6 +673,20 @@ class TaskCoordinator {
       onConnection: async (socket, handler, sender) => {
         const logger = new SimpleLogger(`[prod-worker][${socket.id}]`);
 
+        const crashRun = async (error: { name: string; message: string; stack?: string }) => {
+          try {
+            this.#platformSocket?.send("RUN_CRASHED", {
+              version: "v1",
+              runId: socket.data.runId,
+              error,
+            });
+          } finally {
+            socket.emit("REQUEST_EXIT", {
+              version: "v1",
+            });
+          }
+        };
+
         const checkpointInProgress = () => {
           return this.#checkpointableTasks.has(socket.data.runId);
         };
@@ -596,8 +755,9 @@ class TaskCoordinator {
             if (!executionAck) {
               logger.error("no execution ack", { runId: socket.data.runId });
 
-              socket.emit("REQUEST_EXIT", {
-                version: "v1",
+              await crashRun({
+                name: "ReadyForExecutionError",
+                message: "No execution ack",
               });
 
               return;
@@ -606,8 +766,9 @@ class TaskCoordinator {
             if (!executionAck.success) {
               logger.error("failed to get execution payload", { runId: socket.data.runId });
 
-              socket.emit("REQUEST_EXIT", {
-                version: "v1",
+              await crashRun({
+                name: "ReadyForExecutionError",
+                message: "Failed to get execution payload",
               });
 
               return;
@@ -624,6 +785,46 @@ class TaskCoordinator {
           }
         });
 
+        socket.on("READY_FOR_LAZY_ATTEMPT", async (message) => {
+          logger.log("[READY_FOR_LAZY_ATTEMPT]", message);
+
+          try {
+            const lazyAttempt = await this.#platformSocket?.sendWithAck("READY_FOR_LAZY_ATTEMPT", {
+              ...message,
+              envId: socket.data.envId,
+            });
+
+            if (!lazyAttempt) {
+              logger.error("no lazy attempt ack", { runId: socket.data.runId });
+
+              await crashRun({
+                name: "ReadyForLazyAttemptError",
+                message: "No lazy attempt ack",
+              });
+
+              return;
+            }
+
+            if (!lazyAttempt.success) {
+              logger.error("failed to get lazy attempt payload", { runId: socket.data.runId });
+
+              await crashRun({
+                name: "ReadyForLazyAttemptError",
+                message: "Failed to get lazy attempt payload",
+              });
+
+              return;
+            }
+
+            socket.emit("EXECUTE_TASK_RUN_LAZY_ATTEMPT", {
+              version: "v1",
+              lazyPayload: lazyAttempt.lazyPayload,
+            });
+          } catch (error) {
+            logger.error("Error", { error });
+          }
+        });
+
         socket.on("READY_FOR_RESUME", async (message) => {
           logger.log("[READY_FOR_RESUME]", message);
 
@@ -714,6 +915,19 @@ class TaskCoordinator {
           }
         });
 
+        socket.on("TASK_RUN_FAILED_TO_RUN", async ({ completion }) => {
+          logger.log("completed task", { completionId: completion.id });
+
+          this.#platformSocket?.send("TASK_RUN_FAILED_TO_RUN", {
+            version: "v1",
+            completion,
+          });
+
+          socket.emit("REQUEST_EXIT", {
+            version: "v1",
+          });
+        });
+
         socket.on("READY_FOR_CHECKPOINT", async (message) => {
           logger.log("[READY_FOR_CHECKPOINT]", message);
 
@@ -890,7 +1104,7 @@ class TaskCoordinator {
           logger.log("[INDEX_TASKS]", message);
 
           const workerAck = await this.#platformSocket?.sendWithAck("CREATE_WORKER", {
-            version: "v1",
+            version: "v2",
             projectRef: socket.data.projectRef,
             envId: socket.data.envId,
             deploymentId: message.deploymentId,
@@ -899,6 +1113,7 @@ class TaskCoordinator {
               packageVersion: message.packageVersion,
               tasks: message.tasks,
             },
+            supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts,
           });
 
           if (!workerAck) {
@@ -917,6 +1132,34 @@ class TaskCoordinator {
             error: message.error,
           });
         });
+
+        socket.on("CREATE_TASK_RUN_ATTEMPT", async (message, callback) => {
+          logger.log("[CREATE_TASK_RUN_ATTEMPT]", message);
+
+          const createAttempt = await this.#platformSocket?.sendWithAck("CREATE_TASK_RUN_ATTEMPT", {
+            runId: message.runId,
+            envId: socket.data.envId,
+          });
+
+          if (!createAttempt?.success) {
+            logger.debug("no ack while creating attempt", message);
+            callback({ success: false });
+            return;
+          }
+
+          socket.data.attemptFriendlyId = createAttempt.executionPayload.execution.attempt.id;
+
+          callback({
+            success: true,
+            executionPayload: createAttempt.executionPayload,
+          });
+        });
+
+        socket.on("UNRECOVERABLE_ERROR", async (message) => {
+          logger.log("[UNRECOVERABLE_ERROR]", message);
+
+          await crashRun(message.error);
+        });
       },
       onDisconnect: async (socket, handler, sender, logger) => {
         this.#platformSocket?.send("LOG", {
@@ -928,13 +1171,16 @@ class TaskCoordinator {
         TASK_HEARTBEAT: async (message) => {
           this.#platformSocket?.send("TASK_HEARTBEAT", message);
         },
+        TASK_RUN_HEARTBEAT: async (message) => {
+          this.#platformSocket?.send("TASK_RUN_HEARTBEAT", message);
+        },
       },
     });
 
     return provider;
   }
 
-  #cancelCheckpoint(runId: string) {
+  #cancelCheckpoint(runId: string): boolean {
     const checkpointWait = this.#checkpointableTasks.get(runId);
 
     if (checkpointWait) {
diff --git a/apps/docker-provider/.env.example b/apps/docker-provider/.env.example
index 2d24f79c8ca..75c54083d1a 100644
--- a/apps/docker-provider/.env.example
+++ b/apps/docker-provider/.env.example
@@ -4,6 +4,8 @@ PLATFORM_WS_PORT=3030
 PLATFORM_SECRET=provider-secret
 SECURE_CONNECTION=false
 
+OTEL_EXPORTER_OTLP_ENDPOINT=http://0.0.0.0:3030/otel
+
 # Use this if you are on macOS
 # COORDINATOR_HOST="host.docker.internal"
 # OTEL_EXPORTER_OTLP_ENDPOINT="http://host.docker.internal:4318"
\ No newline at end of file
diff --git a/apps/docker-provider/src/index.ts b/apps/docker-provider/src/index.ts
index f1f945853c4..a5e588956c8 100644
--- a/apps/docker-provider/src/index.ts
+++ b/apps/docker-provider/src/index.ts
@@ -13,9 +13,14 @@ import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3";
 const MACHINE_NAME = process.env.MACHINE_NAME || "local";
 const COORDINATOR_PORT = process.env.COORDINATOR_PORT || 8020;
 const COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1";
+
 const OTEL_EXPORTER_OTLP_ENDPOINT =
   process.env.OTEL_EXPORTER_OTLP_ENDPOINT || "http://0.0.0.0:4318";
 
+const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes(
+  process.env.FORCE_CHECKPOINT_SIMULATION ?? "true"
+);
+
 const logger = new SimpleLogger(`[${MACHINE_NAME}]`);
 
 type InitializeReturn = {
@@ -278,7 +283,7 @@ class DockerTaskOperations implements TaskOperations {
 }
 
 const provider = new ProviderShell({
-  tasks: new DockerTaskOperations({ forceSimulate: true }),
+  tasks: new DockerTaskOperations({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }),
   type: "docker",
 });
 
diff --git a/apps/kubernetes-provider/src/index.ts b/apps/kubernetes-provider/src/index.ts
index 8981c85d928..53870d36d5b 100644
--- a/apps/kubernetes-provider/src/index.ts
+++ b/apps/kubernetes-provider/src/index.ts
@@ -212,7 +212,7 @@ class KubernetesTaskOperations implements TaskOperations {
             },
             {
               name: "populate-taskinfo",
-              image: "docker.io/library/busybox",
+              image: "registry.digitalocean.com/trigger/busybox",
               imagePullPolicy: "IfNotPresent",
               command: ["/bin/sh", "-c"],
               args: ["printenv COORDINATOR_HOST | tee /etc/taskinfo/coordinator-host"],
diff --git a/apps/webapp/app/routes/admin.api.v1.marqs.ts b/apps/webapp/app/routes/admin.api.v1.marqs.ts
new file mode 100644
index 00000000000..14a9fd409e8
--- /dev/null
+++ b/apps/webapp/app/routes/admin.api.v1.marqs.ts
@@ -0,0 +1,31 @@
+import { LoaderFunctionArgs, json } from "@remix-run/server-runtime";
+import { prisma } from "~/db.server";
+import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server";
+import { marqs } from "~/v3/marqs/index.server";
+
+export async function loader({ request, params }: LoaderFunctionArgs) {
+  // Next authenticate the request
+  const authenticationResult = await authenticateApiRequestWithPersonalAccessToken(request);
+
+  if (!authenticationResult) {
+    return json({ error: "Invalid or Missing API key" }, { status: 401 });
+  }
+
+  const user = await prisma.user.findUnique({
+    where: {
+      id: authenticationResult.userId,
+    },
+  });
+
+  if (!user) {
+    return json({ error: "Invalid or Missing API key" }, { status: 401 });
+  }
+
+  if (!user.admin) {
+    return json({ error: "You must be an admin to perform this action" }, { status: 403 });
+  }
+
+  const details = await marqs?.getSharedQueueDetails();
+
+  return json(details);
+}
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts
new file mode 100644
index 00000000000..9c2845f6a52
--- /dev/null
+++ b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts
@@ -0,0 +1,45 @@
+import type { ActionFunctionArgs } from "@remix-run/server-runtime";
+import { json } from "@remix-run/server-runtime";
+import { z } from "zod";
+import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { ServiceValidationError } from "~/v3/services/baseService.server";
+import { CreateTaskRunAttemptService } from "~/v3/services/createTaskRunAttempt.server";
+
+const ParamsSchema = z.object({
+  /* This is the run friendly ID */
+  runParam: z.string(),
+});
+
+export async function action({ request, params }: ActionFunctionArgs) {
+  // Authenticate the request
+  const authenticationResult = await authenticateApiRequest(request);
+
+  if (!authenticationResult) {
+    return json({ error: "Invalid or Missing API Key" }, { status: 401 });
+  }
+
+  const parsed = ParamsSchema.safeParse(params);
+
+  if (!parsed.success) {
+    return json({ error: "Invalid or missing run ID" }, { status: 400 });
+  }
+
+  const { runParam } = parsed.data;
+
+  const service = new CreateTaskRunAttemptService();
+
+  try {
+    const { execution } = await service.call(runParam, authenticationResult.environment);
+
+    return json(execution, { status: 200 });
+  } catch (error) {
+    if (error instanceof ServiceValidationError) {
+      return json({ error: error.message }, { status: error.status ?? 422 });
+    }
+
+    return json(
+      { error: error instanceof Error ? error.message : "Internal Server Error" },
+      { status: 500 }
+    );
+  }
+}
diff --git a/apps/webapp/app/services/worker.server.ts b/apps/webapp/app/services/worker.server.ts
index d592482f0d3..0b03fc27d39 100644
--- a/apps/webapp/app/services/worker.server.ts
+++ b/apps/webapp/app/services/worker.server.ts
@@ -44,6 +44,8 @@ import { GraphileMigrationHelperService } from "./db/graphileMigrationHelper.ser
 import { PerformBulkActionService } from "~/v3/services/bulk/performBulkAction.server";
 import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server";
 import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server";
+import { RequeueTaskRunService } from "~/v3/requeueTaskRun.server";
+import { RetryAttemptService } from "~/v3/services/retryAttempt.server";
 
 const workerCatalog = {
   indexEndpoint: z.object({
@@ -158,6 +160,12 @@ const workerCatalog = {
   "v3.performBulkActionItem": z.object({
     bulkActionItemId: z.string(),
   }),
+  "v3.requeueTaskRun": z.object({
+    runId: z.string(),
+  }),
+  "v3.retryAttempt": z.object({
+    runId: z.string(),
+  }),
 };
 
 const executionWorkerCatalog = {
@@ -600,6 +608,24 @@ function getWorkerQueue() {
           await service.performBulkActionItem(payload.bulkActionItemId);
         },
       },
+      "v3.requeueTaskRun": {
+        priority: 0,
+        maxAttempts: 3,
+        handler: async (payload, job) => {
+          const service = new RequeueTaskRunService();
+
+          await service.call(payload.runId);
+        },
+      },
+      "v3.retryAttempt": {
+        priority: 0,
+        maxAttempts: 3,
+        handler: async (payload, job) => {
+          const service = new RetryAttemptService();
+
+          return await service.call(payload.runId);
+        },
+      },
     },
   });
 }
diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
index 209954f34e6..79ce1716127 100644
--- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
+++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
@@ -54,7 +54,10 @@ export class AuthenticatedSocketConnection {
       schema: clientWebsocketMessages,
       messages: {
         READY_FOR_TASKS: async (payload) => {
-          await this._consumer.registerBackgroundWorker(payload.backgroundWorkerId);
+          await this._consumer.registerBackgroundWorker(
+            payload.backgroundWorkerId,
+            payload.inProgressRuns ?? []
+          );
         },
         BACKGROUND_WORKER_DEPRECATED: async (payload) => {
           await this._consumer.deprecateBackgroundWorker(payload.backgroundWorkerId);
@@ -69,10 +72,22 @@ export class AuthenticatedSocketConnection {
               );
               break;
             }
+            case "TASK_RUN_FAILED_TO_RUN": {
+              await this._consumer.taskRunFailed(
+                payload.backgroundWorkerId,
+                payload.data.completion
+              );
+
+              break;
+            }
             case "TASK_HEARTBEAT": {
               await this._consumer.taskHeartbeat(payload.backgroundWorkerId, payload.data.id);
               break;
             }
+            case "TASK_RUN_HEARTBEAT": {
+              await this._consumer.taskRunHeartbeat(payload.backgroundWorkerId, payload.data.id);
+              break;
+            }
           }
         },
       },
diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts
index 324975e51b9..4a25d2dbd78 100644
--- a/apps/webapp/app/v3/eventRepository.server.ts
+++ b/apps/webapp/app/v3/eventRepository.server.ts
@@ -10,6 +10,7 @@ import {
   SpanEvents,
   SpanMessagingEvent,
   TaskEventStyle,
+  TaskRunError,
   correctErrorStackTrace,
   createPacketAttributesAsJson,
   flattenAttributes,
@@ -117,6 +118,7 @@ export type QueriedEvent = Prisma.TaskEventGetPayload<{
     isCancelled: true;
     level: true;
     events: true;
+    environmentType: true;
   };
 }>;
 
@@ -156,6 +158,7 @@ export type SpanSummary = {
     isPartial: boolean;
     isCancelled: boolean;
     level: NonNullable<CreatableEvent["level"]>;
+    environmentType: CreatableEventEnvironmentType;
   };
 };
 
@@ -165,6 +168,7 @@ export type UpdateEventOptions = {
   attributes: TraceAttributes;
   endTime?: Date;
   immediate?: boolean;
+  events?: SpanEvents;
 };
 
 export class EventRepository {
@@ -239,7 +243,7 @@ export class EventRepository {
       isCancelled: false,
       status: options?.attributes.isError ? "ERROR" : "OK",
       links: event.links ?? [],
-      events: event.events ?? [],
+      events: event.events ?? (options?.events as any) ?? [],
       duration: calculateDurationFromStart(event.startTime, options?.endTime),
       properties: event.properties as Attributes,
       metadata: event.metadata as Attributes,
@@ -386,6 +390,7 @@ export class EventRepository {
         isCancelled: true,
         level: true,
         events: true,
+        environmentType: true,
       },
       where: {
         traceId,
@@ -421,6 +426,7 @@ export class EventRepository {
           startTime: getDateFromNanoseconds(event.startTime),
           level: event.level,
           events: event.events,
+          environmentType: event.environmentType,
         },
       };
     });
@@ -505,7 +511,11 @@ export class EventRepository {
       });
     }
 
-    const events = transformEvents(span.data.events, fullEvent.metadata as Attributes);
+    const events = transformEvents(
+      span.data.events,
+      fullEvent.metadata as Attributes,
+      traceSummary?.rootSpan.data.environmentType === "DEVELOPMENT"
+    );
 
     return {
       ...fullEvent,
@@ -877,6 +887,36 @@ export function stripAttributePrefix(attributes: Attributes, prefix: string) {
   return result;
 }
 
+export function createExceptionPropertiesFromError(error: TaskRunError): ExceptionEventProperties {
+  switch (error.type) {
+    case "BUILT_IN_ERROR": {
+      return {
+        type: error.name,
+        message: error.message,
+        stacktrace: error.stackTrace,
+      };
+    }
+    case "CUSTOM_ERROR": {
+      return {
+        type: "Error",
+        message: error.raw,
+      };
+    }
+    case "INTERNAL_ERROR": {
+      return {
+        type: "Internal error",
+        message: [error.code, error.message].filter(Boolean).join(": "),
+      };
+    }
+    case "STRING_ERROR": {
+      return {
+        type: "Error",
+        message: error.raw,
+      };
+    }
+  }
+}
+
 /**
  * Filters out partial events from a batch of creatable events, excluding those that have a corresponding full event.
  * @param batch - The batch of creatable events to filter.
@@ -1097,16 +1137,16 @@ function removePrivateProperties(
   return result;
 }
 
-function transformEvents(events: SpanEvents, properties: Attributes): SpanEvents {
-  return (events ?? []).map((event) => transformEvent(event, properties));
+function transformEvents(events: SpanEvents, properties: Attributes, isDev: boolean): SpanEvents {
+  return (events ?? []).map((event) => transformEvent(event, properties, isDev));
 }
 
-function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent {
+function transformEvent(event: SpanEvent, properties: Attributes, isDev: boolean): SpanEvent {
   if (isExceptionSpanEvent(event)) {
     return {
       ...event,
       properties: {
-        exception: transformException(event.properties.exception, properties),
+        exception: transformException(event.properties.exception, properties, isDev),
       },
     };
   }
@@ -1116,11 +1156,12 @@ function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent {
 
 function transformException(
   exception: ExceptionEventProperties,
-  properties: Attributes
+  properties: Attributes,
+  isDev: boolean
 ): ExceptionEventProperties {
   const projectDirAttributeValue = properties[SemanticInternalAttributes.PROJECT_DIR];
 
-  if (typeof projectDirAttributeValue !== "string") {
+  if (projectDirAttributeValue !== undefined && typeof projectDirAttributeValue !== "string") {
     return exception;
   }
 
@@ -1129,6 +1170,7 @@ function transformException(
     stacktrace: exception.stacktrace
       ? correctErrorStackTrace(exception.stacktrace, projectDirAttributeValue, {
           removeFirstLine: true,
+          isDev,
         })
       : undefined,
   };
diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts
new file mode 100644
index 00000000000..79594e73cba
--- /dev/null
+++ b/apps/webapp/app/v3/failedTaskRun.server.ts
@@ -0,0 +1,66 @@
+import { TaskRunFailedExecutionResult } from "@trigger.dev/core/v3";
+import { logger } from "~/services/logger.server";
+import { marqs } from "~/v3/marqs/index.server";
+
+import { TaskRunStatus } from "@trigger.dev/database";
+import { createExceptionPropertiesFromError, eventRepository } from "./eventRepository.server";
+import { BaseService } from "./services/baseService.server";
+
+const FAILABLE_TASK_RUN_STATUSES: TaskRunStatus[] = ["EXECUTING", "PENDING", "WAITING_FOR_DEPLOY"];
+
+export class FailedTaskRunService extends BaseService {
+  public async call(runFriendlyId: string, completion: TaskRunFailedExecutionResult) {
+    const taskRun = await this._prisma.taskRun.findUnique({
+      where: { friendlyId: runFriendlyId },
+    });
+
+    if (!taskRun) {
+      logger.error("[FailedTaskRunService] Task run not found", {
+        runFriendlyId,
+        completion,
+      });
+
+      return;
+    }
+
+    if (!FAILABLE_TASK_RUN_STATUSES.includes(taskRun.status)) {
+      logger.error("[FailedTaskRunService] Task run is not in a failable state", {
+        taskRun,
+        completion,
+      });
+
+      return;
+    }
+
+    // No more retries, we need to fail the task run
+    logger.debug("[FailedTaskRunService] Failing task run", { taskRun, completion });
+
+    await marqs?.acknowledgeMessage(taskRun.id);
+
+    // Now we need to "complete" the task run event/span
+    await eventRepository.completeEvent(taskRun.spanId, {
+      endTime: new Date(),
+      attributes: {
+        isError: true,
+      },
+      events: [
+        {
+          name: "exception",
+          time: new Date(),
+          properties: {
+            exception: createExceptionPropertiesFromError(completion.error),
+          },
+        },
+      ],
+    });
+
+    await this._prisma.taskRun.update({
+      where: {
+        id: taskRun.id,
+      },
+      data: {
+        status: "SYSTEM_FAILURE",
+      },
+    });
+  }
+}
diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts
index 6c9bb340207..ca731f202d9 100644
--- a/apps/webapp/app/v3/handleSocketIo.server.ts
+++ b/apps/webapp/app/v3/handleSocketIo.server.ts
@@ -22,6 +22,7 @@ import { DeploymentIndexFailed } from "./services/deploymentIndexFailed.server";
 import { Redis } from "ioredis";
 import { createAdapter } from "@socket.io/redis-adapter";
 import { CrashTaskRunService } from "./services/crashTaskRun.server";
+import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server";
 
 export const socketIo = singleton("socketIo", initalizeIoServer);
 
@@ -91,6 +92,23 @@ function createCoordinatorNamespace(io: Server) {
           return { success: true, payload };
         }
       },
+      READY_FOR_LAZY_ATTEMPT: async (message) => {
+        try {
+          const payload = await sharedQueueTasks.getLazyAttemptPayload(
+            message.envId,
+            message.runId
+          );
+
+          if (!payload) {
+            logger.error("Failed to retrieve lazy attempt payload", message);
+            return { success: false, reason: "Failed to retrieve payload" };
+          }
+
+          return { success: true, lazyPayload: payload };
+        } catch (error) {
+          return { success: false };
+        }
+      },
       READY_FOR_RESUME: async (message) => {
         const resumeAttempt = new ResumeAttemptService();
         await resumeAttempt.call(message);
@@ -103,9 +121,15 @@ function createCoordinatorNamespace(io: Server) {
           checkpoint: message.checkpoint,
         });
       },
+      TASK_RUN_FAILED_TO_RUN: async (message) => {
+        await sharedQueueTasks.taskRunFailed(message.completion);
+      },
       TASK_HEARTBEAT: async (message) => {
         await sharedQueueTasks.taskHeartbeat(message.attemptFriendlyId);
       },
+      TASK_RUN_HEARTBEAT: async (message) => {
+        await sharedQueueTasks.taskRunHeartbeat(message.runId);
+      },
       CHECKPOINT_CREATED: async (message) => {
         const createCheckpoint = new CreateCheckpointService();
         await createCheckpoint.call(message);
@@ -123,6 +147,7 @@ function createCoordinatorNamespace(io: Server) {
           const worker = await service.call(message.projectRef, environment, message.deploymentId, {
             localOnly: false,
             metadata: message.metadata,
+            supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts,
           });
 
           return { success: !!worker };
@@ -131,13 +156,52 @@ function createCoordinatorNamespace(io: Server) {
           return { success: false };
         }
       },
+      CREATE_TASK_RUN_ATTEMPT: async (message) => {
+        try {
+          const environment = await findEnvironmentById(message.envId);
+
+          if (!environment) {
+            logger.error("Environment not found", { id: message.envId });
+            return { success: false, reason: "Environment not found" };
+          }
+
+          const service = new CreateTaskRunAttemptService();
+          const { attempt } = await service.call(message.runId, environment, false);
+
+          const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true);
+
+          if (!payload) {
+            logger.error("Failed to retrieve payload after attempt creation", {
+              id: message.envId,
+            });
+            return { success: false, reason: "Failed to retrieve payload" };
+          }
+
+          return { success: true, executionPayload: payload };
+        } catch (error) {
+          logger.error("Error while creating attempt", { error });
+          return { success: false };
+        }
+      },
       INDEXING_FAILED: async (message) => {
         try {
           const service = new DeploymentIndexFailed();
 
           await service.call(message.deploymentId, message.error);
         } catch (e) {
-          logger.error("Error while indexing", { error: e });
+          logger.error("Error while processing index failure", { error: e });
+        }
+      },
+      RUN_CRASHED: async (message) => {
+        try {
+          const service = new CrashTaskRunService();
+
+          await service.call(message.runId, {
+            reason: `${message.error.name}: ${message.error.message}`,
+            logs: message.error.stack,
+          });
+        } catch (e) {
+          logger.error("Error while processing run failure", { error: e });
         }
       },
     },
diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
index d26759ec2a2..fc1a5ed833a 100644
--- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
+++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
@@ -1,8 +1,10 @@
 import { Context, ROOT_CONTEXT, Span, SpanKind, context, trace } from "@opentelemetry/api";
 import {
   TaskRunExecution,
+  TaskRunExecutionLazyAttemptPayload,
   TaskRunExecutionPayload,
   TaskRunExecutionResult,
+  TaskRunFailedExecutionResult,
   serverWebsocketMessages,
 } from "@trigger.dev/core/v3";
 import { ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler";
@@ -14,16 +16,16 @@ import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
 import { marqs, sanitizeQueueName } from "~/v3/marqs/index.server";
 import { EnvironmentVariablesRepository } from "../environmentVariables/environmentVariablesRepository.server";
-import { generateFriendlyId } from "../friendlyIdentifiers";
-import { CancelAttemptService } from "../services/cancelAttempt.server";
 import { CancelTaskRunService } from "../services/cancelTaskRun.server";
 import { CompleteAttemptService } from "../services/completeAttempt.server";
+import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server";
 import {
   SEMINTATTRS_FORCE_RECORDING,
   attributesFromAuthenticatedEnv,
   tracer,
 } from "../tracer.server";
 import { DevSubscriber, devPubSub } from "./devPubSub.server";
+import { FailedTaskRunService } from "../failedTaskRun.server";
 
 const MessageBody = z.discriminatedUnion("type", [
   z.object({
@@ -54,7 +56,6 @@ export class DevQueueConsumer {
   private _taskSuccesses: number = 0;
   private _currentSpan: Span | undefined;
   private _endSpanInNextIteration = false;
-  private _inProgressAttempts: Map<string, string> = new Map(); // Keys are task attempt friendly IDs, values are TaskRun ids/queue message ids
   private _inProgressRuns: Map<string, string> = new Map(); // Keys are task run friendly IDs, values are TaskRun internal ids/queue message ids
 
   constructor(
@@ -78,7 +79,7 @@ export class DevQueueConsumer {
     this._backgroundWorkers.delete(id);
   }
 
-  public async registerBackgroundWorker(id: string) {
+  public async registerBackgroundWorker(id: string, inProgressRuns: string[] = []) {
     const backgroundWorker = await prisma.backgroundWorker.findUnique({
       where: { friendlyId: id, runtimeEnvironmentId: this.env.id },
       include: {
@@ -96,7 +97,10 @@ export class DevQueueConsumer {
 
     this._backgroundWorkers.set(backgroundWorker.id, backgroundWorker);
 
-    logger.debug("Registered background worker", { backgroundWorker: backgroundWorker.id });
+    logger.debug("Registered background worker", {
+      backgroundWorker: backgroundWorker.id,
+      inProgressRuns,
+    });
 
     const subscriber = await devPubSub.subscribe(`backgroundWorker:${backgroundWorker.id}:*`);
 
@@ -113,6 +117,10 @@ export class DevQueueConsumer {
 
     this._backgroundWorkerSubscriber.set(backgroundWorker.id, subscriber);
 
+    for (const runId of inProgressRuns) {
+      this._inProgressRuns.set(runId, runId);
+    }
+
     // Start reading from the queue if we haven't already
     await this.#enable();
   }
@@ -122,15 +130,16 @@ export class DevQueueConsumer {
     completion: TaskRunExecutionResult,
     execution: TaskRunExecution
   ) {
-    this._inProgressAttempts.delete(execution.attempt.id);
-
     if (completion.ok) {
       this._taskSuccesses++;
     } else {
       this._taskFailures++;
     }
 
-    logger.debug("Task run completed", { taskRunCompletion: completion, execution });
+    logger.debug("[DevQueueConsumer] taskAttemptCompleted()", {
+      taskRunCompletion: completion,
+      execution,
+    });
 
     const service = new CompleteAttemptService();
     const result = await service.call({ completion, execution, env: this.env });
@@ -140,7 +149,24 @@ export class DevQueueConsumer {
     }
   }
 
+  public async taskRunFailed(workerId: string, completion: TaskRunFailedExecutionResult) {
+    this._taskFailures++;
+
+    logger.debug("[DevQueueConsumer] taskRunFailed()", { completion });
+
+    this._inProgressRuns.delete(completion.id);
+
+    const service = new FailedTaskRunService();
+
+    await service.call(completion.id, completion);
+  }
+
+  /**
+   * @deprecated Use `taskRunHeartbeat` instead
+   */
   public async taskHeartbeat(workerId: string, id: string, seconds: number = 60) {
+    logger.debug("[DevQueueConsumer] taskHeartbeat()", { id, seconds });
+
     const taskRunAttempt = await prisma.taskRunAttempt.findUnique({
       where: { friendlyId: id },
     });
@@ -152,6 +178,12 @@ export class DevQueueConsumer {
     await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds);
   }
 
+  public async taskRunHeartbeat(workerId: string, id: string, seconds: number = 60) {
+    logger.debug("[DevQueueConsumer] taskRunHeartbeat()", { id, seconds });
+
+    await marqs?.heartbeatMessage(id, seconds);
+  }
+
   public async stop(reason: string = "CLI disconnected") {
     if (!this._enabled) {
       return;
@@ -184,66 +216,23 @@ export class DevQueueConsumer {
   }
 
   async #cancelInProgressRunsAndAttempts(reason: string) {
-    const cancelAttemptService = new CancelAttemptService();
     const cancelTaskRunService = new CancelTaskRunService();
 
     const cancelledAt = new Date();
 
-    const inProgressAttempts = new Map(this._inProgressAttempts);
     const inProgressRuns = new Map(this._inProgressRuns);
 
-    this._inProgressAttempts.clear();
     this._inProgressRuns.clear();
 
-    const inProgressRunsWithNoInProgressAttempts: string[] = [];
-    const inProgressAttemptRunIds = new Set(inProgressAttempts.values());
-
-    for (const [runId, messageId] of inProgressRuns) {
-      if (!inProgressAttemptRunIds.has(messageId)) {
-        inProgressRunsWithNoInProgressAttempts.push(messageId);
-      }
-    }
-
     logger.debug("Cancelling in progress runs and attempts", {
-      attempts: Array.from(inProgressAttempts.keys()),
       runs: Array.from(inProgressRuns.keys()),
     });
 
-    for (const [attemptId, messageId] of inProgressAttempts) {
-      await this.#cancelInProgressAttempt(
-        attemptId,
-        messageId,
-        cancelAttemptService,
-        cancelledAt,
-        reason
-      );
-    }
-
-    for (const runId of inProgressRunsWithNoInProgressAttempts) {
+    for (const [_, runId] of inProgressRuns) {
       await this.#cancelInProgressRun(runId, cancelTaskRunService, cancelledAt, reason);
     }
   }
 
-  async #cancelInProgressAttempt(
-    attemptId: string,
-    messageId: string,
-    cancelAttemptService: CancelAttemptService,
-    cancelledAt: Date,
-    reason: string
-  ) {
-    logger.debug("Cancelling in progress attempt", { attemptId, messageId });
-
-    try {
-      await cancelAttemptService.call(attemptId, messageId, cancelledAt, reason, this.env);
-    } catch (e) {
-      logger.error("Failed to cancel in progress attempt", {
-        attemptId,
-        messageId,
-        error: e,
-      });
-    }
-  }
-
   async #cancelInProgressRun(
     runId: string,
     service: CancelTaskRunService,
@@ -252,16 +241,20 @@ export class DevQueueConsumer {
   ) {
     logger.debug("Cancelling in progress run", { runId });
 
-    const taskRun = await prisma.taskRun.findUnique({
-      where: { id: runId },
-    });
+    const taskRun = runId.startsWith("run_")
+      ? await prisma.taskRun.findUnique({
+          where: { friendlyId: runId },
+        })
+      : await prisma.taskRun.findUnique({
+          where: { id: runId },
+        });
 
     if (!taskRun) {
       return;
     }
 
     try {
-      await service.call(taskRun, { reason, cancelAttempts: false, cancelledAt });
+      await service.call(taskRun, { reason, cancelAttempts: true, cancelledAt });
     } catch (e) {
       logger.error("Failed to cancel in progress run", {
         runId,
@@ -474,141 +467,131 @@ export class DevQueueConsumer {
     }
 
     if (!this._enabled) {
+      logger.debug("Dev queue consumer is disabled", { env: this.env, queueMessage: message });
+
       await marqs?.nackMessage(message.messageId);
       return;
     }
 
-    const taskRunAttempt = await prisma.taskRunAttempt.create({
-      data: {
-        number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1,
-        friendlyId: generateFriendlyId("attempt"),
-        taskRunId: lockedTaskRun.id,
-        startedAt: new Date(),
-        backgroundWorkerId: backgroundTask.workerId,
-        backgroundWorkerTaskId: backgroundTask.id,
-        status: "EXECUTING" as const,
-        queueId: queue.id,
-        runtimeEnvironmentId: this.env.id,
-      },
-    });
-
-    const execution: TaskRunExecution = {
-      task: {
-        id: backgroundTask.slug,
-        filePath: backgroundTask.filePath,
-        exportName: backgroundTask.exportName,
-      },
-      attempt: {
-        id: taskRunAttempt.friendlyId,
-        number: taskRunAttempt.number,
-        startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt,
-        backgroundWorkerId: backgroundWorker.id,
-        backgroundWorkerTaskId: backgroundTask.id,
-        status: "EXECUTING" as const,
-      },
-      run: {
-        id: lockedTaskRun.friendlyId,
-        payload: lockedTaskRun.payload,
-        payloadType: lockedTaskRun.payloadType,
-        context: lockedTaskRun.context,
-        createdAt: lockedTaskRun.createdAt,
-        tags: lockedTaskRun.tags.map((tag) => tag.name),
-        isTest: lockedTaskRun.isTest,
-        idempotencyKey: lockedTaskRun.idempotencyKey ?? undefined,
-      },
-      queue: {
-        id: queue.friendlyId,
-        name: queue.name,
-      },
-      environment: {
-        id: this.env.id,
-        slug: this.env.slug,
-        type: this.env.type,
-      },
-      organization: {
-        id: this.env.organization.id,
-        slug: this.env.organization.slug,
-        name: this.env.organization.title,
-      },
-      project: {
-        id: this.env.project.id,
-        ref: this.env.project.externalRef,
-        slug: this.env.project.slug,
-        name: this.env.project.name,
-      },
-      batch:
-        lockedTaskRun.batchItems[0] && lockedTaskRun.batchItems[0].batchTaskRun
-          ? { id: lockedTaskRun.batchItems[0].batchTaskRun.friendlyId }
-          : undefined,
-    };
-
     const environmentRepository = new EnvironmentVariablesRepository();
     const variables = await environmentRepository.getEnvironmentVariables(
       this.env.project.id,
       this.env.id
     );
 
-    const payload: TaskRunExecutionPayload = {
-      execution,
-      traceContext: lockedTaskRun.traceContext as Record<string, unknown>,
-      environment: variables.reduce((acc: Record<string, string>, curr) => {
-        acc[curr.key] = curr.value;
-        return acc;
-      }, {}),
-    };
+    if (backgroundWorker.supportsLazyAttempts) {
+      const payload: TaskRunExecutionLazyAttemptPayload = {
+        traceContext: lockedTaskRun.traceContext as Record<string, unknown>,
+        environment: variables.reduce((acc: Record<string, string>, curr) => {
+          acc[curr.key] = curr.value;
+          return acc;
+        }, {}),
+        runId: lockedTaskRun.friendlyId,
+        messageId: lockedTaskRun.id,
+        isTest: lockedTaskRun.isTest,
+      };
 
-    try {
-      // TODO: send trace context down to the CLI
-      await this._sender.send("BACKGROUND_WORKER_MESSAGE", {
-        backgroundWorkerId: backgroundWorker.friendlyId,
-        data: {
-          type: "EXECUTE_RUNS",
-          payloads: [payload],
-        },
-      });
+      try {
+        await this._sender.send("BACKGROUND_WORKER_MESSAGE", {
+          backgroundWorkerId: backgroundWorker.friendlyId,
+          data: {
+            type: "EXECUTE_RUN_LAZY_ATTEMPT",
+            payload,
+          },
+        });
 
-      logger.debug("Saving the in progress attempt", {
-        taskRunAttempt: taskRunAttempt.id,
-        messageId: message.messageId,
-      });
+        logger.debug("Executing the run", {
+          messageId: message.messageId,
+        });
 
-      this._inProgressAttempts.set(taskRunAttempt.friendlyId, message.messageId);
-      this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId);
-    } catch (e) {
-      if (e instanceof Error) {
-        this._currentSpan?.recordException(e);
-      } else {
-        this._currentSpan?.recordException(new Error(String(e)));
+        this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId);
+      } catch (e) {
+        if (e instanceof Error) {
+          this._currentSpan?.recordException(e);
+        } else {
+          this._currentSpan?.recordException(new Error(String(e)));
+        }
+
+        this._endSpanInNextIteration = true;
+
+        // We now need to unlock the task run and delete the task run attempt
+        await prisma.$transaction([
+          prisma.taskRun.update({
+            where: {
+              id: lockedTaskRun.id,
+            },
+            data: {
+              lockedAt: null,
+              lockedById: null,
+              status: "PENDING",
+            },
+          }),
+        ]);
+
+        this._inProgressRuns.delete(lockedTaskRun.friendlyId);
+
+        // Finally we need to nack the message so it can be retried
+        await marqs?.nackMessage(message.messageId);
+      } finally {
+        setTimeout(() => this.#doWork(), 100);
       }
-
-      this._endSpanInNextIteration = true;
-
-      // We now need to unlock the task run and delete the task run attempt
-      await prisma.$transaction([
-        prisma.taskRun.update({
-          where: {
-            id: lockedTaskRun.id,
-          },
+    } else {
+      const service = new CreateTaskRunAttemptService();
+      const { execution } = await service.call(lockedTaskRun.friendlyId, this.env);
+
+      const payload: TaskRunExecutionPayload = {
+        traceContext: lockedTaskRun.traceContext as Record<string, unknown>,
+        environment: variables.reduce((acc: Record<string, string>, curr) => {
+          acc[curr.key] = curr.value;
+          return acc;
+        }, {}),
+        execution,
+      };
+
+      try {
+        await this._sender.send("BACKGROUND_WORKER_MESSAGE", {
+          backgroundWorkerId: backgroundWorker.friendlyId,
           data: {
-            lockedAt: null,
-            lockedById: null,
-            status: "PENDING",
+            type: "EXECUTE_RUNS",
+            payloads: [payload],
           },
-        }),
-        prisma.taskRunAttempt.delete({
-          where: {
-            id: taskRunAttempt.id,
-          },
-        }),
-      ]);
+        });
 
-      this._inProgressAttempts.delete(taskRunAttempt.friendlyId);
-      this._inProgressRuns.delete(lockedTaskRun.friendlyId);
+        logger.debug("Executing the run", {
+          messageId: message.messageId,
+        });
 
-      // Finally we need to nack the message so it can be retried
-      await marqs?.nackMessage(message.messageId);
-    } finally {
-      setTimeout(() => this.#doWork(), 100);
+        this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId);
+      } catch (e) {
+        if (e instanceof Error) {
+          this._currentSpan?.recordException(e);
+        } else {
+          this._currentSpan?.recordException(new Error(String(e)));
+        }
+
+        this._endSpanInNextIteration = true;
+
+        // We now need to unlock the task run and delete the task run attempt
+        await prisma.$transaction([
+          prisma.taskRun.update({
+            where: {
+              id: lockedTaskRun.id,
+            },
+            data: {
+              lockedAt: null,
+              lockedById: null,
+              status: "PENDING",
+            },
+          }),
+        ]);
+
+        this._inProgressRuns.delete(lockedTaskRun.friendlyId);
+
+        // Finally we need to nack the message so it can be retried
+        await marqs?.nackMessage(message.messageId);
+      } finally {
+        setTimeout(() => this.#doWork(), 100);
+      }
     }
   }
 
diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts
index b20c0d99e9d..0816bab3b01 100644
--- a/apps/webapp/app/v3/marqs/index.server.ts
+++ b/apps/webapp/app/v3/marqs/index.server.ts
@@ -21,6 +21,7 @@ import {
   QueueCapacities,
   QueueRange,
 } from "./types";
+import { RequeueTaskRunService } from "../requeueTaskRun.server";
 
 const tracer = trace.getTracer("marqs");
 
@@ -259,6 +260,11 @@ export class MarQS {
           });
         }
 
+        await RequeueTaskRunService.enqueue(
+          messageData.messageId,
+          new Date(Date.now() + this.visibilityTimeoutInMs)
+        );
+
         return message;
       },
       {
@@ -272,6 +278,35 @@ export class MarQS {
     );
   }
 
+  public async getSharedQueueDetails() {
+    const parentQueue = constants.SHARED_QUEUE;
+
+    const { range, selectionId } = await this.queuePriorityStrategy.nextCandidateSelection(
+      parentQueue
+    );
+    const queues = await this.#getChildQueuesWithScores(parentQueue, range);
+
+    const queuesWithScores = await this.#calculateQueueScores(queues, (queue) =>
+      this.#calculateMessageQueueCapacities(queue)
+    );
+
+    // We need to priority shuffle here to ensure all workers aren't just working on the highest priority queue
+    const choice = this.queuePriorityStrategy.chooseQueue(
+      queuesWithScores,
+      parentQueue,
+      selectionId
+    );
+
+    return {
+      selectionId,
+      queues,
+      queuesWithScores,
+      nextRange: range,
+      queueCount: queues.length,
+      queueChoice: choice,
+    };
+  }
+
   /**
    * Dequeue a message from the shared queue (this should be used in production environments)
    */
@@ -350,6 +385,8 @@ export class MarQS {
           [SemanticAttributes.PARENT_QUEUE]: message.parentQueue,
         });
 
+        await RequeueTaskRunService.dequeue(messageId);
+
         await this.#callAcknowledgeMessage({
           parentQueue: message.parentQueue,
           messageKey: this.keys.messageKey(messageId),
@@ -415,6 +452,8 @@ export class MarQS {
           return;
         }
 
+        await RequeueTaskRunService.dequeue(messageId);
+
         await this.#callAcknowledgeMessage({
           parentQueue: oldMessage.parentQueue,
           messageKey: this.keys.messageKey(messageId),
@@ -481,6 +520,8 @@ export class MarQS {
           [SemanticAttributes.PARENT_QUEUE]: message.parentQueue,
         });
 
+        await RequeueTaskRunService.dequeue(messageId);
+
         await this.#callNackMessage({
           messageKey: this.keys.messageKey(messageId),
           messageQueue: message.queue,
@@ -506,16 +547,19 @@ export class MarQS {
 
   // This should increment by the number of seconds, but with a max value of Date.now() + visibilityTimeoutInMs
   public async heartbeatMessage(messageId: string, seconds: number = 30) {
+    // We are still calling this for backwards compatibility, but we should be using the v3.requeueTaskRun job
     await this.#callHeartbeatMessage({
       visibilityQueue: constants.MESSAGE_VISIBILITY_TIMEOUT_QUEUE,
       messageId,
       milliseconds: seconds * 1000,
       maxVisibilityTimeout: Date.now() + this.visibilityTimeoutInMs,
     });
+
+    await RequeueTaskRunService.enqueue(messageId, new Date(Date.now() + seconds * 1000));
   }
 
   get visibilityTimeoutInMs() {
-    return this.options.visibilityTimeoutInMs ?? 300000;
+    return this.options.visibilityTimeoutInMs ?? 300000; // 5 minutes
   }
 
   async readMessage(messageId: string) {
@@ -873,7 +917,6 @@ export class MarQS {
     const result = await this.redis.dequeueMessage(
       messageQueue,
       parentQueue,
-      visibilityQueue,
       concurrencyLimitKey,
       envConcurrencyLimitKey,
       orgConcurrencyLimitKey,
@@ -881,7 +924,6 @@ export class MarQS {
       envCurrentConcurrencyKey,
       orgCurrentConcurrencyKey,
       messageQueue,
-      String(this.options.visibilityTimeoutInMs ?? 300000), // 5 minutes
       String(Date.now()),
       String(this.options.defaultEnvConcurrency),
       String(this.options.defaultOrgConcurrency)
@@ -1007,6 +1049,9 @@ export class MarQS {
     );
   }
 
+  /**
+   * @deprecated This is being replaced by the v3.requeueTaskRun graphile worker job
+   */
   #callHeartbeatMessage({
     visibilityQueue,
     messageId,
@@ -1145,25 +1190,23 @@ end
     });
 
     this.redis.defineCommand("dequeueMessage", {
-      numberOfKeys: 9,
+      numberOfKeys: 8,
       lua: `
--- Keys: childQueue, parentQueue, visibilityQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey
+-- Keys: childQueue, parentQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey
 local childQueue = KEYS[1]
 local parentQueue = KEYS[2]
-local visibilityQueue = KEYS[3]
-local concurrencyLimitKey = KEYS[4]
-local envConcurrencyLimitKey = KEYS[5]
-local orgConcurrencyLimitKey = KEYS[6]
-local currentConcurrencyKey = KEYS[7]
-local envCurrentConcurrencyKey = KEYS[8]
-local orgCurrentConcurrencyKey = KEYS[9]
-
--- Args: childQueueName, visibilityQueue, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit
+local concurrencyLimitKey = KEYS[3]
+local envConcurrencyLimitKey = KEYS[4]
+local orgConcurrencyLimitKey = KEYS[5]
+local currentConcurrencyKey = KEYS[6]
+local envCurrentConcurrencyKey = KEYS[7]
+local orgCurrentConcurrencyKey = KEYS[8]
+
+-- Args: childQueueName, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit
 local childQueueName = ARGV[1]
-local visibilityTimeout = tonumber(ARGV[2])
-local currentTime = tonumber(ARGV[3])
-local defaultEnvConcurrencyLimit = ARGV[4]
-local defaultOrgConcurrencyLimit = ARGV[5]
+local currentTime = tonumber(ARGV[2])
+local defaultEnvConcurrencyLimit = ARGV[3]
+local defaultOrgConcurrencyLimit = ARGV[4]
 
 -- Check current org concurrency against the limit
 local orgCurrentConcurrency = tonumber(redis.call('SCARD', orgCurrentConcurrencyKey) or '0')
@@ -1199,11 +1242,9 @@ end
 
 local messageId = messages[1]
 local messageScore = tonumber(messages[2])
-local timeoutScore = currentTime + visibilityTimeout
 
 -- Move message to timeout queue and update concurrency
 redis.call('ZREM', childQueue, messageId)
-redis.call('ZADD', visibilityQueue, timeoutScore, messageId)
 redis.call('SADD', currentConcurrencyKey, messageId)
 redis.call('SADD', envCurrentConcurrencyKey, messageId)
 redis.call('SADD', orgCurrentConcurrencyKey, messageId)
@@ -1269,7 +1310,7 @@ else
     redis.call('ZADD', parentQueue, earliestMessage[2], messageQueueName)
 end
 
--- Remove the message from the timeout queue
+-- Remove the message from the timeout queue (deprecated, will eventually remove this)
 redis.call('ZREM', visibilityQueue, messageId)
 
 -- Update the concurrency keys
@@ -1297,20 +1338,18 @@ local messageId = ARGV[2]
 local currentTime = tonumber(ARGV[3])
 local messageScore = tonumber(ARGV[4])
 
--- Check to see if the message is still in the visibilityQueue
-local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0
-
-if messageVisibility == 0 then
-    return
-end
-
 -- Update the concurrency keys
 redis.call('SREM', concurrencyKey, messageId)
 redis.call('SREM', envConcurrencyKey, messageId)
 redis.call('SREM', orgConcurrencyKey, messageId)
 
--- Remove the message from the timeout queue
-redis.call('ZREM', visibilityQueue, messageId)
+-- Check to see if the message is still in the visibilityQueue
+local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0
+
+if messageVisibility > 0 then
+-- Remove the message from the timeout queue (deprecated, will eventually remove this)
+    redis.call('ZREM', visibilityQueue, messageId)
+end
 
 -- Enqueue the message into the queue
 redis.call('ZADD', childQueueKey, messageScore, messageId)
@@ -1337,12 +1376,16 @@ local milliseconds = tonumber(ARGV[2])
 local maxVisibilityTimeout = tonumber(ARGV[3])
 
 -- Get the current visibility timeout
-local currentVisibilityTimeout = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0
+local zscoreResult = redis.call('ZSCORE', visibilityQueue, messageId)
 
-if currentVisibilityTimeout == 0 then
+-- If there's no currentVisibilityTimeout, return and do not execute ZADD
+if zscoreResult == false then
     return
 end
 
+local currentVisibilityTimeout = tonumber(zscoreResult)
+
+
 -- Calculate the new visibility timeout
 local newVisibilityTimeout = math.min(currentVisibilityTimeout + milliseconds * 1000, maxVisibilityTimeout)
 
@@ -1445,7 +1488,6 @@ declare module "ioredis" {
     dequeueMessage(
       childQueue: string,
       parentQueue: string,
-      visibilityQueue: string,
       concurrencyLimitKey: string,
       envConcurrencyLimitKey: string,
       orgConcurrencyLimitKey: string,
@@ -1453,7 +1495,6 @@ declare module "ioredis" {
       envCurrentConcurrencyKey: string,
       orgCurrentConcurrencyKey: string,
       childQueueName: string,
-      visibilityTimeout: string,
       currentTime: string,
       defaultEnvConcurrencyLimit: string,
       defaultOrgConcurrencyLimit: string,
diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts
index b71b1176d0f..0f8f5f6c050 100644
--- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts
+++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts
@@ -5,6 +5,7 @@ import {
   ProdTaskRunExecutionPayload,
   TaskRunError,
   TaskRunExecution,
+  TaskRunExecutionLazyAttemptPayload,
   TaskRunExecutionResult,
   TaskRunFailedExecutionResult,
   TaskRunSuccessfulExecutionResult,
@@ -33,6 +34,9 @@ import {
 import { RestoreCheckpointService } from "../services/restoreCheckpoint.server";
 import { SEMINTATTRS_FORCE_RECORDING, tracer } from "../tracer.server";
 import { CrashTaskRunService } from "../services/crashTaskRun.server";
+import { FailedTaskRunService } from "../failedTaskRun.server";
+import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server";
+import { findEnvironmentById } from "~/models/runtimeEnvironment.server";
 
 const WithTraceContext = z.object({
   traceparent: z.string().optional(),
@@ -260,6 +264,14 @@ export class SharedQueueConsumer {
           where: {
             id: message.messageId,
           },
+          include: {
+            lockedToVersion: {
+              include: {
+                deployment: true,
+                tasks: true,
+              },
+            },
+          },
         });
 
         if (!existingTaskRun) {
@@ -291,7 +303,7 @@ export class SharedQueueConsumer {
           (!retryingFromCheckpoint &&
             !EXECUTABLE_RUN_STATUSES.withoutCheckpoint.includes(existingTaskRun.status))
         ) {
-          logger.debug("Task run has invalid status for execution", {
+          logger.error("Task run has invalid status for execution", {
             queueMessage: message.data,
             messageId: message.messageId,
             taskRun: existingTaskRun.id,
@@ -299,6 +311,12 @@ export class SharedQueueConsumer {
             retryingFromCheckpoint,
           });
 
+          const service = new CrashTaskRunService();
+          await service.call(existingTaskRun.id, {
+            crashAttempts: true,
+            reason: `Invalid run status for execution: ${existingTaskRun.status}`,
+          });
+
           await this.#ackAndDoMoreWork(message.messageId);
           return;
         }
@@ -398,6 +416,7 @@ export class SharedQueueConsumer {
                 createdAt: "desc",
               },
             },
+            lockedBy: true,
           },
         });
 
@@ -443,39 +462,12 @@ export class SharedQueueConsumer {
           return;
         }
 
-        const taskRunAttempt = await prisma.taskRunAttempt.create({
-          data: {
-            number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1,
-            friendlyId: generateFriendlyId("attempt"),
-            taskRunId: lockedTaskRun.id,
-            startedAt: new Date(),
-            backgroundWorkerId: backgroundTask.workerId,
-            backgroundWorkerTaskId: backgroundTask.id,
-            status: "PENDING" as const,
-            queueId: queue.id,
-            runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId,
-          },
-          include: {
-            backgroundWorkerTask: true,
-          },
-        });
-
-        const isRetry = taskRunAttempt.number > 1;
+        const nextAttemptNumber = lockedTaskRun.attempts[0]
+          ? lockedTaskRun.attempts[0].number + 1
+          : 1;
 
-        const { machineConfig } = taskRunAttempt.backgroundWorkerTask;
-        const machine = Machine.safeParse(machineConfig ?? {});
-
-        if (!machine.success) {
-          logger.error("Failed to parse machine config", {
-            queueMessage: message.data,
-            messageId: message.messageId,
-            attemptId: taskRunAttempt.id,
-            machineConfig,
-          });
+        const isRetry = nextAttemptNumber > 1;
 
-          await this.#ackAndDoMoreWork(message.messageId);
-          return;
-        }
         try {
           if (messageBody.data.checkpointEventId) {
             const restoreService = new RestoreCheckpointService();
@@ -494,12 +486,35 @@ export class SharedQueueConsumer {
               await this.#ackAndDoMoreWork(message.messageId);
               return;
             }
-          } else if (isRetry) {
+
+            break;
+          }
+
+          if (!deployment.worker.supportsLazyAttempts) {
+            const service = new CreateTaskRunAttemptService();
+            await service.call(lockedTaskRun.friendlyId, undefined, false);
+          }
+
+          if (isRetry) {
             socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", {
               version: "v1",
-              runId: taskRunAttempt.taskRunId,
+              runId: lockedTaskRun.id,
             });
           } else {
+            const machineConfig = lockedTaskRun.lockedBy?.machineConfig;
+            const machine = Machine.safeParse(machineConfig ?? {});
+
+            if (!machine.success) {
+              logger.error("Failed to parse machine config", {
+                queueMessage: message.data,
+                messageId: message.messageId,
+                machineConfig,
+              });
+
+              await this.#ackAndDoMoreWork(message.messageId);
+              return;
+            }
+
             await this._sender.send("BACKGROUND_WORKER_MESSAGE", {
               backgroundWorkerId: deployment.worker.friendlyId,
               data: {
@@ -508,12 +523,12 @@ export class SharedQueueConsumer {
                 version: deployment.version,
                 machine: machine.data,
                 // identifiers
-                id: taskRunAttempt.id,
+                id: "placeholder", // TODO: Remove this completely in a future release
                 envId: lockedTaskRun.runtimeEnvironment.id,
                 envType: lockedTaskRun.runtimeEnvironment.type,
                 orgId: lockedTaskRun.runtimeEnvironment.organizationId,
                 projectId: lockedTaskRun.runtimeEnvironment.projectId,
-                runId: taskRunAttempt.taskRunId,
+                runId: lockedTaskRun.id,
               },
             });
           }
@@ -535,11 +550,7 @@ export class SharedQueueConsumer {
               data: {
                 lockedAt: null,
                 lockedById: null,
-              },
-            }),
-            prisma.taskRunAttempt.delete({
-              where: {
-                id: taskRunAttempt.id,
+                status: lockedTaskRun.status,
               },
             }),
           ]);
@@ -1096,7 +1107,50 @@ class SharedQueueTasks {
     return this.getExecutionPayloadFromAttempt(latestAttempt.id, setToExecuting, isRetrying);
   }
 
+  async getLazyAttemptPayload(
+    envId: string,
+    runId: string
+  ): Promise<TaskRunExecutionLazyAttemptPayload | undefined> {
+    const environment = await findEnvironmentById(envId);
+
+    if (!environment) {
+      logger.error("Environment not found", { id: envId });
+      return;
+    }
+
+    const run = await prisma.taskRun.findUnique({
+      where: {
+        id: runId,
+        runtimeEnvironmentId: environment.id,
+      },
+    });
+
+    if (!run) {
+      logger.error("Run not found", { id: runId, envId });
+      return;
+    }
+
+    const environmentRepository = new EnvironmentVariablesRepository();
+    const variables = await environmentRepository.getEnvironmentVariables(
+      environment.projectId,
+      environment.id
+    );
+
+    return {
+      traceContext: run.traceContext as Record<string, unknown>,
+      environment: variables.reduce((acc: Record<string, string>, curr) => {
+        acc[curr.key] = curr.value;
+        return acc;
+      }, {}),
+      runId: run.friendlyId,
+      messageId: run.id,
+      isTest: run.isTest,
+    } satisfies TaskRunExecutionLazyAttemptPayload;
+  }
+
   async taskHeartbeat(attemptFriendlyId: string, seconds: number = 60) {
+    logger.debug("[SharedQueueConsumer] taskHeartbeat()", { id: attemptFriendlyId, seconds });
+
     const taskRunAttempt = await prisma.taskRunAttempt.findUnique({
       where: { friendlyId: attemptFriendlyId },
     });
@@ -1107,6 +1161,20 @@ class SharedQueueTasks {
 
     await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds);
   }
+
+  async taskRunHeartbeat(runId: string, seconds: number = 60) {
+    logger.debug("[SharedQueueConsumer] taskRunHeartbeat()", { runId, seconds });
+
+    await marqs?.heartbeatMessage(runId, seconds);
+  }
+
+  public async taskRunFailed(completion: TaskRunFailedExecutionResult) {
+    logger.debug("[SharedQueueConsumer] taskRunFailed()", { completion });
+
+    const service = new FailedTaskRunService();
+
+    await service.call(completion.id, completion);
+  }
 }
 
 export const sharedQueueTasks = singleton("sharedQueueTasks", () => new SharedQueueTasks());
diff --git a/apps/webapp/app/v3/requeueTaskRun.server.ts b/apps/webapp/app/v3/requeueTaskRun.server.ts
new file mode 100644
index 00000000000..e2b904998fd
--- /dev/null
+++ b/apps/webapp/app/v3/requeueTaskRun.server.ts
@@ -0,0 +1,95 @@
+import { logger } from "~/services/logger.server";
+import { marqs } from "~/v3/marqs/index.server";
+
+import assertNever from "assert-never";
+import { FailedTaskRunService } from "./failedTaskRun.server";
+import { BaseService } from "./services/baseService.server";
+import { PrismaClientOrTransaction } from "~/db.server";
+import { workerQueue } from "~/services/worker.server";
+
+export class RequeueTaskRunService extends BaseService {
+  public async call(runId: string) {
+    const taskRun = await this._prisma.taskRun.findUnique({
+      where: { id: runId },
+    });
+
+    if (!taskRun) {
+      logger.error("[RequeueTaskRunService] Task run not found", {
+        runId,
+      });
+
+      return;
+    }
+
+    switch (taskRun.status) {
+      case "PENDING": {
+        logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun });
+
+        await marqs?.nackMessage(taskRun.id);
+
+        break;
+      }
+      case "EXECUTING":
+      case "RETRYING_AFTER_FAILURE": {
+        logger.debug("[RequeueTaskRunService] Failing task run", { taskRun });
+
+        const service = new FailedTaskRunService();
+
+        await service.call(taskRun.friendlyId, {
+          ok: false,
+          id: taskRun.friendlyId,
+          retry: undefined,
+          error: {
+            type: "INTERNAL_ERROR",
+            code: "TASK_RUN_HEARTBEAT_TIMEOUT",
+            message: "Did not receive a heartbeat from the worker in time",
+          },
+        });
+
+        break;
+      }
+      case "WAITING_FOR_DEPLOY": {
+        logger.debug("[RequeueTaskRunService] Removing task run from queue", { taskRun });
+
+        await marqs?.acknowledgeMessage(taskRun.id);
+
+        break;
+      }
+      case "WAITING_TO_RESUME":
+      case "PAUSED": {
+        logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun });
+
+        await marqs?.nackMessage(taskRun.id);
+
+        break;
+      }
+      case "SYSTEM_FAILURE":
+      case "INTERRUPTED":
+      case "CRASHED":
+      case "COMPLETED_WITH_ERRORS":
+      case "COMPLETED_SUCCESSFULLY":
+      case "CANCELED": {
+        logger.debug("[RequeueTaskRunService] Task run is completed", { taskRun });
+
+        await marqs?.acknowledgeMessage(taskRun.id);
+
+        break;
+      }
+      default: {
+        assertNever(taskRun.status);
+      }
+    }
+  }
+
+  public static async enqueue(runId: string, runAt?: Date, tx?: PrismaClientOrTransaction) {
+    return await workerQueue.enqueue(
+      "v3.requeueTaskRun",
+      { runId },
+      { runAt, jobKey: `requeueTaskRun:${runId}` }
+    );
+  }
+
+  public static async dequeue(runId: string, tx?: PrismaClientOrTransaction) {
+    return await workerQueue.dequeue(`requeueTaskRun:${runId}`, { tx });
+  }
+}
diff --git a/apps/webapp/app/v3/services/baseService.server.ts b/apps/webapp/app/v3/services/baseService.server.ts
index 6892118c649..e6b9d0252cb 100644
--- a/apps/webapp/app/v3/services/baseService.server.ts
+++ b/apps/webapp/app/v3/services/baseService.server.ts
@@ -34,7 +34,7 @@ export abstract class BaseService {
 }
 
 export class ServiceValidationError extends Error {
-  constructor(message: string) {
+  constructor(message: string, public status?: number) {
     super(message);
     this.name = "ServiceValidationError";
   }
diff --git a/apps/webapp/app/v3/services/cancelTaskRun.server.ts b/apps/webapp/app/v3/services/cancelTaskRun.server.ts
index 72179a0dce7..a4822d854aa 100644
--- a/apps/webapp/app/v3/services/cancelTaskRun.server.ts
+++ b/apps/webapp/app/v3/services/cancelTaskRun.server.ts
@@ -24,9 +24,15 @@ const CANCELLABLE_ATTEMPT_STATUSES: Array<TaskRunAttemptStatus> = [
   "PENDING",
 ];
 
-type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{
+type ExtendedTaskRun = Prisma.TaskRunGetPayload<{
   include: {
     runtimeEnvironment: true;
+    lockedToVersion: true;
+  };
+}>;
+
+type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{
+  include: {
     backgroundWorker: true;
   };
 }>;
@@ -71,11 +77,10 @@ export class CancelTaskRunService extends BaseService {
           },
           include: {
             backgroundWorker: true,
-            runtimeEnvironment: true,
           },
         },
-        dependency: true,
         runtimeEnvironment: true,
+        lockedToVersion: true,
       },
     });
 
@@ -96,6 +101,7 @@ export class CancelTaskRunService extends BaseService {
     // Cancel any in progress attempts
     if (opts.cancelAttempts) {
       await this.#cancelPotentiallyRunningAttempts(cancelledTaskRun, cancelledTaskRun.attempts);
+      await this.#cancelRemainingRunWorkers(cancelledTaskRun);
     }
 
     return {
@@ -103,9 +109,12 @@ export class CancelTaskRunService extends BaseService {
     };
   }
 
-  async #cancelPotentiallyRunningAttempts(run: TaskRun, attempts: ExtendedTaskRunAttempt[]) {
+  async #cancelPotentiallyRunningAttempts(
+    run: ExtendedTaskRun,
+    attempts: ExtendedTaskRunAttempt[]
+  ) {
     for (const attempt of attempts) {
-      if (attempt.runtimeEnvironment.type === "DEVELOPMENT") {
+      if (run.runtimeEnvironment.type === "DEVELOPMENT") {
         // Signal the task run attempt to stop
         await devPubSub.publish(
           `backgroundWorker:${attempt.backgroundWorkerId}:${attempt.id}`,
@@ -158,4 +167,19 @@ export class CancelTaskRunService extends BaseService {
       }
     }
   }
+
+  async #cancelRemainingRunWorkers(run: ExtendedTaskRun) {
+    if (run.runtimeEnvironment.type === "DEVELOPMENT") {
+      // Nothing to do
+      return;
+    }
+
+    // Broadcast cancel message to all coordinators
+    socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
+      version: "v1",
+      runId: run.id,
+      // Give the attempts some time to exit gracefully. If the runs supports lazy attempts, it also supports exit delays.
+      delayInMs: run.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined,
+    });
+  }
 }
diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts
index d18e3f16b7e..93d491581e5 100644
--- a/apps/webapp/app/v3/services/completeAttempt.server.ts
+++ b/apps/webapp/app/v3/services/completeAttempt.server.ts
@@ -11,7 +11,7 @@ import { PrismaClientOrTransaction } from "~/db.server";
 import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
 import { safeJsonParse } from "~/utils/json";
-import { eventRepository } from "../eventRepository.server";
+import { createExceptionPropertiesFromError, eventRepository } from "../eventRepository.server";
 import { marqs } from "~/v3/marqs/index.server";
 import { BaseService } from "./baseService.server";
 import { CancelAttemptService } from "./cancelAttempt.server";
@@ -20,6 +20,7 @@ import { MAX_TASK_RUN_ATTEMPTS } from "~/consts";
 import { CreateCheckpointService } from "./createCheckpoint.server";
 import { TaskRun } from "@trigger.dev/database";
 import { PerformTaskAttemptAlertsService } from "./alerts/performTaskAttemptAlerts.server";
+import { RetryAttemptService } from "./retryAttempt.server";
 
 type FoundAttempt = Awaited<ReturnType<typeof findAttempt>>;
 
@@ -57,6 +58,8 @@ export class CompleteAttemptService extends BaseService {
         },
       });
 
+      // No attempt, so there's no message to ACK
+
       return "COMPLETED";
     }
 
@@ -143,6 +146,8 @@ export class CompleteAttemptService extends BaseService {
         env
       );
 
+      // The cancel service handles ACK
+
       return "COMPLETED";
     }
 
@@ -173,7 +178,7 @@ export class CompleteAttemptService extends BaseService {
           properties: {
             retryAt: retryAt.toISOString(),
           },
-          runId: taskRunAttempt.taskRunId,
+          runId: taskRunAttempt.taskRun.friendlyId,
           style: {
             icon: "schedule-attempt",
           },
@@ -185,7 +190,10 @@ export class CompleteAttemptService extends BaseService {
         endTime: retryAt,
       });
 
-      logger.debug("Retrying", { taskRun: taskRunAttempt.taskRun.friendlyId });
+      logger.debug("Retrying", {
+        taskRun: taskRunAttempt.taskRun.friendlyId,
+        retry: completion.retry,
+      });
 
       await this._prisma.taskRun.update({
         where: {
@@ -203,7 +211,12 @@ export class CompleteAttemptService extends BaseService {
       }
 
       if (!checkpoint) {
-        await this.#enqueueRetry(taskRunAttempt.taskRun, completion.retry.timestamp);
+        await this.#retryAttempt(
+          taskRunAttempt.taskRun,
+          completion.retry.timestamp,
+          undefined,
+          taskRunAttempt.backgroundWorker.supportsLazyAttempts
+        );
         return "RETRIED";
       }
 
@@ -231,10 +244,12 @@ export class CompleteAttemptService extends BaseService {
           },
         });
 
+        await marqs?.acknowledgeMessage(taskRunAttempt.taskRunId);
+
         return "COMPLETED";
       }
 
-      await this.#enqueueRetry(
+      await this.#retryAttempt(
         taskRunAttempt.taskRun,
         completion.retry.timestamp,
         checkpointCreateResult.event.id
@@ -253,6 +268,15 @@ export class CompleteAttemptService extends BaseService {
         attributes: {
           isError: true,
         },
+        events: [
+          {
+            name: "exception",
+            time: new Date(),
+            properties: {
+              exception: createExceptionPropertiesFromError(completion.error),
+            },
+          },
+        ],
       });
 
       if (
@@ -310,17 +334,28 @@ export class CompleteAttemptService extends BaseService {
     }
   }
 
-  async #enqueueRetry(run: TaskRun, retryTimestamp: number, checkpointEventId?: string) {
-    // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt
-    return await marqs?.replaceMessage(
-      run.id,
-      {
-        type: "EXECUTE",
-        taskIdentifier: run.taskIdentifier,
-        checkpointEventId: checkpointEventId,
-      },
-      retryTimestamp
-    );
+  async #retryAttempt(
+    run: TaskRun,
+    retryTimestamp: number,
+    checkpointEventId?: string,
+    supportsLazyAttempts?: boolean
+  ) {
+    if (checkpointEventId || !supportsLazyAttempts) {
+      // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt
+      return await marqs?.replaceMessage(
+        run.id,
+        {
+          type: "EXECUTE",
+          taskIdentifier: run.taskIdentifier,
+          checkpointEventId: checkpointEventId,
+        },
+        retryTimestamp
+      );
+    } else {
+      // There's no checkpoint so the worker is still running and waiting for a retry message
+      // It supports lazy attempts so we can bypass the queue and send the message directly to the worker
+      RetryAttemptService.enqueue(run.id, this._prisma, new Date(retryTimestamp));
+    }
   }
 
   #generateMetadataAttributesForNextAttempt(execution: TaskRunExecution) {
@@ -353,6 +388,7 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId:
     include: {
       taskRun: true,
       backgroundWorkerTask: true,
+      backgroundWorker: true,
     },
   });
 }
diff --git a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
index 2deb2c5373c..9045075e131 100644
--- a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
+++ b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
@@ -63,6 +63,7 @@ export class CreateBackgroundWorkerService extends BaseService {
           contentHash: body.metadata.contentHash,
           cliVersion: body.metadata.cliPackageVersion,
           sdkVersion: body.metadata.packageVersion,
+          supportsLazyAttempts: body.supportsLazyAttempts,
         },
       });
 
diff --git a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
index 920fbeb4c0e..9b9c1afc984 100644
--- a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
+++ b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
@@ -45,6 +45,7 @@ export class CreateDeployedBackgroundWorkerService extends BaseService {
           contentHash: body.metadata.contentHash,
           cliVersion: body.metadata.cliPackageVersion,
           sdkVersion: body.metadata.packageVersion,
+          supportsLazyAttempts: body.supportsLazyAttempts,
         },
       });
 
diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts
new file mode 100644
index 00000000000..91dc5ccad39
--- /dev/null
+++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts
@@ -0,0 +1,213 @@
+import { TaskRunExecution } from "@trigger.dev/core/v3";
+import { $transaction, PrismaClientOrTransaction, prisma } from "~/db.server";
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+import { generateFriendlyId } from "../friendlyIdentifiers";
+import { BaseService, ServiceValidationError } from "./baseService.server";
+import { TaskRun, TaskRunAttempt } from "@trigger.dev/database";
+
+export class CreateTaskRunAttemptService extends BaseService {
+  public async call(
+    runId: string,
+    env?: AuthenticatedEnvironment,
+    setToExecuting = true
+  ): Promise<{
+    execution: TaskRunExecution;
+    run: TaskRun;
+    attempt: TaskRunAttempt;
+  }> {
+    const environment = env ?? (await getAuthenticatedEnvironmentFromRun(runId, this._prisma));
+
+    if (!environment) {
+      throw new ServiceValidationError("Environment not found", 404);
+    }
+
+    const isFriendlyId = runId.startsWith("run_");
+
+    return await this.traceWithEnv("call()", environment, async (span) => {
+      if (isFriendlyId) {
+        span.setAttribute("taskRunFriendlyId", runId);
+      } else {
+        span.setAttribute("taskRunId", runId);
+      }
+
+      const taskRun = await this._prisma.taskRun.findUnique({
+        where: {
+          id: !isFriendlyId ? runId : undefined,
+          friendlyId: isFriendlyId ? runId : undefined,
+          runtimeEnvironmentId: environment.id,
+        },
+        include: {
+          tags: true,
+          attempts: {
+            take: 1,
+            orderBy: {
+              number: "desc",
+            },
+          },
+          lockedBy: {
+            include: {
+              worker: true,
+            },
+          },
+          batchItems: {
+            include: {
+              batchTaskRun: true,
+            },
+          },
+        },
+      });
+
+      logger.debug("Creating a task run attempt", { taskRun });
+
+      if (!taskRun) {
+        throw new ServiceValidationError("Task run not found", 404);
+      }
+
+      span.setAttribute("taskRunId", taskRun.id);
+      span.setAttribute("taskRunFriendlyId", taskRun.friendlyId);
+
+      if (taskRun.status === "CANCELED") {
+        throw new ServiceValidationError("Task run is cancelled", 400);
+      }
+
+      if (!taskRun.lockedBy) {
+        throw new ServiceValidationError("Task run is not locked", 400);
+      }
+
+      const queue = await this._prisma.taskQueue.findUnique({
+        where: {
+          runtimeEnvironmentId_name: {
+            runtimeEnvironmentId: environment.id,
+            name: taskRun.queue,
+          },
+        },
+      });
+
+      if (!queue) {
+        throw new ServiceValidationError("Queue not found", 404);
+      }
+
+      const nextAttemptNumber = taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1;
+
+      const taskRunAttempt = await $transaction(this._prisma, async (tx) => {
+        const taskRunAttempt = await tx.taskRunAttempt.create({
+          data: {
+            number: nextAttemptNumber,
+            friendlyId: generateFriendlyId("attempt"),
+            taskRunId: taskRun.id,
+            startedAt: new Date(),
+            backgroundWorkerId: taskRun.lockedBy!.worker.id,
+            backgroundWorkerTaskId: taskRun.lockedBy!.id,
+            status: setToExecuting ? "EXECUTING" : "PENDING",
+            queueId: queue.id,
+            runtimeEnvironmentId: environment.id,
+          },
+          include: {
+            backgroundWorker: true,
+            backgroundWorkerTask: true,
+          },
+        });
+
+        if (setToExecuting) {
+          await tx.taskRun.update({
+            where: {
+              id: taskRun.id,
+            },
+            data: {
+              status: "EXECUTING",
+            },
+          });
+        }
+
+        return taskRunAttempt;
+      });
+
+      if (!taskRunAttempt) {
+        logger.error("Failed to create task run attempt", { runId: taskRun.id, nextAttemptNumber });
+        throw new ServiceValidationError("Failed to create task run attempt", 500);
+      }
+
+      const execution: TaskRunExecution = {
+        task: {
+          id: taskRun.lockedBy.slug,
+          filePath: taskRun.lockedBy.filePath,
+          exportName: taskRun.lockedBy.exportName,
+        },
+        attempt: {
+          id: taskRunAttempt.friendlyId,
+          number: taskRunAttempt.number,
+          startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt,
+          backgroundWorkerId: taskRun.lockedBy.worker.id,
+          backgroundWorkerTaskId: taskRun.lockedBy.id,
+          status: "EXECUTING" as const,
+        },
+        run: {
+          id: taskRun.friendlyId,
+          payload: taskRun.payload,
+          payloadType: taskRun.payloadType,
+          context: taskRun.context,
+          createdAt: taskRun.createdAt,
+          tags: taskRun.tags.map((tag) => tag.name),
+          isTest: taskRun.isTest,
+          idempotencyKey: taskRun.idempotencyKey ?? undefined,
+        },
+        queue: {
+          id: queue.friendlyId,
+          name: queue.name,
+        },
+        environment: {
+          id: environment.id,
+          slug: environment.slug,
+          type: environment.type,
+        },
+        organization: {
+          id: environment.organization.id,
+          slug: environment.organization.slug,
+          name: environment.organization.title,
+        },
+        project: {
+          id: environment.project.id,
+          ref: environment.project.externalRef,
+          slug: environment.project.slug,
+          name: environment.project.name,
+        },
+        batch:
+          taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun
+            ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId }
+            : undefined,
+      };
+
+      return {
+        execution,
+        run: taskRun,
+        attempt: taskRunAttempt,
+      };
+    });
+  }
+}
+
+async function getAuthenticatedEnvironmentFromRun(
+  friendlyId: string,
+  prismaClient?: PrismaClientOrTransaction
+) {
+  const taskRun = await (prismaClient ?? prisma).taskRun.findUnique({
+    where: {
+      friendlyId,
+    },
+    include: {
+      runtimeEnvironment: {
+        include: {
+          organization: true,
+          project: true,
+        },
+      },
+    },
+  });
+
+  if (!taskRun) {
+    return;
+  }
+
+  return taskRun?.runtimeEnvironment;
+}
diff --git a/apps/webapp/app/v3/services/retryAttempt.server.ts b/apps/webapp/app/v3/services/retryAttempt.server.ts
new file mode 100644
index 00000000000..86844b53496
--- /dev/null
+++ b/apps/webapp/app/v3/services/retryAttempt.server.ts
@@ -0,0 +1,39 @@
+import { BaseService } from "./baseService.server";
+import { logger } from "~/services/logger.server";
+import { socketIo } from "../handleSocketIo.server";
+import { PrismaClientOrTransaction } from "~/db.server";
+import { workerQueue } from "~/services/worker.server";
+
+export class RetryAttemptService extends BaseService {
+  public async call(runId: string) {
+    const taskRun = await this._prisma.taskRun.findFirst({
+      where: {
+        id: runId,
+      },
+    });
+
+    if (!taskRun) {
+      logger.error("Task run not found", { runId });
+      return;
+    }
+
+    socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", {
+      version: "v1",
+      runId,
+    });
+  }
+
+  static async enqueue(runId: string, tx: PrismaClientOrTransaction, runAt?: Date) {
+    return await workerQueue.enqueue(
+      "v3.retryAttempt",
+      {
+        runId,
+      },
+      {
+        tx,
+        runAt,
+        jobKey: `retryAttempt:${runId}`,
+      }
+    );
+  }
+}
diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts
index d935724b69b..1d6a59443ac 100644
--- a/apps/webapp/app/v3/services/triggerTask.server.ts
+++ b/apps/webapp/app/v3/services/triggerTask.server.ts
@@ -4,13 +4,12 @@ import {
   TriggerTaskRequestBody,
   packetRequiresOffloading,
 } from "@trigger.dev/core/v3";
-import { nanoid } from "nanoid";
 import { createHash } from "node:crypto";
 import { $transaction } from "~/db.server";
 import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { eventRepository } from "../eventRepository.server";
 import { generateFriendlyId } from "../friendlyIdentifiers";
-import { marqs } from "~/v3/marqs/index.server";
+import { marqs, sanitizeQueueName } from "~/v3/marqs/index.server";
 import { uploadToObjectStore } from "../r2.server";
 import { BaseService } from "./baseService.server";
 
@@ -112,7 +111,7 @@ export class TriggerTaskService extends BaseService {
               select: { lastNumber: true },
             });
 
-            const queueName = body.options?.queue?.name ?? `task/${taskId}`;
+            const queueName = sanitizeQueueName(body.options?.queue?.name ?? `task/${taskId}`);
 
             event.setAttribute("queueName", queueName);
             span.setAttribute("queueName", queueName);
diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts
index 39f693b918f..6b6fbcacbd1 100644
--- a/packages/cli-v3/src/apiClient.ts
+++ b/packages/cli-v3/src/apiClient.ts
@@ -16,7 +16,10 @@ import {
   GetProjectResponseBody,
   ImportEnvironmentVariablesRequestBody,
   EnvironmentVariableResponseBody,
+  TaskRunExecution,
+  APIError,
 } from "@trigger.dev/core/v3";
+import { zodfetch } from "@trigger.dev/core/v3/zodfetch";
 
 export class CliApiClient {
   private readonly apiURL: string;
@@ -29,7 +32,7 @@ export class CliApiClient {
   }
 
   async createAuthorizationCode() {
-    return zodfetch(
+    return wrapZodFetch(
       CreateAuthorizationCodeResponseSchema,
       `${this.apiURL}/api/v1/authorization-code`,
       {
@@ -39,7 +42,7 @@ export class CliApiClient {
   }
 
   async getPersonalAccessToken(authorizationCode: string) {
-    return zodfetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, {
+    return wrapZodFetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, {
       method: "POST",
       body: JSON.stringify({
         authorizationCode,
@@ -52,7 +55,7 @@ export class CliApiClient {
       throw new Error("whoAmI: No access token");
     }
 
-    return zodfetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, {
+    return wrapZodFetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, {
       headers: {
         Authorization: `Bearer ${this.accessToken}`,
         "Content-Type": "application/json",
@@ -65,7 +68,7 @@ export class CliApiClient {
       throw new Error("getProject: No access token");
     }
 
-    return zodfetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, {
+    return wrapZodFetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, {
       headers: {
         Authorization: `Bearer ${this.accessToken}`,
         "Content-Type": "application/json",
@@ -78,7 +81,7 @@ export class CliApiClient {
       throw new Error("getProjects: No access token");
     }
 
-    return zodfetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, {
+    return wrapZodFetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, {
       headers: {
         Authorization: `Bearer ${this.accessToken}`,
         "Content-Type": "application/json",
@@ -91,7 +94,7 @@ export class CliApiClient {
       throw new Error("createBackgroundWorker: No access token");
     }
 
-    return zodfetch(
+    return wrapZodFetch(
       CreateBackgroundWorkerResponse,
       `${this.apiURL}/api/v1/projects/${projectRef}/background-workers`,
       {
@@ -105,6 +108,20 @@ export class CliApiClient {
     );
   }
 
+  async createTaskRunAttempt(runFriendlyId: string) {
+    if (!this.accessToken) {
+      throw new Error("creatTaskRunAttempt: No access token");
+    }
+
+    return wrapZodFetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${this.accessToken}`,
+        "Content-Type": "application/json",
+      },
+    });
+  }
+
   async getProjectEnv({
     projectRef,
     env,
@@ -116,12 +133,16 @@ export class CliApiClient {
       throw new Error("getProjectDevEnv: No access token");
     }
 
-    return zodfetch(GetProjectEnvResponse, `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, {
-      headers: {
-        Authorization: `Bearer ${this.accessToken}`,
-        "Content-Type": "application/json",
-      },
-    });
+    return wrapZodFetch(
+      GetProjectEnvResponse,
+      `${this.apiURL}/api/v1/projects/${projectRef}/${env}`,
+      {
+        headers: {
+          Authorization: `Bearer ${this.accessToken}`,
+          "Content-Type": "application/json",
+        },
+      }
+    );
   }
 
   async getEnvironmentVariables(projectRef: string) {
@@ -129,7 +150,7 @@ export class CliApiClient {
       throw new Error("getEnvironmentVariables: No access token");
     }
 
-    return zodfetch(
+    return wrapZodFetch(
       GetEnvironmentVariablesResponseBody,
       `${this.apiURL}/api/v1/projects/${projectRef}/envvars`,
       {
@@ -150,7 +171,7 @@ export class CliApiClient {
       throw new Error("importEnvVars: No access token");
     }
 
-    return zodfetch(
+    return wrapZodFetch(
       EnvironmentVariableResponseBody,
       `${this.apiURL}/api/v1/projects/${projectRef}/envvars/${slug}/import`,
       {
@@ -169,7 +190,7 @@ export class CliApiClient {
       throw new Error("initializeDeployment: No access token");
     }
 
-    return zodfetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, {
+    return wrapZodFetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, {
       method: "POST",
       headers: {
         Authorization: `Bearer ${this.accessToken}`,
@@ -184,7 +205,7 @@ export class CliApiClient {
       throw new Error("startDeploymentIndexing: No access token");
     }
 
-    return zodfetch(
+    return wrapZodFetch(
       StartDeploymentIndexingResponseBody,
       `${this.apiURL}/api/v1/deployments/${deploymentId}/start-indexing`,
       {
@@ -203,7 +224,7 @@ export class CliApiClient {
       throw new Error("getDeployment: No access token");
     }
 
-    return zodfetch(
+    return wrapZodFetch(
       GetDeploymentResponseBody,
       `${this.apiURL}/api/v1/deployments/${deploymentId}`,
       {
@@ -223,56 +244,42 @@ type ApiResult<TSuccessResult> =
       error: string;
     };
 
-async function zodfetch<TResponseBody extends any>(
-  schema: z.Schema<TResponseBody>,
+async function wrapZodFetch<T extends z.ZodTypeAny>(
+  schema: T,
   url: string,
   requestInit?: RequestInit
-): Promise<ApiResult<TResponseBody>> {
+): Promise<ApiResult<z.infer<T>>> {
   try {
-    const response = await fetch(url, requestInit);
+    const response = await zodfetch(schema, url, requestInit, {
+      retry: {
+        minTimeoutInMs: 500,
+        maxTimeoutInMs: 5000,
+        maxAttempts: 3,
+        factor: 2,
+        randomize: false,
+      },
+    });
 
-    if ((!requestInit || requestInit.method === "GET") && response.status === 404) {
+    return {
+      success: true,
+      data: response,
+    };
+  } catch (error) {
+    if (error instanceof APIError) {
       return {
         success: false,
-        error: `404: ${response.statusText}`,
+        error: error.message,
       };
-    }
-
-    if (response.status >= 400 && response.status < 500) {
-      const body = await response.json();
-      if (!body.error) {
-        return { success: false, error: "Something went wrong" };
-      }
-
-      return { success: false, error: body.error };
-    }
-
-    if (response.status !== 200) {
+    } else if (error instanceof Error) {
       return {
         success: false,
-        error: `Failed to fetch ${url}, got status code ${response.status}`,
+        error: error.message,
       };
-    }
-
-    const jsonBody = await response.json();
-    const parsedResult = schema.safeParse(jsonBody);
-
-    if (parsedResult.success) {
-      return { success: true, data: parsedResult.data };
-    }
-
-    if ("error" in jsonBody) {
+    } else {
       return {
         success: false,
-        error: typeof jsonBody.error === "string" ? jsonBody.error : JSON.stringify(jsonBody.error),
+        error: String(error),
       };
     }
-
-    return { success: false, error: parsedResult.error.message };
-  } catch (error) {
-    return {
-      success: false,
-      error: error instanceof Error ? error.message : JSON.stringify(error),
-    };
   }
 }
diff --git a/packages/cli-v3/src/commands/deploy.ts b/packages/cli-v3/src/commands/deploy.ts
index b9c8861057d..37a9157d404 100644
--- a/packages/cli-v3/src/commands/deploy.ts
+++ b/packages/cli-v3/src/commands/deploy.ts
@@ -199,7 +199,9 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) {
         `Failed to connect to ${authorization.auth?.apiUrl}. Are you sure it's the correct URL?`
       );
     } else {
-      throw new Error("You must login first. Use `trigger.dev login` to login.");
+      throw new Error(
+        `You must login first. Use the \`login\` CLI command.\n\n${authorization.error}`
+      );
     }
   }
 
diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx
index 353f9907cd5..a692b3bd1d9 100644
--- a/packages/cli-v3/src/commands/dev.tsx
+++ b/packages/cli-v3/src/commands/dev.tsx
@@ -112,7 +112,11 @@ export async function devCommand(dir: string, options: DevCommandOptions) {
         )} Connecting to the server failed. Please check your internet connection or contact eric@trigger.dev for help.`
       );
     } else {
-      logger.log(`${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.`);
+      logger.log(
+        `${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.\n\n${
+          authorization.error
+        }`
+      );
     }
     process.exitCode = 1;
     return;
@@ -285,6 +289,7 @@ function useDev({
     websocket.addEventListener("close", (event) => {});
     websocket.addEventListener("error", (event) => {});
 
+    // This is the deprecated task heart beat that uses the friendly attempt ID
     backgroundWorkerCoordinator.onWorkerTaskHeartbeat.attach(
       async ({ worker, backgroundWorkerId, id }) => {
         await sender.send("BACKGROUND_WORKER_MESSAGE", {
@@ -297,6 +302,19 @@ function useDev({
       }
     );
 
+    // "Task Run Heartbeat" id is the actual run ID that corresponds to the MarQS message ID
+    backgroundWorkerCoordinator.onWorkerTaskRunHeartbeat.attach(
+      async ({ worker, backgroundWorkerId, id }) => {
+        await sender.send("BACKGROUND_WORKER_MESSAGE", {
+          backgroundWorkerId,
+          data: {
+            type: "TASK_RUN_HEARTBEAT",
+            id,
+          },
+        });
+      }
+    );
+
     backgroundWorkerCoordinator.onTaskCompleted.attach(
       async ({ backgroundWorkerId, completion, execution }) => {
         await sender.send("BACKGROUND_WORKER_MESSAGE", {
@@ -310,6 +328,18 @@ function useDev({
       }
     );
 
+    backgroundWorkerCoordinator.onTaskFailedToRun.attach(
+      async ({ backgroundWorkerId, completion }) => {
+        await sender.send("BACKGROUND_WORKER_MESSAGE", {
+          backgroundWorkerId,
+          data: {
+            type: "TASK_RUN_FAILED_TO_RUN",
+            completion,
+          },
+        });
+      }
+    );
+
     backgroundWorkerCoordinator.onWorkerRegistered.attach(async ({ id, worker, record }) => {
       await sender.send("READY_FOR_TASKS", {
         backgroundWorkerId: id,
@@ -334,6 +364,7 @@ function useDev({
             for (const worker of backgroundWorkerCoordinator.currentWorkers) {
               await sender.send("READY_FOR_TASKS", {
                 backgroundWorkerId: worker.id,
+                inProgressRuns: worker.worker.inProgressRuns,
               });
             }
           },
@@ -505,21 +536,25 @@ function useDev({
 
                 const processEnv = await gatherProcessEnv();
 
-                const backgroundWorker = new BackgroundWorker(fullPath, {
-                  projectConfig: config,
-                  dependencies,
-                  env: {
-                    ...processEnv,
-                    TRIGGER_API_URL: apiUrl,
-                    TRIGGER_SECRET_KEY: apiKey,
-                    ...(environmentVariablesResponse.success
-                      ? environmentVariablesResponse.data.variables
-                      : {}),
+                const backgroundWorker = new BackgroundWorker(
+                  fullPath,
+                  {
+                    projectConfig: config,
+                    dependencies,
+                    env: {
+                      ...processEnv,
+                      TRIGGER_API_URL: apiUrl,
+                      TRIGGER_SECRET_KEY: apiKey,
+                      ...(environmentVariablesResponse.success
+                        ? environmentVariablesResponse.data.variables
+                        : {}),
+                    },
+                    debuggerOn,
+                    debugOtel,
+                    resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule),
                   },
-                  debuggerOn,
-                  debugOtel,
-                  resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule),
-                });
+                  environmentClient
+                );
 
                 try {
                   await backgroundWorker.initialize();
@@ -576,6 +611,7 @@ function useDev({
                       tasks: taskResources,
                       contentHash: contentHash,
                     },
+                    supportsLazyAttempts: true,
                   };
 
                   const backgroundWorkerRecord = await environmentClient.createBackgroundWorker(
@@ -827,18 +863,9 @@ function createDuplicateTaskIdOutputErrorMessage(
 
 async function gatherProcessEnv() {
   const env = {
+    ...process.env,
     NODE_ENV: process.env.NODE_ENV ?? "development",
-    PATH: process.env.PATH,
-    USER: process.env.USER,
-    SHELL: process.env.SHELL,
-    NVM_INC: process.env.NVM_INC,
-    NVM_DIR: process.env.NVM_DIR,
-    NVM_BIN: process.env.NVM_BIN,
-    LANG: process.env.LANG,
-    TERM: process.env.TERM,
     NODE_PATH: await amendNodePathWithPnpmNodeModules(process.env.NODE_PATH),
-    HOME: process.env.HOME,
-    BUN_INSTALL: process.env.BUN_INSTALL,
   };
 
   // Filter out undefined values
diff --git a/packages/cli-v3/src/commands/whoami.ts b/packages/cli-v3/src/commands/whoami.ts
index c46445d6462..73c740c94b9 100644
--- a/packages/cli-v3/src/commands/whoami.ts
+++ b/packages/cli-v3/src/commands/whoami.ts
@@ -78,7 +78,7 @@ export async function whoAmI(
             options?.profile ?? "default"
           }\` to login.`
         );
-        outro("Whoami failed");
+        outro(`Whoami failed: ${authentication.error}`);
       }
     }
 
diff --git a/packages/cli-v3/src/workers/common/errors.ts b/packages/cli-v3/src/workers/common/errors.ts
index 4017d3cc65a..053ab8d19ba 100644
--- a/packages/cli-v3/src/workers/common/errors.ts
+++ b/packages/cli-v3/src/workers/common/errors.ts
@@ -21,3 +21,43 @@ export class TaskMetadataParseError extends Error {
     this.name = "TaskMetadataParseError";
   }
 }
+
+export class UnexpectedExitError extends Error {
+  constructor(public code: number) {
+    super(`Unexpected exit with code ${code}`);
+
+    this.name = "UnexpectedExitError";
+  }
+}
+
+export class CleanupProcessError extends Error {
+  constructor() {
+    super("Cancelled");
+
+    this.name = "CleanupProcessError";
+  }
+}
+
+export class CancelledProcessError extends Error {
+  constructor() {
+    super("Cancelled");
+
+    this.name = "CancelledProcessError";
+  }
+}
+
+export class SigKillTimeoutProcessError extends Error {
+  constructor() {
+    super("Process kill timeout");
+
+    this.name = "SigKillTimeoutProcessError";
+  }
+}
+
+export class GracefulExitTimeoutError extends Error {
+  constructor() {
+    super("Graceful exit timeout");
+
+    this.name = "GracefulExitTimeoutError";
+  }
+}
diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts
index 5172b1f3558..c912aaa4691 100644
--- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts
+++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts
@@ -9,8 +9,10 @@ import {
   TaskRunError,
   TaskRunErrorCodes,
   TaskRunExecution,
+  TaskRunExecutionLazyAttemptPayload,
   TaskRunExecutionPayload,
   TaskRunExecutionResult,
+  TaskRunFailedExecutionResult,
   childToWorkerMessages,
   correctErrorStackTrace,
   formatDurationMilliseconds,
@@ -36,8 +38,15 @@ import {
 import { safeDeleteFileSync } from "../../utilities/fileSystem.js";
 import { installPackages } from "../../utilities/installPackages.js";
 import { logger } from "../../utilities/logger.js";
-import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors.js";
-import { env } from "node:process";
+import {
+  CancelledProcessError,
+  CleanupProcessError,
+  SigKillTimeoutProcessError,
+  TaskMetadataParseError,
+  UncaughtExceptionError,
+  UnexpectedExitError,
+} from "../common/errors.js";
+import { CliApiClient } from "../../apiClient.js";
 
 export type CurrentWorkers = BackgroundWorkerCoordinator["currentWorkers"];
 export class BackgroundWorkerCoordinator {
@@ -47,37 +56,52 @@ export class BackgroundWorkerCoordinator {
     worker: BackgroundWorker;
     execution: TaskRunExecution;
   }> = new Evt();
+  public onTaskFailedToRun: Evt<{
+    backgroundWorkerId: string;
+    worker: BackgroundWorker;
+    completion: TaskRunFailedExecutionResult;
+  }> = new Evt();
   public onWorkerRegistered: Evt<{
     worker: BackgroundWorker;
     id: string;
     record: CreateBackgroundWorkerResponse;
   }> = new Evt();
+
+  /**
+   * @deprecated use onWorkerTaskRunHeartbeat instead
+   */
   public onWorkerTaskHeartbeat: Evt<{
     id: string;
     backgroundWorkerId: string;
     worker: BackgroundWorker;
   }> = new Evt();
+  public onWorkerTaskRunHeartbeat: Evt<{
+    id: string;
+    backgroundWorkerId: string;
+    worker: BackgroundWorker;
+  }> = new Evt();
   public onWorkerDeprecated: Evt<{ worker: BackgroundWorker; id: string }> = new Evt();
   private _backgroundWorkers: Map<string, BackgroundWorker> = new Map();
   private _records: Map<string, CreateBackgroundWorkerResponse> = new Map();
   private _deprecatedWorkers: Set<string> = new Set();
 
   constructor(private baseURL: string) {
-    this.onTaskCompleted.attach(async ({ completion, execution }) => {
+    this.onTaskCompleted.attach(async ({ completion }) => {
       if (!completion.ok && typeof completion.retry !== "undefined") {
         return;
       }
 
-      await this.#notifyWorkersOfTaskCompletion(completion, execution);
+      await this.#notifyWorkersOfTaskCompletion(completion);
+    });
+
+    this.onTaskFailedToRun.attach(async ({ completion }) => {
+      await this.#notifyWorkersOfTaskCompletion(completion);
     });
   }
 
-  async #notifyWorkersOfTaskCompletion(
-    completion: TaskRunExecutionResult,
-    execution: TaskRunExecution
-  ) {
+  async #notifyWorkersOfTaskCompletion(completion: TaskRunExecutionResult) {
     for (const worker of this._backgroundWorkers.values()) {
-      await worker.taskRunCompletedNotification(completion, execution);
+      await worker.taskRunCompletedNotification(completion);
     }
   }
 
@@ -107,6 +131,10 @@ export class BackgroundWorkerCoordinator {
     worker.onTaskHeartbeat.attach((id) => {
       this.onWorkerTaskHeartbeat.post({ id, backgroundWorkerId: record.id, worker });
     });
+
+    worker.onTaskRunHeartbeat.attach((id) => {
+      this.onWorkerTaskRunHeartbeat.post({ id, backgroundWorkerId: record.id, worker });
+    });
   }
 
   close() {
@@ -136,11 +164,15 @@ export class BackgroundWorkerCoordinator {
         }
 
         await worker.cancelRun(message.taskRunId);
+        break;
+      }
+      case "EXECUTE_RUN_LAZY_ATTEMPT": {
+        await this.#executeTaskRunLazyAttempt(id, message.payload);
       }
     }
   }
 
-  async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) {
+  async #executeTaskRunLazyAttempt(id: string, payload: TaskRunExecutionLazyAttemptPayload) {
     const worker = this._backgroundWorkers.get(id);
 
     if (!worker) {
@@ -155,106 +187,68 @@ export class BackgroundWorkerCoordinator {
       return;
     }
 
-    const { execution } = payload;
-
-    // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1
-
-    const logsUrl = `${this.baseURL}/runs/${execution.run.id}`;
-
-    const pipe = chalkGrey("|");
-    const bullet = chalkGrey("○");
-    const link = chalkLink(terminalLink("View logs", logsUrl));
-    let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt));
-    const workerPrefix = chalkWorker(record.version);
-    const taskPrefix = chalkTask(execution.task.id);
-    const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`);
-
-    logger.log(
-      `${bullet} ${timestampPrefix} ${chalkGrey(
-        "->"
-      )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}`
-    );
-
-    const now = performance.now();
-
-    const completion = await worker.executeTaskRun(payload);
-
-    const elapsed = performance.now() - now;
-
-    const retryingText = chalkGrey(
-      !completion.ok && completion.skippedRetrying
-        ? " (retrying skipped)"
-        : !completion.ok && completion.retry !== undefined
-        ? ` (retrying in ${completion.retry.delay}ms)`
-        : ""
-    );
-
-    const resultText = !completion.ok
-      ? completion.error.type === "INTERNAL_ERROR" &&
-        (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED ||
-          completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED)
-        ? chalkWarning("Cancelled")
-        : `${chalkError("Error")}${retryingText}`
-      : chalkSuccess("Success");
-
-    const errorText = !completion.ok
-      ? this.#formatErrorLog(completion.error)
-      : "retry" in completion
-      ? `retry in ${completion.retry}ms`
-      : "";
-
-    const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`);
-
-    timestampPrefix = chalkGrey(prettyPrintDate());
-
-    logger.log(
-      `${bullet} ${timestampPrefix} ${chalkGrey(
-        "->"
-      )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}`
-    );
-
-    this.onTaskCompleted.post({ completion, execution, worker, backgroundWorkerId: id });
-  }
+    try {
+      const { completion, execution } = await worker.executeTaskRunLazyAttempt(
+        payload,
+        this.baseURL
+      );
 
-  #formatErrorLog(error: TaskRunError) {
-    switch (error.type) {
-      case "INTERNAL_ERROR": {
-        return "";
-      }
-      case "STRING_ERROR": {
-        return `\n\n${chalkError("X Error:")} ${error.raw}\n`;
-      }
-      case "CUSTOM_ERROR": {
-        return `\n\n${chalkError("X Error:")} ${error.raw}\n`;
-      }
-      case "BUILT_IN_ERROR": {
-        return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`;
-      }
+      this.onTaskCompleted.post({
+        completion,
+        execution,
+        worker,
+        backgroundWorkerId: id,
+      });
+    } catch (error) {
+      this.onTaskFailedToRun.post({
+        backgroundWorkerId: id,
+        worker,
+        completion: {
+          ok: false,
+          id: payload.runId,
+          retry: undefined,
+          error:
+            error instanceof Error
+              ? {
+                  type: "BUILT_IN_ERROR",
+                  name: error.name,
+                  message: error.message,
+                  stackTrace: error.stack ?? "",
+                }
+              : {
+                  type: "BUILT_IN_ERROR",
+                  name: "UnknownError",
+                  message: String(error),
+                  stackTrace: "",
+                },
+        },
+      });
     }
   }
-}
 
-class UnexpectedExitError extends Error {
-  constructor(public code: number) {
-    super(`Unexpected exit with code ${code}`);
+  async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) {
+    const worker = this._backgroundWorkers.get(id);
 
-    this.name = "UnexpectedExitError";
-  }
-}
+    if (!worker) {
+      logger.error(`Could not find worker ${id}`);
+      return;
+    }
 
-class CleanupProcessError extends Error {
-  constructor() {
-    super("Cancelled");
+    const record = this._records.get(id);
 
-    this.name = "CleanupProcessError";
-  }
-}
+    if (!record) {
+      logger.error(`Could not find worker record ${id}`);
+      return;
+    }
 
-class CancelledProcessError extends Error {
-  constructor() {
-    super("Cancelled");
+    const completion = await worker.executeTaskRun(payload, this.baseURL);
 
-    this.name = "CancelledProcessError";
+    this.onTaskCompleted.post({
+      completion,
+      execution: payload.execution,
+      worker,
+      backgroundWorkerId: id,
+    });
   }
 }
 
@@ -276,13 +270,18 @@ export class BackgroundWorker {
     schema: childToWorkerMessages,
   });
 
+  /**
+   * @deprecated use onTaskRunHeartbeat instead
+   */
   public onTaskHeartbeat: Evt<string> = new Evt();
+  public onTaskRunHeartbeat: Evt<string> = new Evt();
   private _onClose: Evt<void> = new Evt();
 
   public tasks: Array<TaskMetadataWithFilePath> = [];
   public metadata: BackgroundWorkerProperties | undefined;
 
   _taskRunProcesses: Map<string, TaskRunProcess> = new Map();
+  private _taskRunProcessesBeingKilled: Set<number> = new Set();
 
   private _closed: boolean = false;
 
@@ -290,7 +289,8 @@ export class BackgroundWorker {
 
   constructor(
     public path: string,
-    public params: BackgroundWorkerParams
+    public params: BackgroundWorkerParams,
+    private apiClient: CliApiClient
   ) {}
 
   close() {
@@ -301,6 +301,7 @@ export class BackgroundWorker {
     this._closed = true;
 
     this.onTaskHeartbeat.detach();
+    this.onTaskRunHeartbeat.detach();
 
     // We need to close all the task run processes
     for (const taskRunProcess of this._taskRunProcesses.values()) {
@@ -314,6 +315,10 @@ export class BackgroundWorker {
     safeDeleteFileSync(`${this.path}.map`);
   }
 
+  get inProgressRuns(): Array<string> {
+    return Array.from(this._taskRunProcesses.keys());
+  }
+
   async initialize() {
     if (this._initialized) {
       throw new Error("Worker already initialized");
@@ -408,46 +413,142 @@ export class BackgroundWorker {
 
   // We need to notify all the task run processes that a task run has completed,
   // in case they are waiting for it through triggerAndWait
-  async taskRunCompletedNotification(
-    completion: TaskRunExecutionResult,
-    execution: TaskRunExecution
-  ) {
+  async taskRunCompletedNotification(completion: TaskRunExecutionResult) {
     for (const taskRunProcess of this._taskRunProcesses.values()) {
-      taskRunProcess.taskRunCompletedNotification(completion, execution);
+      taskRunProcess.taskRunCompletedNotification(completion);
     }
   }
 
-  async #initializeTaskRunProcess(payload: TaskRunExecutionPayload): Promise<TaskRunProcess> {
+  #prefixedMessage(payload: TaskRunExecutionPayload, message: string = "") {
+    return `[${payload.execution.run.id}.${payload.execution.attempt.number}] ${message}`;
+  }
+
+  async #getFreshTaskRunProcess(
+    payload: TaskRunExecutionPayload,
+    messageId?: string
+  ): Promise<TaskRunProcess> {
+    logger.debug(this.#prefixedMessage(payload, "getFreshTaskRunProcess()"));
+
     if (!this.metadata) {
       throw new Error("Worker not registered");
     }
 
-    if (!this._taskRunProcesses.has(payload.execution.run.id)) {
-      const taskRunProcess = new TaskRunProcess(
-        payload.execution,
-        this.path,
-        {
-          ...this._fullEnv,
-          ...(payload.environment ?? {}),
-        },
-        this.metadata,
-        this.params
-      );
+    this._closed = false;
+
+    logger.debug(this.#prefixedMessage(payload, "killing current task run process before attempt"));
+
+    await this.#killCurrentTaskRunProcessBeforeAttempt(payload.execution.run.id);
 
-      taskRunProcess.onExit.attach(() => {
+    logger.debug(this.#prefixedMessage(payload, "creating new task run process"));
+
+    const taskRunProcess = new TaskRunProcess(
+      payload.execution.run.id,
+      payload.execution.run.isTest,
+      this.path,
+      {
+        ...this._fullEnv,
+        ...(payload.environment ?? {}),
+        ...this.#readEnvVars(),
+      },
+      this.metadata,
+      this.params,
+      messageId
+    );
+
+    taskRunProcess.onExit.attach(({ pid }) => {
+      logger.debug(this.#prefixedMessage(payload, "onExit()"), { pid });
+
+      const taskRunProcess = this._taskRunProcesses.get(payload.execution.run.id);
+
+      // Only delete the task run process if the pid matches
+      if (taskRunProcess?.pid === pid) {
         this._taskRunProcesses.delete(payload.execution.run.id);
-      });
+      }
 
-      taskRunProcess.onTaskHeartbeat.attach((id) => {
-        this.onTaskHeartbeat.post(id);
-      });
+      if (pid) {
+        this._taskRunProcessesBeingKilled.delete(pid);
+      }
+    });
+
+    taskRunProcess.onIsBeingKilled.attach((pid) => {
+      if (pid) {
+        this._taskRunProcessesBeingKilled.add(pid);
+      }
+    });
+
+    taskRunProcess.onTaskHeartbeat.attach((id) => {
+      this.onTaskHeartbeat.post(id);
+    });
 
-      await taskRunProcess.initialize();
+    taskRunProcess.onTaskRunHeartbeat.attach((id) => {
+      this.onTaskRunHeartbeat.post(id);
+    });
+
+    await taskRunProcess.initialize();
+
+    this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess);
+
+    return taskRunProcess;
+  }
 
-      this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess);
+  async #killCurrentTaskRunProcessBeforeAttempt(runId: string) {
+    const taskRunProcess = this._taskRunProcesses.get(runId);
+
+    if (!taskRunProcess) {
+      logger.debug(`[${runId}] no current task process to kill`);
+      return;
     }
 
-    return this._taskRunProcesses.get(payload.execution.run.id) as TaskRunProcess;
+    logger.debug(`[${runId}] killing current task process`, {
+      pid: taskRunProcess.pid,
+    });
+
+    if (taskRunProcess.isBeingKilled) {
+      if (this._taskRunProcessesBeingKilled.size > 1) {
+        await this.#tryGracefulExit(taskRunProcess);
+      } else {
+        // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel
+      }
+    } else {
+      // It's not being killed, so kill it
+      if (this._taskRunProcessesBeingKilled.size > 0) {
+        await this.#tryGracefulExit(taskRunProcess);
+      } else {
+        // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around.
+        taskRunProcess.kill("SIGTERM", 5_000).catch(() => {});
+      }
+    }
+  }
+
+  async #tryGracefulExit(
+    taskRunProcess: TaskRunProcess,
+    kill = false,
+    initialSignal: number | NodeJS.Signals = "SIGTERM"
+  ) {
+    try {
+      const initialExit = taskRunProcess.onExit.waitFor(5_000);
+
+      if (kill) {
+        taskRunProcess.kill(initialSignal);
+      }
+
+      await initialExit;
+    } catch (error) {
+      logger.error("TaskRunProcess graceful kill timeout exceeded", error);
+
+      this.#tryForcefulExit(taskRunProcess);
+    }
+  }
+
+  async #tryForcefulExit(taskRunProcess: TaskRunProcess) {
+    try {
+      const forcedKill = taskRunProcess.onExit.waitFor(5_000);
+      taskRunProcess.kill("SIGKILL");
+      await forcedKill;
+    } catch (error) {
+      logger.error("TaskRunProcess forced kill timeout exceeded", error);
+      throw new SigKillTimeoutProcessError();
+    }
   }
 
   async cancelRun(taskRunId: string) {
@@ -460,14 +561,113 @@ export class BackgroundWorker {
     await taskRunProcess.cancel();
   }
 
+  async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload, baseURL: string) {
+    const attemptResponse = await this.apiClient.createTaskRunAttempt(payload.runId);
+
+    if (!attemptResponse.success) {
+      throw new Error(`Failed to create task run attempt: ${attemptResponse.error}`);
+    }
+
+    const execution = attemptResponse.data;
+
+    const completion = await this.executeTaskRun(
+      { execution, traceContext: payload.traceContext, environment: payload.environment },
+      baseURL,
+      payload.messageId
+    );
+
+    return { execution, completion };
+  }
+
   // We need to fork the process before we can execute any tasks
-  async executeTaskRun(payload: TaskRunExecutionPayload): Promise<TaskRunExecutionResult> {
+  async executeTaskRun(
+    payload: TaskRunExecutionPayload,
+    baseURL: string,
+    messageId?: string
+  ): Promise<TaskRunExecutionResult> {
+    if (this._closed) {
+      throw new Error("Worker is closed");
+    }
+
+    if (!this.metadata) {
+      throw new Error("Worker not registered");
+    }
+
+    const { execution } = payload;
+    // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1
+
+    const logsUrl = `${baseURL}/runs/${execution.run.id}`;
+
+    const pipe = chalkGrey("|");
+    const bullet = chalkGrey("○");
+    const link = chalkLink(terminalLink("View logs", logsUrl));
+    let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt));
+    const workerPrefix = chalkWorker(this.metadata.version);
+    const taskPrefix = chalkTask(execution.task.id);
+    const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`);
+
+    logger.log(
+      `${bullet} ${timestampPrefix} ${chalkGrey(
+        "->"
+      )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}`
+    );
+
+    const now = performance.now();
+
+    const completion = await this.#doExecuteTaskRun(payload, messageId);
+
+    const elapsed = performance.now() - now;
+
+    const retryingText = chalkGrey(
+      !completion.ok && completion.skippedRetrying
+        ? " (retrying skipped)"
+        : !completion.ok && completion.retry !== undefined
+        ? ` (retrying in ${completion.retry.delay}ms)`
+        : ""
+    );
+
+    const resultText = !completion.ok
+      ? completion.error.type === "INTERNAL_ERROR" &&
+        (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED ||
+          completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED)
+        ? chalkWarning("Cancelled")
+        : `${chalkError("Error")}${retryingText}`
+      : chalkSuccess("Success");
+
+    const errorText = !completion.ok
+      ? formatErrorLog(completion.error)
+      : "retry" in completion
+      ? `retry in ${completion.retry}ms`
+      : "";
+
+    const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`);
+
+    timestampPrefix = chalkGrey(prettyPrintDate());
+
+    logger.log(
+      `${bullet} ${timestampPrefix} ${chalkGrey(
+        "->"
+      )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}`
+    );
+
+    return completion;
+  }
+
+  async #doExecuteTaskRun(
+    payload: TaskRunExecutionPayload,
+    messageId?: string
+  ): Promise<TaskRunExecutionResult> {
     try {
-      const taskRunProcess = await this.#initializeTaskRunProcess(payload);
+      const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId);
+
+      logger.debug(this.#prefixedMessage(payload, "executing task run"), {
+        pid: taskRunProcess.pid,
+      });
+
       const result = await taskRunProcess.executeTaskRun(payload);
 
-      // Kill the worker if the task was successful or if it's not going to be retried);
-      await taskRunProcess.cleanup(result.ok || result.retry === undefined);
+      // Always kill the worker
+      await taskRunProcess.cleanup(true);
 
       if (result.ok) {
         return result;
@@ -568,6 +768,7 @@ class TaskRunProcess {
   });
   private _sender: ZodMessageSender<typeof workerToChildMessages>;
   private _child: ChildProcess | undefined;
+  private _childPid?: number;
   private _attemptPromises: Map<
     string,
     { resolver: (value: TaskRunExecutionResult) => void; rejecter: (err?: any) => void }
@@ -576,15 +777,23 @@ class TaskRunProcess {
   private _currentExecution: TaskRunExecution | undefined;
   private _isBeingKilled: boolean = false;
   private _isBeingCancelled: boolean = false;
+  /**
+   * @deprecated use onTaskRunHeartbeat instead
+   */
   public onTaskHeartbeat: Evt<string> = new Evt();
-  public onExit: Evt<number> = new Evt();
+  public onTaskRunHeartbeat: Evt<string> = new Evt();
+  public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> =
+    new Evt();
+  public onIsBeingKilled: Evt<number | undefined> = new Evt();
 
   constructor(
-    private execution: TaskRunExecution,
+    private runId: string,
+    private isTest: boolean,
     private path: string,
     private env: NodeJS.ProcessEnv,
     private metadata: BackgroundWorkerProperties,
-    private worker: BackgroundWorkerParams
+    private worker: BackgroundWorkerParams,
+    private messageId?: string
   ) {
     this._sender = new ZodMessageSender({
       schema: workerToChildMessages,
@@ -604,7 +813,7 @@ class TaskRunProcess {
 
   async initialize() {
     const fullEnv = {
-      ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}),
+      ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}),
       ...this.env,
       OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({
         [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir,
@@ -615,7 +824,7 @@ class TaskRunProcess {
 
     const cwd = dirname(this.path);
 
-    logger.debug(`[${this.execution.run.id}] initializing task run process`, {
+    logger.debug(`[${this.runId}] initializing task run process`, {
       env: fullEnv,
       path: this.path,
       cwd,
@@ -629,6 +838,7 @@ class TaskRunProcess {
         ? ["--inspect-brk", "--trace-uncaught", "--no-warnings=ExperimentalWarning"]
         : ["--trace-uncaught", "--no-warnings=ExperimentalWarning"],
     });
+    this._childPid = this._child?.pid;
 
     this._child.on("message", this.#handleMessage.bind(this));
     this._child.on("exit", this.#handleExit.bind(this));
@@ -641,19 +851,28 @@ class TaskRunProcess {
       return;
     }
 
-    logger.debug(`[${this.execution.run.id}] cleaning up task run process`, { kill });
+    if (kill) {
+      this._isBeingKilled = true;
+      this.onIsBeingKilled.post(this._child?.pid);
+    }
+
+    logger.debug(`[${this.runId}] cleaning up task run process`, { kill, pid: this.pid });
 
     await this._sender.send("CLEANUP", {
       flush: true,
       kill,
     });
 
-    this._isBeingKilled = kill;
+    // FIXME: Something broke READY_TO_DISPOSE. We never receive it, so we always have to kill the process after the timeout below.
+
+    if (!kill) {
+      return;
+    }
 
     // Set a timeout to kill the child process if it hasn't been killed within 5 seconds
     setTimeout(() => {
       if (this._child && !this._child.killed) {
-        logger.debug(`[${this.execution.run.id}] killing task run process after timeout`);
+        logger.debug(`[${this.runId}] killing task run process after timeout`, { pid: this.pid });
 
         this._child.kill();
       }
@@ -691,24 +910,23 @@ class TaskRunProcess {
     return result;
   }
 
-  taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) {
+  taskRunCompletedNotification(completion: TaskRunExecutionResult) {
     if (!completion.ok && typeof completion.retry !== "undefined") {
       return;
     }
 
-    if (execution.run.id === this.execution.run.id) {
+    if (completion.id === this.runId) {
       // We don't need to notify the task run process if it's the same as the one we're running
       return;
     }
 
-    logger.debug(`[${this.execution.run.id}] task run completed notification`, {
+    logger.debug(`[${this.runId}] task run completed notification`, {
       completion,
-      execution,
     });
 
     this._sender.send("TASK_RUN_COMPLETED_NOTIFICATION", {
+      version: "v2",
       completion,
-      execution,
     });
   }
 
@@ -740,14 +958,18 @@ class TaskRunProcess {
         break;
       }
       case "READY_TO_DISPOSE": {
-        logger.debug(`[${this.execution.run.id}] task run process is ready to dispose`);
+        logger.debug(`[${this.runId}] task run process is ready to dispose`);
 
         this.#kill();
 
         break;
       }
       case "TASK_HEARTBEAT": {
-        this.onTaskHeartbeat.post(message.payload.id);
+        if (this.messageId) {
+          this.onTaskRunHeartbeat.post(this.messageId);
+        } else {
+          this.onTaskHeartbeat.post(message.payload.id);
+        }
 
         break;
       }
@@ -757,8 +979,8 @@ class TaskRunProcess {
     }
   }
 
-  async #handleExit(code: number) {
-    logger.debug(`[${this.execution.run.id}] task run process exiting`, { code });
+  async #handleExit(code: number | null, signal: NodeJS.Signals | null) {
+    logger.debug(`[${this.runId}] handle task run process exit`, { code, signal, pid: this.pid });
 
     // Go through all the attempts currently pending and reject them
     for (const [id, status] of this._attemptStatuses.entries()) {
@@ -778,12 +1000,12 @@ class TaskRunProcess {
         } else if (this._isBeingKilled) {
           rejecter(new CleanupProcessError());
         } else {
-          rejecter(new UnexpectedExitError(code));
+          rejecter(new UnexpectedExitError(code ?? -1));
         }
       }
     }
 
-    this.onExit.post(code);
+    this.onExit.post({ code, signal, pid: this.pid });
   }
 
   #handleLog(data: Buffer) {
@@ -823,10 +1045,54 @@ class TaskRunProcess {
   }
 
   #kill() {
-    if (this._child && !this._child.killed) {
-      logger.debug(`[${this.execution.run.id}] killing task run process`);
+    logger.debug(`[${this.runId}] #kill()`, { pid: this.pid });
 
+    if (this._child && !this._child.killed) {
       this._child?.kill();
     }
   }
+
+  async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) {
+    logger.debug(`[${this.runId}] killing task run process`, {
+      signal,
+      timeoutInMs,
+      pid: this.pid,
+    });
+
+    this._isBeingKilled = true;
+
+    const killTimeout = this.onExit.waitFor(timeoutInMs);
+
+    this.onIsBeingKilled.post(this._child?.pid);
+    this._child?.kill(signal);
+
+    if (timeoutInMs) {
+      await killTimeout;
+    }
+  }
+
+  get isBeingKilled() {
+    return this._isBeingKilled || this._child?.killed;
+  }
+
+  get pid() {
+    return this._childPid;
+  }
+}
+
+function formatErrorLog(error: TaskRunError) {
+  switch (error.type) {
+    case "INTERNAL_ERROR": {
+      return "";
+    }
+    case "STRING_ERROR": {
+      return `\n\n${chalkError("X Error:")} ${error.raw}\n`;
+    }
+    case "CUSTOM_ERROR": {
+      return `\n\n${chalkError("X Error:")} ${error.raw}\n`;
+    }
+    case "BUILT_IN_ERROR": {
+      return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`;
+    }
+  }
 }
diff --git a/packages/cli-v3/src/workers/dev/worker-facade.ts b/packages/cli-v3/src/workers/dev/worker-facade.ts
index efc65c93ddb..703ea93a12e 100644
--- a/packages/cli-v3/src/workers/dev/worker-facade.ts
+++ b/packages/cli-v3/src/workers/dev/worker-facade.ts
@@ -182,8 +182,17 @@ const handler = new ZodMessageHandler({
         _isRunning = false;
       }
     },
-    TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => {
-      devRuntimeManager.resumeTask(completion, execution);
+    TASK_RUN_COMPLETED_NOTIFICATION: async (payload) => {
+      switch (payload.version) {
+        case "v1": {
+          devRuntimeManager.resumeTask(payload.completion, payload.execution.run.id);
+          break;
+        }
+        case "v2": {
+          devRuntimeManager.resumeTask(payload.completion, payload.completion.id);
+          break;
+        }
+      }
     },
     CLEANUP: async ({ flush, kill }) => {
       if (kill) {
@@ -215,7 +224,7 @@ sender.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => {
 
 process.title = "trigger-dev-worker";
 
-async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) {
+async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 30) {
   async function _doHeartbeat() {
     while (true) {
       if (_isRunning && _execution) {
diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts
index 8f2bbe897a1..2b0c523e7b4 100644
--- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts
+++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts
@@ -11,6 +11,7 @@ import {
   TaskRunBuiltInError,
   TaskRunErrorCodes,
   TaskRunExecution,
+  TaskRunExecutionLazyAttemptPayload,
   TaskRunExecutionPayload,
   TaskRunExecutionResult,
   WaitReason,
@@ -20,31 +21,15 @@ import { ZodIpcConnection } from "@trigger.dev/core/v3/zodIpc";
 import type { InferSocketMessageSchema } from "@trigger.dev/core/v3/zodSocket";
 import { Evt } from "evt";
 import { ChildProcess, fork } from "node:child_process";
-import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors";
-
-class UnexpectedExitError extends Error {
-  constructor(public code: number) {
-    super(`Unexpected exit with code ${code}`);
-
-    this.name = "UnexpectedExitError";
-  }
-}
-
-class CleanupProcessError extends Error {
-  constructor() {
-    super("Cancelled");
-
-    this.name = "CleanupProcessError";
-  }
-}
-
-class CancelledProcessError extends Error {
-  constructor() {
-    super("Cancelled");
-
-    this.name = "CancelledProcessError";
-  }
-}
+import {
+  CancelledProcessError,
+  CleanupProcessError,
+  GracefulExitTimeoutError,
+  SigKillTimeoutProcessError,
+  TaskMetadataParseError,
+  UncaughtExceptionError,
+  UnexpectedExitError,
+} from "../common/errors";
 
 type BackgroundWorkerParams = {
   env: Record<string, string>;
@@ -56,7 +41,11 @@ type BackgroundWorkerParams = {
 export class ProdBackgroundWorker {
   private _initialized: boolean = false;
 
+  /**
+   * @deprecated use onTaskRunHeartbeat instead
+   */
   public onTaskHeartbeat: Evt<string> = new Evt();
+  public onTaskRunHeartbeat: Evt<string> = new Evt();
 
   public onWaitForBatch: Evt<
     InferSocketMessageSchema<typeof ProdChildToWorkerMessages, "WAIT_FOR_BATCH">
@@ -74,11 +63,24 @@ export class ProdBackgroundWorker {
   public onReadyForCheckpoint = Evt.create<{ version?: "v1" }>();
   public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>();
 
+  public onCreateTaskRunAttempt = Evt.create<{ version?: "v1"; runId: string }>();
+  public attemptCreatedNotification = Evt.create<
+    | {
+        success: false;
+        reason?: string;
+      }
+    | {
+        success: true;
+        execution: ProdTaskRunExecution;
+      }
+  >();
+
   private _onClose: Evt<void> = new Evt();
 
   public tasks: Array<TaskMetadataWithFilePath> = [];
 
   _taskRunProcess: TaskRunProcess | undefined;
+  private _taskRunProcessesBeingKilled: Map<number, TaskRunProcess> = new Map();
 
   private _closed: boolean = false;
 
@@ -87,7 +89,9 @@ export class ProdBackgroundWorker {
     private params: BackgroundWorkerParams
   ) {}
 
-  async close() {
+  async close(gracefulExitTimeoutElapsed = false) {
+    console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed });
+
     if (this._closed) {
       return;
     }
@@ -95,9 +99,35 @@ export class ProdBackgroundWorker {
     this._closed = true;
 
     this.onTaskHeartbeat.detach();
+    this.onTaskRunHeartbeat.detach();
 
     // We need to close the task run process
-    await this._taskRunProcess?.cleanup(true);
+    await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed);
+  }
+
+  async #killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") {
+    console.log("Killing task run process", { flush, initialSignal, closed: this._closed });
+
+    if (this._closed || !this._taskRunProcess) {
+      return;
+    }
+
+    if (flush) {
+      await this.flushTelemetry();
+    }
+
+    const currentTaskRunProcess = this._taskRunProcess;
+
+    // Try graceful exit but don't wait. We limit the amount of processes during creation instead.
+    this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => {
+      console.error("Error while trying graceful exit", error);
+    });
+
+    console.log("Killed task run process, setting closed to true", {
+      closed: this._closed,
+      pid: currentTaskRunProcess.pid,
+    });
+    this._closed = true;
   }
 
   async flushTelemetry() {
@@ -193,83 +223,191 @@ export class ProdBackgroundWorker {
 
   // We need to notify all the task run processes that a task run has completed,
   // in case they are waiting for it through triggerAndWait
-  async taskRunCompletedNotification(
-    completion: TaskRunExecutionResult,
-    execution: TaskRunExecution
-  ) {
-    this._taskRunProcess?.taskRunCompletedNotification(completion, execution);
+  async taskRunCompletedNotification(completion: TaskRunExecutionResult) {
+    this._taskRunProcess?.taskRunCompletedNotification(completion);
   }
 
   async waitCompletedNotification() {
     this._taskRunProcess?.waitCompletedNotification();
   }
 
-  async #initializeTaskRunProcess(payload: ProdTaskRunExecutionPayload): Promise<TaskRunProcess> {
+  async #getFreshTaskRunProcess(
+    payload: ProdTaskRunExecutionPayload,
+    messageId?: string
+  ): Promise<TaskRunProcess> {
     const metadata = this.getMetadata(
       payload.execution.worker.id,
       payload.execution.worker.version
     );
 
-    if (!this._taskRunProcess) {
-      const taskRunProcess = new TaskRunProcess(
-        payload.execution,
-        this.path,
-        {
-          ...this.params.env,
-          ...(payload.environment ?? {}),
-        },
-        metadata,
-        this.params
-      );
+    console.log("Getting fresh task run process, setting closed to false", {
+      closed: this._closed,
+    });
+    this._closed = false;
+
+    await this.#killCurrentTaskRunProcessBeforeAttempt();
+
+    const taskRunProcess = new TaskRunProcess(
+      payload.execution.run.id,
+      payload.execution.run.isTest,
+      this.path,
+      {
+        ...this.params.env,
+        ...(payload.environment ?? {}),
+      },
+      metadata,
+      this.params,
+      messageId
+    );
 
-      taskRunProcess.onExit.attach(() => {
+    taskRunProcess.onExit.attach(({ pid }) => {
+      console.log("Task run process exited", { pid });
+
+      // Only delete the task run process if the pid matches
+      if (this._taskRunProcess?.pid === pid) {
         this._taskRunProcess = undefined;
-      });
+      }
 
-      taskRunProcess.onTaskHeartbeat.attach((id) => {
-        this.onTaskHeartbeat.post(id);
-      });
+      if (pid) {
+        this._taskRunProcessesBeingKilled.delete(pid);
+      }
+    });
 
-      taskRunProcess.onWaitForBatch.attach((message) => {
-        this.onWaitForBatch.post(message);
-      });
+    taskRunProcess.onIsBeingKilled.attach((taskRunProcess) => {
+      if (taskRunProcess?.pid) {
+        this._taskRunProcessesBeingKilled.set(taskRunProcess.pid, taskRunProcess);
+      }
+    });
 
-      taskRunProcess.onWaitForDuration.attach((message) => {
-        this.onWaitForDuration.post(message);
-      });
+    taskRunProcess.onTaskHeartbeat.attach((id) => {
+      this.onTaskHeartbeat.post(id);
+    });
 
-      taskRunProcess.onWaitForTask.attach((message) => {
-        this.onWaitForTask.post(message);
-      });
+    taskRunProcess.onTaskRunHeartbeat.attach((id) => {
+      this.onTaskRunHeartbeat.post(id);
+    });
 
-      taskRunProcess.onReadyForCheckpoint.attach((message) => {
-        this.onReadyForCheckpoint.post(message);
-      });
+    taskRunProcess.onWaitForBatch.attach((message) => {
+      this.onWaitForBatch.post(message);
+    });
 
-      taskRunProcess.onCancelCheckpoint.attach((message) => {
-        this.onCancelCheckpoint.post(message);
-      });
+    taskRunProcess.onWaitForDuration.attach((message) => {
+      this.onWaitForDuration.post(message);
+    });
 
-      // Notify down the chain
-      this.preCheckpointNotification.attach((message) => {
-        taskRunProcess.preCheckpointNotification.post(message);
-      });
-      this.checkpointCanceledNotification.attach((message) => {
-        taskRunProcess.checkpointCanceledNotification.post(message);
-      });
+    taskRunProcess.onWaitForTask.attach((message) => {
+      this.onWaitForTask.post(message);
+    });
 
-      await taskRunProcess.initialize();
+    taskRunProcess.onReadyForCheckpoint.attach((message) => {
+      this.onReadyForCheckpoint.post(message);
+    });
 
-      this._taskRunProcess = taskRunProcess;
-    }
+    taskRunProcess.onCancelCheckpoint.attach((message) => {
+      this.onCancelCheckpoint.post(message);
+    });
+
+    // Notify down the chain
+    this.preCheckpointNotification.attach((message) => {
+      taskRunProcess.preCheckpointNotification.post(message);
+    });
+    this.checkpointCanceledNotification.attach((message) => {
+      taskRunProcess.checkpointCanceledNotification.post(message);
+    });
+
+    await taskRunProcess.initialize();
+
+    this._taskRunProcess = taskRunProcess;
 
     return this._taskRunProcess;
   }
 
-  // We need to fork the process before we can execute any tasks
-  async executeTaskRun(payload: ProdTaskRunExecutionPayload): Promise<TaskRunExecutionResult> {
+  async forceKillOldTaskRunProcesses() {
+    for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) {
+      try {
+        await taskRunProcess.kill("SIGKILL");
+      } catch (error) {
+        console.error("Error while force killing old task run processes", error);
+      }
+    }
+  }
+
+  async #killCurrentTaskRunProcessBeforeAttempt() {
+    console.log("killCurrentTaskRunProcessBeforeAttempt()", {
+      hasTaskRunProcess: !!this._taskRunProcess,
+    });
+
+    if (!this._taskRunProcess) {
+      return;
+    }
+
+    const currentTaskRunProcess = this._taskRunProcess;
+
+    console.log("Killing current task run process", {
+      isBeingKilled: currentTaskRunProcess?.isBeingKilled,
+      totalBeingKilled: this._taskRunProcessesBeingKilled.size,
+    });
+
+    if (currentTaskRunProcess.isBeingKilled) {
+      if (this._taskRunProcessesBeingKilled.size > 1) {
+        await this.#tryGracefulExit(currentTaskRunProcess);
+      } else {
+        // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel
+      }
+    } else {
+      // It's not being killed, so kill it
+      if (this._taskRunProcessesBeingKilled.size > 0) {
+        await this.#tryGracefulExit(currentTaskRunProcess);
+      } else {
+        // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around.
+        currentTaskRunProcess.kill("SIGTERM", 5_000).catch(() => {});
+      }
+    }
+  }
+
+  async #tryGracefulExit(
+    taskRunProcess: TaskRunProcess,
+    kill = false,
+    initialSignal: number | NodeJS.Signals = "SIGTERM"
+  ) {
+    try {
+      const initialExit = taskRunProcess.onExit.waitFor(5_000);
+
+      if (kill) {
+        taskRunProcess.kill(initialSignal);
+      }
+
+      await initialExit;
+    } catch (error) {
+      console.error("TaskRunProcess graceful kill timeout exceeded", error);
+
+      this.#tryForcefulExit(taskRunProcess);
+    }
+  }
+
+  async #tryForcefulExit(taskRunProcess: TaskRunProcess) {
+    try {
+      const forcedKill = taskRunProcess.onExit.waitFor(5_000);
+      taskRunProcess.kill("SIGKILL");
+      await forcedKill;
+    } catch (error) {
+      console.error("TaskRunProcess forced kill timeout exceeded", error);
+      throw new SigKillTimeoutProcessError();
+    }
+  }
+
+  // We need to fork the process before we can execute any tasks, use a fresh process for each execution
+  async executeTaskRun(
+    payload: ProdTaskRunExecutionPayload,
+    messageId?: string
+  ): Promise<TaskRunExecutionResult> {
     try {
-      const taskRunProcess = await this.#initializeTaskRunProcess(payload);
+      const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId);
+
+      console.log("executing task run", {
+        attempt: payload.execution.attempt.id,
+        taskRunPid: taskRunProcess.pid,
+      });
 
       const result = await taskRunProcess.executeTaskRun(payload);
 
@@ -326,6 +464,31 @@ export class ProdBackgroundWorker {
         };
       }
 
+      if (e instanceof SigKillTimeoutProcessError) {
+        return {
+          id: payload.execution.attempt.id,
+          ok: false,
+          retry: undefined,
+          error: {
+            type: "INTERNAL_ERROR",
+            code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT,
+          },
+        };
+      }
+
+      if (e instanceof GracefulExitTimeoutError) {
+        return {
+          id: payload.execution.attempt.id,
+          ok: false,
+          retry: undefined,
+          error: {
+            type: "INTERNAL_ERROR",
+            code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT,
+            message: "Worker process killed while attempt in progress.",
+          },
+        };
+      }
+
       return {
         id: payload.execution.attempt.id,
         ok: false,
@@ -335,11 +498,52 @@ export class ProdBackgroundWorker {
           code: TaskRunErrorCodes.TASK_EXECUTION_FAILED,
         },
       };
+    } finally {
+      await this.#killTaskRunProcess();
     }
   }
 
   async cancelAttempt(attemptId: string) {
-    await this._taskRunProcess?.cancel();
+    if (!this._taskRunProcess) {
+      console.error("No task run process to cancel attempt", { attemptId });
+      return;
+    }
+
+    await this._taskRunProcess.cancel();
+  }
+
+  async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload) {
+    // Post to coordinator
+    this.onCreateTaskRunAttempt.post({ runId: payload.runId });
+
+    let execution: ProdTaskRunExecution;
+
+    try {
+      // ..and wait for response
+      const attemptCreated = await this.attemptCreatedNotification.waitFor(30_000);
+
+      if (!attemptCreated.success) {
+        throw new Error(
+          `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}`
+        );
+      }
+
+      execution = attemptCreated.execution;
+    } catch (error) {
+      console.error("Error while creating attempt", error);
+      throw new Error(`Failed to create task run attempt: ${error}`);
+    }
+
+    const completion = await this.executeTaskRun(
+      {
+        execution,
+        traceContext: payload.traceContext,
+        environment: payload.environment,
+      },
+      payload.messageId
+    );
+
+    return { execution, completion };
   }
 
   async #correctError(
@@ -359,6 +563,7 @@ class TaskRunProcess {
     typeof ProdWorkerToChildMessages
   >;
   private _child?: ChildProcess;
+  private _childPid?: number;
 
   private _attemptPromises: Map<
     string,
@@ -368,9 +573,16 @@ class TaskRunProcess {
   private _currentExecution: TaskRunExecution | undefined;
   private _isBeingKilled: boolean = false;
   private _isBeingCancelled: boolean = false;
+  private _gracefulExitTimeoutElapsed: boolean = false;
 
+  /**
+   * @deprecated use onTaskRunHeartbeat instead
+   */
   public onTaskHeartbeat: Evt<string> = new Evt();
-  public onExit: Evt<number> = new Evt();
+  public onTaskRunHeartbeat: Evt<string> = new Evt();
+  public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> =
+    new Evt();
+  public onIsBeingKilled: Evt<TaskRunProcess> = new Evt();
 
   public onWaitForBatch: Evt<
     InferSocketMessageSchema<typeof ProdChildToWorkerMessages, "WAIT_FOR_BATCH">
@@ -389,18 +601,20 @@ class TaskRunProcess {
   public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>();
 
   constructor(
-    private execution: ProdTaskRunExecution,
+    private runId: string,
+    private isTest: boolean,
     private path: string,
     private env: NodeJS.ProcessEnv,
     private metadata: BackgroundWorkerProperties,
-    private worker: BackgroundWorkerParams
+    private worker: BackgroundWorkerParams,
+    private messageId?: string
   ) {}
 
   async initialize() {
     this._child = fork(this.path, {
       stdio: [/*stdin*/ "ignore", /*stdout*/ "pipe", /*stderr*/ "pipe", "ipc"],
       env: {
-        ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}),
+        ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}),
         ...this.env,
         OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({
           [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir,
@@ -408,6 +622,7 @@ class TaskRunProcess {
         ...(this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}),
       },
     });
+    this._childPid = this._child?.pid;
 
     this._ipc = new ZodIpcConnection({
       listenSchema: ProdChildToWorkerMessages,
@@ -439,7 +654,11 @@ class TaskRunProcess {
           process.exit(0);
         },
         TASK_HEARTBEAT: async (message) => {
-          this.onTaskHeartbeat.post(message.id);
+          if (this.messageId) {
+            this.onTaskRunHeartbeat.post(this.messageId);
+          } else {
+            this.onTaskHeartbeat.post(message.id);
+          }
         },
         TASKS_READY: async (message) => {},
         WAIT_FOR_TASK: async (message) => {
@@ -513,17 +732,38 @@ class TaskRunProcess {
     await this.cleanup(true);
   }
 
-  async cleanup(kill: boolean = false) {
+  async cleanup(kill = false, gracefulExitTimeoutElapsed = false) {
+    console.log("cleanup()", { kill, gracefulExitTimeoutElapsed });
+
     if (kill && this._isBeingKilled) {
       return;
     }
 
-    this._isBeingKilled = kill;
+    if (kill) {
+      this._isBeingKilled = true;
+      this.onIsBeingKilled.post(this);
+    }
+
+    const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution;
+
+    // Kill parent unless graceful exit timeout has elapsed and we're in the middle of an execution
+    const killParentProcess = kill && !killChildProcess;
+
+    console.log("Cleaning up task run process", {
+      killChildProcess,
+      killParentProcess,
+    });
 
     await this._ipc?.sendWithAck("CLEANUP", {
       flush: true,
-      kill,
+      kill: killParentProcess,
     });
+
+    if (killChildProcess) {
+      this._gracefulExitTimeoutElapsed = true;
+      // Kill the child process
+      await this.kill("SIGKILL");
+    }
   }
 
   async executeTaskRun(payload: TaskRunExecutionPayload): Promise<TaskRunExecutionResult> {
@@ -559,15 +799,15 @@ class TaskRunProcess {
     return result;
   }
 
-  taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) {
+  taskRunCompletedNotification(completion: TaskRunExecutionResult) {
     if (!completion.ok && typeof completion.retry !== "undefined") {
       return;
     }
 
     if (this._child?.connected && !this._isBeingKilled && !this._child.killed) {
       this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", {
+        version: "v2",
         completion,
-        execution,
       });
     }
   }
@@ -578,10 +818,14 @@ class TaskRunProcess {
     }
   }
 
-  async #handleExit(code: number) {
+  async #handleExit(code: number | null, signal: NodeJS.Signals | null) {
+    console.log("handling child exit", { code, signal });
+
     // Go through all the attempts currently pending and reject them
     for (const [id, status] of this._attemptStatuses.entries()) {
       if (status === "PENDING") {
+        console.log("found pending attempt", { id });
+
         this._attemptStatuses.set(id, "REJECTED");
 
         const attemptPromise = this._attemptPromises.get(id);
@@ -594,15 +838,18 @@ class TaskRunProcess {
 
         if (this._isBeingCancelled) {
           rejecter(new CancelledProcessError());
+        } else if (this._gracefulExitTimeoutElapsed) {
+          // Order matters, this has to be before the graceful exit timeout
+          rejecter(new GracefulExitTimeoutError());
         } else if (this._isBeingKilled) {
           rejecter(new CleanupProcessError());
         } else {
-          rejecter(new UnexpectedExitError(code));
+          rejecter(new UnexpectedExitError(code ?? -1));
         }
       }
     }
 
-    this.onExit.post(code);
+    this.onExit.post({ code, signal, pid: this.pid });
   }
 
   #handleLog(data: Buffer) {
@@ -635,9 +882,24 @@ class TaskRunProcess {
     );
   }
 
-  #kill() {
-    if (this._child && !this._child.killed) {
-      this._child?.kill();
+  async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) {
+    this._isBeingKilled = true;
+
+    const killTimeout = this.onExit.waitFor(timeoutInMs);
+
+    this.onIsBeingKilled.post(this);
+    this._child?.kill(signal);
+
+    if (timeoutInMs) {
+      await killTimeout;
     }
   }
+
+  get isBeingKilled() {
+    return this._isBeingKilled || this._child?.killed;
+  }
+
+  get pid() {
+    return this._childPid;
+  }
 }
diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts
index e8b35d61e45..d59132e83d5 100644
--- a/packages/cli-v3/src/workers/prod/entry-point.ts
+++ b/packages/cli-v3/src/workers/prod/entry-point.ts
@@ -5,6 +5,7 @@ import {
   PreStopCauses,
   ProdWorkerToCoordinatorMessages,
   TaskResource,
+  TaskRunFailedExecutionResult,
   WaitReason,
 } from "@trigger.dev/core/v3";
 import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket";
@@ -60,8 +61,89 @@ class ProdWorker {
     process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM"));
 
     this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST);
+    this.#backgroundWorker = this.#createBackgroundWorker();
 
-    this.#backgroundWorker = new ProdBackgroundWorker("worker.js", {
+    this.#httpPort = port;
+    this.#httpServer = this.#createHttpServer();
+  }
+
+  async #handleSignal(signal: NodeJS.Signals) {
+    logger.log("Received signal", { signal });
+
+    if (signal === "SIGTERM") {
+      let gracefulExitTimeoutElapsed = false;
+
+      if (this.executing) {
+        const terminationGracePeriodSeconds = 60 * 60;
+
+        logger.log("Waiting for attempt to complete before exiting", {
+          terminationGracePeriodSeconds,
+        });
+
+        // Wait for termination grace period minus 5s to give cleanup a chance to complete
+        await setTimeout(terminationGracePeriodSeconds * 1000 - 5000);
+        gracefulExitTimeoutElapsed = true;
+
+        logger.log("Termination timeout reached, exiting gracefully.");
+      } else {
+        logger.log("Not executing, exiting immediately.");
+      }
+
+      await this.#exitGracefully(gracefulExitTimeoutElapsed);
+      return;
+    }
+
+    logger.log("Unhandled signal", { signal });
+  }
+
+  async #exitGracefully(gracefulExitTimeoutElapsed = false) {
+    await this.#backgroundWorker.close(gracefulExitTimeoutElapsed);
+
+    if (!gracefulExitTimeoutElapsed) {
+      // TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies
+      process.exit(0);
+    }
+  }
+
+  async #reconnect(isPostStart = false, reconnectImmediately = false) {
+    if (isPostStart) {
+      this.waitForPostStart = false;
+    }
+
+    this.#coordinatorSocket.close();
+
+    if (!reconnectImmediately) {
+      await setTimeout(1000);
+    }
+
+    let coordinatorHost = COORDINATOR_HOST;
+
+    try {
+      if (this.runningInKubernetes) {
+        coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
+          "\n",
+          ""
+        );
+
+        logger.log("reconnecting", {
+          coordinatorHost: {
+            fromEnv: COORDINATOR_HOST,
+            fromVolume: coordinatorHost,
+            current: this.#coordinatorSocket.socket.io.opts.hostname,
+          },
+        });
+      }
+    } catch (error) {
+      logger.error("taskinfo read error during reconnect", {
+        error: error instanceof Error ? error.message : error,
+      });
+    } finally {
+      this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
+    }
+  }
+
+  #createBackgroundWorker() {
+    const backgroundWorker = new ProdBackgroundWorker("worker.js", {
       projectConfig: __PROJECT_CONFIG__,
       env: {
         ...gatherProcessEnv(),
@@ -73,19 +155,24 @@ class ProdWorker {
       contentHash: this.contentHash,
     });
 
-    this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
+    backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => {
       // TODO: Switch to .send() once coordinator uses zod handler for all messages
       this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId });
     });
 
-    this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
-      // Flush before checkpointing so we don't flush the same spans again after restore
-      await this.#backgroundWorker.flushTelemetry();
+    backgroundWorker.onTaskRunHeartbeat.attach((runId) => {
+      this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId });
+    });
+
+    // Currently, this is only used for duration waits
+    backgroundWorker.onReadyForCheckpoint.attach(async (message) => {
+      await this.#prepareForCheckpoint();
+
       this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
     });
 
     // Currently, this is only used for duration waits. Might need adjusting for other use cases.
-    this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => {
+    backgroundWorker.onCancelCheckpoint.attach(async (message) => {
       logger.log("onCancelCheckpoint", { message });
 
       const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck(
@@ -96,6 +183,8 @@ class ProdWorker {
         }
       );
 
+      logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled });
+
       if (checkpointCanceled) {
         if (message.reason === "WAIT_FOR_DURATION") {
           // Worker will resume immediately
@@ -105,12 +194,52 @@ class ProdWorker {
         }
       }
 
-      this.#backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
+      backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled });
+    });
+
+    backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => {
+      logger.log("onCreateTaskRunAttempt()", { message });
+
+      const createAttempt = await this.#coordinatorSocket.socket.emitWithAck(
+        "CREATE_TASK_RUN_ATTEMPT",
+        {
+          version: "v1",
+          runId: message.runId,
+        }
+      );
+
+      if (!createAttempt.success) {
+        backgroundWorker.attemptCreatedNotification.post({
+          success: false,
+          reason: createAttempt.reason,
+        });
+        return;
+      }
+
+      backgroundWorker.attemptCreatedNotification.post({
+        success: true,
+        execution: createAttempt.executionPayload.execution,
+      });
+    });
+
+    backgroundWorker.attemptCreatedNotification.attach((message) => {
+      if (!message.success) {
+        return;
+      }
+
+      // Workers with lazy attempt support set their friendly ID here
+      this.attemptFriendlyId = message.execution.attempt.id;
     });
 
-    this.#backgroundWorker.onWaitForDuration.attach(async (message) => {
+    backgroundWorker.onWaitForDuration.attach(async (message) => {
       if (!this.attemptFriendlyId) {
         logger.error("Failed to send wait message, attempt friendly ID not set", { message });
+
+        this.#emitUnrecoverableError(
+          "NoAttemptId",
+          "Attempt ID not set before waiting for duration"
+        );
+
         return;
       }
 
@@ -125,9 +254,12 @@ class ProdWorker {
       this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore);
     });
 
-    this.#backgroundWorker.onWaitForTask.attach(async (message) => {
+    backgroundWorker.onWaitForTask.attach(async (message) => {
       if (!this.attemptFriendlyId) {
         logger.error("Failed to send wait message, attempt friendly ID not set", { message });
+
+        this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task");
+
         return;
       }
 
@@ -142,9 +274,12 @@ class ProdWorker {
       this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore);
     });
 
-    this.#backgroundWorker.onWaitForBatch.attach(async (message) => {
+    backgroundWorker.onWaitForBatch.attach(async (message) => {
       if (!this.attemptFriendlyId) {
         logger.error("Failed to send wait message, attempt friendly ID not set", { message });
+
+        this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch");
+
         return;
       }
 
@@ -159,73 +294,7 @@ class ProdWorker {
       this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore);
     });
 
-    this.#httpPort = port;
-    this.#httpServer = this.#createHttpServer();
-  }
-
-  async #handleSignal(signal: NodeJS.Signals) {
-    logger.log("Received signal", { signal });
-
-    if (signal === "SIGTERM") {
-      if (this.executing) {
-        const terminationGracePeriodSeconds = 60 * 60;
-
-        logger.log("Waiting for attempt to complete before exiting", {
-          terminationGracePeriodSeconds,
-        });
-
-        // Wait for termination grace period minus 5s to give cleanup a chance to complete
-        await setTimeout(terminationGracePeriodSeconds * 1000 - 5000);
-
-        logger.log("Termination timeout reached, exiting gracefully.");
-      } else {
-        logger.log("Not executing, exiting immediately.");
-      }
-
-      await this.#exitGracefully();
-    }
-
-    logger.log("Unhandled signal", { signal });
-  }
-
-  async #exitGracefully() {
-    await this.#backgroundWorker.close();
-    process.exit(0);
-  }
-
-  async #reconnect(isPostStart = false, reconnectImmediately = false) {
-    if (isPostStart) {
-      this.waitForPostStart = false;
-    }
-
-    this.#coordinatorSocket.close();
-
-    if (!reconnectImmediately) {
-      await setTimeout(1000);
-    }
-
-    let coordinatorHost = COORDINATOR_HOST;
-
-    try {
-      if (this.runningInKubernetes) {
-        coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace(
-          "\n",
-          ""
-        );
-
-        logger.log("reconnecting", {
-          coordinatorHost: {
-            fromEnv: COORDINATOR_HOST,
-            fromVolume: coordinatorHost,
-            current: this.#coordinatorSocket.socket.io.opts.hostname,
-          },
-        });
-      }
-    } catch (error) {
-      logger.error("taskinfo read error during reconnect", { error });
-    } finally {
-      this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost);
-    }
+    return backgroundWorker;
   }
 
   async #prepareForWait(reason: WaitReason, willCheckpointAndRestore: boolean) {
@@ -239,9 +308,8 @@ class ProdWorker {
       this.waitForPostStart = true;
 
       if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") {
-        // Flush before checkpointing so we don't flush the same spans again after restore
         // Duration waits do this via the "ready for checkpoint" event instead
-        await this.#backgroundWorker.flushTelemetry();
+        await this.#prepareForCheckpoint();
       }
     }
   }
@@ -256,18 +324,36 @@ class ProdWorker {
       }
 
       await this.#exitGracefully();
+      return;
     }
 
+    // Clear state for next execution
+    this.paused = false;
+    this.waitForPostStart = false;
     this.executing = false;
     this.attemptFriendlyId = undefined;
 
     if (willCheckpointAndRestore) {
       this.waitForPostStart = true;
+
+      // We already flush after completion, so we don't need to do it here
+      this.#prepareForCheckpoint(false);
+
       this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" });
       return;
     }
   }
 
+  async #prepareForCheckpoint(flush = true) {
+    if (flush) {
+      // Flush before checkpointing so we don't flush the same spans again after restore
+      await this.#backgroundWorker.flushTelemetry();
+    }
+
+    // Kill the previous worker process to prevent large checkpoints
+    await this.#backgroundWorker.forceKillOldTaskRunProcesses();
+  }
+
   #resumeAfterDuration() {
     this.paused = false;
     this.nextResumeAfter = undefined;
@@ -303,11 +389,8 @@ class ProdWorker {
       extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId;
     }
 
-    logger.log("connecting to coordinator", {
-      host,
-      port: COORDINATOR_PORT,
-      extraHeaders,
-    });
+    logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`);
+    logger.debug(`connecting with extra headers`, { extraHeaders });
 
     const coordinatorConnection = new ZodSocketConnection({
       namespace: "prod-worker",
@@ -317,28 +400,14 @@ class ProdWorker {
       serverMessages: CoordinatorToProdWorkerMessages,
       extraHeaders,
       handlers: {
-        RESUME_AFTER_DEPENDENCY: async (message) => {
+        RESUME_AFTER_DEPENDENCY: async ({ completions }) => {
           if (!this.paused) {
-            logger.error("worker not paused", {
-              completions: message.completions,
-              executions: message.executions,
-            });
-            return;
-          }
-
-          if (message.completions.length !== message.executions.length) {
-            logger.error("did not receive the same number of completions and executions", {
-              completions: message.completions,
-              executions: message.executions,
-            });
+            logger.error("Failed to resume after dependency: Worker not paused");
             return;
           }
 
-          if (message.completions.length === 0 || message.executions.length === 0) {
-            logger.error("no completions or executions", {
-              completions: message.completions,
-              executions: message.executions,
-            });
+          if (completions.length === 0) {
+            logger.error("Failed to resume after dependency: No completions");
             return;
           }
 
@@ -346,17 +415,19 @@ class ProdWorker {
             this.nextResumeAfter !== "WAIT_FOR_TASK" &&
             this.nextResumeAfter !== "WAIT_FOR_BATCH"
           ) {
-            logger.error("not waiting to resume after dependency", {
+            logger.error("Failed to resume after dependency: Invalid next resume", {
               nextResumeAfter: this.nextResumeAfter,
             });
             return;
           }
 
-          if (this.nextResumeAfter === "WAIT_FOR_TASK" && message.completions.length > 1) {
-            logger.error("waiting for single task but got multiple completions", {
-              completions: message.completions,
-              executions: message.executions,
-            });
+          if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) {
+            logger.error(
+              "Failed to resume after dependency: Waiting for single task but got multiple completions",
+              {
+                completions: completions,
+              }
+            );
             return;
           }
 
@@ -364,13 +435,12 @@ class ProdWorker {
           this.nextResumeAfter = undefined;
           this.waitForPostStart = false;
 
-          for (let i = 0; i < message.completions.length; i++) {
-            const completion = message.completions[i];
-            const execution = message.executions[i];
+          for (let i = 0; i < completions.length; i++) {
+            const completion = completions[i];
 
-            if (!completion || !execution) continue;
+            if (!completion) continue;
 
-            this.#backgroundWorker.taskRunCompletedNotification(completion, execution);
+            this.#backgroundWorker.taskRunCompletedNotification(completion);
           }
         },
         RESUME_AFTER_DURATION: async (message) => {
@@ -420,14 +490,75 @@ class ProdWorker {
 
           this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
         },
+        EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => {
+          if (this.executing) {
+            logger.error("dropping execute request, already executing");
+            return;
+          }
+
+          this.executing = true;
+
+          try {
+            const { completion, execution } =
+              await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload);
+
+            logger.log("completed", completion);
+
+            this.completed.add(execution.attempt.id);
+
+            const { willCheckpointAndRestore, shouldExit } =
+              await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", {
+                version: "v1",
+                execution,
+                completion,
+              });
+
+            logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit });
+
+            this.#prepareForRetry(willCheckpointAndRestore, shouldExit);
+          } catch (error) {
+            const completion: TaskRunFailedExecutionResult = {
+              ok: false,
+              id: message.lazyPayload.runId,
+              retry: undefined,
+              error:
+                error instanceof Error
+                  ? {
+                      type: "BUILT_IN_ERROR",
+                      name: error.name,
+                      message: error.message,
+                      stackTrace: error.stack ?? "",
+                    }
+                  : {
+                      type: "BUILT_IN_ERROR",
+                      name: "UnknownError",
+                      message: String(error),
+                      stackTrace: "",
+                    },
+            };
+
+            this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", {
+              version: "v1",
+              completion,
+            });
+          }
+        },
         REQUEST_ATTEMPT_CANCELLATION: async (message) => {
           if (!this.executing) {
+            logger.log("dropping cancel request, not executing", { status: this.#status });
             return;
           }
 
+          logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status });
+
           await this.#backgroundWorker.cancelAttempt(message.attemptId);
         },
-        REQUEST_EXIT: async () => {
+        REQUEST_EXIT: async (message) => {
+          if (message.version === "v2" && message.delayInMs) {
+            logger.log("exit requested with delay", { delayInMs: message.delayInMs });
+            await setTimeout(message.delayInMs);
+          }
+
           this.#coordinatorSocket.close();
           process.exit(0);
         },
@@ -436,7 +567,7 @@ class ProdWorker {
             return;
           }
 
-          this.#coordinatorSocket.socket.emit("READY_FOR_EXECUTION", {
+          this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", {
             version: "v1",
             runId: this.runId,
             totalCompletions: this.completed.size,
@@ -444,6 +575,8 @@ class ProdWorker {
         },
       },
       onConnection: async (socket, handler, sender, logger) => {
+        logger.log("connected to coordinator", { status: this.#status });
+
         if (this.waitForPostStart) {
           logger.log("skip connection handler, waiting for post start hook");
           return;
@@ -451,11 +584,24 @@ class ProdWorker {
 
         if (this.paused) {
           if (!this.nextResumeAfter) {
+            logger.error("Missing next resume reason", { status: this.#status });
+
+            this.#emitUnrecoverableError(
+              "NoNextResume",
+              "Next resume reason not set while resuming from paused state"
+            );
+
             return;
           }
 
           if (!this.attemptFriendlyId) {
-            logger.error("Missing friendly ID");
+            logger.error("Missing friendly ID", { status: this.#status });
+
+            this.#emitUnrecoverableError(
+              "NoAttemptId",
+              "Attempt ID not set while resuming from paused state"
+            );
+
             return;
           }
 
@@ -473,9 +619,10 @@ class ProdWorker {
             const taskResources = await this.#initializeWorker();
 
             const { success } = await socket.emitWithAck("INDEX_TASKS", {
-              version: "v1",
+              version: "v2",
               deploymentId: this.deploymentId,
               ...taskResources,
+              supportsLazyAttempts: true,
             });
 
             if (success) {
@@ -563,7 +710,7 @@ class ProdWorker {
           return;
         }
 
-        socket.emit("READY_FOR_EXECUTION", {
+        socket.emit("READY_FOR_LAZY_ATTEMPT", {
           version: "v1",
           runId: this.runId,
           totalCompletions: this.completed.size,
@@ -601,12 +748,7 @@ class ProdWorker {
           }
 
           case "/status": {
-            return reply.json({
-              executing: this.executing,
-              paused: this.paused,
-              completed: this.completed.size,
-              nextResumeAfter: this.nextResumeAfter,
-            });
+            return reply.json(this.#status);
           }
 
           case "/connect": {
@@ -768,6 +910,27 @@ class ProdWorker {
     return data?.variables ?? {};
   }
 
+  get #status() {
+    return {
+      executing: this.executing,
+      paused: this.paused,
+      completed: this.completed.size,
+      nextResumeAfter: this.nextResumeAfter,
+      waitForPostStart: this.waitForPostStart,
+      attemptFriendlyId: this.attemptFriendlyId,
+    };
+  }
+
+  #emitUnrecoverableError(name: string, message: string) {
+    this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", {
+      version: "v1",
+      error: {
+        name,
+        message,
+      },
+    });
+  }
+
   start() {
     this.#httpServer.listen(this.#httpPort, this.host);
   }
diff --git a/packages/cli-v3/src/workers/prod/worker-facade.ts b/packages/cli-v3/src/workers/prod/worker-facade.ts
index 77271b80162..516f7035331 100644
--- a/packages/cli-v3/src/workers/prod/worker-facade.ts
+++ b/packages/cli-v3/src/workers/prod/worker-facade.ts
@@ -170,8 +170,8 @@ const zodIpc = new ZodIpcConnection({
         _isRunning = false;
       }
     },
-    TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => {
-      prodRuntimeManager.resumeTask(completion, execution);
+    TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion }) => {
+      prodRuntimeManager.resumeTask(completion);
     },
     WAIT_COMPLETED_NOTIFICATION: async () => {
       prodRuntimeManager.resumeAfterDuration();
@@ -179,23 +179,6 @@ const zodIpc = new ZodIpcConnection({
     CLEANUP: async ({ flush, kill }, sender) => {
       if (kill) {
         await tracingSDK.flush();
-
-        if (_execution) {
-          // Fail currently executing attempt
-          await sender.send("TASK_RUN_COMPLETED", {
-            execution: _execution,
-            result: {
-              ok: false,
-              id: _execution.run.id,
-              error: {
-                type: "INTERNAL_ERROR",
-                code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT,
-                message: "Worker process killed while attempt in progress.",
-              },
-            },
-          });
-        }
-
         // Now we need to exit the process
         await sender.send("READY_TO_DISPOSE", undefined);
       } else {
@@ -228,7 +211,7 @@ zodIpc.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => {
 
 process.title = "trigger-prod-worker";
 
-async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) {
+async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 20) {
   async function _doHeartbeat() {
     while (true) {
       if (_isRunning && _execution) {
diff --git a/packages/core-apps/src/provider.ts b/packages/core-apps/src/provider.ts
index cb1b0c4d574..159c81dbc37 100644
--- a/packages/core-apps/src/provider.ts
+++ b/packages/core-apps/src/provider.ts
@@ -46,7 +46,6 @@ export interface TaskOperationsCreateOptions {
   orgId: string;
   projectId: string;
   runId: string;
-  attemptId: string;
 }
 
 export interface TaskOperationsRestoreOptions {
@@ -129,7 +128,6 @@ export class ProviderShell implements Provider {
                 orgId: message.data.orgId,
                 projectId: message.data.projectId,
                 runId: message.data.runId,
-                attemptId: message.data.id,
               });
             } catch (error) {
               logger.error("create failed", error);
diff --git a/packages/core/package.json b/packages/core/package.json
index 0b49d753f60..222191a7a2e 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -37,6 +37,14 @@
       "require": "./dist/v3/otel/index.js",
       "types": "./dist/v3/otel/index.d.ts"
     },
+    "./v3/zodfetch": {
+      "import": {
+        "types": "./dist/v3/zodfetch.d.mts",
+        "default": "./dist/v3/zodfetch.mjs"
+      },
+      "require": "./dist/v3/zodfetch.js",
+      "types": "./dist/v3/zodfetch.d.ts"
+    },
     "./v3/zodMessageHandler": {
       "import": {
         "types": "./dist/v3/zodMessageHandler.d.mts",
diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts
index 61ee0575b20..bc12e75d782 100644
--- a/packages/core/src/v3/errors.ts
+++ b/packages/core/src/v3/errors.ts
@@ -84,13 +84,13 @@ export function createJsonErrorObject(error: TaskRunError) {
 export function correctErrorStackTrace(
   stackTrace: string,
   projectDir?: string,
-  options?: { removeFirstLine?: boolean }
+  options?: { removeFirstLine?: boolean; isDev?: boolean }
 ) {
   const [errorLine, ...traceLines] = stackTrace.split("\n");
 
   return [
     options?.removeFirstLine ? undefined : errorLine,
-    ...traceLines.map((line) => correctStackTraceLine(line, projectDir)),
+    ...traceLines.map((line) => correctStackTraceLine(line, projectDir, options?.isDev)),
   ]
     .filter(Boolean)
     .join("\n");
@@ -102,17 +102,21 @@ const LINES_TO_IGNORE = [
   /TaskExecutor/,
   /EXECUTE_TASK_RUN/,
   /@trigger.dev\/core/,
+  /packages\/core\/src\/v3/,
   /safeJsonProcess/,
   /__entryPoint.ts/,
+  /ZodIpc/,
+  /startActiveSpan/,
+  /processTicksAndRejections/,
 ];
 
-function correctStackTraceLine(line: string, projectDir?: string) {
+function correctStackTraceLine(line: string, projectDir?: string, isDev?: boolean) {
   if (LINES_TO_IGNORE.some((regex) => regex.test(line))) {
     return;
   }
 
   // Check to see if the path is inside the project directory
-  if (projectDir && !line.includes(projectDir)) {
+  if (isDev && projectDir && !line.includes(projectDir)) {
     return;
   }
 
diff --git a/packages/core/src/v3/runtime/devRuntimeManager.ts b/packages/core/src/v3/runtime/devRuntimeManager.ts
index 7df2c9335cf..6209fdcb203 100644
--- a/packages/core/src/v3/runtime/devRuntimeManager.ts
+++ b/packages/core/src/v3/runtime/devRuntimeManager.ts
@@ -80,18 +80,18 @@ export class DevRuntimeManager implements RuntimeManager {
     };
   }
 
-  resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void {
-    const wait = this._taskWaits.get(execution.run.id);
+  resumeTask(completion: TaskRunExecutionResult, runId: string): void {
+    const wait = this._taskWaits.get(runId);
 
     if (!wait) {
       // We need to store the completion in case the task is awaited later
-      this._pendingCompletionNotifications.set(execution.run.id, completion);
+      this._pendingCompletionNotifications.set(runId, completion);
 
       return;
     }
 
     wait.resolve(completion);
 
-    this._taskWaits.delete(execution.run.id);
+    this._taskWaits.delete(runId);
   }
 }
diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts
index 622a44ed7db..19dc0fd8833 100644
--- a/packages/core/src/v3/runtime/prodRuntimeManager.ts
+++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts
@@ -55,10 +55,14 @@ export class ProdRuntimeManager implements RuntimeManager {
       this._waitForDuration = { resolve, reject };
     });
 
-    const { willCheckpointAndRestore } = await this.ipc.sendWithAck("WAIT_FOR_DURATION", {
-      ms,
-      now,
-    });
+    const { willCheckpointAndRestore } = await this.ipc.sendWithAck(
+      "WAIT_FOR_DURATION",
+      {
+        ms,
+        now,
+      },
+      31_000
+    );
 
     if (!willCheckpointAndRestore) {
       await internalTimeout;
@@ -74,18 +78,24 @@ export class ProdRuntimeManager implements RuntimeManager {
     // Resets the clock to the current time
     clock.reset();
 
-    // The coordinator should cancel any in-progress checkpoints
-    const { checkpointCanceled, version } = await this.ipc.sendWithAck(
-      "CANCEL_CHECKPOINT",
-      {
-        version: "v2",
-        reason: "WAIT_FOR_DURATION",
-      },
-      31_000
-    );
-
-    if (checkpointCanceled) {
-      // There won't be a checkpoint or external resume and we've already completed our internal timeout
+    try {
+      // The coordinator should cancel any in-progress checkpoints
+      const { checkpointCanceled, version } = await this.ipc.sendWithAck(
+        "CANCEL_CHECKPOINT",
+        {
+          version: "v2",
+          reason: "WAIT_FOR_DURATION",
+        },
+        31_000
+      );
+
+      if (checkpointCanceled) {
+        // There won't be a checkpoint or external resume and we've already completed our internal timeout
+        return;
+      }
+    } catch (error) {
+      // If the cancellation times out, we will proceed as if the checkpoint was canceled
+      logger.debug("Checkpoint cancellation timed out", { error });
       return;
     }
 
@@ -98,19 +108,9 @@ export class ProdRuntimeManager implements RuntimeManager {
       return;
     }
 
-    process.stdout.write("pre");
-    process.stdout.write(JSON.stringify(clock.preciseNow()));
-
-    console.log("pre", clock.preciseNow());
-
     // Resets the clock to the current time
     clock.reset();
 
-    console.log("post", clock.preciseNow());
-
-    process.stdout.write("post");
-    process.stdout.write(JSON.stringify(clock.preciseNow()));
-
     this._waitForDuration.resolve("external");
     this._waitForDuration = undefined;
   }
@@ -167,8 +167,8 @@ export class ProdRuntimeManager implements RuntimeManager {
     };
   }
 
-  resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void {
-    const wait = this._taskWaits.get(execution.run.id);
+  resumeTask(completion: TaskRunExecutionResult): void {
+    const wait = this._taskWaits.get(completion.id);
 
     if (!wait) {
       return;
@@ -176,7 +176,7 @@ export class ProdRuntimeManager implements RuntimeManager {
 
     wait.resolve(completion);
 
-    this._taskWaits.delete(execution.run.id);
+    this._taskWaits.delete(completion.id);
   }
 
   private get waitThresholdInMs(): number {
diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts
index 23a32737c09..0a14bb2f7e7 100644
--- a/packages/core/src/v3/schemas/api.ts
+++ b/packages/core/src/v3/schemas/api.ts
@@ -41,6 +41,7 @@ export type GetProjectEnvResponse = z.infer<typeof GetProjectEnvResponse>;
 export const CreateBackgroundWorkerRequestBody = z.object({
   localOnly: z.boolean(),
   metadata: BackgroundWorkerMetadata,
+  supportsLazyAttempts: z.boolean().optional(),
 });
 
 export type CreateBackgroundWorkerRequestBody = z.infer<typeof CreateBackgroundWorkerRequestBody>;
diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts
index 58ae633c737..2362748d705 100644
--- a/packages/core/src/v3/schemas/common.ts
+++ b/packages/core/src/v3/schemas/common.ts
@@ -31,6 +31,7 @@ export const TaskRunErrorCodes = {
   TASK_EXECUTION_FAILED: "TASK_EXECUTION_FAILED",
   TASK_EXECUTION_ABORTED: "TASK_EXECUTION_ABORTED",
   TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE",
+  TASK_PROCESS_SIGKILL_TIMEOUT: "TASK_PROCESS_SIGKILL_TIMEOUT",
   TASK_RUN_CANCELLED: "TASK_RUN_CANCELLED",
   TASK_OUTPUT_ERROR: "TASK_OUTPUT_ERROR",
   HANDLE_ERROR_ERROR: "HANDLE_ERROR_ERROR",
@@ -47,10 +48,12 @@ export const TaskRunInternalError = z.object({
     "TASK_EXECUTION_FAILED",
     "TASK_EXECUTION_ABORTED",
     "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE",
+    "TASK_PROCESS_SIGKILL_TIMEOUT",
     "TASK_RUN_CANCELLED",
     "TASK_OUTPUT_ERROR",
     "HANDLE_ERROR_ERROR",
     "GRACEFUL_EXIT_TIMEOUT",
+    "TASK_RUN_HEARTBEAT_TIMEOUT",
   ]),
   message: z.string().optional(),
 });
diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts
index 32c66c8082e..10c20912a10 100644
--- a/packages/core/src/v3/schemas/messages.ts
+++ b/packages/core/src/v3/schemas/messages.ts
@@ -1,11 +1,13 @@
 import { z } from "zod";
-import { TaskRunExecution, TaskRunExecutionResult } from "./common";
+import { TaskRunExecution, TaskRunExecutionResult, TaskRunFailedExecutionResult } from "./common";
+
 import {
   EnvironmentType,
   Machine,
   ProdTaskRunExecution,
   ProdTaskRunExecutionPayload,
   TaskMetadataWithFilePath,
+  TaskRunExecutionLazyAttemptPayload,
   TaskRunExecutionPayload,
   WaitReason,
 } from "./schemas";
@@ -27,13 +29,17 @@ export const BackgroundWorkerServerMessages = z.discriminatedUnion("type", [
     version: z.string(),
     machine: Machine,
     // identifiers
-    id: z.string(), // attempt
+    id: z.string().optional(), // TODO: Remove this completely in a future release
     envId: z.string(),
     envType: EnvironmentType,
     orgId: z.string(),
     projectId: z.string(),
     runId: z.string(),
   }),
+  z.object({
+    type: z.literal("EXECUTE_RUN_LAZY_ATTEMPT"),
+    payload: TaskRunExecutionLazyAttemptPayload,
+  }),
 ]);
 
 export type BackgroundWorkerServerMessages = z.infer<typeof BackgroundWorkerServerMessages>;
@@ -57,11 +63,21 @@ export const BackgroundWorkerClientMessages = z.discriminatedUnion("type", [
     completion: TaskRunExecutionResult,
     execution: TaskRunExecution,
   }),
+  z.object({
+    version: z.literal("v1").default("v1"),
+    type: z.literal("TASK_RUN_FAILED_TO_RUN"),
+    completion: TaskRunFailedExecutionResult,
+  }),
   z.object({
     version: z.literal("v1").default("v1"),
     type: z.literal("TASK_HEARTBEAT"),
     id: z.string(),
   }),
+  z.object({
+    version: z.literal("v1").default("v1"),
+    type: z.literal("TASK_RUN_HEARTBEAT"),
+    id: z.string(),
+  }),
 ]);
 
 export type BackgroundWorkerClientMessages = z.infer<typeof BackgroundWorkerClientMessages>;
@@ -78,6 +94,7 @@ export const clientWebsocketMessages = {
   READY_FOR_TASKS: z.object({
     version: z.literal("v1").default("v1"),
     backgroundWorkerId: z.string(),
+    inProgressRuns: z.string().array().optional(),
   }),
   BACKGROUND_WORKER_DEPRECATED: z.object({
     version: z.literal("v1").default("v1"),
@@ -97,11 +114,17 @@ export const workerToChildMessages = {
     traceContext: z.record(z.unknown()),
     metadata: BackgroundWorkerProperties,
   }),
-  TASK_RUN_COMPLETED_NOTIFICATION: z.object({
-    version: z.literal("v1").default("v1"),
-    completion: TaskRunExecutionResult,
-    execution: TaskRunExecution,
-  }),
+  TASK_RUN_COMPLETED_NOTIFICATION: z.discriminatedUnion("version", [
+    z.object({
+      version: z.literal("v1"),
+      completion: TaskRunExecutionResult,
+      execution: TaskRunExecution,
+    }),
+    z.object({
+      version: z.literal("v2"),
+      completion: TaskRunExecutionResult,
+    }),
+  ]),
   CLEANUP: z.object({
     version: z.literal("v1").default("v1"),
     flush: z.boolean().default(false),
@@ -142,6 +165,10 @@ export const childToWorkerMessages = {
     version: z.literal("v1").default("v1"),
     id: z.string(),
   }),
+  TASK_RUN_HEARTBEAT: z.object({
+    version: z.literal("v1").default("v1"),
+    id: z.string(),
+  }),
   READY_TO_DISPOSE: z.undefined(),
   WAIT_FOR_DURATION: z.object({
     version: z.literal("v1").default("v1"),
@@ -182,6 +209,12 @@ export const ProdChildToWorkerMessages = {
       id: z.string(),
     }),
   },
+  TASK_RUN_HEARTBEAT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      id: z.string(),
+    }),
+  },
   READY_TO_DISPOSE: {
     message: z.undefined(),
   },
@@ -247,11 +280,17 @@ export const ProdWorkerToChildMessages = {
     }),
   },
   TASK_RUN_COMPLETED_NOTIFICATION: {
-    message: z.object({
-      version: z.literal("v1").default("v1"),
-      completion: TaskRunExecutionResult,
-      execution: TaskRunExecution,
-    }),
+    message: z.discriminatedUnion("version", [
+      z.object({
+        version: z.literal("v1"),
+        completion: TaskRunExecutionResult,
+        execution: TaskRunExecution,
+      }),
+      z.object({
+        version: z.literal("v2"),
+        completion: TaskRunExecutionResult,
+      }),
+    ]),
   },
   CLEANUP: {
     message: z.object({
@@ -379,6 +418,18 @@ export const PlatformToProviderMessages = {
   },
 };
 
+const CreateWorkerMessage = z.object({
+  projectRef: z.string(),
+  envId: z.string(),
+  deploymentId: z.string(),
+  metadata: z.object({
+    cliPackageVersion: z.string().optional(),
+    contentHash: z.string(),
+    packageVersion: z.string(),
+    tasks: TaskResource.array(),
+  }),
+});
+
 export const CoordinatorToPlatformMessages = {
   LOG: {
     message: z.object({
@@ -388,24 +439,38 @@ export const CoordinatorToPlatformMessages = {
     }),
   },
   CREATE_WORKER: {
+    message: z.discriminatedUnion("version", [
+      CreateWorkerMessage.extend({
+        version: z.literal("v1"),
+      }),
+      CreateWorkerMessage.extend({
+        version: z.literal("v2"),
+        supportsLazyAttempts: z.boolean(),
+      }),
+    ]),
+    callback: z.discriminatedUnion("success", [
+      z.object({
+        success: z.literal(false),
+      }),
+      z.object({
+        success: z.literal(true),
+      }),
+    ]),
+  },
+  CREATE_TASK_RUN_ATTEMPT: {
     message: z.object({
       version: z.literal("v1").default("v1"),
-      projectRef: z.string(),
+      runId: z.string(),
       envId: z.string(),
-      deploymentId: z.string(),
-      metadata: z.object({
-        cliPackageVersion: z.string().optional(),
-        contentHash: z.string(),
-        packageVersion: z.string(),
-        tasks: TaskResource.array(),
-      }),
     }),
     callback: z.discriminatedUnion("success", [
       z.object({
         success: z.literal(false),
+        reason: z.string().optional(),
       }),
       z.object({
         success: z.literal(true),
+        executionPayload: ProdTaskRunExecutionPayload,
       }),
     ]),
   },
@@ -425,6 +490,24 @@ export const CoordinatorToPlatformMessages = {
       }),
     ]),
   },
+  READY_FOR_LAZY_ATTEMPT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+      envId: z.string(),
+      totalCompletions: z.number(),
+    }),
+    callback: z.discriminatedUnion("success", [
+      z.object({
+        success: z.literal(false),
+        reason: z.string().optional(),
+      }),
+      z.object({
+        success: z.literal(true),
+        lazyPayload: TaskRunExecutionLazyAttemptPayload,
+      }),
+    ]),
+  },
   READY_FOR_RESUME: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -445,12 +528,24 @@ export const CoordinatorToPlatformMessages = {
         .optional(),
     }),
   },
+  TASK_RUN_FAILED_TO_RUN: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      completion: TaskRunFailedExecutionResult,
+    }),
+  },
   TASK_HEARTBEAT: {
     message: z.object({
       version: z.literal("v1").default("v1"),
       attemptFriendlyId: z.string(),
     }),
   },
+  TASK_RUN_HEARTBEAT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+    }),
+  },
   CHECKPOINT_CREATED: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -490,6 +585,17 @@ export const CoordinatorToPlatformMessages = {
       }),
     }),
   },
+  RUN_CRASHED: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+      error: z.object({
+        name: z.string(),
+        message: z.string(),
+        stack: z.string().optional(),
+      }),
+    }),
+  },
 };
 
 export const PlatformToCoordinatorMessages = {
@@ -517,6 +623,13 @@ export const PlatformToCoordinatorMessages = {
       attemptFriendlyId: z.string(),
     }),
   },
+  REQUEST_RUN_CANCELLATION: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+      delayInMs: z.number().optional(),
+    }),
+  },
   READY_FOR_RETRY: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -563,6 +676,13 @@ export const SharedQueueToClientMessages = {
   },
 };
 
+const IndexTasksMessage = z.object({
+  version: z.literal("v1"),
+  deploymentId: z.string(),
+  tasks: TaskResource.array(),
+  packageVersion: z.string(),
+});
+
 export const ProdWorkerToCoordinatorMessages = {
   LOG: {
     message: z.object({
@@ -572,12 +692,15 @@ export const ProdWorkerToCoordinatorMessages = {
     callback: z.void(),
   },
   INDEX_TASKS: {
-    message: z.object({
-      version: z.literal("v1").default("v1"),
-      deploymentId: z.string(),
-      tasks: TaskResource.array(),
-      packageVersion: z.string(),
-    }),
+    message: z.discriminatedUnion("version", [
+      IndexTasksMessage.extend({
+        version: z.literal("v1"),
+      }),
+      IndexTasksMessage.extend({
+        version: z.literal("v2"),
+        supportsLazyAttempts: z.boolean(),
+      }),
+    ]),
     callback: z.discriminatedUnion("success", [
       z.object({
         success: z.literal(false),
@@ -594,6 +717,13 @@ export const ProdWorkerToCoordinatorMessages = {
       totalCompletions: z.number(),
     }),
   },
+  READY_FOR_LAZY_ATTEMPT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+      totalCompletions: z.number(),
+    }),
+  },
   READY_FOR_RESUME: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -630,6 +760,12 @@ export const ProdWorkerToCoordinatorMessages = {
       attemptFriendlyId: z.string(),
     }),
   },
+  TASK_RUN_HEARTBEAT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+    }),
+  },
   TASK_RUN_COMPLETED: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -641,6 +777,12 @@ export const ProdWorkerToCoordinatorMessages = {
       shouldExit: z.boolean(),
     }),
   },
+  TASK_RUN_FAILED_TO_RUN: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      completion: TaskRunFailedExecutionResult,
+    }),
+  },
   WAIT_FOR_DURATION: {
     message: z.object({
       version: z.literal("v1").default("v1"),
@@ -686,8 +828,35 @@ export const ProdWorkerToCoordinatorMessages = {
       }),
     }),
   },
+  CREATE_TASK_RUN_ATTEMPT: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      runId: z.string(),
+    }),
+    callback: z.discriminatedUnion("success", [
+      z.object({
+        success: z.literal(false),
+        reason: z.string().optional(),
+      }),
+      z.object({
+        success: z.literal(true),
+        executionPayload: ProdTaskRunExecutionPayload,
+      }),
+    ]),
+  },
+  UNRECOVERABLE_ERROR: {
+    message: z.object({
+      version: z.literal("v1").default("v1"),
+      error: z.object({
+        name: z.string(),
+        message: z.string(),
+        stack: z.string().optional(),
+      }),
+    }),
+  },
 };
 
+// TODO: The coordinator can only safely use v1 worker messages, higher versions will need a new flag, e.g. SUPPORTS_VERSIONED_MESSAGES
 export const CoordinatorToProdWorkerMessages = {
   RESUME_AFTER_DEPENDENCY: {
     message: z.object({
@@ -709,17 +878,29 @@ export const CoordinatorToProdWorkerMessages = {
       executionPayload: ProdTaskRunExecutionPayload,
     }),
   },
-  REQUEST_ATTEMPT_CANCELLATION: {
+  EXECUTE_TASK_RUN_LAZY_ATTEMPT: {
     message: z.object({
       version: z.literal("v1").default("v1"),
-      attemptId: z.string(),
+      lazyPayload: TaskRunExecutionLazyAttemptPayload,
     }),
   },
-  REQUEST_EXIT: {
+  REQUEST_ATTEMPT_CANCELLATION: {
     message: z.object({
       version: z.literal("v1").default("v1"),
+      attemptId: z.string(),
     }),
   },
+  REQUEST_EXIT: {
+    message: z.discriminatedUnion("version", [
+      z.object({
+        version: z.literal("v1"),
+      }),
+      z.object({
+        version: z.literal("v2"),
+        delayInMs: z.number().optional(),
+      }),
+    ]),
+  },
   READY_FOR_RETRY: {
     message: z.object({
       version: z.literal("v1").default("v1"),
diff --git a/packages/core/src/v3/schemas/schemas.ts b/packages/core/src/v3/schemas/schemas.ts
index af72652d7ae..2d73b74dd50 100644
--- a/packages/core/src/v3/schemas/schemas.ts
+++ b/packages/core/src/v3/schemas/schemas.ts
@@ -224,3 +224,13 @@ export type ResolvedConfig = RequireKeys<
 export const WaitReason = z.enum(["WAIT_FOR_DURATION", "WAIT_FOR_TASK", "WAIT_FOR_BATCH"]);
 
 export type WaitReason = z.infer<typeof WaitReason>;
+
+export const TaskRunExecutionLazyAttemptPayload = z.object({
+  runId: z.string(),
+  messageId: z.string(),
+  isTest: z.boolean(),
+  traceContext: z.record(z.unknown()),
+  environment: z.record(z.string()).optional(),
+});
+
+export type TaskRunExecutionLazyAttemptPayload = z.infer<typeof TaskRunExecutionLazyAttemptPayload>;
diff --git a/packages/core/src/v3/zodSocket.ts b/packages/core/src/v3/zodSocket.ts
index 1e2ae1e9e5d..964318586a4 100644
--- a/packages/core/src/v3/zodSocket.ts
+++ b/packages/core/src/v3/zodSocket.ts
@@ -1,8 +1,9 @@
 import type { Socket } from "socket.io-client";
 import { io } from "socket.io-client";
-import { z } from "zod";
+import { ZodError, z } from "zod";
 import { EventEmitterLike, ZodMessageValueSchema } from "./zodMessageHandler";
 import { LogLevel, SimpleStructuredLogger, StructuredLogger } from "./utils/structuredLogger";
+import { fromZodError } from "zod-validation-error";
 
 export interface ZodSocketMessageCatalogSchema {
   [key: string]:
@@ -81,7 +82,7 @@ export type MessagesFromSocketCatalog<TMessageCatalog extends ZodSocketMessageCa
 }[keyof TMessageCatalog];
 
 const messageSchema = z.object({
-  version: z.literal("v1").default("v1"),
+  version: z.string(),
   type: z.string(),
   payload: z.unknown(),
 });
@@ -127,10 +128,22 @@ export class ZodSocketMessageHandler<TRPCCatalog extends ZodSocketMessageCatalog
       throw new Error(`Unknown message type: ${parsedMessage.data.type}`);
     }
 
-    const parsedPayload = schema.safeParse(parsedMessage.data.payload);
+    const messageWithVersion = {
+      version: parsedMessage.data.version,
+      ...(typeof parsedMessage.data.payload === "object" ? parsedMessage.data.payload : {}),
+    };
+
+    const parsedPayload = schema.safeParse(messageWithVersion);
 
     if (!parsedPayload.success) {
-      throw new Error(`Failed to parse message payload: ${JSON.stringify(parsedPayload.error)}`);
+      console.error("Failed to parse message payload", {
+        message,
+        payload: messageWithVersion,
+      });
+
+      throw parsedPayload.error instanceof ZodError
+        ? fromZodError(parsedPayload.error)
+        : parsedPayload.error;
     }
 
     return {
@@ -166,7 +179,15 @@ export class ZodSocketMessageHandler<TRPCCatalog extends ZodSocketMessageCatalog
             ack = await this.handleMessage({ type: eventName, version, payload });
           }
         } catch (error) {
-          log.error("Error while handling message", { error });
+          log.error("Error while handling message", {
+            error:
+              error instanceof Error
+                ? {
+                    message: error.message,
+                    stack: error.stack,
+                  }
+                : error,
+          });
           return;
         }
 
diff --git a/packages/core/tsup.config.ts b/packages/core/tsup.config.ts
index f920f0982c1..b245c1803f3 100644
--- a/packages/core/tsup.config.ts
+++ b/packages/core/tsup.config.ts
@@ -15,6 +15,7 @@ export default defineConfig({
     "./src/v3/dev/index.ts",
     "./src/v3/prod/index.ts",
     "./src/v3/workers/index.ts",
+    "./src/v3/zodfetch.ts",
   ],
   external: ["node:stream"],
 });
diff --git a/packages/database/prisma/migrations/20240430101936_add_lazy_attempt_support_flag_to_workers/migration.sql b/packages/database/prisma/migrations/20240430101936_add_lazy_attempt_support_flag_to_workers/migration.sql
new file mode 100644
index 00000000000..28b3b39095a
--- /dev/null
+++ b/packages/database/prisma/migrations/20240430101936_add_lazy_attempt_support_flag_to_workers/migration.sql
@@ -0,0 +1,2 @@
+-- AlterTable
+ALTER TABLE "BackgroundWorker" ADD COLUMN     "supportsLazyAttempts" BOOLEAN NOT NULL DEFAULT false;
diff --git a/packages/database/prisma/schema.prisma b/packages/database/prisma/schema.prisma
index b0c87beef05..2052f5b82c2 100644
--- a/packages/database/prisma/schema.prisma
+++ b/packages/database/prisma/schema.prisma
@@ -1563,6 +1563,8 @@ model BackgroundWorker {
 
   deployment WorkerDeployment?
 
+  supportsLazyAttempts Boolean @default(false)
+
   @@unique([projectId, runtimeEnvironmentId, version])
 }
 
diff --git a/references/v3-catalog/src/trigger/lazyAttempts.ts b/references/v3-catalog/src/trigger/lazyAttempts.ts
new file mode 100644
index 00000000000..c84207b601b
--- /dev/null
+++ b/references/v3-catalog/src/trigger/lazyAttempts.ts
@@ -0,0 +1,394 @@
+import { logger, task, wait } from "@trigger.dev/sdk/v3";
+
+export const lazyImmediate = task({
+  id: "lazy-immediate",
+  run: async (payload: { forceError?: boolean }) => {
+    logger.info("Log something", { payload });
+    logger.info("Log something else", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyWait = task({
+  id: "lazy-wait",
+  run: async (payload: { forceError?: boolean; delayInSeconds?: number }) => {
+    logger.info("Log something", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazySingleDependency = task({
+  id: "lazy-single-dependency",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result", { result });
+
+    logger.info("Log something else", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyBatchDependency = task({
+  id: "lazy-batch-dependency",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results", { results });
+
+    logger.info("Log something else", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyConsecutiveWaits = task({
+  id: "lazy-consecutive-waits",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyConsecutiveDependencies = task({
+  id: "lazy-consecutive-dependencies",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result #1", { result });
+
+    logger.info("Log something else", { payload });
+
+    const result2 = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result #2", { result2 });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyConsecutiveBatchDependencies = task({
+  id: "lazy-consecutive-batch-dependencies",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results #1", { results });
+
+    logger.info("Log something else", { payload });
+
+    const results2 = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results #2", { results2 });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyWaitThenSingleDependency = task({
+  id: "lazy-wait-then-single-dependency",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result", { result });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyWaitThenBatchDependency = task({
+  id: "lazy-wait-then-batch-dependency",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results", { results });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazySingleDependencyThenWait = task({
+  id: "lazy-single-dependency-then-wait",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result", { result });
+
+    logger.info("Log something else", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazySingleDependencyThenBatch = task({
+  id: "lazy-single-dependency-then-batch",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result", { result });
+
+    logger.info("Log something else", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results", { results });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyBatchDependencyThenWait = task({
+  id: "lazy-batch-dependency-then-wait",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results", { results });
+
+    logger.info("Log something else", { payload });
+
+    await wait.for({ seconds: payload.delayInSeconds ?? 1 });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
+
+export const lazyBatchDependencyThenSingle = task({
+  id: "lazy-batch-dependency-then-single",
+  run: async (payload: {
+    forceError?: boolean;
+    forceChildError?: boolean;
+    delayInSeconds?: number;
+  }) => {
+    logger.info("Log something", { payload });
+
+    const results = await lazyWait.batchTriggerAndWait([
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+      { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } },
+    ]);
+    logger.info("Batch results", { results });
+
+    logger.info("Log something else", { payload });
+
+    const result = await lazyWait.triggerAndWait({
+      delayInSeconds: payload.delayInSeconds,
+      forceError: payload.forceChildError,
+    });
+    logger.info("Single result", { result });
+
+    logger.info("Log something else again", { payload });
+
+    if (payload.forceError) {
+      throw new Error("Forced error");
+    }
+
+    return {
+      message: "This is a message",
+      payload,
+    };
+  },
+});
diff --git a/references/v3-catalog/src/trigger/longRunning.ts b/references/v3-catalog/src/trigger/longRunning.ts
index 09462a1f192..e8119dfcdc4 100644
--- a/references/v3-catalog/src/trigger/longRunning.ts
+++ b/references/v3-catalog/src/trigger/longRunning.ts
@@ -3,7 +3,7 @@ import { logger, task } from "@trigger.dev/sdk/v3";
 export const longRunning = task({
   id: "long-running",
   run: async (payload: { message: string }) => {
-    logger.info("Long running payloadd", { payload });
+    logger.info("Long running payloadddd", { payload });
 
     // Wait for 3 minutes
     await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000));
@@ -19,7 +19,22 @@ export const longRunningParent = task({
   run: async (payload: { message: string }) => {
     logger.info("Long running parent", { payload });
 
-    await longRunning.triggerAndWait({ message: "child" });
+    const result = await longRunning.triggerAndWait({ message: "child" });
+
+    return {
+      finished: new Date().toISOString(),
+      result,
+    };
+  },
+});
+
+export const longRunningWithDotInName = task({
+  id: "long.running.with.dot",
+  run: async (payload: { message: string }) => {
+    logger.info("Long running payloadd", { payload });
+
+    // Wait for 3 minutes
+    await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000));
 
     return {
       finished: new Date().toISOString(),
diff --git a/references/v3-catalog/trigger.config.ts b/references/v3-catalog/trigger.config.ts
index 649a349a66f..c1f504f2eb9 100644
--- a/references/v3-catalog/trigger.config.ts
+++ b/references/v3-catalog/trigger.config.ts
@@ -37,7 +37,7 @@ export const config: TriggerConfig = {
   retries: {
     enabledInDev: true,
     default: {
-      maxAttempts: 3,
+      maxAttempts: 4,
       minTimeoutInMs: 1000,
       maxTimeoutInMs: 10000,
       factor: 2,