diff --git a/.changeset/tricky-keys-attack.md b/.changeset/tricky-keys-attack.md new file mode 100644 index 00000000000..271096497d7 --- /dev/null +++ b/.changeset/tricky-keys-attack.md @@ -0,0 +1,14 @@ +--- +"trigger.dev": patch +"@trigger.dev/core": patch +--- + +- Clear paused states before retry +- Detect and handle unrecoverable worker errors +- Remove checkpoints after successful push +- Permanently switch to DO hosted busybox image +- Fix IPC timeout issue, or at least handle it more gracefully +- Handle checkpoint failures +- Basic chaos monkey for checkpoint testing +- Stack traces are back in the dashboard +- Display final errors on root span diff --git a/.changeset/warm-olives-provide.md b/.changeset/warm-olives-provide.md new file mode 100644 index 00000000000..b57d242a52b --- /dev/null +++ b/.changeset/warm-olives-provide.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Improve handling of IPC timeouts and fix checkpoint cancellation after failures diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index e49aa04ebd0..1bdbc4bc445 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -1,5 +1,6 @@ import { createServer } from "node:http"; -import { $ } from "execa"; +import fs from "node:fs/promises"; +import { $, type ExecaChildProcess } from "execa"; import { nanoid } from "nanoid"; import { Server } from "socket.io"; import { @@ -19,6 +20,11 @@ collectDefaultMetrics(); const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || 8020); const NODE_NAME = process.env.NODE_NAME || "coordinator"; const DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS = 30_000; +const CHAOS_MONKEY_ENABLED = !!process.env.CHAOS_MONKEY_ENABLED; + +const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes( + process.env.FORCE_CHECKPOINT_SIMULATION ?? "true" +); const REGISTRY_HOST = process.env.REGISTRY_HOST || "localhost:5000"; const CHECKPOINT_PATH = process.env.CHECKPOINT_PATH || "/checkpoints"; @@ -32,6 +38,10 @@ const SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ? const logger = new SimpleLogger(`[${NODE_NAME}]`); +if (CHAOS_MONKEY_ENABLED) { + logger.log("🍌 Chaos monkey enabled"); +} + type CheckpointerInitializeReturn = { canCheckpoint: boolean; willSimulate: boolean; @@ -49,6 +59,40 @@ type CheckpointData = { docker: boolean; }; +function isExecaChildProcess(maybeExeca: unknown): maybeExeca is Awaited { + return typeof maybeExeca === "object" && maybeExeca !== null && "escapedCommand" in maybeExeca; +} + +async function getFileSize(filePath: string): Promise { + try { + const stats = await fs.stat(filePath); + return stats.size; + } catch (error) { + console.error("Error getting file size:", error); + return -1; + } +} + +async function getParsedFileSize(filePath: string) { + const sizeInBytes = await getFileSize(filePath); + + let message = `Size in bytes: ${sizeInBytes}`; + + if (sizeInBytes > 1024 * 1024) { + const sizeInMB = (sizeInBytes / 1024 / 1024).toFixed(2); + message = `Size in MB (rounded): ${sizeInMB}`; + } else if (sizeInBytes > 1024) { + const sizeInKB = (sizeInBytes / 1024).toFixed(2); + message = `Size in KB (rounded): ${sizeInKB}`; + } + + return { + path: filePath, + sizeInBytes, + message, + }; +} + class Checkpointer { #initialized = false; #canCheckpoint = false; @@ -56,6 +100,7 @@ class Checkpointer { #logger = new SimpleLogger("[checkptr]"); #abortControllers = new Map(); + #failedCheckpoints = new Map(); constructor(private opts = { forceSimulate: false }) {} @@ -150,7 +195,11 @@ class Checkpointer { success: !!result, }); - return result; + if (!result.success) { + return; + } + + return result.checkpoint; } isCheckpointing(runId: string) { @@ -158,6 +207,13 @@ class Checkpointer { } cancelCheckpoint(runId: string): boolean { + // If the last checkpoint failed, pretend we canceled it + // This ensures tasks don't wait for external resume messages to continue + if (this.#hasFailedCheckpoint(runId)) { + this.#clearFailedCheckpoint(runId); + return true; + } + const controller = this.#abortControllers.get(runId); if (!controller) { @@ -176,44 +232,58 @@ class Checkpointer { leaveRunning = true, // This mirrors kubernetes behaviour more accurately projectRef, deploymentVersion, - }: CheckpointAndPushOptions): Promise { + }: CheckpointAndPushOptions): Promise< + { success: true; checkpoint: CheckpointData } | { success: false; reason?: "CANCELED" } + > { await this.initialize(); + const options = { + runId, + leaveRunning, + projectRef, + deploymentVersion, + }; + if (!this.#dockerMode && !this.#canCheckpoint) { this.#logger.error("No checkpoint support. Simulation requires docker."); - return; + return { success: false }; } if (this.#abortControllers.has(runId)) { - logger.error("Checkpoint procedure already in progress", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - }); - return; + logger.error("Checkpoint procedure already in progress", { options }); + return { success: false }; } + // This is a new checkpoint, clear any last failure for this run + this.#clearFailedCheckpoint(runId); + const controller = new AbortController(); this.#abortControllers.set(runId, controller); const $$ = $({ signal: controller.signal }); try { + if (CHAOS_MONKEY_ENABLED) { + console.log("🍌 Chaos monkey wreaking havoc"); + + const random = Math.random(); + + if (random < 0.33) { + // Fake long checkpoint duration + await $$`sleep 300`; + } else if (random < 0.66) { + // Fake checkpoint error + await $$`false`; + } else { + // no-op + } + } + const shortCode = nanoid(8); const imageRef = this.#getImageRef(projectRef, deploymentVersion, shortCode); const exportLocation = this.#getExportLocation(projectRef, deploymentVersion, shortCode); - this.#logger.log("Checkpointing:", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - }); + this.#logger.log("Checkpointing:", { options }); const containterName = this.#getRunContainerName(runId); @@ -234,9 +304,9 @@ class Checkpointer { ); } } - } catch (error: any) { - this.#logger.error(error.stderr); - return; + } catch (error) { + this.#logger.error("Failed while creating docker checkpoint", { exportLocation }); + throw error; } this.#logger.log("checkpoint created:", { @@ -245,8 +315,11 @@ class Checkpointer { }); return { - location: exportLocation, - docker: true, + success: true, + checkpoint: { + location: exportLocation, + docker: true, + }, }; } @@ -266,54 +339,104 @@ class Checkpointer { throw new Error("could not find container id"); } + const start = performance.now(); + + // Create checkpoint this.#logger.debug(await $$`crictl checkpoint --export=${exportLocation} ${containerId}`); + const postCheckpoint = performance.now(); + + // Print checkpoint size + const size = await getParsedFileSize(exportLocation); + this.#logger.log("checkpoint archive created", { size, options }); // Create image from checkpoint const container = this.#logger.debug(await $$`buildah from scratch`); + const postFrom = performance.now(); + this.#logger.debug(await $$`buildah add ${container} ${exportLocation} /`); + const postAdd = performance.now(); + this.#logger.debug( await $$`buildah config --annotation=io.kubernetes.cri-o.annotations.checkpoint.name=counter ${container}` ); + const postConfig = performance.now(); + this.#logger.debug(await $$`buildah commit ${container} ${imageRef}`); + const postCommit = performance.now(); + this.#logger.debug(await $$`buildah rm ${container}`); + const postRm = performance.now(); // Push checkpoint image this.#logger.debug(await $$`buildah push --tls-verify=${REGISTRY_TLS_VERIFY} ${imageRef}`); + const postPush = performance.now(); + + const perf = { + "crictl checkpoint": postCheckpoint - start, + "buildah from": postFrom - postCheckpoint, + "buildah add": postAdd - postFrom, + "buildah config": postConfig - postAdd, + "buildah commit": postCommit - postConfig, + "buildah rm": postRm - postCommit, + "buildah push": postPush - postRm, + }; - this.#logger.log("Checkpointed and pushed image to:", { location: imageRef }); + this.#logger.log("Checkpointed and pushed image to:", { location: imageRef, perf }); try { await $$`rm ${exportLocation}`; this.#logger.log("Deleted checkpoint archive", { exportLocation }); - // Disabled for now as this will increase restore time by having to pull the image again - // await $`buildah rmi ${imageRef}`; - // this.#logger.log("Deleted checkpoint image", { imageRef }); + await $`buildah rmi ${imageRef}`; + this.#logger.log("Deleted checkpoint image", { imageRef }); } catch (error) { this.#logger.error("Failed during checkpoint cleanup", { exportLocation }); - this.#logger.debug(error); + throw error; } return { - location: imageRef, - docker: false, + success: true, + checkpoint: { + location: imageRef, + docker: false, + }, }; } catch (error) { - this.#logger.error("checkpoint failed", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - error, - }); - return; + if (isExecaChildProcess(error)) { + if (error.isCanceled) { + this.#logger.error("Checkpoint canceled", { options, error }); + + return { success: false, reason: "CANCELED" }; + } + + // Everything that's not a cancellation is a failure + this.#failCheckpoint(runId, error); + this.#logger.error("Checkpoint command error", { options, error }); + + return { success: false }; + } + + this.#failCheckpoint(runId, error); + this.#logger.error("Unhandled checkpoint error", { options, error }); + + return { success: false }; } finally { this.#abortControllers.delete(runId); } } + #failCheckpoint(runId: string, error: unknown) { + this.#failedCheckpoints.set(runId, error); + } + + #clearFailedCheckpoint(runId: string) { + this.#failedCheckpoints.delete(runId); + } + + #hasFailedCheckpoint(runId: string) { + return this.#failedCheckpoints.has(runId); + } + #getRunContainerName(suffix: string) { return `task-run-${suffix}`; } @@ -321,7 +444,7 @@ class Checkpointer { class TaskCoordinator { #httpServer: ReturnType; - #checkpointer = new Checkpointer({ forceSimulate: true }); + #checkpointer = new Checkpointer({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }); #prodWorkerNamespace: ZodNamespace< typeof ProdWorkerToCoordinatorMessages, @@ -442,6 +565,28 @@ class TaskCoordinator { taskSocket.emit("REQUEST_ATTEMPT_CANCELLATION", message); }, + REQUEST_RUN_CANCELLATION: async (message) => { + const taskSocket = await this.#getRunSocket(message.runId); + + if (!taskSocket) { + logger.log("Socket for run not found", { + runId: message.runId, + }); + return; + } + + if (message.delayInMs) { + taskSocket.emit("REQUEST_EXIT", { + version: "v2", + delayInMs: message.delayInMs, + }); + } else { + // If there's no delay, assume the worker doesn't support non-v1 messages + taskSocket.emit("REQUEST_EXIT", { + version: "v1", + }); + } + }, READY_FOR_RETRY: async (message) => { const taskSocket = await this.#getRunSocket(message.runId); @@ -528,6 +673,20 @@ class TaskCoordinator { onConnection: async (socket, handler, sender) => { const logger = new SimpleLogger(`[prod-worker][${socket.id}]`); + const crashRun = async (error: { name: string; message: string; stack?: string }) => { + try { + this.#platformSocket?.send("RUN_CRASHED", { + version: "v1", + runId: socket.data.runId, + error, + }); + } finally { + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + } + }; + const checkpointInProgress = () => { return this.#checkpointableTasks.has(socket.data.runId); }; @@ -596,8 +755,9 @@ class TaskCoordinator { if (!executionAck) { logger.error("no execution ack", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForExecutionError", + message: "No execution ack", }); return; @@ -606,8 +766,9 @@ class TaskCoordinator { if (!executionAck.success) { logger.error("failed to get execution payload", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForExecutionError", + message: "Failed to get execution payload", }); return; @@ -624,6 +785,46 @@ class TaskCoordinator { } }); + socket.on("READY_FOR_LAZY_ATTEMPT", async (message) => { + logger.log("[READY_FOR_LAZY_ATTEMPT]", message); + + try { + const lazyAttempt = await this.#platformSocket?.sendWithAck("READY_FOR_LAZY_ATTEMPT", { + ...message, + envId: socket.data.envId, + }); + + if (!lazyAttempt) { + logger.error("no lazy attempt ack", { runId: socket.data.runId }); + + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "No lazy attempt ack", + }); + + return; + } + + if (!lazyAttempt.success) { + logger.error("failed to get lazy attempt payload", { runId: socket.data.runId }); + + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "Failed to get lazy attempt payload", + }); + + return; + } + + socket.emit("EXECUTE_TASK_RUN_LAZY_ATTEMPT", { + version: "v1", + lazyPayload: lazyAttempt.lazyPayload, + }); + } catch (error) { + logger.error("Error", { error }); + } + }); + socket.on("READY_FOR_RESUME", async (message) => { logger.log("[READY_FOR_RESUME]", message); @@ -714,6 +915,19 @@ class TaskCoordinator { } }); + socket.on("TASK_RUN_FAILED_TO_RUN", async ({ completion }) => { + logger.log("completed task", { completionId: completion.id }); + + this.#platformSocket?.send("TASK_RUN_FAILED_TO_RUN", { + version: "v1", + completion, + }); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + }); + socket.on("READY_FOR_CHECKPOINT", async (message) => { logger.log("[READY_FOR_CHECKPOINT]", message); @@ -890,7 +1104,7 @@ class TaskCoordinator { logger.log("[INDEX_TASKS]", message); const workerAck = await this.#platformSocket?.sendWithAck("CREATE_WORKER", { - version: "v1", + version: "v2", projectRef: socket.data.projectRef, envId: socket.data.envId, deploymentId: message.deploymentId, @@ -899,6 +1113,7 @@ class TaskCoordinator { packageVersion: message.packageVersion, tasks: message.tasks, }, + supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts, }); if (!workerAck) { @@ -917,6 +1132,34 @@ class TaskCoordinator { error: message.error, }); }); + + socket.on("CREATE_TASK_RUN_ATTEMPT", async (message, callback) => { + logger.log("[CREATE_TASK_RUN_ATTEMPT]", message); + + const createAttempt = await this.#platformSocket?.sendWithAck("CREATE_TASK_RUN_ATTEMPT", { + runId: message.runId, + envId: socket.data.envId, + }); + + if (!createAttempt?.success) { + logger.debug("no ack while creating attempt", message); + callback({ success: false }); + return; + } + + socket.data.attemptFriendlyId = createAttempt.executionPayload.execution.attempt.id; + + callback({ + success: true, + executionPayload: createAttempt.executionPayload, + }); + }); + + socket.on("UNRECOVERABLE_ERROR", async (message) => { + logger.log("[UNRECOVERABLE_ERROR]", message); + + await crashRun(message.error); + }); }, onDisconnect: async (socket, handler, sender, logger) => { this.#platformSocket?.send("LOG", { @@ -928,13 +1171,16 @@ class TaskCoordinator { TASK_HEARTBEAT: async (message) => { this.#platformSocket?.send("TASK_HEARTBEAT", message); }, + TASK_RUN_HEARTBEAT: async (message) => { + this.#platformSocket?.send("TASK_RUN_HEARTBEAT", message); + }, }, }); return provider; } - #cancelCheckpoint(runId: string) { + #cancelCheckpoint(runId: string): boolean { const checkpointWait = this.#checkpointableTasks.get(runId); if (checkpointWait) { diff --git a/apps/docker-provider/.env.example b/apps/docker-provider/.env.example index 2d24f79c8ca..75c54083d1a 100644 --- a/apps/docker-provider/.env.example +++ b/apps/docker-provider/.env.example @@ -4,6 +4,8 @@ PLATFORM_WS_PORT=3030 PLATFORM_SECRET=provider-secret SECURE_CONNECTION=false +OTEL_EXPORTER_OTLP_ENDPOINT=http://0.0.0.0:3030/otel + # Use this if you are on macOS # COORDINATOR_HOST="host.docker.internal" # OTEL_EXPORTER_OTLP_ENDPOINT="http://host.docker.internal:4318" \ No newline at end of file diff --git a/apps/docker-provider/src/index.ts b/apps/docker-provider/src/index.ts index f1f945853c4..a5e588956c8 100644 --- a/apps/docker-provider/src/index.ts +++ b/apps/docker-provider/src/index.ts @@ -13,9 +13,14 @@ import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3"; const MACHINE_NAME = process.env.MACHINE_NAME || "local"; const COORDINATOR_PORT = process.env.COORDINATOR_PORT || 8020; const COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1"; + const OTEL_EXPORTER_OTLP_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || "http://0.0.0.0:4318"; +const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes( + process.env.FORCE_CHECKPOINT_SIMULATION ?? "true" +); + const logger = new SimpleLogger(`[${MACHINE_NAME}]`); type InitializeReturn = { @@ -278,7 +283,7 @@ class DockerTaskOperations implements TaskOperations { } const provider = new ProviderShell({ - tasks: new DockerTaskOperations({ forceSimulate: true }), + tasks: new DockerTaskOperations({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }), type: "docker", }); diff --git a/apps/kubernetes-provider/src/index.ts b/apps/kubernetes-provider/src/index.ts index 8981c85d928..53870d36d5b 100644 --- a/apps/kubernetes-provider/src/index.ts +++ b/apps/kubernetes-provider/src/index.ts @@ -212,7 +212,7 @@ class KubernetesTaskOperations implements TaskOperations { }, { name: "populate-taskinfo", - image: "docker.io/library/busybox", + image: "registry.digitalocean.com/trigger/busybox", imagePullPolicy: "IfNotPresent", command: ["/bin/sh", "-c"], args: ["printenv COORDINATOR_HOST | tee /etc/taskinfo/coordinator-host"], diff --git a/apps/webapp/app/routes/admin.api.v1.marqs.ts b/apps/webapp/app/routes/admin.api.v1.marqs.ts new file mode 100644 index 00000000000..14a9fd409e8 --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.marqs.ts @@ -0,0 +1,31 @@ +import { LoaderFunctionArgs, json } from "@remix-run/server-runtime"; +import { prisma } from "~/db.server"; +import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server"; +import { marqs } from "~/v3/marqs/index.server"; + +export async function loader({ request, params }: LoaderFunctionArgs) { + // Next authenticate the request + const authenticationResult = await authenticateApiRequestWithPersonalAccessToken(request); + + if (!authenticationResult) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + const user = await prisma.user.findUnique({ + where: { + id: authenticationResult.userId, + }, + }); + + if (!user) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + if (!user.admin) { + return json({ error: "You must be an admin to perform this action" }, { status: 403 }); + } + + const details = await marqs?.getSharedQueueDetails(); + + return json(details); +} diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts new file mode 100644 index 00000000000..9c2845f6a52 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts @@ -0,0 +1,45 @@ +import type { ActionFunctionArgs } from "@remix-run/server-runtime"; +import { json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; +import { CreateTaskRunAttemptService } from "~/v3/services/createTaskRunAttempt.server"; + +const ParamsSchema = z.object({ + /* This is the run friendly ID */ + runParam: z.string(), +}); + +export async function action({ request, params }: ActionFunctionArgs) { + // Authenticate the request + const authenticationResult = await authenticateApiRequest(request); + + if (!authenticationResult) { + return json({ error: "Invalid or Missing API Key" }, { status: 401 }); + } + + const parsed = ParamsSchema.safeParse(params); + + if (!parsed.success) { + return json({ error: "Invalid or missing run ID" }, { status: 400 }); + } + + const { runParam } = parsed.data; + + const service = new CreateTaskRunAttemptService(); + + try { + const { execution } = await service.call(runParam, authenticationResult.environment); + + return json(execution, { status: 200 }); + } catch (error) { + if (error instanceof ServiceValidationError) { + return json({ error: error.message }, { status: error.status ?? 422 }); + } + + return json( + { error: error instanceof Error ? error.message : "Internal Server Error" }, + { status: 500 } + ); + } +} diff --git a/apps/webapp/app/services/worker.server.ts b/apps/webapp/app/services/worker.server.ts index d592482f0d3..0b03fc27d39 100644 --- a/apps/webapp/app/services/worker.server.ts +++ b/apps/webapp/app/services/worker.server.ts @@ -44,6 +44,8 @@ import { GraphileMigrationHelperService } from "./db/graphileMigrationHelper.ser import { PerformBulkActionService } from "~/v3/services/bulk/performBulkAction.server"; import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server"; +import { RequeueTaskRunService } from "~/v3/requeueTaskRun.server"; +import { RetryAttemptService } from "~/v3/services/retryAttempt.server"; const workerCatalog = { indexEndpoint: z.object({ @@ -158,6 +160,12 @@ const workerCatalog = { "v3.performBulkActionItem": z.object({ bulkActionItemId: z.string(), }), + "v3.requeueTaskRun": z.object({ + runId: z.string(), + }), + "v3.retryAttempt": z.object({ + runId: z.string(), + }), }; const executionWorkerCatalog = { @@ -600,6 +608,24 @@ function getWorkerQueue() { await service.performBulkActionItem(payload.bulkActionItemId); }, }, + "v3.requeueTaskRun": { + priority: 0, + maxAttempts: 3, + handler: async (payload, job) => { + const service = new RequeueTaskRunService(); + + await service.call(payload.runId); + }, + }, + "v3.retryAttempt": { + priority: 0, + maxAttempts: 3, + handler: async (payload, job) => { + const service = new RetryAttemptService(); + + return await service.call(payload.runId); + }, + }, }, }); } diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts index 209954f34e6..79ce1716127 100644 --- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts +++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts @@ -54,7 +54,10 @@ export class AuthenticatedSocketConnection { schema: clientWebsocketMessages, messages: { READY_FOR_TASKS: async (payload) => { - await this._consumer.registerBackgroundWorker(payload.backgroundWorkerId); + await this._consumer.registerBackgroundWorker( + payload.backgroundWorkerId, + payload.inProgressRuns ?? [] + ); }, BACKGROUND_WORKER_DEPRECATED: async (payload) => { await this._consumer.deprecateBackgroundWorker(payload.backgroundWorkerId); @@ -69,10 +72,22 @@ export class AuthenticatedSocketConnection { ); break; } + case "TASK_RUN_FAILED_TO_RUN": { + await this._consumer.taskRunFailed( + payload.backgroundWorkerId, + payload.data.completion + ); + + break; + } case "TASK_HEARTBEAT": { await this._consumer.taskHeartbeat(payload.backgroundWorkerId, payload.data.id); break; } + case "TASK_RUN_HEARTBEAT": { + await this._consumer.taskRunHeartbeat(payload.backgroundWorkerId, payload.data.id); + break; + } } }, }, diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts index 324975e51b9..4a25d2dbd78 100644 --- a/apps/webapp/app/v3/eventRepository.server.ts +++ b/apps/webapp/app/v3/eventRepository.server.ts @@ -10,6 +10,7 @@ import { SpanEvents, SpanMessagingEvent, TaskEventStyle, + TaskRunError, correctErrorStackTrace, createPacketAttributesAsJson, flattenAttributes, @@ -117,6 +118,7 @@ export type QueriedEvent = Prisma.TaskEventGetPayload<{ isCancelled: true; level: true; events: true; + environmentType: true; }; }>; @@ -156,6 +158,7 @@ export type SpanSummary = { isPartial: boolean; isCancelled: boolean; level: NonNullable; + environmentType: CreatableEventEnvironmentType; }; }; @@ -165,6 +168,7 @@ export type UpdateEventOptions = { attributes: TraceAttributes; endTime?: Date; immediate?: boolean; + events?: SpanEvents; }; export class EventRepository { @@ -239,7 +243,7 @@ export class EventRepository { isCancelled: false, status: options?.attributes.isError ? "ERROR" : "OK", links: event.links ?? [], - events: event.events ?? [], + events: event.events ?? (options?.events as any) ?? [], duration: calculateDurationFromStart(event.startTime, options?.endTime), properties: event.properties as Attributes, metadata: event.metadata as Attributes, @@ -386,6 +390,7 @@ export class EventRepository { isCancelled: true, level: true, events: true, + environmentType: true, }, where: { traceId, @@ -421,6 +426,7 @@ export class EventRepository { startTime: getDateFromNanoseconds(event.startTime), level: event.level, events: event.events, + environmentType: event.environmentType, }, }; }); @@ -505,7 +511,11 @@ export class EventRepository { }); } - const events = transformEvents(span.data.events, fullEvent.metadata as Attributes); + const events = transformEvents( + span.data.events, + fullEvent.metadata as Attributes, + traceSummary?.rootSpan.data.environmentType === "DEVELOPMENT" + ); return { ...fullEvent, @@ -877,6 +887,36 @@ export function stripAttributePrefix(attributes: Attributes, prefix: string) { return result; } +export function createExceptionPropertiesFromError(error: TaskRunError): ExceptionEventProperties { + switch (error.type) { + case "BUILT_IN_ERROR": { + return { + type: error.name, + message: error.message, + stacktrace: error.stackTrace, + }; + } + case "CUSTOM_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + case "INTERNAL_ERROR": { + return { + type: "Internal error", + message: [error.code, error.message].filter(Boolean).join(": "), + }; + } + case "STRING_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + } +} + /** * Filters out partial events from a batch of creatable events, excluding those that have a corresponding full event. * @param batch - The batch of creatable events to filter. @@ -1097,16 +1137,16 @@ function removePrivateProperties( return result; } -function transformEvents(events: SpanEvents, properties: Attributes): SpanEvents { - return (events ?? []).map((event) => transformEvent(event, properties)); +function transformEvents(events: SpanEvents, properties: Attributes, isDev: boolean): SpanEvents { + return (events ?? []).map((event) => transformEvent(event, properties, isDev)); } -function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent { +function transformEvent(event: SpanEvent, properties: Attributes, isDev: boolean): SpanEvent { if (isExceptionSpanEvent(event)) { return { ...event, properties: { - exception: transformException(event.properties.exception, properties), + exception: transformException(event.properties.exception, properties, isDev), }, }; } @@ -1116,11 +1156,12 @@ function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent { function transformException( exception: ExceptionEventProperties, - properties: Attributes + properties: Attributes, + isDev: boolean ): ExceptionEventProperties { const projectDirAttributeValue = properties[SemanticInternalAttributes.PROJECT_DIR]; - if (typeof projectDirAttributeValue !== "string") { + if (projectDirAttributeValue !== undefined && typeof projectDirAttributeValue !== "string") { return exception; } @@ -1129,6 +1170,7 @@ function transformException( stacktrace: exception.stacktrace ? correctErrorStackTrace(exception.stacktrace, projectDirAttributeValue, { removeFirstLine: true, + isDev, }) : undefined, }; diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts new file mode 100644 index 00000000000..79594e73cba --- /dev/null +++ b/apps/webapp/app/v3/failedTaskRun.server.ts @@ -0,0 +1,66 @@ +import { TaskRunFailedExecutionResult } from "@trigger.dev/core/v3"; +import { logger } from "~/services/logger.server"; +import { marqs } from "~/v3/marqs/index.server"; + +import { TaskRunStatus } from "@trigger.dev/database"; +import { createExceptionPropertiesFromError, eventRepository } from "./eventRepository.server"; +import { BaseService } from "./services/baseService.server"; + +const FAILABLE_TASK_RUN_STATUSES: TaskRunStatus[] = ["EXECUTING", "PENDING", "WAITING_FOR_DEPLOY"]; + +export class FailedTaskRunService extends BaseService { + public async call(runFriendlyId: string, completion: TaskRunFailedExecutionResult) { + const taskRun = await this._prisma.taskRun.findUnique({ + where: { friendlyId: runFriendlyId }, + }); + + if (!taskRun) { + logger.error("[FailedTaskRunService] Task run not found", { + runFriendlyId, + completion, + }); + + return; + } + + if (!FAILABLE_TASK_RUN_STATUSES.includes(taskRun.status)) { + logger.error("[FailedTaskRunService] Task run is not in a failable state", { + taskRun, + completion, + }); + + return; + } + + // No more retries, we need to fail the task run + logger.debug("[FailedTaskRunService] Failing task run", { taskRun, completion }); + + await marqs?.acknowledgeMessage(taskRun.id); + + // Now we need to "complete" the task run event/span + await eventRepository.completeEvent(taskRun.spanId, { + endTime: new Date(), + attributes: { + isError: true, + }, + events: [ + { + name: "exception", + time: new Date(), + properties: { + exception: createExceptionPropertiesFromError(completion.error), + }, + }, + ], + }); + + await this._prisma.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "SYSTEM_FAILURE", + }, + }); + } +} diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index 6c9bb340207..ca731f202d9 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -22,6 +22,7 @@ import { DeploymentIndexFailed } from "./services/deploymentIndexFailed.server"; import { Redis } from "ioredis"; import { createAdapter } from "@socket.io/redis-adapter"; import { CrashTaskRunService } from "./services/crashTaskRun.server"; +import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server"; export const socketIo = singleton("socketIo", initalizeIoServer); @@ -91,6 +92,23 @@ function createCoordinatorNamespace(io: Server) { return { success: true, payload }; } }, + READY_FOR_LAZY_ATTEMPT: async (message) => { + try { + const payload = await sharedQueueTasks.getLazyAttemptPayload( + message.envId, + message.runId + ); + + if (!payload) { + logger.error("Failed to retrieve lazy attempt payload", message); + return { success: false, reason: "Failed to retrieve payload" }; + } + + return { success: true, lazyPayload: payload }; + } catch (error) { + return { success: false }; + } + }, READY_FOR_RESUME: async (message) => { const resumeAttempt = new ResumeAttemptService(); await resumeAttempt.call(message); @@ -103,9 +121,15 @@ function createCoordinatorNamespace(io: Server) { checkpoint: message.checkpoint, }); }, + TASK_RUN_FAILED_TO_RUN: async (message) => { + await sharedQueueTasks.taskRunFailed(message.completion); + }, TASK_HEARTBEAT: async (message) => { await sharedQueueTasks.taskHeartbeat(message.attemptFriendlyId); }, + TASK_RUN_HEARTBEAT: async (message) => { + await sharedQueueTasks.taskRunHeartbeat(message.runId); + }, CHECKPOINT_CREATED: async (message) => { const createCheckpoint = new CreateCheckpointService(); await createCheckpoint.call(message); @@ -123,6 +147,7 @@ function createCoordinatorNamespace(io: Server) { const worker = await service.call(message.projectRef, environment, message.deploymentId, { localOnly: false, metadata: message.metadata, + supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts, }); return { success: !!worker }; @@ -131,13 +156,52 @@ function createCoordinatorNamespace(io: Server) { return { success: false }; } }, + CREATE_TASK_RUN_ATTEMPT: async (message) => { + try { + const environment = await findEnvironmentById(message.envId); + + if (!environment) { + logger.error("Environment not found", { id: message.envId }); + return { success: false, reason: "Environment not found" }; + } + + const service = new CreateTaskRunAttemptService(); + const { attempt } = await service.call(message.runId, environment, false); + + const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true); + + if (!payload) { + logger.error("Failed to retrieve payload after attempt creation", { + id: message.envId, + }); + return { success: false, reason: "Failed to retrieve payload" }; + } + + return { success: true, executionPayload: payload }; + } catch (error) { + logger.error("Error while creating attempt", { error }); + return { success: false }; + } + }, INDEXING_FAILED: async (message) => { try { const service = new DeploymentIndexFailed(); await service.call(message.deploymentId, message.error); } catch (e) { - logger.error("Error while indexing", { error: e }); + logger.error("Error while processing index failure", { error: e }); + } + }, + RUN_CRASHED: async (message) => { + try { + const service = new CrashTaskRunService(); + + await service.call(message.runId, { + reason: `${message.error.name}: ${message.error.message}`, + logs: message.error.stack, + }); + } catch (e) { + logger.error("Error while processing run failure", { error: e }); } }, }, diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index d26759ec2a2..fc1a5ed833a 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -1,8 +1,10 @@ import { Context, ROOT_CONTEXT, Span, SpanKind, context, trace } from "@opentelemetry/api"; import { TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, + TaskRunFailedExecutionResult, serverWebsocketMessages, } from "@trigger.dev/core/v3"; import { ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler"; @@ -14,16 +16,16 @@ import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { marqs, sanitizeQueueName } from "~/v3/marqs/index.server"; import { EnvironmentVariablesRepository } from "../environmentVariables/environmentVariablesRepository.server"; -import { generateFriendlyId } from "../friendlyIdentifiers"; -import { CancelAttemptService } from "../services/cancelAttempt.server"; import { CancelTaskRunService } from "../services/cancelTaskRun.server"; import { CompleteAttemptService } from "../services/completeAttempt.server"; +import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server"; import { SEMINTATTRS_FORCE_RECORDING, attributesFromAuthenticatedEnv, tracer, } from "../tracer.server"; import { DevSubscriber, devPubSub } from "./devPubSub.server"; +import { FailedTaskRunService } from "../failedTaskRun.server"; const MessageBody = z.discriminatedUnion("type", [ z.object({ @@ -54,7 +56,6 @@ export class DevQueueConsumer { private _taskSuccesses: number = 0; private _currentSpan: Span | undefined; private _endSpanInNextIteration = false; - private _inProgressAttempts: Map = new Map(); // Keys are task attempt friendly IDs, values are TaskRun ids/queue message ids private _inProgressRuns: Map = new Map(); // Keys are task run friendly IDs, values are TaskRun internal ids/queue message ids constructor( @@ -78,7 +79,7 @@ export class DevQueueConsumer { this._backgroundWorkers.delete(id); } - public async registerBackgroundWorker(id: string) { + public async registerBackgroundWorker(id: string, inProgressRuns: string[] = []) { const backgroundWorker = await prisma.backgroundWorker.findUnique({ where: { friendlyId: id, runtimeEnvironmentId: this.env.id }, include: { @@ -96,7 +97,10 @@ export class DevQueueConsumer { this._backgroundWorkers.set(backgroundWorker.id, backgroundWorker); - logger.debug("Registered background worker", { backgroundWorker: backgroundWorker.id }); + logger.debug("Registered background worker", { + backgroundWorker: backgroundWorker.id, + inProgressRuns, + }); const subscriber = await devPubSub.subscribe(`backgroundWorker:${backgroundWorker.id}:*`); @@ -113,6 +117,10 @@ export class DevQueueConsumer { this._backgroundWorkerSubscriber.set(backgroundWorker.id, subscriber); + for (const runId of inProgressRuns) { + this._inProgressRuns.set(runId, runId); + } + // Start reading from the queue if we haven't already await this.#enable(); } @@ -122,15 +130,16 @@ export class DevQueueConsumer { completion: TaskRunExecutionResult, execution: TaskRunExecution ) { - this._inProgressAttempts.delete(execution.attempt.id); - if (completion.ok) { this._taskSuccesses++; } else { this._taskFailures++; } - logger.debug("Task run completed", { taskRunCompletion: completion, execution }); + logger.debug("[DevQueueConsumer] taskAttemptCompleted()", { + taskRunCompletion: completion, + execution, + }); const service = new CompleteAttemptService(); const result = await service.call({ completion, execution, env: this.env }); @@ -140,7 +149,24 @@ export class DevQueueConsumer { } } + public async taskRunFailed(workerId: string, completion: TaskRunFailedExecutionResult) { + this._taskFailures++; + + logger.debug("[DevQueueConsumer] taskRunFailed()", { completion }); + + this._inProgressRuns.delete(completion.id); + + const service = new FailedTaskRunService(); + + await service.call(completion.id, completion); + } + + /** + * @deprecated Use `taskRunHeartbeat` instead + */ public async taskHeartbeat(workerId: string, id: string, seconds: number = 60) { + logger.debug("[DevQueueConsumer] taskHeartbeat()", { id, seconds }); + const taskRunAttempt = await prisma.taskRunAttempt.findUnique({ where: { friendlyId: id }, }); @@ -152,6 +178,12 @@ export class DevQueueConsumer { await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds); } + public async taskRunHeartbeat(workerId: string, id: string, seconds: number = 60) { + logger.debug("[DevQueueConsumer] taskRunHeartbeat()", { id, seconds }); + + await marqs?.heartbeatMessage(id, seconds); + } + public async stop(reason: string = "CLI disconnected") { if (!this._enabled) { return; @@ -184,66 +216,23 @@ export class DevQueueConsumer { } async #cancelInProgressRunsAndAttempts(reason: string) { - const cancelAttemptService = new CancelAttemptService(); const cancelTaskRunService = new CancelTaskRunService(); const cancelledAt = new Date(); - const inProgressAttempts = new Map(this._inProgressAttempts); const inProgressRuns = new Map(this._inProgressRuns); - this._inProgressAttempts.clear(); this._inProgressRuns.clear(); - const inProgressRunsWithNoInProgressAttempts: string[] = []; - const inProgressAttemptRunIds = new Set(inProgressAttempts.values()); - - for (const [runId, messageId] of inProgressRuns) { - if (!inProgressAttemptRunIds.has(messageId)) { - inProgressRunsWithNoInProgressAttempts.push(messageId); - } - } - logger.debug("Cancelling in progress runs and attempts", { - attempts: Array.from(inProgressAttempts.keys()), runs: Array.from(inProgressRuns.keys()), }); - for (const [attemptId, messageId] of inProgressAttempts) { - await this.#cancelInProgressAttempt( - attemptId, - messageId, - cancelAttemptService, - cancelledAt, - reason - ); - } - - for (const runId of inProgressRunsWithNoInProgressAttempts) { + for (const [_, runId] of inProgressRuns) { await this.#cancelInProgressRun(runId, cancelTaskRunService, cancelledAt, reason); } } - async #cancelInProgressAttempt( - attemptId: string, - messageId: string, - cancelAttemptService: CancelAttemptService, - cancelledAt: Date, - reason: string - ) { - logger.debug("Cancelling in progress attempt", { attemptId, messageId }); - - try { - await cancelAttemptService.call(attemptId, messageId, cancelledAt, reason, this.env); - } catch (e) { - logger.error("Failed to cancel in progress attempt", { - attemptId, - messageId, - error: e, - }); - } - } - async #cancelInProgressRun( runId: string, service: CancelTaskRunService, @@ -252,16 +241,20 @@ export class DevQueueConsumer { ) { logger.debug("Cancelling in progress run", { runId }); - const taskRun = await prisma.taskRun.findUnique({ - where: { id: runId }, - }); + const taskRun = runId.startsWith("run_") + ? await prisma.taskRun.findUnique({ + where: { friendlyId: runId }, + }) + : await prisma.taskRun.findUnique({ + where: { id: runId }, + }); if (!taskRun) { return; } try { - await service.call(taskRun, { reason, cancelAttempts: false, cancelledAt }); + await service.call(taskRun, { reason, cancelAttempts: true, cancelledAt }); } catch (e) { logger.error("Failed to cancel in progress run", { runId, @@ -474,141 +467,131 @@ export class DevQueueConsumer { } if (!this._enabled) { + logger.debug("Dev queue consumer is disabled", { env: this.env, queueMessage: message }); + await marqs?.nackMessage(message.messageId); return; } - const taskRunAttempt = await prisma.taskRunAttempt.create({ - data: { - number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1, - friendlyId: generateFriendlyId("attempt"), - taskRunId: lockedTaskRun.id, - startedAt: new Date(), - backgroundWorkerId: backgroundTask.workerId, - backgroundWorkerTaskId: backgroundTask.id, - status: "EXECUTING" as const, - queueId: queue.id, - runtimeEnvironmentId: this.env.id, - }, - }); - - const execution: TaskRunExecution = { - task: { - id: backgroundTask.slug, - filePath: backgroundTask.filePath, - exportName: backgroundTask.exportName, - }, - attempt: { - id: taskRunAttempt.friendlyId, - number: taskRunAttempt.number, - startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt, - backgroundWorkerId: backgroundWorker.id, - backgroundWorkerTaskId: backgroundTask.id, - status: "EXECUTING" as const, - }, - run: { - id: lockedTaskRun.friendlyId, - payload: lockedTaskRun.payload, - payloadType: lockedTaskRun.payloadType, - context: lockedTaskRun.context, - createdAt: lockedTaskRun.createdAt, - tags: lockedTaskRun.tags.map((tag) => tag.name), - isTest: lockedTaskRun.isTest, - idempotencyKey: lockedTaskRun.idempotencyKey ?? undefined, - }, - queue: { - id: queue.friendlyId, - name: queue.name, - }, - environment: { - id: this.env.id, - slug: this.env.slug, - type: this.env.type, - }, - organization: { - id: this.env.organization.id, - slug: this.env.organization.slug, - name: this.env.organization.title, - }, - project: { - id: this.env.project.id, - ref: this.env.project.externalRef, - slug: this.env.project.slug, - name: this.env.project.name, - }, - batch: - lockedTaskRun.batchItems[0] && lockedTaskRun.batchItems[0].batchTaskRun - ? { id: lockedTaskRun.batchItems[0].batchTaskRun.friendlyId } - : undefined, - }; - const environmentRepository = new EnvironmentVariablesRepository(); const variables = await environmentRepository.getEnvironmentVariables( this.env.project.id, this.env.id ); - const payload: TaskRunExecutionPayload = { - execution, - traceContext: lockedTaskRun.traceContext as Record, - environment: variables.reduce((acc: Record, curr) => { - acc[curr.key] = curr.value; - return acc; - }, {}), - }; + if (backgroundWorker.supportsLazyAttempts) { + const payload: TaskRunExecutionLazyAttemptPayload = { + traceContext: lockedTaskRun.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + runId: lockedTaskRun.friendlyId, + messageId: lockedTaskRun.id, + isTest: lockedTaskRun.isTest, + }; - try { - // TODO: send trace context down to the CLI - await this._sender.send("BACKGROUND_WORKER_MESSAGE", { - backgroundWorkerId: backgroundWorker.friendlyId, - data: { - type: "EXECUTE_RUNS", - payloads: [payload], - }, - }); + try { + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId: backgroundWorker.friendlyId, + data: { + type: "EXECUTE_RUN_LAZY_ATTEMPT", + payload, + }, + }); - logger.debug("Saving the in progress attempt", { - taskRunAttempt: taskRunAttempt.id, - messageId: message.messageId, - }); + logger.debug("Executing the run", { + messageId: message.messageId, + }); - this._inProgressAttempts.set(taskRunAttempt.friendlyId, message.messageId); - this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); - } catch (e) { - if (e instanceof Error) { - this._currentSpan?.recordException(e); - } else { - this._currentSpan?.recordException(new Error(String(e))); + this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); + } catch (e) { + if (e instanceof Error) { + this._currentSpan?.recordException(e); + } else { + this._currentSpan?.recordException(new Error(String(e))); + } + + this._endSpanInNextIteration = true; + + // We now need to unlock the task run and delete the task run attempt + await prisma.$transaction([ + prisma.taskRun.update({ + where: { + id: lockedTaskRun.id, + }, + data: { + lockedAt: null, + lockedById: null, + status: "PENDING", + }, + }), + ]); + + this._inProgressRuns.delete(lockedTaskRun.friendlyId); + + // Finally we need to nack the message so it can be retried + await marqs?.nackMessage(message.messageId); + } finally { + setTimeout(() => this.#doWork(), 100); } - - this._endSpanInNextIteration = true; - - // We now need to unlock the task run and delete the task run attempt - await prisma.$transaction([ - prisma.taskRun.update({ - where: { - id: lockedTaskRun.id, - }, + } else { + const service = new CreateTaskRunAttemptService(); + const { execution } = await service.call(lockedTaskRun.friendlyId, this.env); + + const payload: TaskRunExecutionPayload = { + traceContext: lockedTaskRun.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + execution, + }; + + try { + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId: backgroundWorker.friendlyId, data: { - lockedAt: null, - lockedById: null, - status: "PENDING", + type: "EXECUTE_RUNS", + payloads: [payload], }, - }), - prisma.taskRunAttempt.delete({ - where: { - id: taskRunAttempt.id, - }, - }), - ]); + }); - this._inProgressAttempts.delete(taskRunAttempt.friendlyId); - this._inProgressRuns.delete(lockedTaskRun.friendlyId); + logger.debug("Executing the run", { + messageId: message.messageId, + }); - // Finally we need to nack the message so it can be retried - await marqs?.nackMessage(message.messageId); - } finally { - setTimeout(() => this.#doWork(), 100); + this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); + } catch (e) { + if (e instanceof Error) { + this._currentSpan?.recordException(e); + } else { + this._currentSpan?.recordException(new Error(String(e))); + } + + this._endSpanInNextIteration = true; + + // We now need to unlock the task run and delete the task run attempt + await prisma.$transaction([ + prisma.taskRun.update({ + where: { + id: lockedTaskRun.id, + }, + data: { + lockedAt: null, + lockedById: null, + status: "PENDING", + }, + }), + ]); + + this._inProgressRuns.delete(lockedTaskRun.friendlyId); + + // Finally we need to nack the message so it can be retried + await marqs?.nackMessage(message.messageId); + } finally { + setTimeout(() => this.#doWork(), 100); + } } } diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index b20c0d99e9d..0816bab3b01 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -21,6 +21,7 @@ import { QueueCapacities, QueueRange, } from "./types"; +import { RequeueTaskRunService } from "../requeueTaskRun.server"; const tracer = trace.getTracer("marqs"); @@ -259,6 +260,11 @@ export class MarQS { }); } + await RequeueTaskRunService.enqueue( + messageData.messageId, + new Date(Date.now() + this.visibilityTimeoutInMs) + ); + return message; }, { @@ -272,6 +278,35 @@ export class MarQS { ); } + public async getSharedQueueDetails() { + const parentQueue = constants.SHARED_QUEUE; + + const { range, selectionId } = await this.queuePriorityStrategy.nextCandidateSelection( + parentQueue + ); + const queues = await this.#getChildQueuesWithScores(parentQueue, range); + + const queuesWithScores = await this.#calculateQueueScores(queues, (queue) => + this.#calculateMessageQueueCapacities(queue) + ); + + // We need to priority shuffle here to ensure all workers aren't just working on the highest priority queue + const choice = this.queuePriorityStrategy.chooseQueue( + queuesWithScores, + parentQueue, + selectionId + ); + + return { + selectionId, + queues, + queuesWithScores, + nextRange: range, + queueCount: queues.length, + queueChoice: choice, + }; + } + /** * Dequeue a message from the shared queue (this should be used in production environments) */ @@ -350,6 +385,8 @@ export class MarQS { [SemanticAttributes.PARENT_QUEUE]: message.parentQueue, }); + await RequeueTaskRunService.dequeue(messageId); + await this.#callAcknowledgeMessage({ parentQueue: message.parentQueue, messageKey: this.keys.messageKey(messageId), @@ -415,6 +452,8 @@ export class MarQS { return; } + await RequeueTaskRunService.dequeue(messageId); + await this.#callAcknowledgeMessage({ parentQueue: oldMessage.parentQueue, messageKey: this.keys.messageKey(messageId), @@ -481,6 +520,8 @@ export class MarQS { [SemanticAttributes.PARENT_QUEUE]: message.parentQueue, }); + await RequeueTaskRunService.dequeue(messageId); + await this.#callNackMessage({ messageKey: this.keys.messageKey(messageId), messageQueue: message.queue, @@ -506,16 +547,19 @@ export class MarQS { // This should increment by the number of seconds, but with a max value of Date.now() + visibilityTimeoutInMs public async heartbeatMessage(messageId: string, seconds: number = 30) { + // We are still calling this for backwards compatibility, but we should be using the v3.requeueTaskRun job await this.#callHeartbeatMessage({ visibilityQueue: constants.MESSAGE_VISIBILITY_TIMEOUT_QUEUE, messageId, milliseconds: seconds * 1000, maxVisibilityTimeout: Date.now() + this.visibilityTimeoutInMs, }); + + await RequeueTaskRunService.enqueue(messageId, new Date(Date.now() + seconds * 1000)); } get visibilityTimeoutInMs() { - return this.options.visibilityTimeoutInMs ?? 300000; + return this.options.visibilityTimeoutInMs ?? 300000; // 5 minutes } async readMessage(messageId: string) { @@ -873,7 +917,6 @@ export class MarQS { const result = await this.redis.dequeueMessage( messageQueue, parentQueue, - visibilityQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, @@ -881,7 +924,6 @@ export class MarQS { envCurrentConcurrencyKey, orgCurrentConcurrencyKey, messageQueue, - String(this.options.visibilityTimeoutInMs ?? 300000), // 5 minutes String(Date.now()), String(this.options.defaultEnvConcurrency), String(this.options.defaultOrgConcurrency) @@ -1007,6 +1049,9 @@ export class MarQS { ); } + /** + * @deprecated This is being replaced by the v3.requeueTaskRun graphile worker job + */ #callHeartbeatMessage({ visibilityQueue, messageId, @@ -1145,25 +1190,23 @@ end }); this.redis.defineCommand("dequeueMessage", { - numberOfKeys: 9, + numberOfKeys: 8, lua: ` --- Keys: childQueue, parentQueue, visibilityQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey +-- Keys: childQueue, parentQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey local childQueue = KEYS[1] local parentQueue = KEYS[2] -local visibilityQueue = KEYS[3] -local concurrencyLimitKey = KEYS[4] -local envConcurrencyLimitKey = KEYS[5] -local orgConcurrencyLimitKey = KEYS[6] -local currentConcurrencyKey = KEYS[7] -local envCurrentConcurrencyKey = KEYS[8] -local orgCurrentConcurrencyKey = KEYS[9] - --- Args: childQueueName, visibilityQueue, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit +local concurrencyLimitKey = KEYS[3] +local envConcurrencyLimitKey = KEYS[4] +local orgConcurrencyLimitKey = KEYS[5] +local currentConcurrencyKey = KEYS[6] +local envCurrentConcurrencyKey = KEYS[7] +local orgCurrentConcurrencyKey = KEYS[8] + +-- Args: childQueueName, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit local childQueueName = ARGV[1] -local visibilityTimeout = tonumber(ARGV[2]) -local currentTime = tonumber(ARGV[3]) -local defaultEnvConcurrencyLimit = ARGV[4] -local defaultOrgConcurrencyLimit = ARGV[5] +local currentTime = tonumber(ARGV[2]) +local defaultEnvConcurrencyLimit = ARGV[3] +local defaultOrgConcurrencyLimit = ARGV[4] -- Check current org concurrency against the limit local orgCurrentConcurrency = tonumber(redis.call('SCARD', orgCurrentConcurrencyKey) or '0') @@ -1199,11 +1242,9 @@ end local messageId = messages[1] local messageScore = tonumber(messages[2]) -local timeoutScore = currentTime + visibilityTimeout -- Move message to timeout queue and update concurrency redis.call('ZREM', childQueue, messageId) -redis.call('ZADD', visibilityQueue, timeoutScore, messageId) redis.call('SADD', currentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('SADD', orgCurrentConcurrencyKey, messageId) @@ -1269,7 +1310,7 @@ else redis.call('ZADD', parentQueue, earliestMessage[2], messageQueueName) end --- Remove the message from the timeout queue +-- Remove the message from the timeout queue (deprecated, will eventually remove this) redis.call('ZREM', visibilityQueue, messageId) -- Update the concurrency keys @@ -1297,20 +1338,18 @@ local messageId = ARGV[2] local currentTime = tonumber(ARGV[3]) local messageScore = tonumber(ARGV[4]) --- Check to see if the message is still in the visibilityQueue -local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 - -if messageVisibility == 0 then - return -end - -- Update the concurrency keys redis.call('SREM', concurrencyKey, messageId) redis.call('SREM', envConcurrencyKey, messageId) redis.call('SREM', orgConcurrencyKey, messageId) --- Remove the message from the timeout queue -redis.call('ZREM', visibilityQueue, messageId) +-- Check to see if the message is still in the visibilityQueue +local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 + +if messageVisibility > 0 then +-- Remove the message from the timeout queue (deprecated, will eventually remove this) + redis.call('ZREM', visibilityQueue, messageId) +end -- Enqueue the message into the queue redis.call('ZADD', childQueueKey, messageScore, messageId) @@ -1337,12 +1376,16 @@ local milliseconds = tonumber(ARGV[2]) local maxVisibilityTimeout = tonumber(ARGV[3]) -- Get the current visibility timeout -local currentVisibilityTimeout = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 +local zscoreResult = redis.call('ZSCORE', visibilityQueue, messageId) -if currentVisibilityTimeout == 0 then +-- If there's no currentVisibilityTimeout, return and do not execute ZADD +if zscoreResult == false then return end +local currentVisibilityTimeout = tonumber(zscoreResult) + + -- Calculate the new visibility timeout local newVisibilityTimeout = math.min(currentVisibilityTimeout + milliseconds * 1000, maxVisibilityTimeout) @@ -1445,7 +1488,6 @@ declare module "ioredis" { dequeueMessage( childQueue: string, parentQueue: string, - visibilityQueue: string, concurrencyLimitKey: string, envConcurrencyLimitKey: string, orgConcurrencyLimitKey: string, @@ -1453,7 +1495,6 @@ declare module "ioredis" { envCurrentConcurrencyKey: string, orgCurrentConcurrencyKey: string, childQueueName: string, - visibilityTimeout: string, currentTime: string, defaultEnvConcurrencyLimit: string, defaultOrgConcurrencyLimit: string, diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index b71b1176d0f..0f8f5f6c050 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -5,6 +5,7 @@ import { ProdTaskRunExecutionPayload, TaskRunError, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionResult, TaskRunFailedExecutionResult, TaskRunSuccessfulExecutionResult, @@ -33,6 +34,9 @@ import { import { RestoreCheckpointService } from "../services/restoreCheckpoint.server"; import { SEMINTATTRS_FORCE_RECORDING, tracer } from "../tracer.server"; import { CrashTaskRunService } from "../services/crashTaskRun.server"; +import { FailedTaskRunService } from "../failedTaskRun.server"; +import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server"; +import { findEnvironmentById } from "~/models/runtimeEnvironment.server"; const WithTraceContext = z.object({ traceparent: z.string().optional(), @@ -260,6 +264,14 @@ export class SharedQueueConsumer { where: { id: message.messageId, }, + include: { + lockedToVersion: { + include: { + deployment: true, + tasks: true, + }, + }, + }, }); if (!existingTaskRun) { @@ -291,7 +303,7 @@ export class SharedQueueConsumer { (!retryingFromCheckpoint && !EXECUTABLE_RUN_STATUSES.withoutCheckpoint.includes(existingTaskRun.status)) ) { - logger.debug("Task run has invalid status for execution", { + logger.error("Task run has invalid status for execution", { queueMessage: message.data, messageId: message.messageId, taskRun: existingTaskRun.id, @@ -299,6 +311,12 @@ export class SharedQueueConsumer { retryingFromCheckpoint, }); + const service = new CrashTaskRunService(); + await service.call(existingTaskRun.id, { + crashAttempts: true, + reason: `Invalid run status for execution: ${existingTaskRun.status}`, + }); + await this.#ackAndDoMoreWork(message.messageId); return; } @@ -398,6 +416,7 @@ export class SharedQueueConsumer { createdAt: "desc", }, }, + lockedBy: true, }, }); @@ -443,39 +462,12 @@ export class SharedQueueConsumer { return; } - const taskRunAttempt = await prisma.taskRunAttempt.create({ - data: { - number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1, - friendlyId: generateFriendlyId("attempt"), - taskRunId: lockedTaskRun.id, - startedAt: new Date(), - backgroundWorkerId: backgroundTask.workerId, - backgroundWorkerTaskId: backgroundTask.id, - status: "PENDING" as const, - queueId: queue.id, - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - }, - include: { - backgroundWorkerTask: true, - }, - }); - - const isRetry = taskRunAttempt.number > 1; + const nextAttemptNumber = lockedTaskRun.attempts[0] + ? lockedTaskRun.attempts[0].number + 1 + : 1; - const { machineConfig } = taskRunAttempt.backgroundWorkerTask; - const machine = Machine.safeParse(machineConfig ?? {}); - - if (!machine.success) { - logger.error("Failed to parse machine config", { - queueMessage: message.data, - messageId: message.messageId, - attemptId: taskRunAttempt.id, - machineConfig, - }); + const isRetry = nextAttemptNumber > 1; - await this.#ackAndDoMoreWork(message.messageId); - return; - } try { if (messageBody.data.checkpointEventId) { const restoreService = new RestoreCheckpointService(); @@ -494,12 +486,35 @@ export class SharedQueueConsumer { await this.#ackAndDoMoreWork(message.messageId); return; } - } else if (isRetry) { + + break; + } + + if (!deployment.worker.supportsLazyAttempts) { + const service = new CreateTaskRunAttemptService(); + await service.call(lockedTaskRun.friendlyId, undefined, false); + } + + if (isRetry) { socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { version: "v1", - runId: taskRunAttempt.taskRunId, + runId: lockedTaskRun.id, }); } else { + const machineConfig = lockedTaskRun.lockedBy?.machineConfig; + const machine = Machine.safeParse(machineConfig ?? {}); + + if (!machine.success) { + logger.error("Failed to parse machine config", { + queueMessage: message.data, + messageId: message.messageId, + machineConfig, + }); + + await this.#ackAndDoMoreWork(message.messageId); + return; + } + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { backgroundWorkerId: deployment.worker.friendlyId, data: { @@ -508,12 +523,12 @@ export class SharedQueueConsumer { version: deployment.version, machine: machine.data, // identifiers - id: taskRunAttempt.id, + id: "placeholder", // TODO: Remove this completely in a future release envId: lockedTaskRun.runtimeEnvironment.id, envType: lockedTaskRun.runtimeEnvironment.type, orgId: lockedTaskRun.runtimeEnvironment.organizationId, projectId: lockedTaskRun.runtimeEnvironment.projectId, - runId: taskRunAttempt.taskRunId, + runId: lockedTaskRun.id, }, }); } @@ -535,11 +550,7 @@ export class SharedQueueConsumer { data: { lockedAt: null, lockedById: null, - }, - }), - prisma.taskRunAttempt.delete({ - where: { - id: taskRunAttempt.id, + status: lockedTaskRun.status, }, }), ]); @@ -1096,7 +1107,50 @@ class SharedQueueTasks { return this.getExecutionPayloadFromAttempt(latestAttempt.id, setToExecuting, isRetrying); } + async getLazyAttemptPayload( + envId: string, + runId: string + ): Promise { + const environment = await findEnvironmentById(envId); + + if (!environment) { + logger.error("Environment not found", { id: envId }); + return; + } + + const run = await prisma.taskRun.findUnique({ + where: { + id: runId, + runtimeEnvironmentId: environment.id, + }, + }); + + if (!run) { + logger.error("Run not found", { id: runId, envId }); + return; + } + + const environmentRepository = new EnvironmentVariablesRepository(); + const variables = await environmentRepository.getEnvironmentVariables( + environment.projectId, + environment.id + ); + + return { + traceContext: run.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + runId: run.friendlyId, + messageId: run.id, + isTest: run.isTest, + } satisfies TaskRunExecutionLazyAttemptPayload; + } + async taskHeartbeat(attemptFriendlyId: string, seconds: number = 60) { + logger.debug("[SharedQueueConsumer] taskHeartbeat()", { id: attemptFriendlyId, seconds }); + const taskRunAttempt = await prisma.taskRunAttempt.findUnique({ where: { friendlyId: attemptFriendlyId }, }); @@ -1107,6 +1161,20 @@ class SharedQueueTasks { await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds); } + + async taskRunHeartbeat(runId: string, seconds: number = 60) { + logger.debug("[SharedQueueConsumer] taskRunHeartbeat()", { runId, seconds }); + + await marqs?.heartbeatMessage(runId, seconds); + } + + public async taskRunFailed(completion: TaskRunFailedExecutionResult) { + logger.debug("[SharedQueueConsumer] taskRunFailed()", { completion }); + + const service = new FailedTaskRunService(); + + await service.call(completion.id, completion); + } } export const sharedQueueTasks = singleton("sharedQueueTasks", () => new SharedQueueTasks()); diff --git a/apps/webapp/app/v3/requeueTaskRun.server.ts b/apps/webapp/app/v3/requeueTaskRun.server.ts new file mode 100644 index 00000000000..e2b904998fd --- /dev/null +++ b/apps/webapp/app/v3/requeueTaskRun.server.ts @@ -0,0 +1,95 @@ +import { logger } from "~/services/logger.server"; +import { marqs } from "~/v3/marqs/index.server"; + +import assertNever from "assert-never"; +import { FailedTaskRunService } from "./failedTaskRun.server"; +import { BaseService } from "./services/baseService.server"; +import { PrismaClientOrTransaction } from "~/db.server"; +import { workerQueue } from "~/services/worker.server"; + +export class RequeueTaskRunService extends BaseService { + public async call(runId: string) { + const taskRun = await this._prisma.taskRun.findUnique({ + where: { id: runId }, + }); + + if (!taskRun) { + logger.error("[RequeueTaskRunService] Task run not found", { + runId, + }); + + return; + } + + switch (taskRun.status) { + case "PENDING": { + logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun }); + + await marqs?.nackMessage(taskRun.id); + + break; + } + case "EXECUTING": + case "RETRYING_AFTER_FAILURE": { + logger.debug("[RequeueTaskRunService] Failing task run", { taskRun }); + + const service = new FailedTaskRunService(); + + await service.call(taskRun.friendlyId, { + ok: false, + id: taskRun.friendlyId, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_HEARTBEAT_TIMEOUT", + message: "Did not receive a heartbeat from the worker in time", + }, + }); + + break; + } + case "WAITING_FOR_DEPLOY": { + logger.debug("[RequeueTaskRunService] Removing task run from queue", { taskRun }); + + await marqs?.acknowledgeMessage(taskRun.id); + + break; + } + case "WAITING_TO_RESUME": + case "PAUSED": { + logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun }); + + await marqs?.nackMessage(taskRun.id); + + break; + } + case "SYSTEM_FAILURE": + case "INTERRUPTED": + case "CRASHED": + case "COMPLETED_WITH_ERRORS": + case "COMPLETED_SUCCESSFULLY": + case "CANCELED": { + logger.debug("[RequeueTaskRunService] Task run is completed", { taskRun }); + + await marqs?.acknowledgeMessage(taskRun.id); + + break; + } + default: { + assertNever(taskRun.status); + } + } + } + + public static async enqueue(runId: string, runAt?: Date, tx?: PrismaClientOrTransaction) { + return await workerQueue.enqueue( + "v3.requeueTaskRun", + { runId }, + { runAt, jobKey: `requeueTaskRun:${runId}` } + ); + } + + public static async dequeue(runId: string, tx?: PrismaClientOrTransaction) { + return await workerQueue.dequeue(`requeueTaskRun:${runId}`, { tx }); + } +} diff --git a/apps/webapp/app/v3/services/baseService.server.ts b/apps/webapp/app/v3/services/baseService.server.ts index 6892118c649..e6b9d0252cb 100644 --- a/apps/webapp/app/v3/services/baseService.server.ts +++ b/apps/webapp/app/v3/services/baseService.server.ts @@ -34,7 +34,7 @@ export abstract class BaseService { } export class ServiceValidationError extends Error { - constructor(message: string) { + constructor(message: string, public status?: number) { super(message); this.name = "ServiceValidationError"; } diff --git a/apps/webapp/app/v3/services/cancelTaskRun.server.ts b/apps/webapp/app/v3/services/cancelTaskRun.server.ts index 72179a0dce7..a4822d854aa 100644 --- a/apps/webapp/app/v3/services/cancelTaskRun.server.ts +++ b/apps/webapp/app/v3/services/cancelTaskRun.server.ts @@ -24,9 +24,15 @@ const CANCELLABLE_ATTEMPT_STATUSES: Array = [ "PENDING", ]; -type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{ +type ExtendedTaskRun = Prisma.TaskRunGetPayload<{ include: { runtimeEnvironment: true; + lockedToVersion: true; + }; +}>; + +type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{ + include: { backgroundWorker: true; }; }>; @@ -71,11 +77,10 @@ export class CancelTaskRunService extends BaseService { }, include: { backgroundWorker: true, - runtimeEnvironment: true, }, }, - dependency: true, runtimeEnvironment: true, + lockedToVersion: true, }, }); @@ -96,6 +101,7 @@ export class CancelTaskRunService extends BaseService { // Cancel any in progress attempts if (opts.cancelAttempts) { await this.#cancelPotentiallyRunningAttempts(cancelledTaskRun, cancelledTaskRun.attempts); + await this.#cancelRemainingRunWorkers(cancelledTaskRun); } return { @@ -103,9 +109,12 @@ export class CancelTaskRunService extends BaseService { }; } - async #cancelPotentiallyRunningAttempts(run: TaskRun, attempts: ExtendedTaskRunAttempt[]) { + async #cancelPotentiallyRunningAttempts( + run: ExtendedTaskRun, + attempts: ExtendedTaskRunAttempt[] + ) { for (const attempt of attempts) { - if (attempt.runtimeEnvironment.type === "DEVELOPMENT") { + if (run.runtimeEnvironment.type === "DEVELOPMENT") { // Signal the task run attempt to stop await devPubSub.publish( `backgroundWorker:${attempt.backgroundWorkerId}:${attempt.id}`, @@ -158,4 +167,19 @@ export class CancelTaskRunService extends BaseService { } } } + + async #cancelRemainingRunWorkers(run: ExtendedTaskRun) { + if (run.runtimeEnvironment.type === "DEVELOPMENT") { + // Nothing to do + return; + } + + // Broadcast cancel message to all coordinators + socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", { + version: "v1", + runId: run.id, + // Give the attempts some time to exit gracefully. If the runs supports lazy attempts, it also supports exit delays. + delayInMs: run.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + }); + } } diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index d18e3f16b7e..93d491581e5 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -11,7 +11,7 @@ import { PrismaClientOrTransaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { safeJsonParse } from "~/utils/json"; -import { eventRepository } from "../eventRepository.server"; +import { createExceptionPropertiesFromError, eventRepository } from "../eventRepository.server"; import { marqs } from "~/v3/marqs/index.server"; import { BaseService } from "./baseService.server"; import { CancelAttemptService } from "./cancelAttempt.server"; @@ -20,6 +20,7 @@ import { MAX_TASK_RUN_ATTEMPTS } from "~/consts"; import { CreateCheckpointService } from "./createCheckpoint.server"; import { TaskRun } from "@trigger.dev/database"; import { PerformTaskAttemptAlertsService } from "./alerts/performTaskAttemptAlerts.server"; +import { RetryAttemptService } from "./retryAttempt.server"; type FoundAttempt = Awaited>; @@ -57,6 +58,8 @@ export class CompleteAttemptService extends BaseService { }, }); + // No attempt, so there's no message to ACK + return "COMPLETED"; } @@ -143,6 +146,8 @@ export class CompleteAttemptService extends BaseService { env ); + // The cancel service handles ACK + return "COMPLETED"; } @@ -173,7 +178,7 @@ export class CompleteAttemptService extends BaseService { properties: { retryAt: retryAt.toISOString(), }, - runId: taskRunAttempt.taskRunId, + runId: taskRunAttempt.taskRun.friendlyId, style: { icon: "schedule-attempt", }, @@ -185,7 +190,10 @@ export class CompleteAttemptService extends BaseService { endTime: retryAt, }); - logger.debug("Retrying", { taskRun: taskRunAttempt.taskRun.friendlyId }); + logger.debug("Retrying", { + taskRun: taskRunAttempt.taskRun.friendlyId, + retry: completion.retry, + }); await this._prisma.taskRun.update({ where: { @@ -203,7 +211,12 @@ export class CompleteAttemptService extends BaseService { } if (!checkpoint) { - await this.#enqueueRetry(taskRunAttempt.taskRun, completion.retry.timestamp); + await this.#retryAttempt( + taskRunAttempt.taskRun, + completion.retry.timestamp, + undefined, + taskRunAttempt.backgroundWorker.supportsLazyAttempts + ); return "RETRIED"; } @@ -231,10 +244,12 @@ export class CompleteAttemptService extends BaseService { }, }); + await marqs?.acknowledgeMessage(taskRunAttempt.taskRunId); + return "COMPLETED"; } - await this.#enqueueRetry( + await this.#retryAttempt( taskRunAttempt.taskRun, completion.retry.timestamp, checkpointCreateResult.event.id @@ -253,6 +268,15 @@ export class CompleteAttemptService extends BaseService { attributes: { isError: true, }, + events: [ + { + name: "exception", + time: new Date(), + properties: { + exception: createExceptionPropertiesFromError(completion.error), + }, + }, + ], }); if ( @@ -310,17 +334,28 @@ export class CompleteAttemptService extends BaseService { } } - async #enqueueRetry(run: TaskRun, retryTimestamp: number, checkpointEventId?: string) { - // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt - return await marqs?.replaceMessage( - run.id, - { - type: "EXECUTE", - taskIdentifier: run.taskIdentifier, - checkpointEventId: checkpointEventId, - }, - retryTimestamp - ); + async #retryAttempt( + run: TaskRun, + retryTimestamp: number, + checkpointEventId?: string, + supportsLazyAttempts?: boolean + ) { + if (checkpointEventId || !supportsLazyAttempts) { + // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt + return await marqs?.replaceMessage( + run.id, + { + type: "EXECUTE", + taskIdentifier: run.taskIdentifier, + checkpointEventId: checkpointEventId, + }, + retryTimestamp + ); + } else { + // There's no checkpoint so the worker is still running and waiting for a retry message + // It supports lazy attempts so we can bypass the queue and send the message directly to the worker + RetryAttemptService.enqueue(run.id, this._prisma, new Date(retryTimestamp)); + } } #generateMetadataAttributesForNextAttempt(execution: TaskRunExecution) { @@ -353,6 +388,7 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId: include: { taskRun: true, backgroundWorkerTask: true, + backgroundWorker: true, }, }); } diff --git a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts index 2deb2c5373c..9045075e131 100644 --- a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts +++ b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts @@ -63,6 +63,7 @@ export class CreateBackgroundWorkerService extends BaseService { contentHash: body.metadata.contentHash, cliVersion: body.metadata.cliPackageVersion, sdkVersion: body.metadata.packageVersion, + supportsLazyAttempts: body.supportsLazyAttempts, }, }); diff --git a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts index 920fbeb4c0e..9b9c1afc984 100644 --- a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts +++ b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts @@ -45,6 +45,7 @@ export class CreateDeployedBackgroundWorkerService extends BaseService { contentHash: body.metadata.contentHash, cliVersion: body.metadata.cliPackageVersion, sdkVersion: body.metadata.packageVersion, + supportsLazyAttempts: body.supportsLazyAttempts, }, }); diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts new file mode 100644 index 00000000000..91dc5ccad39 --- /dev/null +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -0,0 +1,213 @@ +import { TaskRunExecution } from "@trigger.dev/core/v3"; +import { $transaction, PrismaClientOrTransaction, prisma } from "~/db.server"; +import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { logger } from "~/services/logger.server"; +import { generateFriendlyId } from "../friendlyIdentifiers"; +import { BaseService, ServiceValidationError } from "./baseService.server"; +import { TaskRun, TaskRunAttempt } from "@trigger.dev/database"; + +export class CreateTaskRunAttemptService extends BaseService { + public async call( + runId: string, + env?: AuthenticatedEnvironment, + setToExecuting = true + ): Promise<{ + execution: TaskRunExecution; + run: TaskRun; + attempt: TaskRunAttempt; + }> { + const environment = env ?? (await getAuthenticatedEnvironmentFromRun(runId, this._prisma)); + + if (!environment) { + throw new ServiceValidationError("Environment not found", 404); + } + + const isFriendlyId = runId.startsWith("run_"); + + return await this.traceWithEnv("call()", environment, async (span) => { + if (isFriendlyId) { + span.setAttribute("taskRunFriendlyId", runId); + } else { + span.setAttribute("taskRunId", runId); + } + + const taskRun = await this._prisma.taskRun.findUnique({ + where: { + id: !isFriendlyId ? runId : undefined, + friendlyId: isFriendlyId ? runId : undefined, + runtimeEnvironmentId: environment.id, + }, + include: { + tags: true, + attempts: { + take: 1, + orderBy: { + number: "desc", + }, + }, + lockedBy: { + include: { + worker: true, + }, + }, + batchItems: { + include: { + batchTaskRun: true, + }, + }, + }, + }); + + logger.debug("Creating a task run attempt", { taskRun }); + + if (!taskRun) { + throw new ServiceValidationError("Task run not found", 404); + } + + span.setAttribute("taskRunId", taskRun.id); + span.setAttribute("taskRunFriendlyId", taskRun.friendlyId); + + if (taskRun.status === "CANCELED") { + throw new ServiceValidationError("Task run is cancelled", 400); + } + + if (!taskRun.lockedBy) { + throw new ServiceValidationError("Task run is not locked", 400); + } + + const queue = await this._prisma.taskQueue.findUnique({ + where: { + runtimeEnvironmentId_name: { + runtimeEnvironmentId: environment.id, + name: taskRun.queue, + }, + }, + }); + + if (!queue) { + throw new ServiceValidationError("Queue not found", 404); + } + + const nextAttemptNumber = taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1; + + const taskRunAttempt = await $transaction(this._prisma, async (tx) => { + const taskRunAttempt = await tx.taskRunAttempt.create({ + data: { + number: nextAttemptNumber, + friendlyId: generateFriendlyId("attempt"), + taskRunId: taskRun.id, + startedAt: new Date(), + backgroundWorkerId: taskRun.lockedBy!.worker.id, + backgroundWorkerTaskId: taskRun.lockedBy!.id, + status: setToExecuting ? "EXECUTING" : "PENDING", + queueId: queue.id, + runtimeEnvironmentId: environment.id, + }, + include: { + backgroundWorker: true, + backgroundWorkerTask: true, + }, + }); + + if (setToExecuting) { + await tx.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "EXECUTING", + }, + }); + } + + return taskRunAttempt; + }); + + if (!taskRunAttempt) { + logger.error("Failed to create task run attempt", { runId: taskRun.id, nextAttemptNumber }); + throw new ServiceValidationError("Failed to create task run attempt", 500); + } + + const execution: TaskRunExecution = { + task: { + id: taskRun.lockedBy.slug, + filePath: taskRun.lockedBy.filePath, + exportName: taskRun.lockedBy.exportName, + }, + attempt: { + id: taskRunAttempt.friendlyId, + number: taskRunAttempt.number, + startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt, + backgroundWorkerId: taskRun.lockedBy.worker.id, + backgroundWorkerTaskId: taskRun.lockedBy.id, + status: "EXECUTING" as const, + }, + run: { + id: taskRun.friendlyId, + payload: taskRun.payload, + payloadType: taskRun.payloadType, + context: taskRun.context, + createdAt: taskRun.createdAt, + tags: taskRun.tags.map((tag) => tag.name), + isTest: taskRun.isTest, + idempotencyKey: taskRun.idempotencyKey ?? undefined, + }, + queue: { + id: queue.friendlyId, + name: queue.name, + }, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + organization: { + id: environment.organization.id, + slug: environment.organization.slug, + name: environment.organization.title, + }, + project: { + id: environment.project.id, + ref: environment.project.externalRef, + slug: environment.project.slug, + name: environment.project.name, + }, + batch: + taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun + ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId } + : undefined, + }; + + return { + execution, + run: taskRun, + attempt: taskRunAttempt, + }; + }); + } +} + +async function getAuthenticatedEnvironmentFromRun( + friendlyId: string, + prismaClient?: PrismaClientOrTransaction +) { + const taskRun = await (prismaClient ?? prisma).taskRun.findUnique({ + where: { + friendlyId, + }, + include: { + runtimeEnvironment: { + include: { + organization: true, + project: true, + }, + }, + }, + }); + + if (!taskRun) { + return; + } + + return taskRun?.runtimeEnvironment; +} diff --git a/apps/webapp/app/v3/services/retryAttempt.server.ts b/apps/webapp/app/v3/services/retryAttempt.server.ts new file mode 100644 index 00000000000..86844b53496 --- /dev/null +++ b/apps/webapp/app/v3/services/retryAttempt.server.ts @@ -0,0 +1,39 @@ +import { BaseService } from "./baseService.server"; +import { logger } from "~/services/logger.server"; +import { socketIo } from "../handleSocketIo.server"; +import { PrismaClientOrTransaction } from "~/db.server"; +import { workerQueue } from "~/services/worker.server"; + +export class RetryAttemptService extends BaseService { + public async call(runId: string) { + const taskRun = await this._prisma.taskRun.findFirst({ + where: { + id: runId, + }, + }); + + if (!taskRun) { + logger.error("Task run not found", { runId }); + return; + } + + socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { + version: "v1", + runId, + }); + } + + static async enqueue(runId: string, tx: PrismaClientOrTransaction, runAt?: Date) { + return await workerQueue.enqueue( + "v3.retryAttempt", + { + runId, + }, + { + tx, + runAt, + jobKey: `retryAttempt:${runId}`, + } + ); + } +} diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts index d935724b69b..1d6a59443ac 100644 --- a/apps/webapp/app/v3/services/triggerTask.server.ts +++ b/apps/webapp/app/v3/services/triggerTask.server.ts @@ -4,13 +4,12 @@ import { TriggerTaskRequestBody, packetRequiresOffloading, } from "@trigger.dev/core/v3"; -import { nanoid } from "nanoid"; import { createHash } from "node:crypto"; import { $transaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { eventRepository } from "../eventRepository.server"; import { generateFriendlyId } from "../friendlyIdentifiers"; -import { marqs } from "~/v3/marqs/index.server"; +import { marqs, sanitizeQueueName } from "~/v3/marqs/index.server"; import { uploadToObjectStore } from "../r2.server"; import { BaseService } from "./baseService.server"; @@ -112,7 +111,7 @@ export class TriggerTaskService extends BaseService { select: { lastNumber: true }, }); - const queueName = body.options?.queue?.name ?? `task/${taskId}`; + const queueName = sanitizeQueueName(body.options?.queue?.name ?? `task/${taskId}`); event.setAttribute("queueName", queueName); span.setAttribute("queueName", queueName); diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 39f693b918f..6b6fbcacbd1 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -16,7 +16,10 @@ import { GetProjectResponseBody, ImportEnvironmentVariablesRequestBody, EnvironmentVariableResponseBody, + TaskRunExecution, + APIError, } from "@trigger.dev/core/v3"; +import { zodfetch } from "@trigger.dev/core/v3/zodfetch"; export class CliApiClient { private readonly apiURL: string; @@ -29,7 +32,7 @@ export class CliApiClient { } async createAuthorizationCode() { - return zodfetch( + return wrapZodFetch( CreateAuthorizationCodeResponseSchema, `${this.apiURL}/api/v1/authorization-code`, { @@ -39,7 +42,7 @@ export class CliApiClient { } async getPersonalAccessToken(authorizationCode: string) { - return zodfetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { + return wrapZodFetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { method: "POST", body: JSON.stringify({ authorizationCode, @@ -52,7 +55,7 @@ export class CliApiClient { throw new Error("whoAmI: No access token"); } - return zodfetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { + return wrapZodFetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -65,7 +68,7 @@ export class CliApiClient { throw new Error("getProject: No access token"); } - return zodfetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { + return wrapZodFetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -78,7 +81,7 @@ export class CliApiClient { throw new Error("getProjects: No access token"); } - return zodfetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { + return wrapZodFetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -91,7 +94,7 @@ export class CliApiClient { throw new Error("createBackgroundWorker: No access token"); } - return zodfetch( + return wrapZodFetch( CreateBackgroundWorkerResponse, `${this.apiURL}/api/v1/projects/${projectRef}/background-workers`, { @@ -105,6 +108,20 @@ export class CliApiClient { ); } + async createTaskRunAttempt(runFriendlyId: string) { + if (!this.accessToken) { + throw new Error("creatTaskRunAttempt: No access token"); + } + + return wrapZodFetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { + method: "POST", + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + }); + } + async getProjectEnv({ projectRef, env, @@ -116,12 +133,16 @@ export class CliApiClient { throw new Error("getProjectDevEnv: No access token"); } - return zodfetch(GetProjectEnvResponse, `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, { - headers: { - Authorization: `Bearer ${this.accessToken}`, - "Content-Type": "application/json", - }, - }); + return wrapZodFetch( + GetProjectEnvResponse, + `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, + { + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + } + ); } async getEnvironmentVariables(projectRef: string) { @@ -129,7 +150,7 @@ export class CliApiClient { throw new Error("getEnvironmentVariables: No access token"); } - return zodfetch( + return wrapZodFetch( GetEnvironmentVariablesResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars`, { @@ -150,7 +171,7 @@ export class CliApiClient { throw new Error("importEnvVars: No access token"); } - return zodfetch( + return wrapZodFetch( EnvironmentVariableResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars/${slug}/import`, { @@ -169,7 +190,7 @@ export class CliApiClient { throw new Error("initializeDeployment: No access token"); } - return zodfetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { + return wrapZodFetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -184,7 +205,7 @@ export class CliApiClient { throw new Error("startDeploymentIndexing: No access token"); } - return zodfetch( + return wrapZodFetch( StartDeploymentIndexingResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}/start-indexing`, { @@ -203,7 +224,7 @@ export class CliApiClient { throw new Error("getDeployment: No access token"); } - return zodfetch( + return wrapZodFetch( GetDeploymentResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}`, { @@ -223,56 +244,42 @@ type ApiResult = error: string; }; -async function zodfetch( - schema: z.Schema, +async function wrapZodFetch( + schema: T, url: string, requestInit?: RequestInit -): Promise> { +): Promise>> { try { - const response = await fetch(url, requestInit); + const response = await zodfetch(schema, url, requestInit, { + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 5000, + maxAttempts: 3, + factor: 2, + randomize: false, + }, + }); - if ((!requestInit || requestInit.method === "GET") && response.status === 404) { + return { + success: true, + data: response, + }; + } catch (error) { + if (error instanceof APIError) { return { success: false, - error: `404: ${response.statusText}`, + error: error.message, }; - } - - if (response.status >= 400 && response.status < 500) { - const body = await response.json(); - if (!body.error) { - return { success: false, error: "Something went wrong" }; - } - - return { success: false, error: body.error }; - } - - if (response.status !== 200) { + } else if (error instanceof Error) { return { success: false, - error: `Failed to fetch ${url}, got status code ${response.status}`, + error: error.message, }; - } - - const jsonBody = await response.json(); - const parsedResult = schema.safeParse(jsonBody); - - if (parsedResult.success) { - return { success: true, data: parsedResult.data }; - } - - if ("error" in jsonBody) { + } else { return { success: false, - error: typeof jsonBody.error === "string" ? jsonBody.error : JSON.stringify(jsonBody.error), + error: String(error), }; } - - return { success: false, error: parsedResult.error.message }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : JSON.stringify(error), - }; } } diff --git a/packages/cli-v3/src/commands/deploy.ts b/packages/cli-v3/src/commands/deploy.ts index b9c8861057d..37a9157d404 100644 --- a/packages/cli-v3/src/commands/deploy.ts +++ b/packages/cli-v3/src/commands/deploy.ts @@ -199,7 +199,9 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) { `Failed to connect to ${authorization.auth?.apiUrl}. Are you sure it's the correct URL?` ); } else { - throw new Error("You must login first. Use `trigger.dev login` to login."); + throw new Error( + `You must login first. Use the \`login\` CLI command.\n\n${authorization.error}` + ); } } diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx index 353f9907cd5..a692b3bd1d9 100644 --- a/packages/cli-v3/src/commands/dev.tsx +++ b/packages/cli-v3/src/commands/dev.tsx @@ -112,7 +112,11 @@ export async function devCommand(dir: string, options: DevCommandOptions) { )} Connecting to the server failed. Please check your internet connection or contact eric@trigger.dev for help.` ); } else { - logger.log(`${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.`); + logger.log( + `${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.\n\n${ + authorization.error + }` + ); } process.exitCode = 1; return; @@ -285,6 +289,7 @@ function useDev({ websocket.addEventListener("close", (event) => {}); websocket.addEventListener("error", (event) => {}); + // This is the deprecated task heart beat that uses the friendly attempt ID backgroundWorkerCoordinator.onWorkerTaskHeartbeat.attach( async ({ worker, backgroundWorkerId, id }) => { await sender.send("BACKGROUND_WORKER_MESSAGE", { @@ -297,6 +302,19 @@ function useDev({ } ); + // "Task Run Heartbeat" id is the actual run ID that corresponds to the MarQS message ID + backgroundWorkerCoordinator.onWorkerTaskRunHeartbeat.attach( + async ({ worker, backgroundWorkerId, id }) => { + await sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId, + data: { + type: "TASK_RUN_HEARTBEAT", + id, + }, + }); + } + ); + backgroundWorkerCoordinator.onTaskCompleted.attach( async ({ backgroundWorkerId, completion, execution }) => { await sender.send("BACKGROUND_WORKER_MESSAGE", { @@ -310,6 +328,18 @@ function useDev({ } ); + backgroundWorkerCoordinator.onTaskFailedToRun.attach( + async ({ backgroundWorkerId, completion }) => { + await sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId, + data: { + type: "TASK_RUN_FAILED_TO_RUN", + completion, + }, + }); + } + ); + backgroundWorkerCoordinator.onWorkerRegistered.attach(async ({ id, worker, record }) => { await sender.send("READY_FOR_TASKS", { backgroundWorkerId: id, @@ -334,6 +364,7 @@ function useDev({ for (const worker of backgroundWorkerCoordinator.currentWorkers) { await sender.send("READY_FOR_TASKS", { backgroundWorkerId: worker.id, + inProgressRuns: worker.worker.inProgressRuns, }); } }, @@ -505,21 +536,25 @@ function useDev({ const processEnv = await gatherProcessEnv(); - const backgroundWorker = new BackgroundWorker(fullPath, { - projectConfig: config, - dependencies, - env: { - ...processEnv, - TRIGGER_API_URL: apiUrl, - TRIGGER_SECRET_KEY: apiKey, - ...(environmentVariablesResponse.success - ? environmentVariablesResponse.data.variables - : {}), + const backgroundWorker = new BackgroundWorker( + fullPath, + { + projectConfig: config, + dependencies, + env: { + ...processEnv, + TRIGGER_API_URL: apiUrl, + TRIGGER_SECRET_KEY: apiKey, + ...(environmentVariablesResponse.success + ? environmentVariablesResponse.data.variables + : {}), + }, + debuggerOn, + debugOtel, + resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule), }, - debuggerOn, - debugOtel, - resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule), - }); + environmentClient + ); try { await backgroundWorker.initialize(); @@ -576,6 +611,7 @@ function useDev({ tasks: taskResources, contentHash: contentHash, }, + supportsLazyAttempts: true, }; const backgroundWorkerRecord = await environmentClient.createBackgroundWorker( @@ -827,18 +863,9 @@ function createDuplicateTaskIdOutputErrorMessage( async function gatherProcessEnv() { const env = { + ...process.env, NODE_ENV: process.env.NODE_ENV ?? "development", - PATH: process.env.PATH, - USER: process.env.USER, - SHELL: process.env.SHELL, - NVM_INC: process.env.NVM_INC, - NVM_DIR: process.env.NVM_DIR, - NVM_BIN: process.env.NVM_BIN, - LANG: process.env.LANG, - TERM: process.env.TERM, NODE_PATH: await amendNodePathWithPnpmNodeModules(process.env.NODE_PATH), - HOME: process.env.HOME, - BUN_INSTALL: process.env.BUN_INSTALL, }; // Filter out undefined values diff --git a/packages/cli-v3/src/commands/whoami.ts b/packages/cli-v3/src/commands/whoami.ts index c46445d6462..73c740c94b9 100644 --- a/packages/cli-v3/src/commands/whoami.ts +++ b/packages/cli-v3/src/commands/whoami.ts @@ -78,7 +78,7 @@ export async function whoAmI( options?.profile ?? "default" }\` to login.` ); - outro("Whoami failed"); + outro(`Whoami failed: ${authentication.error}`); } } diff --git a/packages/cli-v3/src/workers/common/errors.ts b/packages/cli-v3/src/workers/common/errors.ts index 4017d3cc65a..053ab8d19ba 100644 --- a/packages/cli-v3/src/workers/common/errors.ts +++ b/packages/cli-v3/src/workers/common/errors.ts @@ -21,3 +21,43 @@ export class TaskMetadataParseError extends Error { this.name = "TaskMetadataParseError"; } } + +export class UnexpectedExitError extends Error { + constructor(public code: number) { + super(`Unexpected exit with code ${code}`); + + this.name = "UnexpectedExitError"; + } +} + +export class CleanupProcessError extends Error { + constructor() { + super("Cancelled"); + + this.name = "CleanupProcessError"; + } +} + +export class CancelledProcessError extends Error { + constructor() { + super("Cancelled"); + + this.name = "CancelledProcessError"; + } +} + +export class SigKillTimeoutProcessError extends Error { + constructor() { + super("Process kill timeout"); + + this.name = "SigKillTimeoutProcessError"; + } +} + +export class GracefulExitTimeoutError extends Error { + constructor() { + super("Graceful exit timeout"); + + this.name = "GracefulExitTimeoutError"; + } +} diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index 5172b1f3558..c912aaa4691 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -9,8 +9,10 @@ import { TaskRunError, TaskRunErrorCodes, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, + TaskRunFailedExecutionResult, childToWorkerMessages, correctErrorStackTrace, formatDurationMilliseconds, @@ -36,8 +38,15 @@ import { import { safeDeleteFileSync } from "../../utilities/fileSystem.js"; import { installPackages } from "../../utilities/installPackages.js"; import { logger } from "../../utilities/logger.js"; -import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors.js"; -import { env } from "node:process"; +import { + CancelledProcessError, + CleanupProcessError, + SigKillTimeoutProcessError, + TaskMetadataParseError, + UncaughtExceptionError, + UnexpectedExitError, +} from "../common/errors.js"; +import { CliApiClient } from "../../apiClient.js"; export type CurrentWorkers = BackgroundWorkerCoordinator["currentWorkers"]; export class BackgroundWorkerCoordinator { @@ -47,37 +56,52 @@ export class BackgroundWorkerCoordinator { worker: BackgroundWorker; execution: TaskRunExecution; }> = new Evt(); + public onTaskFailedToRun: Evt<{ + backgroundWorkerId: string; + worker: BackgroundWorker; + completion: TaskRunFailedExecutionResult; + }> = new Evt(); public onWorkerRegistered: Evt<{ worker: BackgroundWorker; id: string; record: CreateBackgroundWorkerResponse; }> = new Evt(); + + /** + * @deprecated use onWorkerTaskRunHeartbeat instead + */ public onWorkerTaskHeartbeat: Evt<{ id: string; backgroundWorkerId: string; worker: BackgroundWorker; }> = new Evt(); + public onWorkerTaskRunHeartbeat: Evt<{ + id: string; + backgroundWorkerId: string; + worker: BackgroundWorker; + }> = new Evt(); public onWorkerDeprecated: Evt<{ worker: BackgroundWorker; id: string }> = new Evt(); private _backgroundWorkers: Map = new Map(); private _records: Map = new Map(); private _deprecatedWorkers: Set = new Set(); constructor(private baseURL: string) { - this.onTaskCompleted.attach(async ({ completion, execution }) => { + this.onTaskCompleted.attach(async ({ completion }) => { if (!completion.ok && typeof completion.retry !== "undefined") { return; } - await this.#notifyWorkersOfTaskCompletion(completion, execution); + await this.#notifyWorkersOfTaskCompletion(completion); + }); + + this.onTaskFailedToRun.attach(async ({ completion }) => { + await this.#notifyWorkersOfTaskCompletion(completion); }); } - async #notifyWorkersOfTaskCompletion( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { + async #notifyWorkersOfTaskCompletion(completion: TaskRunExecutionResult) { for (const worker of this._backgroundWorkers.values()) { - await worker.taskRunCompletedNotification(completion, execution); + await worker.taskRunCompletedNotification(completion); } } @@ -107,6 +131,10 @@ export class BackgroundWorkerCoordinator { worker.onTaskHeartbeat.attach((id) => { this.onWorkerTaskHeartbeat.post({ id, backgroundWorkerId: record.id, worker }); }); + + worker.onTaskRunHeartbeat.attach((id) => { + this.onWorkerTaskRunHeartbeat.post({ id, backgroundWorkerId: record.id, worker }); + }); } close() { @@ -136,11 +164,15 @@ export class BackgroundWorkerCoordinator { } await worker.cancelRun(message.taskRunId); + break; + } + case "EXECUTE_RUN_LAZY_ATTEMPT": { + await this.#executeTaskRunLazyAttempt(id, message.payload); } } } - async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) { + async #executeTaskRunLazyAttempt(id: string, payload: TaskRunExecutionLazyAttemptPayload) { const worker = this._backgroundWorkers.get(id); if (!worker) { @@ -155,106 +187,68 @@ export class BackgroundWorkerCoordinator { return; } - const { execution } = payload; - - // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1 - - const logsUrl = `${this.baseURL}/runs/${execution.run.id}`; - - const pipe = chalkGrey("|"); - const bullet = chalkGrey("○"); - const link = chalkLink(terminalLink("View logs", logsUrl)); - let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt)); - const workerPrefix = chalkWorker(record.version); - const taskPrefix = chalkTask(execution.task.id); - const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`); - - logger.log( - `${bullet} ${timestampPrefix} ${chalkGrey( - "->" - )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}` - ); - - const now = performance.now(); - - const completion = await worker.executeTaskRun(payload); - - const elapsed = performance.now() - now; - - const retryingText = chalkGrey( - !completion.ok && completion.skippedRetrying - ? " (retrying skipped)" - : !completion.ok && completion.retry !== undefined - ? ` (retrying in ${completion.retry.delay}ms)` - : "" - ); - - const resultText = !completion.ok - ? completion.error.type === "INTERNAL_ERROR" && - (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED || - completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED) - ? chalkWarning("Cancelled") - : `${chalkError("Error")}${retryingText}` - : chalkSuccess("Success"); - - const errorText = !completion.ok - ? this.#formatErrorLog(completion.error) - : "retry" in completion - ? `retry in ${completion.retry}ms` - : ""; - - const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`); - - timestampPrefix = chalkGrey(prettyPrintDate()); - - logger.log( - `${bullet} ${timestampPrefix} ${chalkGrey( - "->" - )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}` - ); - - this.onTaskCompleted.post({ completion, execution, worker, backgroundWorkerId: id }); - } + try { + const { completion, execution } = await worker.executeTaskRunLazyAttempt( + payload, + this.baseURL + ); - #formatErrorLog(error: TaskRunError) { - switch (error.type) { - case "INTERNAL_ERROR": { - return ""; - } - case "STRING_ERROR": { - return `\n\n${chalkError("X Error:")} ${error.raw}\n`; - } - case "CUSTOM_ERROR": { - return `\n\n${chalkError("X Error:")} ${error.raw}\n`; - } - case "BUILT_IN_ERROR": { - return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`; - } + this.onTaskCompleted.post({ + completion, + execution, + worker, + backgroundWorkerId: id, + }); + } catch (error) { + this.onTaskFailedToRun.post({ + backgroundWorkerId: id, + worker, + completion: { + ok: false, + id: payload.runId, + retry: undefined, + error: + error instanceof Error + ? { + type: "BUILT_IN_ERROR", + name: error.name, + message: error.message, + stackTrace: error.stack ?? "", + } + : { + type: "BUILT_IN_ERROR", + name: "UnknownError", + message: String(error), + stackTrace: "", + }, + }, + }); } } -} -class UnexpectedExitError extends Error { - constructor(public code: number) { - super(`Unexpected exit with code ${code}`); + async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) { + const worker = this._backgroundWorkers.get(id); - this.name = "UnexpectedExitError"; - } -} + if (!worker) { + logger.error(`Could not find worker ${id}`); + return; + } -class CleanupProcessError extends Error { - constructor() { - super("Cancelled"); + const record = this._records.get(id); - this.name = "CleanupProcessError"; - } -} + if (!record) { + logger.error(`Could not find worker record ${id}`); + return; + } -class CancelledProcessError extends Error { - constructor() { - super("Cancelled"); + const completion = await worker.executeTaskRun(payload, this.baseURL); - this.name = "CancelledProcessError"; + this.onTaskCompleted.post({ + completion, + execution: payload.execution, + worker, + backgroundWorkerId: id, + }); } } @@ -276,13 +270,18 @@ export class BackgroundWorker { schema: childToWorkerMessages, }); + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); private _onClose: Evt = new Evt(); public tasks: Array = []; public metadata: BackgroundWorkerProperties | undefined; _taskRunProcesses: Map = new Map(); + private _taskRunProcessesBeingKilled: Set = new Set(); private _closed: boolean = false; @@ -290,7 +289,8 @@ export class BackgroundWorker { constructor( public path: string, - public params: BackgroundWorkerParams + public params: BackgroundWorkerParams, + private apiClient: CliApiClient ) {} close() { @@ -301,6 +301,7 @@ export class BackgroundWorker { this._closed = true; this.onTaskHeartbeat.detach(); + this.onTaskRunHeartbeat.detach(); // We need to close all the task run processes for (const taskRunProcess of this._taskRunProcesses.values()) { @@ -314,6 +315,10 @@ export class BackgroundWorker { safeDeleteFileSync(`${this.path}.map`); } + get inProgressRuns(): Array { + return Array.from(this._taskRunProcesses.keys()); + } + async initialize() { if (this._initialized) { throw new Error("Worker already initialized"); @@ -408,46 +413,142 @@ export class BackgroundWorker { // We need to notify all the task run processes that a task run has completed, // in case they are waiting for it through triggerAndWait - async taskRunCompletedNotification( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { + async taskRunCompletedNotification(completion: TaskRunExecutionResult) { for (const taskRunProcess of this._taskRunProcesses.values()) { - taskRunProcess.taskRunCompletedNotification(completion, execution); + taskRunProcess.taskRunCompletedNotification(completion); } } - async #initializeTaskRunProcess(payload: TaskRunExecutionPayload): Promise { + #prefixedMessage(payload: TaskRunExecutionPayload, message: string = "") { + return `[${payload.execution.run.id}.${payload.execution.attempt.number}] ${message}`; + } + + async #getFreshTaskRunProcess( + payload: TaskRunExecutionPayload, + messageId?: string + ): Promise { + logger.debug(this.#prefixedMessage(payload, "getFreshTaskRunProcess()")); + if (!this.metadata) { throw new Error("Worker not registered"); } - if (!this._taskRunProcesses.has(payload.execution.run.id)) { - const taskRunProcess = new TaskRunProcess( - payload.execution, - this.path, - { - ...this._fullEnv, - ...(payload.environment ?? {}), - }, - this.metadata, - this.params - ); + this._closed = false; + + logger.debug(this.#prefixedMessage(payload, "killing current task run process before attempt")); + + await this.#killCurrentTaskRunProcessBeforeAttempt(payload.execution.run.id); - taskRunProcess.onExit.attach(() => { + logger.debug(this.#prefixedMessage(payload, "creating new task run process")); + + const taskRunProcess = new TaskRunProcess( + payload.execution.run.id, + payload.execution.run.isTest, + this.path, + { + ...this._fullEnv, + ...(payload.environment ?? {}), + ...this.#readEnvVars(), + }, + this.metadata, + this.params, + messageId + ); + + taskRunProcess.onExit.attach(({ pid }) => { + logger.debug(this.#prefixedMessage(payload, "onExit()"), { pid }); + + const taskRunProcess = this._taskRunProcesses.get(payload.execution.run.id); + + // Only delete the task run process if the pid matches + if (taskRunProcess?.pid === pid) { this._taskRunProcesses.delete(payload.execution.run.id); - }); + } - taskRunProcess.onTaskHeartbeat.attach((id) => { - this.onTaskHeartbeat.post(id); - }); + if (pid) { + this._taskRunProcessesBeingKilled.delete(pid); + } + }); + + taskRunProcess.onIsBeingKilled.attach((pid) => { + if (pid) { + this._taskRunProcessesBeingKilled.add(pid); + } + }); + + taskRunProcess.onTaskHeartbeat.attach((id) => { + this.onTaskHeartbeat.post(id); + }); - await taskRunProcess.initialize(); + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); + + await taskRunProcess.initialize(); + + this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess); + + return taskRunProcess; + } - this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess); + async #killCurrentTaskRunProcessBeforeAttempt(runId: string) { + const taskRunProcess = this._taskRunProcesses.get(runId); + + if (!taskRunProcess) { + logger.debug(`[${runId}] no current task process to kill`); + return; } - return this._taskRunProcesses.get(payload.execution.run.id) as TaskRunProcess; + logger.debug(`[${runId}] killing current task process`, { + pid: taskRunProcess.pid, + }); + + if (taskRunProcess.isBeingKilled) { + if (this._taskRunProcessesBeingKilled.size > 1) { + await this.#tryGracefulExit(taskRunProcess); + } else { + // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel + } + } else { + // It's not being killed, so kill it + if (this._taskRunProcessesBeingKilled.size > 0) { + await this.#tryGracefulExit(taskRunProcess); + } else { + // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. + taskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); + } + } + } + + async #tryGracefulExit( + taskRunProcess: TaskRunProcess, + kill = false, + initialSignal: number | NodeJS.Signals = "SIGTERM" + ) { + try { + const initialExit = taskRunProcess.onExit.waitFor(5_000); + + if (kill) { + taskRunProcess.kill(initialSignal); + } + + await initialExit; + } catch (error) { + logger.error("TaskRunProcess graceful kill timeout exceeded", error); + + this.#tryForcefulExit(taskRunProcess); + } + } + + async #tryForcefulExit(taskRunProcess: TaskRunProcess) { + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + logger.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } } async cancelRun(taskRunId: string) { @@ -460,14 +561,113 @@ export class BackgroundWorker { await taskRunProcess.cancel(); } + async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload, baseURL: string) { + const attemptResponse = await this.apiClient.createTaskRunAttempt(payload.runId); + + if (!attemptResponse.success) { + throw new Error(`Failed to create task run attempt: ${attemptResponse.error}`); + } + + const execution = attemptResponse.data; + + const completion = await this.executeTaskRun( + { execution, traceContext: payload.traceContext, environment: payload.environment }, + baseURL, + payload.messageId + ); + + return { execution, completion }; + } + // We need to fork the process before we can execute any tasks - async executeTaskRun(payload: TaskRunExecutionPayload): Promise { + async executeTaskRun( + payload: TaskRunExecutionPayload, + baseURL: string, + messageId?: string + ): Promise { + if (this._closed) { + throw new Error("Worker is closed"); + } + + if (!this.metadata) { + throw new Error("Worker not registered"); + } + + const { execution } = payload; + // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1 + + const logsUrl = `${baseURL}/runs/${execution.run.id}`; + + const pipe = chalkGrey("|"); + const bullet = chalkGrey("○"); + const link = chalkLink(terminalLink("View logs", logsUrl)); + let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt)); + const workerPrefix = chalkWorker(this.metadata.version); + const taskPrefix = chalkTask(execution.task.id); + const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`); + + logger.log( + `${bullet} ${timestampPrefix} ${chalkGrey( + "->" + )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}` + ); + + const now = performance.now(); + + const completion = await this.#doExecuteTaskRun(payload, messageId); + + const elapsed = performance.now() - now; + + const retryingText = chalkGrey( + !completion.ok && completion.skippedRetrying + ? " (retrying skipped)" + : !completion.ok && completion.retry !== undefined + ? ` (retrying in ${completion.retry.delay}ms)` + : "" + ); + + const resultText = !completion.ok + ? completion.error.type === "INTERNAL_ERROR" && + (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED || + completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED) + ? chalkWarning("Cancelled") + : `${chalkError("Error")}${retryingText}` + : chalkSuccess("Success"); + + const errorText = !completion.ok + ? formatErrorLog(completion.error) + : "retry" in completion + ? `retry in ${completion.retry}ms` + : ""; + + const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`); + + timestampPrefix = chalkGrey(prettyPrintDate()); + + logger.log( + `${bullet} ${timestampPrefix} ${chalkGrey( + "->" + )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}` + ); + + return completion; + } + + async #doExecuteTaskRun( + payload: TaskRunExecutionPayload, + messageId?: string + ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload); + const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId); + + logger.debug(this.#prefixedMessage(payload, "executing task run"), { + pid: taskRunProcess.pid, + }); + const result = await taskRunProcess.executeTaskRun(payload); - // Kill the worker if the task was successful or if it's not going to be retried); - await taskRunProcess.cleanup(result.ok || result.retry === undefined); + // Always kill the worker + await taskRunProcess.cleanup(true); if (result.ok) { return result; @@ -568,6 +768,7 @@ class TaskRunProcess { }); private _sender: ZodMessageSender; private _child: ChildProcess | undefined; + private _childPid?: number; private _attemptPromises: Map< string, { resolver: (value: TaskRunExecutionResult) => void; rejecter: (err?: any) => void } @@ -576,15 +777,23 @@ class TaskRunProcess { private _currentExecution: TaskRunExecution | undefined; private _isBeingKilled: boolean = false; private _isBeingCancelled: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); - public onExit: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); + public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> = + new Evt(); + public onIsBeingKilled: Evt = new Evt(); constructor( - private execution: TaskRunExecution, + private runId: string, + private isTest: boolean, private path: string, private env: NodeJS.ProcessEnv, private metadata: BackgroundWorkerProperties, - private worker: BackgroundWorkerParams + private worker: BackgroundWorkerParams, + private messageId?: string ) { this._sender = new ZodMessageSender({ schema: workerToChildMessages, @@ -604,7 +813,7 @@ class TaskRunProcess { async initialize() { const fullEnv = { - ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), + ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), ...this.env, OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({ [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir, @@ -615,7 +824,7 @@ class TaskRunProcess { const cwd = dirname(this.path); - logger.debug(`[${this.execution.run.id}] initializing task run process`, { + logger.debug(`[${this.runId}] initializing task run process`, { env: fullEnv, path: this.path, cwd, @@ -629,6 +838,7 @@ class TaskRunProcess { ? ["--inspect-brk", "--trace-uncaught", "--no-warnings=ExperimentalWarning"] : ["--trace-uncaught", "--no-warnings=ExperimentalWarning"], }); + this._childPid = this._child?.pid; this._child.on("message", this.#handleMessage.bind(this)); this._child.on("exit", this.#handleExit.bind(this)); @@ -641,19 +851,28 @@ class TaskRunProcess { return; } - logger.debug(`[${this.execution.run.id}] cleaning up task run process`, { kill }); + if (kill) { + this._isBeingKilled = true; + this.onIsBeingKilled.post(this._child?.pid); + } + + logger.debug(`[${this.runId}] cleaning up task run process`, { kill, pid: this.pid }); await this._sender.send("CLEANUP", { flush: true, kill, }); - this._isBeingKilled = kill; + // FIXME: Something broke READY_TO_DISPOSE. We never receive it, so we always have to kill the process after the timeout below. + + if (!kill) { + return; + } // Set a timeout to kill the child process if it hasn't been killed within 5 seconds setTimeout(() => { if (this._child && !this._child.killed) { - logger.debug(`[${this.execution.run.id}] killing task run process after timeout`); + logger.debug(`[${this.runId}] killing task run process after timeout`, { pid: this.pid }); this._child.kill(); } @@ -691,24 +910,23 @@ class TaskRunProcess { return result; } - taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) { + taskRunCompletedNotification(completion: TaskRunExecutionResult) { if (!completion.ok && typeof completion.retry !== "undefined") { return; } - if (execution.run.id === this.execution.run.id) { + if (completion.id === this.runId) { // We don't need to notify the task run process if it's the same as the one we're running return; } - logger.debug(`[${this.execution.run.id}] task run completed notification`, { + logger.debug(`[${this.runId}] task run completed notification`, { completion, - execution, }); this._sender.send("TASK_RUN_COMPLETED_NOTIFICATION", { + version: "v2", completion, - execution, }); } @@ -740,14 +958,18 @@ class TaskRunProcess { break; } case "READY_TO_DISPOSE": { - logger.debug(`[${this.execution.run.id}] task run process is ready to dispose`); + logger.debug(`[${this.runId}] task run process is ready to dispose`); this.#kill(); break; } case "TASK_HEARTBEAT": { - this.onTaskHeartbeat.post(message.payload.id); + if (this.messageId) { + this.onTaskRunHeartbeat.post(this.messageId); + } else { + this.onTaskHeartbeat.post(message.payload.id); + } break; } @@ -757,8 +979,8 @@ class TaskRunProcess { } } - async #handleExit(code: number) { - logger.debug(`[${this.execution.run.id}] task run process exiting`, { code }); + async #handleExit(code: number | null, signal: NodeJS.Signals | null) { + logger.debug(`[${this.runId}] handle task run process exit`, { code, signal, pid: this.pid }); // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { @@ -778,12 +1000,12 @@ class TaskRunProcess { } else if (this._isBeingKilled) { rejecter(new CleanupProcessError()); } else { - rejecter(new UnexpectedExitError(code)); + rejecter(new UnexpectedExitError(code ?? -1)); } } } - this.onExit.post(code); + this.onExit.post({ code, signal, pid: this.pid }); } #handleLog(data: Buffer) { @@ -823,10 +1045,54 @@ class TaskRunProcess { } #kill() { - if (this._child && !this._child.killed) { - logger.debug(`[${this.execution.run.id}] killing task run process`); + logger.debug(`[${this.runId}] #kill()`, { pid: this.pid }); + if (this._child && !this._child.killed) { this._child?.kill(); } } + + async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) { + logger.debug(`[${this.runId}] killing task run process`, { + signal, + timeoutInMs, + pid: this.pid, + }); + + this._isBeingKilled = true; + + const killTimeout = this.onExit.waitFor(timeoutInMs); + + this.onIsBeingKilled.post(this._child?.pid); + this._child?.kill(signal); + + if (timeoutInMs) { + await killTimeout; + } + } + + get isBeingKilled() { + return this._isBeingKilled || this._child?.killed; + } + + get pid() { + return this._childPid; + } +} + +function formatErrorLog(error: TaskRunError) { + switch (error.type) { + case "INTERNAL_ERROR": { + return ""; + } + case "STRING_ERROR": { + return `\n\n${chalkError("X Error:")} ${error.raw}\n`; + } + case "CUSTOM_ERROR": { + return `\n\n${chalkError("X Error:")} ${error.raw}\n`; + } + case "BUILT_IN_ERROR": { + return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`; + } + } } diff --git a/packages/cli-v3/src/workers/dev/worker-facade.ts b/packages/cli-v3/src/workers/dev/worker-facade.ts index efc65c93ddb..703ea93a12e 100644 --- a/packages/cli-v3/src/workers/dev/worker-facade.ts +++ b/packages/cli-v3/src/workers/dev/worker-facade.ts @@ -182,8 +182,17 @@ const handler = new ZodMessageHandler({ _isRunning = false; } }, - TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => { - devRuntimeManager.resumeTask(completion, execution); + TASK_RUN_COMPLETED_NOTIFICATION: async (payload) => { + switch (payload.version) { + case "v1": { + devRuntimeManager.resumeTask(payload.completion, payload.execution.run.id); + break; + } + case "v2": { + devRuntimeManager.resumeTask(payload.completion, payload.completion.id); + break; + } + } }, CLEANUP: async ({ flush, kill }) => { if (kill) { @@ -215,7 +224,7 @@ sender.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => { process.title = "trigger-dev-worker"; -async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) { +async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 30) { async function _doHeartbeat() { while (true) { if (_isRunning && _execution) { diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 8f2bbe897a1..2b0c523e7b4 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -11,6 +11,7 @@ import { TaskRunBuiltInError, TaskRunErrorCodes, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, WaitReason, @@ -20,31 +21,15 @@ import { ZodIpcConnection } from "@trigger.dev/core/v3/zodIpc"; import type { InferSocketMessageSchema } from "@trigger.dev/core/v3/zodSocket"; import { Evt } from "evt"; import { ChildProcess, fork } from "node:child_process"; -import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors"; - -class UnexpectedExitError extends Error { - constructor(public code: number) { - super(`Unexpected exit with code ${code}`); - - this.name = "UnexpectedExitError"; - } -} - -class CleanupProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CleanupProcessError"; - } -} - -class CancelledProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CancelledProcessError"; - } -} +import { + CancelledProcessError, + CleanupProcessError, + GracefulExitTimeoutError, + SigKillTimeoutProcessError, + TaskMetadataParseError, + UncaughtExceptionError, + UnexpectedExitError, +} from "../common/errors"; type BackgroundWorkerParams = { env: Record; @@ -56,7 +41,11 @@ type BackgroundWorkerParams = { export class ProdBackgroundWorker { private _initialized: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -74,11 +63,24 @@ export class ProdBackgroundWorker { public onReadyForCheckpoint = Evt.create<{ version?: "v1" }>(); public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>(); + public onCreateTaskRunAttempt = Evt.create<{ version?: "v1"; runId: string }>(); + public attemptCreatedNotification = Evt.create< + | { + success: false; + reason?: string; + } + | { + success: true; + execution: ProdTaskRunExecution; + } + >(); + private _onClose: Evt = new Evt(); public tasks: Array = []; _taskRunProcess: TaskRunProcess | undefined; + private _taskRunProcessesBeingKilled: Map = new Map(); private _closed: boolean = false; @@ -87,7 +89,9 @@ export class ProdBackgroundWorker { private params: BackgroundWorkerParams ) {} - async close() { + async close(gracefulExitTimeoutElapsed = false) { + console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed }); + if (this._closed) { return; } @@ -95,9 +99,35 @@ export class ProdBackgroundWorker { this._closed = true; this.onTaskHeartbeat.detach(); + this.onTaskRunHeartbeat.detach(); // We need to close the task run process - await this._taskRunProcess?.cleanup(true); + await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed); + } + + async #killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { + console.log("Killing task run process", { flush, initialSignal, closed: this._closed }); + + if (this._closed || !this._taskRunProcess) { + return; + } + + if (flush) { + await this.flushTelemetry(); + } + + const currentTaskRunProcess = this._taskRunProcess; + + // Try graceful exit but don't wait. We limit the amount of processes during creation instead. + this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => { + console.error("Error while trying graceful exit", error); + }); + + console.log("Killed task run process, setting closed to true", { + closed: this._closed, + pid: currentTaskRunProcess.pid, + }); + this._closed = true; } async flushTelemetry() { @@ -193,83 +223,191 @@ export class ProdBackgroundWorker { // We need to notify all the task run processes that a task run has completed, // in case they are waiting for it through triggerAndWait - async taskRunCompletedNotification( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { - this._taskRunProcess?.taskRunCompletedNotification(completion, execution); + async taskRunCompletedNotification(completion: TaskRunExecutionResult) { + this._taskRunProcess?.taskRunCompletedNotification(completion); } async waitCompletedNotification() { this._taskRunProcess?.waitCompletedNotification(); } - async #initializeTaskRunProcess(payload: ProdTaskRunExecutionPayload): Promise { + async #getFreshTaskRunProcess( + payload: ProdTaskRunExecutionPayload, + messageId?: string + ): Promise { const metadata = this.getMetadata( payload.execution.worker.id, payload.execution.worker.version ); - if (!this._taskRunProcess) { - const taskRunProcess = new TaskRunProcess( - payload.execution, - this.path, - { - ...this.params.env, - ...(payload.environment ?? {}), - }, - metadata, - this.params - ); + console.log("Getting fresh task run process, setting closed to false", { + closed: this._closed, + }); + this._closed = false; + + await this.#killCurrentTaskRunProcessBeforeAttempt(); + + const taskRunProcess = new TaskRunProcess( + payload.execution.run.id, + payload.execution.run.isTest, + this.path, + { + ...this.params.env, + ...(payload.environment ?? {}), + }, + metadata, + this.params, + messageId + ); - taskRunProcess.onExit.attach(() => { + taskRunProcess.onExit.attach(({ pid }) => { + console.log("Task run process exited", { pid }); + + // Only delete the task run process if the pid matches + if (this._taskRunProcess?.pid === pid) { this._taskRunProcess = undefined; - }); + } - taskRunProcess.onTaskHeartbeat.attach((id) => { - this.onTaskHeartbeat.post(id); - }); + if (pid) { + this._taskRunProcessesBeingKilled.delete(pid); + } + }); - taskRunProcess.onWaitForBatch.attach((message) => { - this.onWaitForBatch.post(message); - }); + taskRunProcess.onIsBeingKilled.attach((taskRunProcess) => { + if (taskRunProcess?.pid) { + this._taskRunProcessesBeingKilled.set(taskRunProcess.pid, taskRunProcess); + } + }); - taskRunProcess.onWaitForDuration.attach((message) => { - this.onWaitForDuration.post(message); - }); + taskRunProcess.onTaskHeartbeat.attach((id) => { + this.onTaskHeartbeat.post(id); + }); - taskRunProcess.onWaitForTask.attach((message) => { - this.onWaitForTask.post(message); - }); + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); - taskRunProcess.onReadyForCheckpoint.attach((message) => { - this.onReadyForCheckpoint.post(message); - }); + taskRunProcess.onWaitForBatch.attach((message) => { + this.onWaitForBatch.post(message); + }); - taskRunProcess.onCancelCheckpoint.attach((message) => { - this.onCancelCheckpoint.post(message); - }); + taskRunProcess.onWaitForDuration.attach((message) => { + this.onWaitForDuration.post(message); + }); - // Notify down the chain - this.preCheckpointNotification.attach((message) => { - taskRunProcess.preCheckpointNotification.post(message); - }); - this.checkpointCanceledNotification.attach((message) => { - taskRunProcess.checkpointCanceledNotification.post(message); - }); + taskRunProcess.onWaitForTask.attach((message) => { + this.onWaitForTask.post(message); + }); - await taskRunProcess.initialize(); + taskRunProcess.onReadyForCheckpoint.attach((message) => { + this.onReadyForCheckpoint.post(message); + }); - this._taskRunProcess = taskRunProcess; - } + taskRunProcess.onCancelCheckpoint.attach((message) => { + this.onCancelCheckpoint.post(message); + }); + + // Notify down the chain + this.preCheckpointNotification.attach((message) => { + taskRunProcess.preCheckpointNotification.post(message); + }); + this.checkpointCanceledNotification.attach((message) => { + taskRunProcess.checkpointCanceledNotification.post(message); + }); + + await taskRunProcess.initialize(); + + this._taskRunProcess = taskRunProcess; return this._taskRunProcess; } - // We need to fork the process before we can execute any tasks - async executeTaskRun(payload: ProdTaskRunExecutionPayload): Promise { + async forceKillOldTaskRunProcesses() { + for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) { + try { + await taskRunProcess.kill("SIGKILL"); + } catch (error) { + console.error("Error while force killing old task run processes", error); + } + } + } + + async #killCurrentTaskRunProcessBeforeAttempt() { + console.log("killCurrentTaskRunProcessBeforeAttempt()", { + hasTaskRunProcess: !!this._taskRunProcess, + }); + + if (!this._taskRunProcess) { + return; + } + + const currentTaskRunProcess = this._taskRunProcess; + + console.log("Killing current task run process", { + isBeingKilled: currentTaskRunProcess?.isBeingKilled, + totalBeingKilled: this._taskRunProcessesBeingKilled.size, + }); + + if (currentTaskRunProcess.isBeingKilled) { + if (this._taskRunProcessesBeingKilled.size > 1) { + await this.#tryGracefulExit(currentTaskRunProcess); + } else { + // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel + } + } else { + // It's not being killed, so kill it + if (this._taskRunProcessesBeingKilled.size > 0) { + await this.#tryGracefulExit(currentTaskRunProcess); + } else { + // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. + currentTaskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); + } + } + } + + async #tryGracefulExit( + taskRunProcess: TaskRunProcess, + kill = false, + initialSignal: number | NodeJS.Signals = "SIGTERM" + ) { + try { + const initialExit = taskRunProcess.onExit.waitFor(5_000); + + if (kill) { + taskRunProcess.kill(initialSignal); + } + + await initialExit; + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + this.#tryForcefulExit(taskRunProcess); + } + } + + async #tryForcefulExit(taskRunProcess: TaskRunProcess) { + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + + // We need to fork the process before we can execute any tasks, use a fresh process for each execution + async executeTaskRun( + payload: ProdTaskRunExecutionPayload, + messageId?: string + ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload); + const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId); + + console.log("executing task run", { + attempt: payload.execution.attempt.id, + taskRunPid: taskRunProcess.pid, + }); const result = await taskRunProcess.executeTaskRun(payload); @@ -326,6 +464,31 @@ export class ProdBackgroundWorker { }; } + if (e instanceof SigKillTimeoutProcessError) { + return { + id: payload.execution.attempt.id, + ok: false, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT, + }, + }; + } + + if (e instanceof GracefulExitTimeoutError) { + return { + id: payload.execution.attempt.id, + ok: false, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT, + message: "Worker process killed while attempt in progress.", + }, + }; + } + return { id: payload.execution.attempt.id, ok: false, @@ -335,11 +498,52 @@ export class ProdBackgroundWorker { code: TaskRunErrorCodes.TASK_EXECUTION_FAILED, }, }; + } finally { + await this.#killTaskRunProcess(); } } async cancelAttempt(attemptId: string) { - await this._taskRunProcess?.cancel(); + if (!this._taskRunProcess) { + console.error("No task run process to cancel attempt", { attemptId }); + return; + } + + await this._taskRunProcess.cancel(); + } + + async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload) { + // Post to coordinator + this.onCreateTaskRunAttempt.post({ runId: payload.runId }); + + let execution: ProdTaskRunExecution; + + try { + // ..and wait for response + const attemptCreated = await this.attemptCreatedNotification.waitFor(30_000); + + if (!attemptCreated.success) { + throw new Error( + `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}` + ); + } + + execution = attemptCreated.execution; + } catch (error) { + console.error("Error while creating attempt", error); + throw new Error(`Failed to create task run attempt: ${error}`); + } + + const completion = await this.executeTaskRun( + { + execution, + traceContext: payload.traceContext, + environment: payload.environment, + }, + payload.messageId + ); + + return { execution, completion }; } async #correctError( @@ -359,6 +563,7 @@ class TaskRunProcess { typeof ProdWorkerToChildMessages >; private _child?: ChildProcess; + private _childPid?: number; private _attemptPromises: Map< string, @@ -368,9 +573,16 @@ class TaskRunProcess { private _currentExecution: TaskRunExecution | undefined; private _isBeingKilled: boolean = false; private _isBeingCancelled: boolean = false; + private _gracefulExitTimeoutElapsed: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); - public onExit: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); + public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> = + new Evt(); + public onIsBeingKilled: Evt = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -389,18 +601,20 @@ class TaskRunProcess { public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>(); constructor( - private execution: ProdTaskRunExecution, + private runId: string, + private isTest: boolean, private path: string, private env: NodeJS.ProcessEnv, private metadata: BackgroundWorkerProperties, - private worker: BackgroundWorkerParams + private worker: BackgroundWorkerParams, + private messageId?: string ) {} async initialize() { this._child = fork(this.path, { stdio: [/*stdin*/ "ignore", /*stdout*/ "pipe", /*stderr*/ "pipe", "ipc"], env: { - ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), + ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), ...this.env, OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({ [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir, @@ -408,6 +622,7 @@ class TaskRunProcess { ...(this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}), }, }); + this._childPid = this._child?.pid; this._ipc = new ZodIpcConnection({ listenSchema: ProdChildToWorkerMessages, @@ -439,7 +654,11 @@ class TaskRunProcess { process.exit(0); }, TASK_HEARTBEAT: async (message) => { - this.onTaskHeartbeat.post(message.id); + if (this.messageId) { + this.onTaskRunHeartbeat.post(this.messageId); + } else { + this.onTaskHeartbeat.post(message.id); + } }, TASKS_READY: async (message) => {}, WAIT_FOR_TASK: async (message) => { @@ -513,17 +732,38 @@ class TaskRunProcess { await this.cleanup(true); } - async cleanup(kill: boolean = false) { + async cleanup(kill = false, gracefulExitTimeoutElapsed = false) { + console.log("cleanup()", { kill, gracefulExitTimeoutElapsed }); + if (kill && this._isBeingKilled) { return; } - this._isBeingKilled = kill; + if (kill) { + this._isBeingKilled = true; + this.onIsBeingKilled.post(this); + } + + const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution; + + // Kill parent unless graceful exit timeout has elapsed and we're in the middle of an execution + const killParentProcess = kill && !killChildProcess; + + console.log("Cleaning up task run process", { + killChildProcess, + killParentProcess, + }); await this._ipc?.sendWithAck("CLEANUP", { flush: true, - kill, + kill: killParentProcess, }); + + if (killChildProcess) { + this._gracefulExitTimeoutElapsed = true; + // Kill the child process + await this.kill("SIGKILL"); + } } async executeTaskRun(payload: TaskRunExecutionPayload): Promise { @@ -559,15 +799,15 @@ class TaskRunProcess { return result; } - taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) { + taskRunCompletedNotification(completion: TaskRunExecutionResult) { if (!completion.ok && typeof completion.retry !== "undefined") { return; } if (this._child?.connected && !this._isBeingKilled && !this._child.killed) { this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", { + version: "v2", completion, - execution, }); } } @@ -578,10 +818,14 @@ class TaskRunProcess { } } - async #handleExit(code: number) { + async #handleExit(code: number | null, signal: NodeJS.Signals | null) { + console.log("handling child exit", { code, signal }); + // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { if (status === "PENDING") { + console.log("found pending attempt", { id }); + this._attemptStatuses.set(id, "REJECTED"); const attemptPromise = this._attemptPromises.get(id); @@ -594,15 +838,18 @@ class TaskRunProcess { if (this._isBeingCancelled) { rejecter(new CancelledProcessError()); + } else if (this._gracefulExitTimeoutElapsed) { + // Order matters, this has to be before the graceful exit timeout + rejecter(new GracefulExitTimeoutError()); } else if (this._isBeingKilled) { rejecter(new CleanupProcessError()); } else { - rejecter(new UnexpectedExitError(code)); + rejecter(new UnexpectedExitError(code ?? -1)); } } } - this.onExit.post(code); + this.onExit.post({ code, signal, pid: this.pid }); } #handleLog(data: Buffer) { @@ -635,9 +882,24 @@ class TaskRunProcess { ); } - #kill() { - if (this._child && !this._child.killed) { - this._child?.kill(); + async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) { + this._isBeingKilled = true; + + const killTimeout = this.onExit.waitFor(timeoutInMs); + + this.onIsBeingKilled.post(this); + this._child?.kill(signal); + + if (timeoutInMs) { + await killTimeout; } } + + get isBeingKilled() { + return this._isBeingKilled || this._child?.killed; + } + + get pid() { + return this._childPid; + } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index e8b35d61e45..d59132e83d5 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -5,6 +5,7 @@ import { PreStopCauses, ProdWorkerToCoordinatorMessages, TaskResource, + TaskRunFailedExecutionResult, WaitReason, } from "@trigger.dev/core/v3"; import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket"; @@ -60,8 +61,89 @@ class ProdWorker { process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM")); this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST); + this.#backgroundWorker = this.#createBackgroundWorker(); - this.#backgroundWorker = new ProdBackgroundWorker("worker.js", { + this.#httpPort = port; + this.#httpServer = this.#createHttpServer(); + } + + async #handleSignal(signal: NodeJS.Signals) { + logger.log("Received signal", { signal }); + + if (signal === "SIGTERM") { + let gracefulExitTimeoutElapsed = false; + + if (this.executing) { + const terminationGracePeriodSeconds = 60 * 60; + + logger.log("Waiting for attempt to complete before exiting", { + terminationGracePeriodSeconds, + }); + + // Wait for termination grace period minus 5s to give cleanup a chance to complete + await setTimeout(terminationGracePeriodSeconds * 1000 - 5000); + gracefulExitTimeoutElapsed = true; + + logger.log("Termination timeout reached, exiting gracefully."); + } else { + logger.log("Not executing, exiting immediately."); + } + + await this.#exitGracefully(gracefulExitTimeoutElapsed); + return; + } + + logger.log("Unhandled signal", { signal }); + } + + async #exitGracefully(gracefulExitTimeoutElapsed = false) { + await this.#backgroundWorker.close(gracefulExitTimeoutElapsed); + + if (!gracefulExitTimeoutElapsed) { + // TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies + process.exit(0); + } + } + + async #reconnect(isPostStart = false, reconnectImmediately = false) { + if (isPostStart) { + this.waitForPostStart = false; + } + + this.#coordinatorSocket.close(); + + if (!reconnectImmediately) { + await setTimeout(1000); + } + + let coordinatorHost = COORDINATOR_HOST; + + try { + if (this.runningInKubernetes) { + coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace( + "\n", + "" + ); + + logger.log("reconnecting", { + coordinatorHost: { + fromEnv: COORDINATOR_HOST, + fromVolume: coordinatorHost, + current: this.#coordinatorSocket.socket.io.opts.hostname, + }, + }); + } + } catch (error) { + logger.error("taskinfo read error during reconnect", { + error: error instanceof Error ? error.message : error, + }); + } finally { + this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); + } + } + + #createBackgroundWorker() { + const backgroundWorker = new ProdBackgroundWorker("worker.js", { projectConfig: __PROJECT_CONFIG__, env: { ...gatherProcessEnv(), @@ -73,19 +155,24 @@ class ProdWorker { contentHash: this.contentHash, }); - this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => { + backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => { // TODO: Switch to .send() once coordinator uses zod handler for all messages this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId }); }); - this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => { - // Flush before checkpointing so we don't flush the same spans again after restore - await this.#backgroundWorker.flushTelemetry(); + backgroundWorker.onTaskRunHeartbeat.attach((runId) => { + this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId }); + }); + + // Currently, this is only used for duration waits + backgroundWorker.onReadyForCheckpoint.attach(async (message) => { + await this.#prepareForCheckpoint(); + this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); }); // Currently, this is only used for duration waits. Might need adjusting for other use cases. - this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => { + backgroundWorker.onCancelCheckpoint.attach(async (message) => { logger.log("onCancelCheckpoint", { message }); const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck( @@ -96,6 +183,8 @@ class ProdWorker { } ); + logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled }); + if (checkpointCanceled) { if (message.reason === "WAIT_FOR_DURATION") { // Worker will resume immediately @@ -105,12 +194,52 @@ class ProdWorker { } } - this.#backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled }); + backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled }); + }); + + backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => { + logger.log("onCreateTaskRunAttempt()", { message }); + + const createAttempt = await this.#coordinatorSocket.socket.emitWithAck( + "CREATE_TASK_RUN_ATTEMPT", + { + version: "v1", + runId: message.runId, + } + ); + + if (!createAttempt.success) { + backgroundWorker.attemptCreatedNotification.post({ + success: false, + reason: createAttempt.reason, + }); + return; + } + + backgroundWorker.attemptCreatedNotification.post({ + success: true, + execution: createAttempt.executionPayload.execution, + }); + }); + + backgroundWorker.attemptCreatedNotification.attach((message) => { + if (!message.success) { + return; + } + + // Workers with lazy attempt support set their friendly ID here + this.attemptFriendlyId = message.execution.attempt.id; }); - this.#backgroundWorker.onWaitForDuration.attach(async (message) => { + backgroundWorker.onWaitForDuration.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#emitUnrecoverableError( + "NoAttemptId", + "Attempt ID not set before waiting for duration" + ); + return; } @@ -125,9 +254,12 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore); }); - this.#backgroundWorker.onWaitForTask.attach(async (message) => { + backgroundWorker.onWaitForTask.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task"); + return; } @@ -142,9 +274,12 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore); }); - this.#backgroundWorker.onWaitForBatch.attach(async (message) => { + backgroundWorker.onWaitForBatch.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch"); + return; } @@ -159,73 +294,7 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore); }); - this.#httpPort = port; - this.#httpServer = this.#createHttpServer(); - } - - async #handleSignal(signal: NodeJS.Signals) { - logger.log("Received signal", { signal }); - - if (signal === "SIGTERM") { - if (this.executing) { - const terminationGracePeriodSeconds = 60 * 60; - - logger.log("Waiting for attempt to complete before exiting", { - terminationGracePeriodSeconds, - }); - - // Wait for termination grace period minus 5s to give cleanup a chance to complete - await setTimeout(terminationGracePeriodSeconds * 1000 - 5000); - - logger.log("Termination timeout reached, exiting gracefully."); - } else { - logger.log("Not executing, exiting immediately."); - } - - await this.#exitGracefully(); - } - - logger.log("Unhandled signal", { signal }); - } - - async #exitGracefully() { - await this.#backgroundWorker.close(); - process.exit(0); - } - - async #reconnect(isPostStart = false, reconnectImmediately = false) { - if (isPostStart) { - this.waitForPostStart = false; - } - - this.#coordinatorSocket.close(); - - if (!reconnectImmediately) { - await setTimeout(1000); - } - - let coordinatorHost = COORDINATOR_HOST; - - try { - if (this.runningInKubernetes) { - coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace( - "\n", - "" - ); - - logger.log("reconnecting", { - coordinatorHost: { - fromEnv: COORDINATOR_HOST, - fromVolume: coordinatorHost, - current: this.#coordinatorSocket.socket.io.opts.hostname, - }, - }); - } - } catch (error) { - logger.error("taskinfo read error during reconnect", { error }); - } finally { - this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); - } + return backgroundWorker; } async #prepareForWait(reason: WaitReason, willCheckpointAndRestore: boolean) { @@ -239,9 +308,8 @@ class ProdWorker { this.waitForPostStart = true; if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") { - // Flush before checkpointing so we don't flush the same spans again after restore // Duration waits do this via the "ready for checkpoint" event instead - await this.#backgroundWorker.flushTelemetry(); + await this.#prepareForCheckpoint(); } } } @@ -256,18 +324,36 @@ class ProdWorker { } await this.#exitGracefully(); + return; } + // Clear state for next execution + this.paused = false; + this.waitForPostStart = false; this.executing = false; this.attemptFriendlyId = undefined; if (willCheckpointAndRestore) { this.waitForPostStart = true; + + // We already flush after completion, so we don't need to do it here + this.#prepareForCheckpoint(false); + this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); return; } } + async #prepareForCheckpoint(flush = true) { + if (flush) { + // Flush before checkpointing so we don't flush the same spans again after restore + await this.#backgroundWorker.flushTelemetry(); + } + + // Kill the previous worker process to prevent large checkpoints + await this.#backgroundWorker.forceKillOldTaskRunProcesses(); + } + #resumeAfterDuration() { this.paused = false; this.nextResumeAfter = undefined; @@ -303,11 +389,8 @@ class ProdWorker { extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId; } - logger.log("connecting to coordinator", { - host, - port: COORDINATOR_PORT, - extraHeaders, - }); + logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`); + logger.debug(`connecting with extra headers`, { extraHeaders }); const coordinatorConnection = new ZodSocketConnection({ namespace: "prod-worker", @@ -317,28 +400,14 @@ class ProdWorker { serverMessages: CoordinatorToProdWorkerMessages, extraHeaders, handlers: { - RESUME_AFTER_DEPENDENCY: async (message) => { + RESUME_AFTER_DEPENDENCY: async ({ completions }) => { if (!this.paused) { - logger.error("worker not paused", { - completions: message.completions, - executions: message.executions, - }); - return; - } - - if (message.completions.length !== message.executions.length) { - logger.error("did not receive the same number of completions and executions", { - completions: message.completions, - executions: message.executions, - }); + logger.error("Failed to resume after dependency: Worker not paused"); return; } - if (message.completions.length === 0 || message.executions.length === 0) { - logger.error("no completions or executions", { - completions: message.completions, - executions: message.executions, - }); + if (completions.length === 0) { + logger.error("Failed to resume after dependency: No completions"); return; } @@ -346,17 +415,19 @@ class ProdWorker { this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH" ) { - logger.error("not waiting to resume after dependency", { + logger.error("Failed to resume after dependency: Invalid next resume", { nextResumeAfter: this.nextResumeAfter, }); return; } - if (this.nextResumeAfter === "WAIT_FOR_TASK" && message.completions.length > 1) { - logger.error("waiting for single task but got multiple completions", { - completions: message.completions, - executions: message.executions, - }); + if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) { + logger.error( + "Failed to resume after dependency: Waiting for single task but got multiple completions", + { + completions: completions, + } + ); return; } @@ -364,13 +435,12 @@ class ProdWorker { this.nextResumeAfter = undefined; this.waitForPostStart = false; - for (let i = 0; i < message.completions.length; i++) { - const completion = message.completions[i]; - const execution = message.executions[i]; + for (let i = 0; i < completions.length; i++) { + const completion = completions[i]; - if (!completion || !execution) continue; + if (!completion) continue; - this.#backgroundWorker.taskRunCompletedNotification(completion, execution); + this.#backgroundWorker.taskRunCompletedNotification(completion); } }, RESUME_AFTER_DURATION: async (message) => { @@ -420,14 +490,75 @@ class ProdWorker { this.#prepareForRetry(willCheckpointAndRestore, shouldExit); }, + EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => { + if (this.executing) { + logger.error("dropping execute request, already executing"); + return; + } + + this.executing = true; + + try { + const { completion, execution } = + await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload); + + logger.log("completed", completion); + + this.completed.add(execution.attempt.id); + + const { willCheckpointAndRestore, shouldExit } = + await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", { + version: "v1", + execution, + completion, + }); + + logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit }); + + this.#prepareForRetry(willCheckpointAndRestore, shouldExit); + } catch (error) { + const completion: TaskRunFailedExecutionResult = { + ok: false, + id: message.lazyPayload.runId, + retry: undefined, + error: + error instanceof Error + ? { + type: "BUILT_IN_ERROR", + name: error.name, + message: error.message, + stackTrace: error.stack ?? "", + } + : { + type: "BUILT_IN_ERROR", + name: "UnknownError", + message: String(error), + stackTrace: "", + }, + }; + + this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", { + version: "v1", + completion, + }); + } + }, REQUEST_ATTEMPT_CANCELLATION: async (message) => { if (!this.executing) { + logger.log("dropping cancel request, not executing", { status: this.#status }); return; } + logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status }); + await this.#backgroundWorker.cancelAttempt(message.attemptId); }, - REQUEST_EXIT: async () => { + REQUEST_EXIT: async (message) => { + if (message.version === "v2" && message.delayInMs) { + logger.log("exit requested with delay", { delayInMs: message.delayInMs }); + await setTimeout(message.delayInMs); + } + this.#coordinatorSocket.close(); process.exit(0); }, @@ -436,7 +567,7 @@ class ProdWorker { return; } - this.#coordinatorSocket.socket.emit("READY_FOR_EXECUTION", { + this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", { version: "v1", runId: this.runId, totalCompletions: this.completed.size, @@ -444,6 +575,8 @@ class ProdWorker { }, }, onConnection: async (socket, handler, sender, logger) => { + logger.log("connected to coordinator", { status: this.#status }); + if (this.waitForPostStart) { logger.log("skip connection handler, waiting for post start hook"); return; @@ -451,11 +584,24 @@ class ProdWorker { if (this.paused) { if (!this.nextResumeAfter) { + logger.error("Missing next resume reason", { status: this.#status }); + + this.#emitUnrecoverableError( + "NoNextResume", + "Next resume reason not set while resuming from paused state" + ); + return; } if (!this.attemptFriendlyId) { - logger.error("Missing friendly ID"); + logger.error("Missing friendly ID", { status: this.#status }); + + this.#emitUnrecoverableError( + "NoAttemptId", + "Attempt ID not set while resuming from paused state" + ); + return; } @@ -473,9 +619,10 @@ class ProdWorker { const taskResources = await this.#initializeWorker(); const { success } = await socket.emitWithAck("INDEX_TASKS", { - version: "v1", + version: "v2", deploymentId: this.deploymentId, ...taskResources, + supportsLazyAttempts: true, }); if (success) { @@ -563,7 +710,7 @@ class ProdWorker { return; } - socket.emit("READY_FOR_EXECUTION", { + socket.emit("READY_FOR_LAZY_ATTEMPT", { version: "v1", runId: this.runId, totalCompletions: this.completed.size, @@ -601,12 +748,7 @@ class ProdWorker { } case "/status": { - return reply.json({ - executing: this.executing, - paused: this.paused, - completed: this.completed.size, - nextResumeAfter: this.nextResumeAfter, - }); + return reply.json(this.#status); } case "/connect": { @@ -768,6 +910,27 @@ class ProdWorker { return data?.variables ?? {}; } + get #status() { + return { + executing: this.executing, + paused: this.paused, + completed: this.completed.size, + nextResumeAfter: this.nextResumeAfter, + waitForPostStart: this.waitForPostStart, + attemptFriendlyId: this.attemptFriendlyId, + }; + } + + #emitUnrecoverableError(name: string, message: string) { + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name, + message, + }, + }); + } + start() { this.#httpServer.listen(this.#httpPort, this.host); } diff --git a/packages/cli-v3/src/workers/prod/worker-facade.ts b/packages/cli-v3/src/workers/prod/worker-facade.ts index 77271b80162..516f7035331 100644 --- a/packages/cli-v3/src/workers/prod/worker-facade.ts +++ b/packages/cli-v3/src/workers/prod/worker-facade.ts @@ -170,8 +170,8 @@ const zodIpc = new ZodIpcConnection({ _isRunning = false; } }, - TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => { - prodRuntimeManager.resumeTask(completion, execution); + TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion }) => { + prodRuntimeManager.resumeTask(completion); }, WAIT_COMPLETED_NOTIFICATION: async () => { prodRuntimeManager.resumeAfterDuration(); @@ -179,23 +179,6 @@ const zodIpc = new ZodIpcConnection({ CLEANUP: async ({ flush, kill }, sender) => { if (kill) { await tracingSDK.flush(); - - if (_execution) { - // Fail currently executing attempt - await sender.send("TASK_RUN_COMPLETED", { - execution: _execution, - result: { - ok: false, - id: _execution.run.id, - error: { - type: "INTERNAL_ERROR", - code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT, - message: "Worker process killed while attempt in progress.", - }, - }, - }); - } - // Now we need to exit the process await sender.send("READY_TO_DISPOSE", undefined); } else { @@ -228,7 +211,7 @@ zodIpc.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => { process.title = "trigger-prod-worker"; -async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) { +async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 20) { async function _doHeartbeat() { while (true) { if (_isRunning && _execution) { diff --git a/packages/core-apps/src/provider.ts b/packages/core-apps/src/provider.ts index cb1b0c4d574..159c81dbc37 100644 --- a/packages/core-apps/src/provider.ts +++ b/packages/core-apps/src/provider.ts @@ -46,7 +46,6 @@ export interface TaskOperationsCreateOptions { orgId: string; projectId: string; runId: string; - attemptId: string; } export interface TaskOperationsRestoreOptions { @@ -129,7 +128,6 @@ export class ProviderShell implements Provider { orgId: message.data.orgId, projectId: message.data.projectId, runId: message.data.runId, - attemptId: message.data.id, }); } catch (error) { logger.error("create failed", error); diff --git a/packages/core/package.json b/packages/core/package.json index 0b49d753f60..222191a7a2e 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -37,6 +37,14 @@ "require": "./dist/v3/otel/index.js", "types": "./dist/v3/otel/index.d.ts" }, + "./v3/zodfetch": { + "import": { + "types": "./dist/v3/zodfetch.d.mts", + "default": "./dist/v3/zodfetch.mjs" + }, + "require": "./dist/v3/zodfetch.js", + "types": "./dist/v3/zodfetch.d.ts" + }, "./v3/zodMessageHandler": { "import": { "types": "./dist/v3/zodMessageHandler.d.mts", diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index 61ee0575b20..bc12e75d782 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -84,13 +84,13 @@ export function createJsonErrorObject(error: TaskRunError) { export function correctErrorStackTrace( stackTrace: string, projectDir?: string, - options?: { removeFirstLine?: boolean } + options?: { removeFirstLine?: boolean; isDev?: boolean } ) { const [errorLine, ...traceLines] = stackTrace.split("\n"); return [ options?.removeFirstLine ? undefined : errorLine, - ...traceLines.map((line) => correctStackTraceLine(line, projectDir)), + ...traceLines.map((line) => correctStackTraceLine(line, projectDir, options?.isDev)), ] .filter(Boolean) .join("\n"); @@ -102,17 +102,21 @@ const LINES_TO_IGNORE = [ /TaskExecutor/, /EXECUTE_TASK_RUN/, /@trigger.dev\/core/, + /packages\/core\/src\/v3/, /safeJsonProcess/, /__entryPoint.ts/, + /ZodIpc/, + /startActiveSpan/, + /processTicksAndRejections/, ]; -function correctStackTraceLine(line: string, projectDir?: string) { +function correctStackTraceLine(line: string, projectDir?: string, isDev?: boolean) { if (LINES_TO_IGNORE.some((regex) => regex.test(line))) { return; } // Check to see if the path is inside the project directory - if (projectDir && !line.includes(projectDir)) { + if (isDev && projectDir && !line.includes(projectDir)) { return; } diff --git a/packages/core/src/v3/runtime/devRuntimeManager.ts b/packages/core/src/v3/runtime/devRuntimeManager.ts index 7df2c9335cf..6209fdcb203 100644 --- a/packages/core/src/v3/runtime/devRuntimeManager.ts +++ b/packages/core/src/v3/runtime/devRuntimeManager.ts @@ -80,18 +80,18 @@ export class DevRuntimeManager implements RuntimeManager { }; } - resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void { - const wait = this._taskWaits.get(execution.run.id); + resumeTask(completion: TaskRunExecutionResult, runId: string): void { + const wait = this._taskWaits.get(runId); if (!wait) { // We need to store the completion in case the task is awaited later - this._pendingCompletionNotifications.set(execution.run.id, completion); + this._pendingCompletionNotifications.set(runId, completion); return; } wait.resolve(completion); - this._taskWaits.delete(execution.run.id); + this._taskWaits.delete(runId); } } diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index 622a44ed7db..19dc0fd8833 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -55,10 +55,14 @@ export class ProdRuntimeManager implements RuntimeManager { this._waitForDuration = { resolve, reject }; }); - const { willCheckpointAndRestore } = await this.ipc.sendWithAck("WAIT_FOR_DURATION", { - ms, - now, - }); + const { willCheckpointAndRestore } = await this.ipc.sendWithAck( + "WAIT_FOR_DURATION", + { + ms, + now, + }, + 31_000 + ); if (!willCheckpointAndRestore) { await internalTimeout; @@ -74,18 +78,24 @@ export class ProdRuntimeManager implements RuntimeManager { // Resets the clock to the current time clock.reset(); - // The coordinator should cancel any in-progress checkpoints - const { checkpointCanceled, version } = await this.ipc.sendWithAck( - "CANCEL_CHECKPOINT", - { - version: "v2", - reason: "WAIT_FOR_DURATION", - }, - 31_000 - ); - - if (checkpointCanceled) { - // There won't be a checkpoint or external resume and we've already completed our internal timeout + try { + // The coordinator should cancel any in-progress checkpoints + const { checkpointCanceled, version } = await this.ipc.sendWithAck( + "CANCEL_CHECKPOINT", + { + version: "v2", + reason: "WAIT_FOR_DURATION", + }, + 31_000 + ); + + if (checkpointCanceled) { + // There won't be a checkpoint or external resume and we've already completed our internal timeout + return; + } + } catch (error) { + // If the cancellation times out, we will proceed as if the checkpoint was canceled + logger.debug("Checkpoint cancellation timed out", { error }); return; } @@ -98,19 +108,9 @@ export class ProdRuntimeManager implements RuntimeManager { return; } - process.stdout.write("pre"); - process.stdout.write(JSON.stringify(clock.preciseNow())); - - console.log("pre", clock.preciseNow()); - // Resets the clock to the current time clock.reset(); - console.log("post", clock.preciseNow()); - - process.stdout.write("post"); - process.stdout.write(JSON.stringify(clock.preciseNow())); - this._waitForDuration.resolve("external"); this._waitForDuration = undefined; } @@ -167,8 +167,8 @@ export class ProdRuntimeManager implements RuntimeManager { }; } - resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void { - const wait = this._taskWaits.get(execution.run.id); + resumeTask(completion: TaskRunExecutionResult): void { + const wait = this._taskWaits.get(completion.id); if (!wait) { return; @@ -176,7 +176,7 @@ export class ProdRuntimeManager implements RuntimeManager { wait.resolve(completion); - this._taskWaits.delete(execution.run.id); + this._taskWaits.delete(completion.id); } private get waitThresholdInMs(): number { diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 23a32737c09..0a14bb2f7e7 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -41,6 +41,7 @@ export type GetProjectEnvResponse = z.infer; export const CreateBackgroundWorkerRequestBody = z.object({ localOnly: z.boolean(), metadata: BackgroundWorkerMetadata, + supportsLazyAttempts: z.boolean().optional(), }); export type CreateBackgroundWorkerRequestBody = z.infer; diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index 58ae633c737..2362748d705 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -31,6 +31,7 @@ export const TaskRunErrorCodes = { TASK_EXECUTION_FAILED: "TASK_EXECUTION_FAILED", TASK_EXECUTION_ABORTED: "TASK_EXECUTION_ABORTED", TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", + TASK_PROCESS_SIGKILL_TIMEOUT: "TASK_PROCESS_SIGKILL_TIMEOUT", TASK_RUN_CANCELLED: "TASK_RUN_CANCELLED", TASK_OUTPUT_ERROR: "TASK_OUTPUT_ERROR", HANDLE_ERROR_ERROR: "HANDLE_ERROR_ERROR", @@ -47,10 +48,12 @@ export const TaskRunInternalError = z.object({ "TASK_EXECUTION_FAILED", "TASK_EXECUTION_ABORTED", "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", + "TASK_PROCESS_SIGKILL_TIMEOUT", "TASK_RUN_CANCELLED", "TASK_OUTPUT_ERROR", "HANDLE_ERROR_ERROR", "GRACEFUL_EXIT_TIMEOUT", + "TASK_RUN_HEARTBEAT_TIMEOUT", ]), message: z.string().optional(), }); diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index 32c66c8082e..10c20912a10 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -1,11 +1,13 @@ import { z } from "zod"; -import { TaskRunExecution, TaskRunExecutionResult } from "./common"; +import { TaskRunExecution, TaskRunExecutionResult, TaskRunFailedExecutionResult } from "./common"; + import { EnvironmentType, Machine, ProdTaskRunExecution, ProdTaskRunExecutionPayload, TaskMetadataWithFilePath, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, WaitReason, } from "./schemas"; @@ -27,13 +29,17 @@ export const BackgroundWorkerServerMessages = z.discriminatedUnion("type", [ version: z.string(), machine: Machine, // identifiers - id: z.string(), // attempt + id: z.string().optional(), // TODO: Remove this completely in a future release envId: z.string(), envType: EnvironmentType, orgId: z.string(), projectId: z.string(), runId: z.string(), }), + z.object({ + type: z.literal("EXECUTE_RUN_LAZY_ATTEMPT"), + payload: TaskRunExecutionLazyAttemptPayload, + }), ]); export type BackgroundWorkerServerMessages = z.infer; @@ -57,11 +63,21 @@ export const BackgroundWorkerClientMessages = z.discriminatedUnion("type", [ completion: TaskRunExecutionResult, execution: TaskRunExecution, }), + z.object({ + version: z.literal("v1").default("v1"), + type: z.literal("TASK_RUN_FAILED_TO_RUN"), + completion: TaskRunFailedExecutionResult, + }), z.object({ version: z.literal("v1").default("v1"), type: z.literal("TASK_HEARTBEAT"), id: z.string(), }), + z.object({ + version: z.literal("v1").default("v1"), + type: z.literal("TASK_RUN_HEARTBEAT"), + id: z.string(), + }), ]); export type BackgroundWorkerClientMessages = z.infer; @@ -78,6 +94,7 @@ export const clientWebsocketMessages = { READY_FOR_TASKS: z.object({ version: z.literal("v1").default("v1"), backgroundWorkerId: z.string(), + inProgressRuns: z.string().array().optional(), }), BACKGROUND_WORKER_DEPRECATED: z.object({ version: z.literal("v1").default("v1"), @@ -97,11 +114,17 @@ export const workerToChildMessages = { traceContext: z.record(z.unknown()), metadata: BackgroundWorkerProperties, }), - TASK_RUN_COMPLETED_NOTIFICATION: z.object({ - version: z.literal("v1").default("v1"), - completion: TaskRunExecutionResult, - execution: TaskRunExecution, - }), + TASK_RUN_COMPLETED_NOTIFICATION: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + completion: TaskRunExecutionResult, + execution: TaskRunExecution, + }), + z.object({ + version: z.literal("v2"), + completion: TaskRunExecutionResult, + }), + ]), CLEANUP: z.object({ version: z.literal("v1").default("v1"), flush: z.boolean().default(false), @@ -142,6 +165,10 @@ export const childToWorkerMessages = { version: z.literal("v1").default("v1"), id: z.string(), }), + TASK_RUN_HEARTBEAT: z.object({ + version: z.literal("v1").default("v1"), + id: z.string(), + }), READY_TO_DISPOSE: z.undefined(), WAIT_FOR_DURATION: z.object({ version: z.literal("v1").default("v1"), @@ -182,6 +209,12 @@ export const ProdChildToWorkerMessages = { id: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + id: z.string(), + }), + }, READY_TO_DISPOSE: { message: z.undefined(), }, @@ -247,11 +280,17 @@ export const ProdWorkerToChildMessages = { }), }, TASK_RUN_COMPLETED_NOTIFICATION: { - message: z.object({ - version: z.literal("v1").default("v1"), - completion: TaskRunExecutionResult, - execution: TaskRunExecution, - }), + message: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + completion: TaskRunExecutionResult, + execution: TaskRunExecution, + }), + z.object({ + version: z.literal("v2"), + completion: TaskRunExecutionResult, + }), + ]), }, CLEANUP: { message: z.object({ @@ -379,6 +418,18 @@ export const PlatformToProviderMessages = { }, }; +const CreateWorkerMessage = z.object({ + projectRef: z.string(), + envId: z.string(), + deploymentId: z.string(), + metadata: z.object({ + cliPackageVersion: z.string().optional(), + contentHash: z.string(), + packageVersion: z.string(), + tasks: TaskResource.array(), + }), +}); + export const CoordinatorToPlatformMessages = { LOG: { message: z.object({ @@ -388,24 +439,38 @@ export const CoordinatorToPlatformMessages = { }), }, CREATE_WORKER: { + message: z.discriminatedUnion("version", [ + CreateWorkerMessage.extend({ + version: z.literal("v1"), + }), + CreateWorkerMessage.extend({ + version: z.literal("v2"), + supportsLazyAttempts: z.boolean(), + }), + ]), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + }), + z.object({ + success: z.literal(true), + }), + ]), + }, + CREATE_TASK_RUN_ATTEMPT: { message: z.object({ version: z.literal("v1").default("v1"), - projectRef: z.string(), + runId: z.string(), envId: z.string(), - deploymentId: z.string(), - metadata: z.object({ - cliPackageVersion: z.string().optional(), - contentHash: z.string(), - packageVersion: z.string(), - tasks: TaskResource.array(), - }), }), callback: z.discriminatedUnion("success", [ z.object({ success: z.literal(false), + reason: z.string().optional(), }), z.object({ success: z.literal(true), + executionPayload: ProdTaskRunExecutionPayload, }), ]), }, @@ -425,6 +490,24 @@ export const CoordinatorToPlatformMessages = { }), ]), }, + READY_FOR_LAZY_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + envId: z.string(), + totalCompletions: z.number(), + }), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + reason: z.string().optional(), + }), + z.object({ + success: z.literal(true), + lazyPayload: TaskRunExecutionLazyAttemptPayload, + }), + ]), + }, READY_FOR_RESUME: { message: z.object({ version: z.literal("v1").default("v1"), @@ -445,12 +528,24 @@ export const CoordinatorToPlatformMessages = { .optional(), }), }, + TASK_RUN_FAILED_TO_RUN: { + message: z.object({ + version: z.literal("v1").default("v1"), + completion: TaskRunFailedExecutionResult, + }), + }, TASK_HEARTBEAT: { message: z.object({ version: z.literal("v1").default("v1"), attemptFriendlyId: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + }, CHECKPOINT_CREATED: { message: z.object({ version: z.literal("v1").default("v1"), @@ -490,6 +585,17 @@ export const CoordinatorToPlatformMessages = { }), }), }, + RUN_CRASHED: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + error: z.object({ + name: z.string(), + message: z.string(), + stack: z.string().optional(), + }), + }), + }, }; export const PlatformToCoordinatorMessages = { @@ -517,6 +623,13 @@ export const PlatformToCoordinatorMessages = { attemptFriendlyId: z.string(), }), }, + REQUEST_RUN_CANCELLATION: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + delayInMs: z.number().optional(), + }), + }, READY_FOR_RETRY: { message: z.object({ version: z.literal("v1").default("v1"), @@ -563,6 +676,13 @@ export const SharedQueueToClientMessages = { }, }; +const IndexTasksMessage = z.object({ + version: z.literal("v1"), + deploymentId: z.string(), + tasks: TaskResource.array(), + packageVersion: z.string(), +}); + export const ProdWorkerToCoordinatorMessages = { LOG: { message: z.object({ @@ -572,12 +692,15 @@ export const ProdWorkerToCoordinatorMessages = { callback: z.void(), }, INDEX_TASKS: { - message: z.object({ - version: z.literal("v1").default("v1"), - deploymentId: z.string(), - tasks: TaskResource.array(), - packageVersion: z.string(), - }), + message: z.discriminatedUnion("version", [ + IndexTasksMessage.extend({ + version: z.literal("v1"), + }), + IndexTasksMessage.extend({ + version: z.literal("v2"), + supportsLazyAttempts: z.boolean(), + }), + ]), callback: z.discriminatedUnion("success", [ z.object({ success: z.literal(false), @@ -594,6 +717,13 @@ export const ProdWorkerToCoordinatorMessages = { totalCompletions: z.number(), }), }, + READY_FOR_LAZY_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + totalCompletions: z.number(), + }), + }, READY_FOR_RESUME: { message: z.object({ version: z.literal("v1").default("v1"), @@ -630,6 +760,12 @@ export const ProdWorkerToCoordinatorMessages = { attemptFriendlyId: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + }, TASK_RUN_COMPLETED: { message: z.object({ version: z.literal("v1").default("v1"), @@ -641,6 +777,12 @@ export const ProdWorkerToCoordinatorMessages = { shouldExit: z.boolean(), }), }, + TASK_RUN_FAILED_TO_RUN: { + message: z.object({ + version: z.literal("v1").default("v1"), + completion: TaskRunFailedExecutionResult, + }), + }, WAIT_FOR_DURATION: { message: z.object({ version: z.literal("v1").default("v1"), @@ -686,8 +828,35 @@ export const ProdWorkerToCoordinatorMessages = { }), }), }, + CREATE_TASK_RUN_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + reason: z.string().optional(), + }), + z.object({ + success: z.literal(true), + executionPayload: ProdTaskRunExecutionPayload, + }), + ]), + }, + UNRECOVERABLE_ERROR: { + message: z.object({ + version: z.literal("v1").default("v1"), + error: z.object({ + name: z.string(), + message: z.string(), + stack: z.string().optional(), + }), + }), + }, }; +// TODO: The coordinator can only safely use v1 worker messages, higher versions will need a new flag, e.g. SUPPORTS_VERSIONED_MESSAGES export const CoordinatorToProdWorkerMessages = { RESUME_AFTER_DEPENDENCY: { message: z.object({ @@ -709,17 +878,29 @@ export const CoordinatorToProdWorkerMessages = { executionPayload: ProdTaskRunExecutionPayload, }), }, - REQUEST_ATTEMPT_CANCELLATION: { + EXECUTE_TASK_RUN_LAZY_ATTEMPT: { message: z.object({ version: z.literal("v1").default("v1"), - attemptId: z.string(), + lazyPayload: TaskRunExecutionLazyAttemptPayload, }), }, - REQUEST_EXIT: { + REQUEST_ATTEMPT_CANCELLATION: { message: z.object({ version: z.literal("v1").default("v1"), + attemptId: z.string(), }), }, + REQUEST_EXIT: { + message: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + }), + z.object({ + version: z.literal("v2"), + delayInMs: z.number().optional(), + }), + ]), + }, READY_FOR_RETRY: { message: z.object({ version: z.literal("v1").default("v1"), diff --git a/packages/core/src/v3/schemas/schemas.ts b/packages/core/src/v3/schemas/schemas.ts index af72652d7ae..2d73b74dd50 100644 --- a/packages/core/src/v3/schemas/schemas.ts +++ b/packages/core/src/v3/schemas/schemas.ts @@ -224,3 +224,13 @@ export type ResolvedConfig = RequireKeys< export const WaitReason = z.enum(["WAIT_FOR_DURATION", "WAIT_FOR_TASK", "WAIT_FOR_BATCH"]); export type WaitReason = z.infer; + +export const TaskRunExecutionLazyAttemptPayload = z.object({ + runId: z.string(), + messageId: z.string(), + isTest: z.boolean(), + traceContext: z.record(z.unknown()), + environment: z.record(z.string()).optional(), +}); + +export type TaskRunExecutionLazyAttemptPayload = z.infer; diff --git a/packages/core/src/v3/zodSocket.ts b/packages/core/src/v3/zodSocket.ts index 1e2ae1e9e5d..964318586a4 100644 --- a/packages/core/src/v3/zodSocket.ts +++ b/packages/core/src/v3/zodSocket.ts @@ -1,8 +1,9 @@ import type { Socket } from "socket.io-client"; import { io } from "socket.io-client"; -import { z } from "zod"; +import { ZodError, z } from "zod"; import { EventEmitterLike, ZodMessageValueSchema } from "./zodMessageHandler"; import { LogLevel, SimpleStructuredLogger, StructuredLogger } from "./utils/structuredLogger"; +import { fromZodError } from "zod-validation-error"; export interface ZodSocketMessageCatalogSchema { [key: string]: @@ -81,7 +82,7 @@ export type MessagesFromSocketCatalog { + logger.info("Log something", { payload }); + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWait = task({ + id: "lazy-wait", + run: async (payload: { forceError?: boolean; delayInSeconds?: number }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependency = task({ + id: "lazy-single-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependency = task({ + id: "lazy-batch-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveWaits = task({ + id: "lazy-consecutive-waits", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveDependencies = task({ + id: "lazy-consecutive-dependencies", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result #1", { result }); + + logger.info("Log something else", { payload }); + + const result2 = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result #2", { result2 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveBatchDependencies = task({ + id: "lazy-consecutive-batch-dependencies", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results #1", { results }); + + logger.info("Log something else", { payload }); + + const results2 = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results #2", { results2 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWaitThenSingleDependency = task({ + id: "lazy-wait-then-single-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWaitThenBatchDependency = task({ + id: "lazy-wait-then-batch-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependencyThenWait = task({ + id: "lazy-single-dependency-then-wait", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependencyThenBatch = task({ + id: "lazy-single-dependency-then-batch", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependencyThenWait = task({ + id: "lazy-batch-dependency-then-wait", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependencyThenSingle = task({ + id: "lazy-batch-dependency-then-single", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); diff --git a/references/v3-catalog/src/trigger/longRunning.ts b/references/v3-catalog/src/trigger/longRunning.ts index 09462a1f192..e8119dfcdc4 100644 --- a/references/v3-catalog/src/trigger/longRunning.ts +++ b/references/v3-catalog/src/trigger/longRunning.ts @@ -3,7 +3,7 @@ import { logger, task } from "@trigger.dev/sdk/v3"; export const longRunning = task({ id: "long-running", run: async (payload: { message: string }) => { - logger.info("Long running payloadd", { payload }); + logger.info("Long running payloadddd", { payload }); // Wait for 3 minutes await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000)); @@ -19,7 +19,22 @@ export const longRunningParent = task({ run: async (payload: { message: string }) => { logger.info("Long running parent", { payload }); - await longRunning.triggerAndWait({ message: "child" }); + const result = await longRunning.triggerAndWait({ message: "child" }); + + return { + finished: new Date().toISOString(), + result, + }; + }, +}); + +export const longRunningWithDotInName = task({ + id: "long.running.with.dot", + run: async (payload: { message: string }) => { + logger.info("Long running payloadd", { payload }); + + // Wait for 3 minutes + await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000)); return { finished: new Date().toISOString(), diff --git a/references/v3-catalog/trigger.config.ts b/references/v3-catalog/trigger.config.ts index 649a349a66f..c1f504f2eb9 100644 --- a/references/v3-catalog/trigger.config.ts +++ b/references/v3-catalog/trigger.config.ts @@ -37,7 +37,7 @@ export const config: TriggerConfig = { retries: { enabledInDev: true, default: { - maxAttempts: 3, + maxAttempts: 4, minTimeoutInMs: 1000, maxTimeoutInMs: 10000, factor: 2,