From 5ed700dad8d30d75968155fc12c806123397105a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 30 Apr 2024 13:27:03 +0100 Subject: [PATCH 01/57] WIP worker TaskRunAttempt creation --- .../routes/api.v1.runs.$runParam.attempts.ts | 45 +++ .../authenticatedSocketConnection.server.ts | 9 +- .../app/v3/marqs/devQueueConsumer.server.ts | 327 ++++++++---------- .../app/v3/services/baseService.server.ts | 2 +- .../services/createBackgroundWorker.server.ts | 1 + .../services/createTaskRunAttempt.server.ts | 131 +++++++ packages/cli-v3/src/apiClient.ts | 21 +- packages/cli-v3/src/commands/dev.tsx | 57 +-- .../src/workers/dev/backgroundWorker.ts | 293 +++++++++++----- packages/core/src/v3/schemas/api.ts | 1 + packages/core/src/v3/schemas/common.ts | 2 +- packages/core/src/v3/schemas/messages.ts | 12 + packages/core/src/v3/schemas/schemas.ts | 10 + .../migrations/20240430101936_/migration.sql | 2 + packages/database/prisma/schema.prisma | 2 + .../v3-catalog/src/trigger/longRunning.ts | 3 +- 16 files changed, 615 insertions(+), 303 deletions(-) create mode 100644 apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts create mode 100644 apps/webapp/app/v3/services/createTaskRunAttempt.server.ts create mode 100644 packages/database/prisma/migrations/20240430101936_/migration.sql diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts new file mode 100644 index 00000000000..ecc59815b4e --- /dev/null +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts @@ -0,0 +1,45 @@ +import type { ActionFunctionArgs } from "@remix-run/server-runtime"; +import { json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; +import { CreateTaskRunAttemptService } from "~/v3/services/createTaskRunAttempt.server"; + +const ParamsSchema = z.object({ + /* This is the run friendly ID */ + runParam: z.string(), +}); + +export async function action({ request, params }: ActionFunctionArgs) { + // Authenticate the request + const authenticationResult = await authenticateApiRequest(request); + + if (!authenticationResult) { + return json({ error: "Invalid or Missing API Key" }, { status: 401 }); + } + + const parsed = ParamsSchema.safeParse(params); + + if (!parsed.success) { + return json({ error: "Invalid or missing run ID" }, { status: 400 }); + } + + const { runParam } = parsed.data; + + const service = new CreateTaskRunAttemptService(); + + try { + const execution = await service.call(runParam, authenticationResult.environment); + + return json(execution, { status: 200 }); + } catch (error) { + if (error instanceof ServiceValidationError) { + return json({ error: error.message }, { status: error.status ?? 422 }); + } + + return json( + { error: error instanceof Error ? error.message : "Internal Server Error" }, + { status: 500 } + ); + } +} diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts index 209954f34e6..33b9038194c 100644 --- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts +++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts @@ -54,7 +54,10 @@ export class AuthenticatedSocketConnection { schema: clientWebsocketMessages, messages: { READY_FOR_TASKS: async (payload) => { - await this._consumer.registerBackgroundWorker(payload.backgroundWorkerId); + await this._consumer.registerBackgroundWorker( + payload.backgroundWorkerId, + payload.inProgressRuns ?? [] + ); }, BACKGROUND_WORKER_DEPRECATED: async (payload) => { await this._consumer.deprecateBackgroundWorker(payload.backgroundWorkerId); @@ -73,6 +76,10 @@ export class AuthenticatedSocketConnection { await this._consumer.taskHeartbeat(payload.backgroundWorkerId, payload.data.id); break; } + case "TASK_RUN_HEARTBEAT": { + await this._consumer.taskRunHeartbeat(payload.backgroundWorkerId, payload.data.id); + break; + } } }, }, diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index d136838514d..98b6a298ce7 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -1,6 +1,7 @@ import { Context, ROOT_CONTEXT, Span, SpanKind, context, trace } from "@opentelemetry/api"; import { TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, serverWebsocketMessages, @@ -14,10 +15,9 @@ import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { marqs } from "~/v3/marqs/index.server"; import { EnvironmentVariablesRepository } from "../environmentVariables/environmentVariablesRepository.server"; -import { generateFriendlyId } from "../friendlyIdentifiers"; -import { CancelAttemptService } from "../services/cancelAttempt.server"; import { CancelTaskRunService } from "../services/cancelTaskRun.server"; import { CompleteAttemptService } from "../services/completeAttempt.server"; +import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server"; import { SEMINTATTRS_FORCE_RECORDING, attributesFromAuthenticatedEnv, @@ -54,7 +54,6 @@ export class DevQueueConsumer { private _taskSuccesses: number = 0; private _currentSpan: Span | undefined; private _endSpanInNextIteration = false; - private _inProgressAttempts: Map = new Map(); // Keys are task attempt friendly IDs, values are TaskRun ids/queue message ids private _inProgressRuns: Map = new Map(); // Keys are task run friendly IDs, values are TaskRun internal ids/queue message ids constructor( @@ -78,7 +77,7 @@ export class DevQueueConsumer { this._backgroundWorkers.delete(id); } - public async registerBackgroundWorker(id: string) { + public async registerBackgroundWorker(id: string, inProgressRuns: string[] = []) { const backgroundWorker = await prisma.backgroundWorker.findUnique({ where: { friendlyId: id, runtimeEnvironmentId: this.env.id }, include: { @@ -92,7 +91,10 @@ export class DevQueueConsumer { this._backgroundWorkers.set(backgroundWorker.id, backgroundWorker); - logger.debug("Registered background worker", { backgroundWorker: backgroundWorker.id }); + logger.debug("Registered background worker", { + backgroundWorker: backgroundWorker.id, + inProgressRuns, + }); const subscriber = await devPubSub.subscribe(`backgroundWorker:${backgroundWorker.id}:*`); @@ -109,6 +111,10 @@ export class DevQueueConsumer { this._backgroundWorkerSubscriber.set(backgroundWorker.id, subscriber); + for (const runId of inProgressRuns) { + this._inProgressRuns.set(runId, runId); + } + // Start reading from the queue if we haven't already await this.#enable(); } @@ -118,15 +124,16 @@ export class DevQueueConsumer { completion: TaskRunExecutionResult, execution: TaskRunExecution ) { - this._inProgressAttempts.delete(execution.attempt.id); - if (completion.ok) { this._taskSuccesses++; } else { this._taskFailures++; } - logger.debug("Task run completed", { taskRunCompletion: completion, execution }); + logger.debug("[DevQueueConsumer] taskAttemptCompleted()", { + taskRunCompletion: completion, + execution, + }); const service = new CompleteAttemptService(); const result = await service.call({ completion, execution, env: this.env }); @@ -136,7 +143,12 @@ export class DevQueueConsumer { } } + /** + * @deprecated Use `taskRunHeartbeat` instead + */ public async taskHeartbeat(workerId: string, id: string, seconds: number = 60) { + logger.debug("[DevQueueConsumer] taskHeartbeat()", { id, seconds }); + const taskRunAttempt = await prisma.taskRunAttempt.findUnique({ where: { friendlyId: id }, }); @@ -148,6 +160,12 @@ export class DevQueueConsumer { await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds); } + public async taskRunHeartbeat(workerId: string, id: string, seconds: number = 60) { + logger.debug("[DevQueueConsumer] taskRunHeartbeat()", { id, seconds }); + + await marqs?.heartbeatMessage(id, seconds); + } + public async stop(reason: string = "CLI disconnected") { if (!this._enabled) { return; @@ -180,66 +198,23 @@ export class DevQueueConsumer { } async #cancelInProgressRunsAndAttempts(reason: string) { - const cancelAttemptService = new CancelAttemptService(); const cancelTaskRunService = new CancelTaskRunService(); const cancelledAt = new Date(); - const inProgressAttempts = new Map(this._inProgressAttempts); const inProgressRuns = new Map(this._inProgressRuns); - this._inProgressAttempts.clear(); this._inProgressRuns.clear(); - const inProgressRunsWithNoInProgressAttempts: string[] = []; - const inProgressAttemptRunIds = new Set(inProgressAttempts.values()); - - for (const [runId, messageId] of inProgressRuns) { - if (!inProgressAttemptRunIds.has(messageId)) { - inProgressRunsWithNoInProgressAttempts.push(messageId); - } - } - logger.debug("Cancelling in progress runs and attempts", { - attempts: Array.from(inProgressAttempts.keys()), runs: Array.from(inProgressRuns.keys()), }); - for (const [attemptId, messageId] of inProgressAttempts) { - await this.#cancelInProgressAttempt( - attemptId, - messageId, - cancelAttemptService, - cancelledAt, - reason - ); - } - - for (const runId of inProgressRunsWithNoInProgressAttempts) { + for (const [_, runId] of inProgressRuns) { await this.#cancelInProgressRun(runId, cancelTaskRunService, cancelledAt, reason); } } - async #cancelInProgressAttempt( - attemptId: string, - messageId: string, - cancelAttemptService: CancelAttemptService, - cancelledAt: Date, - reason: string - ) { - logger.debug("Cancelling in progress attempt", { attemptId, messageId }); - - try { - await cancelAttemptService.call(attemptId, messageId, cancelledAt, reason, this.env); - } catch (e) { - logger.error("Failed to cancel in progress attempt", { - attemptId, - messageId, - error: e, - }); - } - } - async #cancelInProgressRun( runId: string, service: CancelTaskRunService, @@ -248,16 +223,20 @@ export class DevQueueConsumer { ) { logger.debug("Cancelling in progress run", { runId }); - const taskRun = await prisma.taskRun.findUnique({ - where: { id: runId }, - }); + const taskRun = runId.startsWith("run_") + ? await prisma.taskRun.findUnique({ + where: { friendlyId: runId }, + }) + : await prisma.taskRun.findUnique({ + where: { id: runId }, + }); if (!taskRun) { return; } try { - await service.call(taskRun, { reason, cancelAttempts: false, cancelledAt }); + await service.call(taskRun, { reason, cancelAttempts: true, cancelledAt }); } catch (e) { logger.error("Failed to cancel in progress run", { runId, @@ -446,154 +425,132 @@ export class DevQueueConsumer { return; } - const queue = await prisma.taskQueue.findUnique({ - where: { - runtimeEnvironmentId_name: { runtimeEnvironmentId: this.env.id, name: lockedTaskRun.queue }, - }, - }); - - if (!queue) { - await marqs?.nackMessage(message.messageId); - setTimeout(() => this.#doWork(), 1000); - return; - } - if (!this._enabled) { + logger.debug("Dev queue consumer is disabled", { env: this.env, queueMessage: message }); + await marqs?.nackMessage(message.messageId); return; } - const taskRunAttempt = await prisma.taskRunAttempt.create({ - data: { - number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1, - friendlyId: generateFriendlyId("attempt"), - taskRunId: lockedTaskRun.id, - startedAt: new Date(), - backgroundWorkerId: backgroundTask.workerId, - backgroundWorkerTaskId: backgroundTask.id, - status: "EXECUTING" as const, - queueId: queue.id, - runtimeEnvironmentId: this.env.id, - }, - }); - - const execution: TaskRunExecution = { - task: { - id: backgroundTask.slug, - filePath: backgroundTask.filePath, - exportName: backgroundTask.exportName, - }, - attempt: { - id: taskRunAttempt.friendlyId, - number: taskRunAttempt.number, - startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt, - backgroundWorkerId: backgroundWorker.id, - backgroundWorkerTaskId: backgroundTask.id, - status: "EXECUTING" as const, - }, - run: { - id: lockedTaskRun.friendlyId, - payload: lockedTaskRun.payload, - payloadType: lockedTaskRun.payloadType, - context: lockedTaskRun.context, - createdAt: lockedTaskRun.createdAt, - tags: lockedTaskRun.tags.map((tag) => tag.name), - isTest: lockedTaskRun.isTest, - idempotencyKey: lockedTaskRun.idempotencyKey ?? undefined, - }, - queue: { - id: queue.friendlyId, - name: queue.name, - }, - environment: { - id: this.env.id, - slug: this.env.slug, - type: this.env.type, - }, - organization: { - id: this.env.organization.id, - slug: this.env.organization.slug, - name: this.env.organization.title, - }, - project: { - id: this.env.project.id, - ref: this.env.project.externalRef, - slug: this.env.project.slug, - name: this.env.project.name, - }, - batch: - lockedTaskRun.batchItems[0] && lockedTaskRun.batchItems[0].batchTaskRun - ? { id: lockedTaskRun.batchItems[0].batchTaskRun.friendlyId } - : undefined, - }; - const environmentRepository = new EnvironmentVariablesRepository(); const variables = await environmentRepository.getEnvironmentVariables( this.env.project.id, this.env.id ); - const payload: TaskRunExecutionPayload = { - execution, - traceContext: lockedTaskRun.traceContext as Record, - environment: variables.reduce((acc: Record, curr) => { - acc[curr.key] = curr.value; - return acc; - }, {}), - }; + if (backgroundWorker.supportsLazyAttempts) { + const payload: TaskRunExecutionLazyAttemptPayload = { + traceContext: lockedTaskRun.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + runId: lockedTaskRun.friendlyId, + messageId: lockedTaskRun.id, + isTest: lockedTaskRun.isTest, + }; - try { - // TODO: send trace context down to the CLI - await this._sender.send("BACKGROUND_WORKER_MESSAGE", { - backgroundWorkerId: backgroundWorker.friendlyId, - data: { - type: "EXECUTE_RUNS", - payloads: [payload], - }, - }); + try { + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId: backgroundWorker.friendlyId, + data: { + type: "EXECUTE_RUN_LAZY_ATTEMPT", + payload, + }, + }); - logger.debug("Saving the in progress attempt", { - taskRunAttempt: taskRunAttempt.id, - messageId: message.messageId, - }); + logger.debug("Executing the run", { + messageId: message.messageId, + }); - this._inProgressAttempts.set(taskRunAttempt.friendlyId, message.messageId); - this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); - } catch (e) { - if (e instanceof Error) { - this._currentSpan?.recordException(e); - } else { - this._currentSpan?.recordException(new Error(String(e))); + this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); + } catch (e) { + if (e instanceof Error) { + this._currentSpan?.recordException(e); + } else { + this._currentSpan?.recordException(new Error(String(e))); + } + + this._endSpanInNextIteration = true; + + // We now need to unlock the task run and delete the task run attempt + await prisma.$transaction([ + prisma.taskRun.update({ + where: { + id: lockedTaskRun.id, + }, + data: { + lockedAt: null, + lockedById: null, + status: "PENDING", + }, + }), + ]); + + this._inProgressRuns.delete(lockedTaskRun.friendlyId); + + // Finally we need to nack the message so it can be retried + await marqs?.nackMessage(message.messageId); + } finally { + setTimeout(() => this.#doWork(), 100); } - - this._endSpanInNextIteration = true; - - // We now need to unlock the task run and delete the task run attempt - await prisma.$transaction([ - prisma.taskRun.update({ - where: { - id: lockedTaskRun.id, - }, + } else { + const service = new CreateTaskRunAttemptService(); + const execution = await service.call(lockedTaskRun.friendlyId, this.env); + + const payload: TaskRunExecutionPayload = { + traceContext: lockedTaskRun.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + execution, + }; + + try { + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId: backgroundWorker.friendlyId, data: { - lockedAt: null, - lockedById: null, - status: "PENDING", - }, - }), - prisma.taskRunAttempt.delete({ - where: { - id: taskRunAttempt.id, + type: "EXECUTE_RUNS", + payloads: [payload], }, - }), - ]); + }); - this._inProgressAttempts.delete(taskRunAttempt.friendlyId); - this._inProgressRuns.delete(lockedTaskRun.friendlyId); + logger.debug("Executing the run", { + messageId: message.messageId, + }); - // Finally we need to nack the message so it can be retried - await marqs?.nackMessage(message.messageId); - } finally { - setTimeout(() => this.#doWork(), 100); + this._inProgressRuns.set(lockedTaskRun.friendlyId, message.messageId); + } catch (e) { + if (e instanceof Error) { + this._currentSpan?.recordException(e); + } else { + this._currentSpan?.recordException(new Error(String(e))); + } + + this._endSpanInNextIteration = true; + + // We now need to unlock the task run and delete the task run attempt + await prisma.$transaction([ + prisma.taskRun.update({ + where: { + id: lockedTaskRun.id, + }, + data: { + lockedAt: null, + lockedById: null, + status: "PENDING", + }, + }), + ]); + + this._inProgressRuns.delete(lockedTaskRun.friendlyId); + + // Finally we need to nack the message so it can be retried + await marqs?.nackMessage(message.messageId); + } finally { + setTimeout(() => this.#doWork(), 100); + } } } diff --git a/apps/webapp/app/v3/services/baseService.server.ts b/apps/webapp/app/v3/services/baseService.server.ts index 6892118c649..e6b9d0252cb 100644 --- a/apps/webapp/app/v3/services/baseService.server.ts +++ b/apps/webapp/app/v3/services/baseService.server.ts @@ -34,7 +34,7 @@ export abstract class BaseService { } export class ServiceValidationError extends Error { - constructor(message: string) { + constructor(message: string, public status?: number) { super(message); this.name = "ServiceValidationError"; } diff --git a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts index 4137d7a16cc..79a5e287849 100644 --- a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts +++ b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts @@ -63,6 +63,7 @@ export class CreateBackgroundWorkerService extends BaseService { contentHash: body.metadata.contentHash, cliVersion: body.metadata.cliPackageVersion, sdkVersion: body.metadata.packageVersion, + supportsLazyAttempts: body.supportsLazyAttempts, }, }); diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts new file mode 100644 index 00000000000..b72ecfe06e1 --- /dev/null +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -0,0 +1,131 @@ +import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { BaseService, ServiceValidationError } from "./baseService.server"; +import { TaskRunExecution } from "@trigger.dev/core/v3"; +import { prisma } from "~/db.server"; +import { generateFriendlyId } from "../friendlyIdentifiers"; +import { logger } from "~/services/logger.server"; + +export class CreateTaskRunAttemptService extends BaseService { + public async call( + runFriendlyId: string, + environment: AuthenticatedEnvironment + ): Promise { + return await this.traceWithEnv("call()", environment, async (span) => { + span.setAttribute("taskRunId", runFriendlyId); + + const taskRun = await this._prisma.taskRun.findUnique({ + where: { + friendlyId: runFriendlyId, + runtimeEnvironmentId: environment.id, + }, + include: { + tags: true, + attempts: true, + lockedBy: { + include: { + worker: true, + }, + }, + batchItems: { + include: { + batchTaskRun: true, + }, + }, + }, + }); + + logger.debug("Creating a task run attempt", { taskRun }); + + if (!taskRun) { + throw new ServiceValidationError("Task run not found", 404); + } + + if (taskRun.status === "CANCELED") { + throw new ServiceValidationError("Task run is cancelled", 400); + } + + if (!taskRun.lockedBy) { + throw new ServiceValidationError("Task run is not locked", 400); + } + + const queue = await this._prisma.taskQueue.findUnique({ + where: { + runtimeEnvironmentId_name: { + runtimeEnvironmentId: environment.id, + name: taskRun.queue, + }, + }, + }); + + if (!queue) { + throw new ServiceValidationError("Queue not found", 404); + } + + const taskRunAttempt = await prisma.taskRunAttempt.create({ + data: { + number: taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1, + friendlyId: generateFriendlyId("attempt"), + taskRunId: taskRun.id, + startedAt: new Date(), + backgroundWorkerId: taskRun.lockedBy.worker.id, + backgroundWorkerTaskId: taskRun.lockedBy.id, + status: "EXECUTING" as const, + queueId: queue.id, + runtimeEnvironmentId: environment.id, + }, + }); + + const execution: TaskRunExecution = { + task: { + id: taskRun.lockedBy.slug, + filePath: taskRun.lockedBy.filePath, + exportName: taskRun.lockedBy.exportName, + }, + attempt: { + id: taskRunAttempt.friendlyId, + number: taskRunAttempt.number, + startedAt: taskRunAttempt.startedAt ?? taskRunAttempt.createdAt, + backgroundWorkerId: taskRun.lockedBy.worker.id, + backgroundWorkerTaskId: taskRun.lockedBy.id, + status: "EXECUTING" as const, + }, + run: { + id: taskRun.friendlyId, + payload: taskRun.payload, + payloadType: taskRun.payloadType, + context: taskRun.context, + createdAt: taskRun.createdAt, + tags: taskRun.tags.map((tag) => tag.name), + isTest: taskRun.isTest, + idempotencyKey: taskRun.idempotencyKey ?? undefined, + }, + queue: { + id: queue.friendlyId, + name: queue.name, + }, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + organization: { + id: environment.organization.id, + slug: environment.organization.slug, + name: environment.organization.title, + }, + project: { + id: environment.project.id, + ref: environment.project.externalRef, + slug: environment.project.slug, + name: environment.project.name, + }, + batch: + taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun + ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId } + : undefined, + }; + + return execution; + }); + } +} diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 7c775181398..347ca1d8252 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -14,6 +14,7 @@ import { GetDeploymentResponseBody, GetProjectsResponseBody, GetProjectResponseBody, + TaskRunExecution, } from "@trigger.dev/core/v3"; export class CliApiClient { @@ -103,6 +104,20 @@ export class CliApiClient { ); } + async createTaskRunAttempt(runFriendlyId: string) { + if (!this.accessToken) { + throw new Error("creatTaskRunAttempt: No access token"); + } + + return zodfetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { + method: "POST", + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + }); + } + async getProjectEnv({ projectRef, env, @@ -198,11 +213,11 @@ type ApiResult = error: string; }; -async function zodfetch( - schema: z.Schema, +async function zodfetch( + schema: T, url: string, requestInit?: RequestInit -): Promise> { +): Promise>> { try { const response = await fetch(url, requestInit); diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx index c9e684847a0..adc61a3e3a1 100644 --- a/packages/cli-v3/src/commands/dev.tsx +++ b/packages/cli-v3/src/commands/dev.tsx @@ -278,6 +278,7 @@ function useDev({ websocket.addEventListener("close", (event) => {}); websocket.addEventListener("error", (event) => {}); + // This is the deprecated task heart beat that uses the friendly attempt ID backgroundWorkerCoordinator.onWorkerTaskHeartbeat.attach( async ({ worker, backgroundWorkerId, id }) => { await sender.send("BACKGROUND_WORKER_MESSAGE", { @@ -290,6 +291,19 @@ function useDev({ } ); + // "Task Run Heartbeat" id is the actual run ID that corresponds to the MarQS message ID + backgroundWorkerCoordinator.onWorkerTaskRunHeartbeat.attach( + async ({ worker, backgroundWorkerId, id }) => { + await sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId, + data: { + type: "TASK_RUN_HEARTBEAT", + id, + }, + }); + } + ); + backgroundWorkerCoordinator.onTaskCompleted.attach( async ({ backgroundWorkerId, completion, execution }) => { await sender.send("BACKGROUND_WORKER_MESSAGE", { @@ -327,6 +341,7 @@ function useDev({ for (const worker of backgroundWorkerCoordinator.currentWorkers) { await sender.send("READY_FOR_TASKS", { backgroundWorkerId: worker.id, + inProgressRuns: worker.worker.inProgressRuns, }); } }, @@ -495,20 +510,24 @@ function useDev({ const processEnv = await gatherProcessEnv(); - const backgroundWorker = new BackgroundWorker(fullPath, { - projectConfig: config, - dependencies, - env: { - ...processEnv, - TRIGGER_API_URL: apiUrl, - TRIGGER_SECRET_KEY: apiKey, - ...(environmentVariablesResponse.success - ? environmentVariablesResponse.data.variables - : {}), + const backgroundWorker = new BackgroundWorker( + fullPath, + { + projectConfig: config, + dependencies, + env: { + ...processEnv, + TRIGGER_API_URL: apiUrl, + TRIGGER_SECRET_KEY: apiKey, + ...(environmentVariablesResponse.success + ? environmentVariablesResponse.data.variables + : {}), + }, + debuggerOn, + debugOtel, }, - debuggerOn, - debugOtel, - }); + environmentClient + ); try { await backgroundWorker.initialize(); @@ -565,6 +584,7 @@ function useDev({ tasks: taskResources, contentHash: contentHash, }, + supportsLazyAttempts: true, }; const backgroundWorkerRecord = await environmentClient.createBackgroundWorker( @@ -816,18 +836,9 @@ function createDuplicateTaskIdOutputErrorMessage( async function gatherProcessEnv() { const env = { + ...process.env, NODE_ENV: process.env.NODE_ENV ?? "development", - PATH: process.env.PATH, - USER: process.env.USER, - SHELL: process.env.SHELL, - NVM_INC: process.env.NVM_INC, - NVM_DIR: process.env.NVM_DIR, - NVM_BIN: process.env.NVM_BIN, - LANG: process.env.LANG, - TERM: process.env.TERM, NODE_PATH: await amendNodePathWithPnpmNodeModules(process.env.NODE_PATH), - HOME: process.env.HOME, - BUN_INSTALL: process.env.BUN_INSTALL, }; // Filter out undefined values diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index 0b2b2d715ec..c0b75fdbadb 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -9,6 +9,7 @@ import { TaskRunError, TaskRunErrorCodes, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, childToWorkerMessages, @@ -37,6 +38,7 @@ import { safeDeleteFileSync } from "../../utilities/fileSystem.js"; import { installPackages } from "../../utilities/installPackages.js"; import { logger } from "../../utilities/logger.js"; import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors.js"; +import { CliApiClient } from "../../apiClient.js"; export type CurrentWorkers = BackgroundWorkerCoordinator["currentWorkers"]; export class BackgroundWorkerCoordinator { @@ -51,11 +53,20 @@ export class BackgroundWorkerCoordinator { id: string; record: CreateBackgroundWorkerResponse; }> = new Evt(); + + /** + * @deprecated use onWorkerTaskRunHeartbeat instead + */ public onWorkerTaskHeartbeat: Evt<{ id: string; backgroundWorkerId: string; worker: BackgroundWorker; }> = new Evt(); + public onWorkerTaskRunHeartbeat: Evt<{ + id: string; + backgroundWorkerId: string; + worker: BackgroundWorker; + }> = new Evt(); public onWorkerDeprecated: Evt<{ worker: BackgroundWorker; id: string }> = new Evt(); private _backgroundWorkers: Map = new Map(); private _records: Map = new Map(); @@ -106,6 +117,10 @@ export class BackgroundWorkerCoordinator { worker.onTaskHeartbeat.attach((id) => { this.onWorkerTaskHeartbeat.post({ id, backgroundWorkerId: record.id, worker }); }); + + worker.onTaskRunHeartbeat.attach((id) => { + this.onWorkerTaskRunHeartbeat.post({ id, backgroundWorkerId: record.id, worker }); + }); } close() { @@ -135,11 +150,15 @@ export class BackgroundWorkerCoordinator { } await worker.cancelRun(message.taskRunId); + break; + } + case "EXECUTE_RUN_LAZY_ATTEMPT": { + await this.#executeTaskRunLazyAttempt(id, message.payload); } } } - async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) { + async #executeTaskRunLazyAttempt(id: string, payload: TaskRunExecutionLazyAttemptPayload) { const worker = this._backgroundWorkers.get(id); if (!worker) { @@ -154,82 +173,39 @@ export class BackgroundWorkerCoordinator { return; } - const { execution } = payload; - - // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1 - - const logsUrl = `${this.baseURL}/runs/${execution.run.id}`; - - const pipe = chalkGrey("|"); - const bullet = chalkGrey("○"); - const link = chalkLink(terminalLink("View logs", logsUrl)); - let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt)); - const workerPrefix = chalkWorker(record.version); - const taskPrefix = chalkTask(execution.task.id); - const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`); - - logger.log( - `${bullet} ${timestampPrefix} ${chalkGrey( - "->" - )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}` - ); - - const now = performance.now(); - - const completion = await worker.executeTaskRun(payload); - - const elapsed = performance.now() - now; + const { completion, execution } = await worker.executeTaskRunLazyAttempt(payload, this.baseURL); - const retryingText = chalkGrey( - !completion.ok && completion.skippedRetrying - ? " (retrying skipped)" - : !completion.ok && completion.retry !== undefined - ? ` (retrying in ${completion.retry.delay}ms)` - : "" - ); - - const resultText = !completion.ok - ? completion.error.type === "INTERNAL_ERROR" && - (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED || - completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED) - ? chalkWarning("Cancelled") - : `${chalkError("Error")}${retryingText}` - : chalkSuccess("Success"); + this.onTaskCompleted.post({ + completion, + execution, + worker, + backgroundWorkerId: id, + }); + } - const errorText = !completion.ok - ? this.#formatErrorLog(completion.error) - : "retry" in completion - ? `retry in ${completion.retry}ms` - : ""; + async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) { + const worker = this._backgroundWorkers.get(id); - const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`); + if (!worker) { + logger.error(`Could not find worker ${id}`); + return; + } - timestampPrefix = chalkGrey(prettyPrintDate()); + const record = this._records.get(id); - logger.log( - `${bullet} ${timestampPrefix} ${chalkGrey( - "->" - )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}` - ); + if (!record) { + logger.error(`Could not find worker record ${id}`); + return; + } - this.onTaskCompleted.post({ completion, execution, worker, backgroundWorkerId: id }); - } + const completion = await worker.executeTaskRun(payload, this.baseURL); - #formatErrorLog(error: TaskRunError) { - switch (error.type) { - case "INTERNAL_ERROR": { - return ""; - } - case "STRING_ERROR": { - return `\n\n${chalkError("X Error:")} ${error.raw}\n`; - } - case "CUSTOM_ERROR": { - return `\n\n${chalkError("X Error:")} ${error.raw}\n`; - } - case "BUILT_IN_ERROR": { - return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`; - } - } + this.onTaskCompleted.post({ + completion, + execution: payload.execution, + worker, + backgroundWorkerId: id, + }); } } @@ -264,13 +240,18 @@ export type BackgroundWorkerParams = { debuggerOn: boolean; debugOtel?: boolean; }; + export class BackgroundWorker { private _initialized: boolean = false; private _handler = new ZodMessageHandler({ schema: childToWorkerMessages, }); + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); private _onClose: Evt = new Evt(); public tasks: Array = []; @@ -282,7 +263,8 @@ export class BackgroundWorker { constructor( public path: string, - private params: BackgroundWorkerParams + private params: BackgroundWorkerParams, + private apiClient: CliApiClient ) {} close() { @@ -293,6 +275,7 @@ export class BackgroundWorker { this._closed = true; this.onTaskHeartbeat.detach(); + this.onTaskRunHeartbeat.detach(); // We need to close all the task run processes for (const taskRunProcess of this._taskRunProcesses.values()) { @@ -306,6 +289,10 @@ export class BackgroundWorker { safeDeleteFileSync(`${this.path}.map`); } + get inProgressRuns(): Array { + return Array.from(this._taskRunProcesses.keys()); + } + async initialize() { if (this._initialized) { throw new Error("Worker already initialized"); @@ -393,14 +380,18 @@ export class BackgroundWorker { } } - async #initializeTaskRunProcess(payload: TaskRunExecutionPayload): Promise { + async #initializeTaskRunProcess( + payload: TaskRunExecutionPayload, + messageId?: string + ): Promise { if (!this.metadata) { throw new Error("Worker not registered"); } if (!this._taskRunProcesses.has(payload.execution.run.id)) { const taskRunProcess = new TaskRunProcess( - payload.execution, + payload.execution.run.id, + payload.execution.run.isTest, this.path, { ...this.params.env, @@ -408,7 +399,8 @@ export class BackgroundWorker { ...this.#readEnvVars(), }, this.metadata, - this.params + this.params, + messageId ); taskRunProcess.onExit.attach(() => { @@ -419,6 +411,10 @@ export class BackgroundWorker { this.onTaskHeartbeat.post(id); }); + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); + await taskRunProcess.initialize(); this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess); @@ -437,10 +433,104 @@ export class BackgroundWorker { await taskRunProcess.cancel(); } + async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload, baseURL: string) { + const attemptResponse = await this.apiClient.createTaskRunAttempt(payload.runId); + + if (!attemptResponse.success) { + throw new Error(`Failed to create task run attempt: ${attemptResponse.error}`); + } + + const execution = attemptResponse.data; + + const completion = await this.executeTaskRun( + { execution, traceContext: payload.traceContext, environment: payload.environment }, + baseURL, + payload.messageId + ); + + return { execution, completion }; + } + // We need to fork the process before we can execute any tasks - async executeTaskRun(payload: TaskRunExecutionPayload): Promise { + async executeTaskRun( + payload: TaskRunExecutionPayload, + baseURL: string, + messageId?: string + ): Promise { + if (this._closed) { + throw new Error("Worker is closed"); + } + + if (!this.metadata) { + throw new Error("Worker not registered"); + } + + const { execution } = payload; + // ○ Mar 27 09:17:25.653 -> View logs | 20240326.20 | create-avatar | run_slufhjdfiv8ejnrkw9dsj.1 + + const logsUrl = `${baseURL}/runs/${execution.run.id}`; + + const pipe = chalkGrey("|"); + const bullet = chalkGrey("○"); + const link = chalkLink(terminalLink("View logs", logsUrl)); + let timestampPrefix = chalkGrey(prettyPrintDate(payload.execution.attempt.startedAt)); + const workerPrefix = chalkWorker(this.metadata.version); + const taskPrefix = chalkTask(execution.task.id); + const runId = chalkRun(`${execution.run.id}.${execution.attempt.number}`); + + logger.log( + `${bullet} ${timestampPrefix} ${chalkGrey( + "->" + )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId}` + ); + + const now = performance.now(); + + const completion = await this.#doExecuteTaskRun(payload, messageId); + + const elapsed = performance.now() - now; + + const retryingText = chalkGrey( + !completion.ok && completion.skippedRetrying + ? " (retrying skipped)" + : !completion.ok && completion.retry !== undefined + ? ` (retrying in ${completion.retry.delay}ms)` + : "" + ); + + const resultText = !completion.ok + ? completion.error.type === "INTERNAL_ERROR" && + (completion.error.code === TaskRunErrorCodes.TASK_EXECUTION_ABORTED || + completion.error.code === TaskRunErrorCodes.TASK_RUN_CANCELLED) + ? chalkWarning("Cancelled") + : `${chalkError("Error")}${retryingText}` + : chalkSuccess("Success"); + + const errorText = !completion.ok + ? formatErrorLog(completion.error) + : "retry" in completion + ? `retry in ${completion.retry}ms` + : ""; + + const elapsedText = chalkGrey(`(${formatDurationMilliseconds(elapsed, { style: "short" })})`); + + timestampPrefix = chalkGrey(prettyPrintDate()); + + logger.log( + `${bullet} ${timestampPrefix} ${chalkGrey( + "->" + )} ${link} ${pipe} ${workerPrefix} ${pipe} ${taskPrefix} ${pipe} ${runId} ${pipe} ${resultText} ${elapsedText}${errorText}` + ); + + return completion; + } + + async #doExecuteTaskRun( + payload: TaskRunExecutionPayload, + messageId?: string + ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload); + const taskRunProcess = await this.#initializeTaskRunProcess(payload, messageId); const result = await taskRunProcess.executeTaskRun(payload); // Kill the worker if the task was successful or if it's not going to be retried); @@ -553,15 +643,21 @@ class TaskRunProcess { private _currentExecution: TaskRunExecution | undefined; private _isBeingKilled: boolean = false; private _isBeingCancelled: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); public onExit: Evt = new Evt(); constructor( - private execution: TaskRunExecution, + private runId: string, + private isTest: boolean, private path: string, private env: NodeJS.ProcessEnv, private metadata: BackgroundWorkerProperties, - private worker: BackgroundWorkerParams + private worker: BackgroundWorkerParams, + private messageId?: string ) { this._sender = new ZodMessageSender({ schema: workerToChildMessages, @@ -581,7 +677,7 @@ class TaskRunProcess { async initialize() { const fullEnv = { - ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), + ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), ...this.env, OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({ [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir, @@ -592,7 +688,7 @@ class TaskRunProcess { const cwd = dirname(this.path); - logger.debug(`[${this.execution.run.id}] initializing task run process`, { + logger.debug(`[${this.runId}] initializing task run process`, { env: fullEnv, path: this.path, cwd, @@ -618,7 +714,7 @@ class TaskRunProcess { return; } - logger.debug(`[${this.execution.run.id}] cleaning up task run process`, { kill }); + logger.debug(`[${this.runId}] cleaning up task run process`, { kill }); await this._sender.send("CLEANUP", { flush: true, @@ -630,7 +726,7 @@ class TaskRunProcess { // Set a timeout to kill the child process if it hasn't been killed within 5 seconds setTimeout(() => { if (this._child && !this._child.killed) { - logger.debug(`[${this.execution.run.id}] killing task run process after timeout`); + logger.debug(`[${this.runId}] killing task run process after timeout`); this._child.kill(); } @@ -673,12 +769,12 @@ class TaskRunProcess { return; } - if (execution.run.id === this.execution.run.id) { + if (execution.run.id === this.runId) { // We don't need to notify the task run process if it's the same as the one we're running return; } - logger.debug(`[${this.execution.run.id}] task run completed notification`, { + logger.debug(`[${this.runId}] task run completed notification`, { completion, execution, }); @@ -717,14 +813,18 @@ class TaskRunProcess { break; } case "READY_TO_DISPOSE": { - logger.debug(`[${this.execution.run.id}] task run process is ready to dispose`); + logger.debug(`[${this.runId}] task run process is ready to dispose`); this.#kill(); break; } case "TASK_HEARTBEAT": { - this.onTaskHeartbeat.post(message.payload.id); + if (this.messageId) { + this.onTaskRunHeartbeat.post(this.messageId); + } else { + this.onTaskHeartbeat.post(message.payload.id); + } break; } @@ -735,7 +835,7 @@ class TaskRunProcess { } async #handleExit(code: number) { - logger.debug(`[${this.execution.run.id}] task run process exiting`, { code }); + logger.debug(`[${this.runId}] task run process exiting`, { code }); // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { @@ -801,9 +901,26 @@ class TaskRunProcess { #kill() { if (this._child && !this._child.killed) { - logger.debug(`[${this.execution.run.id}] killing task run process`); + logger.debug(`[${this.runId}] killing task run process`); this._child?.kill(); } } } + +function formatErrorLog(error: TaskRunError) { + switch (error.type) { + case "INTERNAL_ERROR": { + return ""; + } + case "STRING_ERROR": { + return `\n\n${chalkError("X Error:")} ${error.raw}\n`; + } + case "CUSTOM_ERROR": { + return `\n\n${chalkError("X Error:")} ${error.raw}\n`; + } + case "BUILT_IN_ERROR": { + return `\n\n${error.stackTrace.replace(/^Error: /, chalkError("X Error: "))}\n`; + } + } +} diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 78013422c9b..30101309cd9 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -41,6 +41,7 @@ export type GetProjectEnvResponse = z.infer; export const CreateBackgroundWorkerRequestBody = z.object({ localOnly: z.boolean(), metadata: BackgroundWorkerMetadata, + supportsLazyAttempts: z.boolean().optional(), }); export type CreateBackgroundWorkerRequestBody = z.infer; diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index c190d163169..58ae633c737 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -50,7 +50,7 @@ export const TaskRunInternalError = z.object({ "TASK_RUN_CANCELLED", "TASK_OUTPUT_ERROR", "HANDLE_ERROR_ERROR", - "GRACEFUL_EXIT_TIMEOUT" + "GRACEFUL_EXIT_TIMEOUT", ]), message: z.string().optional(), }); diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index 32c66c8082e..becdc9fda76 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -1,11 +1,13 @@ import { z } from "zod"; import { TaskRunExecution, TaskRunExecutionResult } from "./common"; + import { EnvironmentType, Machine, ProdTaskRunExecution, ProdTaskRunExecutionPayload, TaskMetadataWithFilePath, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, WaitReason, } from "./schemas"; @@ -34,6 +36,10 @@ export const BackgroundWorkerServerMessages = z.discriminatedUnion("type", [ projectId: z.string(), runId: z.string(), }), + z.object({ + type: z.literal("EXECUTE_RUN_LAZY_ATTEMPT"), + payload: TaskRunExecutionLazyAttemptPayload, + }), ]); export type BackgroundWorkerServerMessages = z.infer; @@ -62,6 +68,11 @@ export const BackgroundWorkerClientMessages = z.discriminatedUnion("type", [ type: z.literal("TASK_HEARTBEAT"), id: z.string(), }), + z.object({ + version: z.literal("v1").default("v1"), + type: z.literal("TASK_RUN_HEARTBEAT"), + id: z.string(), + }), ]); export type BackgroundWorkerClientMessages = z.infer; @@ -78,6 +89,7 @@ export const clientWebsocketMessages = { READY_FOR_TASKS: z.object({ version: z.literal("v1").default("v1"), backgroundWorkerId: z.string(), + inProgressRuns: z.string().array().optional(), }), BACKGROUND_WORKER_DEPRECATED: z.object({ version: z.literal("v1").default("v1"), diff --git a/packages/core/src/v3/schemas/schemas.ts b/packages/core/src/v3/schemas/schemas.ts index fe82a1be3d7..5a609a21db8 100644 --- a/packages/core/src/v3/schemas/schemas.ts +++ b/packages/core/src/v3/schemas/schemas.ts @@ -223,3 +223,13 @@ export type ResolvedConfig = RequireKeys< export const WaitReason = z.enum(["WAIT_FOR_DURATION", "WAIT_FOR_TASK", "WAIT_FOR_BATCH"]); export type WaitReason = z.infer; + +export const TaskRunExecutionLazyAttemptPayload = z.object({ + runId: z.string(), + messageId: z.string(), + isTest: z.boolean(), + traceContext: z.record(z.unknown()), + environment: z.record(z.string()).optional(), +}); + +export type TaskRunExecutionLazyAttemptPayload = z.infer; diff --git a/packages/database/prisma/migrations/20240430101936_/migration.sql b/packages/database/prisma/migrations/20240430101936_/migration.sql new file mode 100644 index 00000000000..28b3b39095a --- /dev/null +++ b/packages/database/prisma/migrations/20240430101936_/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "BackgroundWorker" ADD COLUMN "supportsLazyAttempts" BOOLEAN NOT NULL DEFAULT false; diff --git a/packages/database/prisma/schema.prisma b/packages/database/prisma/schema.prisma index d36c3061ec9..da920765212 100644 --- a/packages/database/prisma/schema.prisma +++ b/packages/database/prisma/schema.prisma @@ -1539,6 +1539,8 @@ model BackgroundWorker { deployment WorkerDeployment? + supportsLazyAttempts Boolean @default(false) + @@unique([projectId, runtimeEnvironmentId, version]) } diff --git a/references/v3-catalog/src/trigger/longRunning.ts b/references/v3-catalog/src/trigger/longRunning.ts index 09462a1f192..17b761a850e 100644 --- a/references/v3-catalog/src/trigger/longRunning.ts +++ b/references/v3-catalog/src/trigger/longRunning.ts @@ -19,10 +19,11 @@ export const longRunningParent = task({ run: async (payload: { message: string }) => { logger.info("Long running parent", { payload }); - await longRunning.triggerAndWait({ message: "child" }); + const result = await longRunning.triggerAndWait({ message: "child" }); return { finished: new Date().toISOString(), + result, }; }, }); From d1bdd0cf5df4549cbc22dabf7bef387a023138fd Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 30 Apr 2024 15:20:09 +0100 Subject: [PATCH 02/57] Handling failing task runs that cannot create an attempt for whatever reason --- .../authenticatedSocketConnection.server.ts | 8 ++ apps/webapp/app/v3/eventRepository.server.ts | 3 +- apps/webapp/app/v3/failedTaskRun.server.ts | 109 ++++++++++++++++++ .../app/v3/marqs/devQueueConsumer.server.ts | 18 +++ packages/cli-v3/src/apiClient.ts | 96 +++++++-------- packages/cli-v3/src/commands/deploy.ts | 4 +- packages/cli-v3/src/commands/dev.tsx | 18 ++- packages/cli-v3/src/commands/whoami.ts | 2 +- .../src/workers/dev/backgroundWorker.ts | 79 +++++++++---- .../cli-v3/src/workers/dev/worker-facade.ts | 13 ++- packages/core/package.json | 8 ++ .../core/src/v3/runtime/devRuntimeManager.ts | 8 +- packages/core/src/v3/schemas/messages.ts | 23 +++- packages/core/src/v3/zodfetch.ts | 6 +- packages/core/tsup.config.ts | 1 + .../v3-catalog/src/trigger/longRunning.ts | 14 +++ 16 files changed, 316 insertions(+), 94 deletions(-) create mode 100644 apps/webapp/app/v3/failedTaskRun.server.ts diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts index 33b9038194c..79ce1716127 100644 --- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts +++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts @@ -72,6 +72,14 @@ export class AuthenticatedSocketConnection { ); break; } + case "TASK_RUN_FAILED_TO_RUN": { + await this._consumer.taskRunFailed( + payload.backgroundWorkerId, + payload.data.completion + ); + + break; + } case "TASK_HEARTBEAT": { await this._consumer.taskHeartbeat(payload.backgroundWorkerId, payload.data.id); break; diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts index adfea613367..bfac9b700a7 100644 --- a/apps/webapp/app/v3/eventRepository.server.ts +++ b/apps/webapp/app/v3/eventRepository.server.ts @@ -147,6 +147,7 @@ export type UpdateEventOptions = { attributes: TraceAttributes; endTime?: Date; immediate?: boolean; + events?: SpanEvents; }; export class EventRepository { @@ -217,7 +218,7 @@ export class EventRepository { isCancelled: false, status: options?.attributes.isError ? "ERROR" : "OK", links: event.links ?? [], - events: event.events ?? [], + events: event.events ?? (options?.events as any) ?? [], duration: calculateDurationFromStart(event.startTime, options?.endTime), properties: event.properties as Attributes, metadata: event.metadata as Attributes, diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts new file mode 100644 index 00000000000..7e8141ef4e5 --- /dev/null +++ b/apps/webapp/app/v3/failedTaskRun.server.ts @@ -0,0 +1,109 @@ +import { + ExceptionEventProperties, + TaskRunError, + TaskRunFailedExecutionResult, +} from "@trigger.dev/core/v3"; +import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { logger } from "~/services/logger.server"; +import { marqs } from "~/v3/marqs/index.server"; + +import { eventRepository } from "./eventRepository.server"; +import { BaseService } from "./services/baseService.server"; +import { TaskRunStatus } from "@trigger.dev/database"; + +const FAILABLE_TASK_RUN_STATUSES: TaskRunStatus[] = ["EXECUTING", "PENDING", "WAITING_FOR_DEPLOY"]; + +export class FailedTaskRunService extends BaseService { + public async call({ + runFriendlyId, + completion, + env, + }: { + runFriendlyId: string; + completion: TaskRunFailedExecutionResult; + env: AuthenticatedEnvironment; + }) { + const taskRun = await this._prisma.taskRun.findUnique({ + where: { friendlyId: runFriendlyId }, + }); + + if (!taskRun) { + logger.error("[FailedTaskRunService] Task run not found", { + runFriendlyId, + completion, + }); + + return; + } + + if (!FAILABLE_TASK_RUN_STATUSES.includes(taskRun.status)) { + logger.error("[FailedTaskRunService] Task run is not in a failable state", { + taskRun, + completion, + }); + + return; + } + + // No more retries, we need to fail the task run + logger.debug("[FailedTaskRunService] Failing task run", { taskRun, completion }); + + await marqs?.acknowledgeMessage(taskRun.id); + + // Now we need to "complete" the task run event/span + await eventRepository.completeEvent(taskRun.spanId, { + endTime: new Date(), + attributes: { + isError: true, + }, + events: [ + { + name: "exception", + time: new Date(), + properties: { + exception: createExceptionPropertiesFromError(completion.error), + }, + }, + ], + }); + + await this._prisma.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "SYSTEM_FAILURE", + }, + }); + } +} + +function createExceptionPropertiesFromError(error: TaskRunError): ExceptionEventProperties { + switch (error.type) { + case "BUILT_IN_ERROR": { + return { + type: error.name, + message: error.message, + stacktrace: error.stackTrace, + }; + } + case "CUSTOM_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + case "INTERNAL_ERROR": { + return { + type: "Internal error", + message: [error.code, error.message].filter(Boolean).join(": "), + }; + } + case "STRING_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + } +} diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index 98b6a298ce7..6124b54d15e 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -4,6 +4,7 @@ import { TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, + TaskRunFailedExecutionResult, serverWebsocketMessages, } from "@trigger.dev/core/v3"; import { ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler"; @@ -24,6 +25,7 @@ import { tracer, } from "../tracer.server"; import { DevSubscriber, devPubSub } from "./devPubSub.server"; +import { FailedTaskRunService } from "../failedTaskRun.server"; const MessageBody = z.discriminatedUnion("type", [ z.object({ @@ -143,6 +145,22 @@ export class DevQueueConsumer { } } + public async taskRunFailed(workerId: string, completion: TaskRunFailedExecutionResult) { + this._taskFailures++; + + logger.debug("[DevQueueConsumer] taskRunFailed()", { completion }); + + this._inProgressRuns.delete(completion.id); + + const service = new FailedTaskRunService(); + + await service.call({ + runFriendlyId: completion.id, + completion, + env: this.env, + }); + } + /** * @deprecated Use `taskRunHeartbeat` instead */ diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 347ca1d8252..20fd16d4d7b 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -15,7 +15,9 @@ import { GetProjectsResponseBody, GetProjectResponseBody, TaskRunExecution, + APIError, } from "@trigger.dev/core/v3"; +import { zodfetch } from "@trigger.dev/core/v3/zodfetch"; export class CliApiClient { private readonly apiURL: string; @@ -28,7 +30,7 @@ export class CliApiClient { } async createAuthorizationCode() { - return zodfetch( + return wrapZodFetch( CreateAuthorizationCodeResponseSchema, `${this.apiURL}/api/v1/authorization-code`, { @@ -38,7 +40,7 @@ export class CliApiClient { } async getPersonalAccessToken(authorizationCode: string) { - return zodfetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { + return wrapZodFetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { method: "POST", body: JSON.stringify({ authorizationCode, @@ -51,7 +53,7 @@ export class CliApiClient { throw new Error("whoAmI: No access token"); } - return zodfetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { + return wrapZodFetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -64,7 +66,7 @@ export class CliApiClient { throw new Error("getProject: No access token"); } - return zodfetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { + return wrapZodFetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -77,7 +79,7 @@ export class CliApiClient { throw new Error("getProjects: No access token"); } - return zodfetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { + return wrapZodFetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -90,7 +92,7 @@ export class CliApiClient { throw new Error("createBackgroundWorker: No access token"); } - return zodfetch( + return wrapZodFetch( CreateBackgroundWorkerResponse, `${this.apiURL}/api/v1/projects/${projectRef}/background-workers`, { @@ -109,7 +111,7 @@ export class CliApiClient { throw new Error("creatTaskRunAttempt: No access token"); } - return zodfetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { + return wrapZodFetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -129,12 +131,16 @@ export class CliApiClient { throw new Error("getProjectDevEnv: No access token"); } - return zodfetch(GetProjectEnvResponse, `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, { - headers: { - Authorization: `Bearer ${this.accessToken}`, - "Content-Type": "application/json", - }, - }); + return wrapZodFetch( + GetProjectEnvResponse, + `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, + { + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + } + ); } async getEnvironmentVariables(projectRef: string) { @@ -142,7 +148,7 @@ export class CliApiClient { throw new Error("getEnvironmentVariables: No access token"); } - return zodfetch( + return wrapZodFetch( GetEnvironmentVariablesResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars`, { @@ -159,7 +165,7 @@ export class CliApiClient { throw new Error("initializeDeployment: No access token"); } - return zodfetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { + return wrapZodFetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -174,7 +180,7 @@ export class CliApiClient { throw new Error("startDeploymentIndexing: No access token"); } - return zodfetch( + return wrapZodFetch( StartDeploymentIndexingResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}/start-indexing`, { @@ -193,7 +199,7 @@ export class CliApiClient { throw new Error("getDeployment: No access token"); } - return zodfetch( + return wrapZodFetch( GetDeploymentResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}`, { @@ -213,56 +219,42 @@ type ApiResult = error: string; }; -async function zodfetch( +async function wrapZodFetch( schema: T, url: string, requestInit?: RequestInit ): Promise>> { try { - const response = await fetch(url, requestInit); + const response = await zodfetch(schema, url, requestInit, { + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 5000, + maxAttempts: 3, + factor: 2, + randomize: false, + }, + }); - if ((!requestInit || requestInit.method === "GET") && response.status === 404) { + return { + success: true, + data: response, + }; + } catch (error) { + if (error instanceof APIError) { return { success: false, - error: `404: ${response.statusText}`, + error: error.message, }; - } - - if (response.status >= 400 && response.status < 500) { - const body = await response.json(); - if (!body.error) { - return { success: false, error: "Something went wrong" }; - } - - return { success: false, error: body.error }; - } - - if (response.status !== 200) { + } else if (error instanceof Error) { return { success: false, - error: `Failed to fetch ${url}, got status code ${response.status}`, + error: error.message, }; - } - - const jsonBody = await response.json(); - const parsedResult = schema.safeParse(jsonBody); - - if (parsedResult.success) { - return { success: true, data: parsedResult.data }; - } - - if ("error" in jsonBody) { + } else { return { success: false, - error: typeof jsonBody.error === "string" ? jsonBody.error : JSON.stringify(jsonBody.error), + error: String(error), }; } - - return { success: false, error: parsedResult.error.message }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : JSON.stringify(error), - }; } } diff --git a/packages/cli-v3/src/commands/deploy.ts b/packages/cli-v3/src/commands/deploy.ts index dc841c11ece..2c1d94654d3 100644 --- a/packages/cli-v3/src/commands/deploy.ts +++ b/packages/cli-v3/src/commands/deploy.ts @@ -187,7 +187,9 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) { `Failed to connect to ${authorization.auth?.apiUrl}. Are you sure it's the correct URL?` ); } else { - throw new Error("You must login first. Use `trigger.dev login` to login."); + throw new Error( + `You must login first. Use the \`login\` CLI command.\n\n${authorization.error}` + ); } } diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx index adc61a3e3a1..51f31cd8420 100644 --- a/packages/cli-v3/src/commands/dev.tsx +++ b/packages/cli-v3/src/commands/dev.tsx @@ -110,7 +110,11 @@ export async function devCommand(dir: string, options: DevCommandOptions) { )} Connecting to the server failed. Please check your internet connection or contact eric@trigger.dev for help.` ); } else { - logger.log(`${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.`); + logger.log( + `${chalkError("X Error:")} You must login first. Use the \`login\` CLI command.\n\n${ + authorization.error + }` + ); } process.exitCode = 1; return; @@ -317,6 +321,18 @@ function useDev({ } ); + backgroundWorkerCoordinator.onTaskFailedToRun.attach( + async ({ backgroundWorkerId, completion }) => { + await sender.send("BACKGROUND_WORKER_MESSAGE", { + backgroundWorkerId, + data: { + type: "TASK_RUN_FAILED_TO_RUN", + completion, + }, + }); + } + ); + backgroundWorkerCoordinator.onWorkerRegistered.attach(async ({ id, worker, record }) => { await sender.send("READY_FOR_TASKS", { backgroundWorkerId: id, diff --git a/packages/cli-v3/src/commands/whoami.ts b/packages/cli-v3/src/commands/whoami.ts index c46445d6462..73c740c94b9 100644 --- a/packages/cli-v3/src/commands/whoami.ts +++ b/packages/cli-v3/src/commands/whoami.ts @@ -78,7 +78,7 @@ export async function whoAmI( options?.profile ?? "default" }\` to login.` ); - outro("Whoami failed"); + outro(`Whoami failed: ${authentication.error}`); } } diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index c0b75fdbadb..2ded486b1c5 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -1,4 +1,5 @@ import { + APIError, BackgroundWorkerProperties, BackgroundWorkerServerMessages, CreateBackgroundWorkerResponse, @@ -12,6 +13,7 @@ import { TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, + TaskRunFailedExecutionResult, childToWorkerMessages, correctErrorStackTrace, formatDurationMilliseconds, @@ -48,6 +50,11 @@ export class BackgroundWorkerCoordinator { worker: BackgroundWorker; execution: TaskRunExecution; }> = new Evt(); + public onTaskFailedToRun: Evt<{ + backgroundWorkerId: string; + worker: BackgroundWorker; + completion: TaskRunFailedExecutionResult; + }> = new Evt(); public onWorkerRegistered: Evt<{ worker: BackgroundWorker; id: string; @@ -73,21 +80,22 @@ export class BackgroundWorkerCoordinator { private _deprecatedWorkers: Set = new Set(); constructor(private baseURL: string) { - this.onTaskCompleted.attach(async ({ completion, execution }) => { + this.onTaskCompleted.attach(async ({ completion }) => { if (!completion.ok && typeof completion.retry !== "undefined") { return; } - await this.#notifyWorkersOfTaskCompletion(completion, execution); + await this.#notifyWorkersOfTaskCompletion(completion); + }); + + this.onTaskFailedToRun.attach(async ({ completion }) => { + await this.#notifyWorkersOfTaskCompletion(completion); }); } - async #notifyWorkersOfTaskCompletion( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { + async #notifyWorkersOfTaskCompletion(completion: TaskRunExecutionResult) { for (const worker of this._backgroundWorkers.values()) { - await worker.taskRunCompletedNotification(completion, execution); + await worker.taskRunCompletedNotification(completion); } } @@ -173,14 +181,43 @@ export class BackgroundWorkerCoordinator { return; } - const { completion, execution } = await worker.executeTaskRunLazyAttempt(payload, this.baseURL); + try { + const { completion, execution } = await worker.executeTaskRunLazyAttempt( + payload, + this.baseURL + ); - this.onTaskCompleted.post({ - completion, - execution, - worker, - backgroundWorkerId: id, - }); + this.onTaskCompleted.post({ + completion, + execution, + worker, + backgroundWorkerId: id, + }); + } catch (error) { + this.onTaskFailedToRun.post({ + backgroundWorkerId: id, + worker, + completion: { + ok: false, + id: payload.runId, + retry: undefined, + error: + error instanceof Error + ? { + type: "BUILT_IN_ERROR", + name: error.name, + message: error.message, + stackTrace: error.stack ?? "", + } + : { + type: "BUILT_IN_ERROR", + name: "UnknownError", + message: String(error), + stackTrace: "", + }, + }, + }); + } } async #executeTaskRun(id: string, payload: TaskRunExecutionPayload) { @@ -371,12 +408,9 @@ export class BackgroundWorker { // We need to notify all the task run processes that a task run has completed, // in case they are waiting for it through triggerAndWait - async taskRunCompletedNotification( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { + async taskRunCompletedNotification(completion: TaskRunExecutionResult) { for (const taskRunProcess of this._taskRunProcesses.values()) { - taskRunProcess.taskRunCompletedNotification(completion, execution); + taskRunProcess.taskRunCompletedNotification(completion); } } @@ -764,24 +798,23 @@ class TaskRunProcess { return result; } - taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) { + taskRunCompletedNotification(completion: TaskRunExecutionResult) { if (!completion.ok && typeof completion.retry !== "undefined") { return; } - if (execution.run.id === this.runId) { + if (completion.id === this.runId) { // We don't need to notify the task run process if it's the same as the one we're running return; } logger.debug(`[${this.runId}] task run completed notification`, { completion, - execution, }); this._sender.send("TASK_RUN_COMPLETED_NOTIFICATION", { + version: "v2", completion, - execution, }); } diff --git a/packages/cli-v3/src/workers/dev/worker-facade.ts b/packages/cli-v3/src/workers/dev/worker-facade.ts index 3102c23231d..f2ff50e4d48 100644 --- a/packages/cli-v3/src/workers/dev/worker-facade.ts +++ b/packages/cli-v3/src/workers/dev/worker-facade.ts @@ -180,8 +180,17 @@ const handler = new ZodMessageHandler({ _isRunning = false; } }, - TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => { - devRuntimeManager.resumeTask(completion, execution); + TASK_RUN_COMPLETED_NOTIFICATION: async (payload) => { + switch (payload.version) { + case "v1": { + devRuntimeManager.resumeTask(payload.completion, payload.execution.run.id); + break; + } + case "v2": { + devRuntimeManager.resumeTask(payload.completion, payload.completion.id); + break; + } + } }, CLEANUP: async ({ flush, kill }) => { if (kill) { diff --git a/packages/core/package.json b/packages/core/package.json index 122f1d08f4f..772d7e23560 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -37,6 +37,14 @@ "require": "./dist/v3/otel/index.js", "types": "./dist/v3/otel/index.d.ts" }, + "./v3/zodfetch": { + "import": { + "types": "./dist/v3/zodfetch.d.mts", + "default": "./dist/v3/zodfetch.mjs" + }, + "require": "./dist/v3/zodfetch.js", + "types": "./dist/v3/zodfetch.d.ts" + }, "./v3/zodMessageHandler": { "import": { "types": "./dist/v3/zodMessageHandler.d.mts", diff --git a/packages/core/src/v3/runtime/devRuntimeManager.ts b/packages/core/src/v3/runtime/devRuntimeManager.ts index 7df2c9335cf..6209fdcb203 100644 --- a/packages/core/src/v3/runtime/devRuntimeManager.ts +++ b/packages/core/src/v3/runtime/devRuntimeManager.ts @@ -80,18 +80,18 @@ export class DevRuntimeManager implements RuntimeManager { }; } - resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void { - const wait = this._taskWaits.get(execution.run.id); + resumeTask(completion: TaskRunExecutionResult, runId: string): void { + const wait = this._taskWaits.get(runId); if (!wait) { // We need to store the completion in case the task is awaited later - this._pendingCompletionNotifications.set(execution.run.id, completion); + this._pendingCompletionNotifications.set(runId, completion); return; } wait.resolve(completion); - this._taskWaits.delete(execution.run.id); + this._taskWaits.delete(runId); } } diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index becdc9fda76..819b8c3a491 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { TaskRunExecution, TaskRunExecutionResult } from "./common"; +import { TaskRunExecution, TaskRunExecutionResult, TaskRunFailedExecutionResult } from "./common"; import { EnvironmentType, @@ -63,6 +63,11 @@ export const BackgroundWorkerClientMessages = z.discriminatedUnion("type", [ completion: TaskRunExecutionResult, execution: TaskRunExecution, }), + z.object({ + version: z.literal("v1").default("v1"), + type: z.literal("TASK_RUN_FAILED_TO_RUN"), + completion: TaskRunFailedExecutionResult, + }), z.object({ version: z.literal("v1").default("v1"), type: z.literal("TASK_HEARTBEAT"), @@ -109,11 +114,17 @@ export const workerToChildMessages = { traceContext: z.record(z.unknown()), metadata: BackgroundWorkerProperties, }), - TASK_RUN_COMPLETED_NOTIFICATION: z.object({ - version: z.literal("v1").default("v1"), - completion: TaskRunExecutionResult, - execution: TaskRunExecution, - }), + TASK_RUN_COMPLETED_NOTIFICATION: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + completion: TaskRunExecutionResult, + execution: TaskRunExecution, + }), + z.object({ + version: z.literal("v2"), + completion: TaskRunExecutionResult, + }), + ]), CLEANUP: z.object({ version: z.literal("v1").default("v1"), flush: z.boolean().default(false), diff --git a/packages/core/src/v3/zodfetch.ts b/packages/core/src/v3/zodfetch.ts index f1fecf4f5f3..b57f3d197f2 100644 --- a/packages/core/src/v3/zodfetch.ts +++ b/packages/core/src/v3/zodfetch.ts @@ -16,12 +16,12 @@ export type ZodFetchOptions = { retry?: RetryOptions; }; -export async function zodfetch( - schema: z.Schema, +export async function zodfetch( + schema: T, url: string, requestInit?: RequestInit, options?: ZodFetchOptions -): Promise { +): Promise> { return await _doZodFetch(schema, url, requestInit, options); } diff --git a/packages/core/tsup.config.ts b/packages/core/tsup.config.ts index edb33fc2dc2..8d6389e241b 100644 --- a/packages/core/tsup.config.ts +++ b/packages/core/tsup.config.ts @@ -15,5 +15,6 @@ export default defineConfig({ "./src/v3/dev/index.ts", "./src/v3/prod/index.ts", "./src/v3/workers/index.ts", + "./src/v3/zodfetch.ts", ], }); diff --git a/references/v3-catalog/src/trigger/longRunning.ts b/references/v3-catalog/src/trigger/longRunning.ts index 17b761a850e..870a35ec492 100644 --- a/references/v3-catalog/src/trigger/longRunning.ts +++ b/references/v3-catalog/src/trigger/longRunning.ts @@ -27,3 +27,17 @@ export const longRunningParent = task({ }; }, }); + +export const longRunningWithDotInName = task({ + id: "long.running.with.dot", + run: async (payload: { message: string }) => { + logger.info("Long running payloadd", { payload }); + + // Wait for 3 minutes + await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000)); + + return { + finished: new Date().toISOString(), + }; + }, +}); From e24631e2dd3fc4e4e3216bbba9bfcd6d00cb01bc Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 30 Apr 2024 17:29:38 +0100 Subject: [PATCH 03/57] Move the visibility queue stuff into a graphile job --- apps/webapp/app/services/worker.server.ts | 13 +++ apps/webapp/app/v3/failedTaskRun.server.ts | 13 +-- .../app/v3/marqs/devQueueConsumer.server.ts | 7 +- apps/webapp/app/v3/marqs/index.server.ts | 77 ++++++++++++------ apps/webapp/app/v3/requeueTaskRun.server.ts | 81 +++++++++++++++++++ .../services/createTaskRunAttempt.server.ts | 49 +++++++---- .../cli-v3/src/workers/dev/worker-facade.ts | 2 +- packages/core/src/v3/schemas/common.ts | 1 + .../v3-catalog/src/trigger/longRunning.ts | 2 +- 9 files changed, 184 insertions(+), 61 deletions(-) create mode 100644 apps/webapp/app/v3/requeueTaskRun.server.ts diff --git a/apps/webapp/app/services/worker.server.ts b/apps/webapp/app/services/worker.server.ts index 98fdd80b039..daade7460b3 100644 --- a/apps/webapp/app/services/worker.server.ts +++ b/apps/webapp/app/services/worker.server.ts @@ -37,6 +37,7 @@ import { TimeoutDeploymentService } from "~/v3/services/timeoutDeployment.server import { eventRepository } from "~/v3/eventRepository.server"; import { ExecuteTasksWaitingForDeployService } from "~/v3/services/executeTasksWaitingForDeploy"; import { TriggerScheduledTaskService } from "~/v3/services/triggerScheduledTask.server"; +import { RequeueTaskRunService } from "~/v3/requeueTaskRun.server"; const workerCatalog = { indexEndpoint: z.object({ @@ -136,6 +137,9 @@ const workerCatalog = { "v3.triggerScheduledTask": z.object({ instanceId: z.string(), }), + "v3.requeueTaskRun": z.object({ + runId: z.string(), + }), }; const executionWorkerCatalog = { @@ -533,6 +537,15 @@ function getWorkerQueue() { return await service.call(payload.instanceId); }, }, + "v3.requeueTaskRun": { + priority: 0, + maxAttempts: 3, + handler: async (payload, job) => { + const service = new RequeueTaskRunService(); + + await service.call(payload.runId); + }, + }, }, }); } diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts index 7e8141ef4e5..b9cffeaf670 100644 --- a/apps/webapp/app/v3/failedTaskRun.server.ts +++ b/apps/webapp/app/v3/failedTaskRun.server.ts @@ -3,26 +3,17 @@ import { TaskRunError, TaskRunFailedExecutionResult, } from "@trigger.dev/core/v3"; -import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { marqs } from "~/v3/marqs/index.server"; +import { TaskRunStatus } from "@trigger.dev/database"; import { eventRepository } from "./eventRepository.server"; import { BaseService } from "./services/baseService.server"; -import { TaskRunStatus } from "@trigger.dev/database"; const FAILABLE_TASK_RUN_STATUSES: TaskRunStatus[] = ["EXECUTING", "PENDING", "WAITING_FOR_DEPLOY"]; export class FailedTaskRunService extends BaseService { - public async call({ - runFriendlyId, - completion, - env, - }: { - runFriendlyId: string; - completion: TaskRunFailedExecutionResult; - env: AuthenticatedEnvironment; - }) { + public async call(runFriendlyId: string, completion: TaskRunFailedExecutionResult) { const taskRun = await this._prisma.taskRun.findUnique({ where: { friendlyId: runFriendlyId }, }); diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index 6124b54d15e..8a56981d3dd 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -154,11 +154,7 @@ export class DevQueueConsumer { const service = new FailedTaskRunService(); - await service.call({ - runFriendlyId: completion.id, - completion, - env: this.env, - }); + await service.call(completion.id, completion); } /** @@ -413,7 +409,6 @@ export class DevQueueConsumer { data: { lockedAt: new Date(), lockedById: backgroundTask.id, - status: "EXECUTING", }, include: { attempts: { diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index b46628d07a3..f4658326a81 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -20,6 +20,7 @@ import { MessagePayload, QueueCapacities, } from "./types"; +import { workerQueue } from "~/services/worker.server"; const tracer = trace.getTracer("marqs"); @@ -258,6 +259,17 @@ export class MarQS { }); } + await workerQueue.enqueue( + "v3.requeueTaskRun", + { + runId: messageData.messageId, + }, + { + runAt: new Date(Date.now() + this.visibilityTimeoutInMs), + jobKey: `requeueTaskRun:${messageData.messageId}`, + } + ); + return message; }, { @@ -349,6 +361,8 @@ export class MarQS { [SemanticAttributes.PARENT_QUEUE]: message.parentQueue, }); + workerQueue.dequeue(`requeueTaskRun:${messageId}`); + await this.#callAcknowledgeMessage({ parentQueue: message.parentQueue, messageKey: this.keys.messageKey(messageId), @@ -505,16 +519,28 @@ export class MarQS { // This should increment by the number of seconds, but with a max value of Date.now() + visibilityTimeoutInMs public async heartbeatMessage(messageId: string, seconds: number = 30) { + // We are still calling this for backwards compatibility, but we should be using the v3.requeueTaskRun job await this.#callHeartbeatMessage({ visibilityQueue: constants.MESSAGE_VISIBILITY_TIMEOUT_QUEUE, messageId, milliseconds: seconds * 1000, maxVisibilityTimeout: Date.now() + this.visibilityTimeoutInMs, }); + + await workerQueue.enqueue( + "v3.requeueTaskRun", + { + runId: messageId, + }, + { + runAt: new Date(Date.now() + seconds * 1000), + jobKey: `requeueTaskRun:${messageId}`, + } + ); } get visibilityTimeoutInMs() { - return this.options.visibilityTimeoutInMs ?? 300000; + return this.options.visibilityTimeoutInMs ?? 300000; // 5 minutes } async readMessage(messageId: string) { @@ -861,7 +887,6 @@ export class MarQS { const result = await this.redis.dequeueMessage( messageQueue, parentQueue, - visibilityQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, @@ -869,7 +894,6 @@ export class MarQS { envCurrentConcurrencyKey, orgCurrentConcurrencyKey, messageQueue, - String(this.options.visibilityTimeoutInMs ?? 300000), // 5 minutes String(Date.now()), String(this.options.defaultEnvConcurrency), String(this.options.defaultOrgConcurrency) @@ -995,6 +1019,9 @@ export class MarQS { ); } + /** + * @deprecated This is being replaced by the v3.requeueTaskRun graphile worker job + */ #callHeartbeatMessage({ visibilityQueue, messageId, @@ -1133,25 +1160,23 @@ end }); this.redis.defineCommand("dequeueMessage", { - numberOfKeys: 9, + numberOfKeys: 8, lua: ` --- Keys: childQueue, parentQueue, visibilityQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey +-- Keys: childQueue, parentQueue, concurrencyLimitKey, envConcurrencyLimitKey, orgConcurrencyLimitKey, currentConcurrencyKey, envCurrentConcurrencyKey, orgCurrentConcurrencyKey local childQueue = KEYS[1] local parentQueue = KEYS[2] -local visibilityQueue = KEYS[3] -local concurrencyLimitKey = KEYS[4] -local envConcurrencyLimitKey = KEYS[5] -local orgConcurrencyLimitKey = KEYS[6] -local currentConcurrencyKey = KEYS[7] -local envCurrentConcurrencyKey = KEYS[8] -local orgCurrentConcurrencyKey = KEYS[9] - --- Args: childQueueName, visibilityQueue, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit +local concurrencyLimitKey = KEYS[3] +local envConcurrencyLimitKey = KEYS[4] +local orgConcurrencyLimitKey = KEYS[5] +local currentConcurrencyKey = KEYS[6] +local envCurrentConcurrencyKey = KEYS[7] +local orgCurrentConcurrencyKey = KEYS[8] + +-- Args: childQueueName, currentTime, defaultEnvConcurrencyLimit, defaultOrgConcurrencyLimit local childQueueName = ARGV[1] -local visibilityTimeout = tonumber(ARGV[2]) -local currentTime = tonumber(ARGV[3]) -local defaultEnvConcurrencyLimit = ARGV[4] -local defaultOrgConcurrencyLimit = ARGV[5] +local currentTime = tonumber(ARGV[2]) +local defaultEnvConcurrencyLimit = ARGV[3] +local defaultOrgConcurrencyLimit = ARGV[4] -- Check current org concurrency against the limit local orgCurrentConcurrency = tonumber(redis.call('SCARD', orgCurrentConcurrencyKey) or '0') @@ -1187,11 +1212,9 @@ end local messageId = messages[1] local messageScore = tonumber(messages[2]) -local timeoutScore = currentTime + visibilityTimeout -- Move message to timeout queue and update concurrency redis.call('ZREM', childQueue, messageId) -redis.call('ZADD', visibilityQueue, timeoutScore, messageId) redis.call('SADD', currentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) redis.call('SADD', orgCurrentConcurrencyKey, messageId) @@ -1257,7 +1280,7 @@ else redis.call('ZADD', parentQueue, earliestMessage[2], messageQueueName) end --- Remove the message from the timeout queue +-- Remove the message from the timeout queue (deprecated, will eventually remove this) redis.call('ZREM', visibilityQueue, messageId) -- Update the concurrency keys @@ -1297,7 +1320,7 @@ redis.call('SREM', concurrencyKey, messageId) redis.call('SREM', envConcurrencyKey, messageId) redis.call('SREM', orgConcurrencyKey, messageId) --- Remove the message from the timeout queue +-- Remove the message from the timeout queue (deprecated, will eventually remove this) redis.call('ZREM', visibilityQueue, messageId) -- Enqueue the message into the queue @@ -1325,12 +1348,16 @@ local milliseconds = tonumber(ARGV[2]) local maxVisibilityTimeout = tonumber(ARGV[3]) -- Get the current visibility timeout -local currentVisibilityTimeout = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 +local zscoreResult = redis.call('ZSCORE', visibilityQueue, messageId) -if currentVisibilityTimeout == 0 then +-- If there's no currentVisibilityTimeout, return and do not execute ZADD +if zscoreResult == false then return end +local currentVisibilityTimeout = tonumber(zscoreResult) + + -- Calculate the new visibility timeout local newVisibilityTimeout = math.min(currentVisibilityTimeout + milliseconds * 1000, maxVisibilityTimeout) @@ -1433,7 +1460,6 @@ declare module "ioredis" { dequeueMessage( childQueue: string, parentQueue: string, - visibilityQueue: string, concurrencyLimitKey: string, envConcurrencyLimitKey: string, orgConcurrencyLimitKey: string, @@ -1441,7 +1467,6 @@ declare module "ioredis" { envCurrentConcurrencyKey: string, orgCurrentConcurrencyKey: string, childQueueName: string, - visibilityTimeout: string, currentTime: string, defaultEnvConcurrencyLimit: string, defaultOrgConcurrencyLimit: string, diff --git a/apps/webapp/app/v3/requeueTaskRun.server.ts b/apps/webapp/app/v3/requeueTaskRun.server.ts new file mode 100644 index 00000000000..3673e16bdb5 --- /dev/null +++ b/apps/webapp/app/v3/requeueTaskRun.server.ts @@ -0,0 +1,81 @@ +import { logger } from "~/services/logger.server"; +import { marqs } from "~/v3/marqs/index.server"; + +import assertNever from "assert-never"; +import { FailedTaskRunService } from "./failedTaskRun.server"; +import { BaseService } from "./services/baseService.server"; + +export class RequeueTaskRunService extends BaseService { + public async call(runId: string) { + const taskRun = await this._prisma.taskRun.findUnique({ + where: { id: runId }, + }); + + if (!taskRun) { + logger.error("[RequeueTaskRunService] Task run not found", { + runId, + }); + + return; + } + + switch (taskRun.status) { + case "PENDING": { + logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun }); + + await marqs?.nackMessage(taskRun.id); + + break; + } + case "EXECUTING": + case "RETRYING_AFTER_FAILURE": { + logger.debug("[RequeueTaskRunService] Failing task run", { taskRun }); + + const service = new FailedTaskRunService(); + + await service.call(taskRun.friendlyId, { + ok: false, + id: taskRun.friendlyId, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_HEARTBEAT_TIMEOUT", + message: "Did not receive a heartbeat from the worker in time", + }, + }); + + break; + } + case "WAITING_FOR_DEPLOY": { + logger.debug("[RequeueTaskRunService] Removing task run from queue", { taskRun }); + + await marqs?.acknowledgeMessage(taskRun.id); + + break; + } + case "WAITING_TO_RESUME": + case "PAUSED": { + logger.debug("[RequeueTaskRunService] Requeueing task run", { taskRun }); + + await marqs?.nackMessage(taskRun.id); + + break; + } + case "SYSTEM_FAILURE": + case "INTERRUPTED": + case "CRASHED": + case "COMPLETED_WITH_ERRORS": + case "COMPLETED_SUCCESSFULLY": + case "CANCELED": { + logger.debug("[RequeueTaskRunService] Task run is completed", { taskRun }); + + await marqs?.acknowledgeMessage(taskRun.id); + + break; + } + default: { + assertNever(taskRun.status); + } + } + } +} diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts index b72ecfe06e1..7e24985404d 100644 --- a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -1,9 +1,9 @@ -import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; -import { BaseService, ServiceValidationError } from "./baseService.server"; import { TaskRunExecution } from "@trigger.dev/core/v3"; -import { prisma } from "~/db.server"; -import { generateFriendlyId } from "../friendlyIdentifiers"; +import { $transaction } from "~/db.server"; +import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; +import { generateFriendlyId } from "../friendlyIdentifiers"; +import { BaseService, ServiceValidationError } from "./baseService.server"; export class CreateTaskRunAttemptService extends BaseService { public async call( @@ -61,20 +61,37 @@ export class CreateTaskRunAttemptService extends BaseService { throw new ServiceValidationError("Queue not found", 404); } - const taskRunAttempt = await prisma.taskRunAttempt.create({ - data: { - number: taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1, - friendlyId: generateFriendlyId("attempt"), - taskRunId: taskRun.id, - startedAt: new Date(), - backgroundWorkerId: taskRun.lockedBy.worker.id, - backgroundWorkerTaskId: taskRun.lockedBy.id, - status: "EXECUTING" as const, - queueId: queue.id, - runtimeEnvironmentId: environment.id, - }, + const taskRunAttempt = await $transaction(this._prisma, async (tx) => { + const taskRunAttempt = await tx.taskRunAttempt.create({ + data: { + number: taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1, + friendlyId: generateFriendlyId("attempt"), + taskRunId: taskRun.id, + startedAt: new Date(), + backgroundWorkerId: taskRun.lockedBy!.worker.id, + backgroundWorkerTaskId: taskRun.lockedBy!.id, + status: "EXECUTING" as const, + queueId: queue.id, + runtimeEnvironmentId: environment.id, + }, + }); + + await tx.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "EXECUTING", + }, + }); + + return taskRunAttempt; }); + if (!taskRunAttempt) { + throw new ServiceValidationError("Failed to create task run attempt", 500); + } + const execution: TaskRunExecution = { task: { id: taskRun.lockedBy.slug, diff --git a/packages/cli-v3/src/workers/dev/worker-facade.ts b/packages/cli-v3/src/workers/dev/worker-facade.ts index f2ff50e4d48..60a1697c258 100644 --- a/packages/cli-v3/src/workers/dev/worker-facade.ts +++ b/packages/cli-v3/src/workers/dev/worker-facade.ts @@ -222,7 +222,7 @@ sender.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => { process.title = "trigger-dev-worker"; -async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) { +async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 30) { async function _doHeartbeat() { while (true) { if (_isRunning && _execution) { diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index 58ae633c737..4f3052ea196 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -51,6 +51,7 @@ export const TaskRunInternalError = z.object({ "TASK_OUTPUT_ERROR", "HANDLE_ERROR_ERROR", "GRACEFUL_EXIT_TIMEOUT", + "TASK_RUN_HEARTBEAT_TIMEOUT", ]), message: z.string().optional(), }); diff --git a/references/v3-catalog/src/trigger/longRunning.ts b/references/v3-catalog/src/trigger/longRunning.ts index 870a35ec492..e8119dfcdc4 100644 --- a/references/v3-catalog/src/trigger/longRunning.ts +++ b/references/v3-catalog/src/trigger/longRunning.ts @@ -3,7 +3,7 @@ import { logger, task } from "@trigger.dev/sdk/v3"; export const longRunning = task({ id: "long-running", run: async (payload: { message: string }) => { - logger.info("Long running payloadd", { payload }); + logger.info("Long running payloadddd", { payload }); // Wait for 3 minutes await new Promise((resolve) => setTimeout(resolve, 3 * 60 * 1000)); From f124d94efc938c090ab91bce1df491878fbf2484 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 30 Apr 2024 21:17:23 +0100 Subject: [PATCH 04/57] Fixed task runs with unsanitized queue names --- apps/webapp/app/v3/services/triggerTask.server.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts index ddacf1b1b96..38ec94bddd4 100644 --- a/apps/webapp/app/v3/services/triggerTask.server.ts +++ b/apps/webapp/app/v3/services/triggerTask.server.ts @@ -10,7 +10,7 @@ import { $transaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { eventRepository } from "../eventRepository.server"; import { generateFriendlyId } from "../friendlyIdentifiers"; -import { marqs } from "~/v3/marqs/index.server"; +import { marqs, sanitizeQueueName } from "~/v3/marqs/index.server"; import { uploadToObjectStore } from "../r2.server"; import { BaseService } from "./baseService.server"; @@ -109,7 +109,7 @@ export class TriggerTaskService extends BaseService { select: { lastNumber: true }, }); - const queueName = body.options?.queue?.name ?? `task/${taskId}`; + const queueName = sanitizeQueueName(body.options?.queue?.name ?? `task/${taskId}`); event.setAttribute("queueName", queueName); span.setAttribute("queueName", queueName); From 3b3b07aec21f8dfbe66f6d4b6899cc9bc4070091 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 30 Apr 2024 22:12:19 +0100 Subject: [PATCH 05/57] =?UTF-8?q?=E2=80=9CBorrow=E2=80=9D=20the=20code=20f?= =?UTF-8?q?rom=20alerts=20PR=20to=20get=20self=20hosted=20deployments=20wo?= =?UTF-8?q?rking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/v3/services/startDeploymentIndexing.server.ts | 7 ++++--- packages/cli-v3/src/commands/deploy.ts | 1 + packages/core/src/v3/schemas/api.ts | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/v3/services/startDeploymentIndexing.server.ts b/apps/webapp/app/v3/services/startDeploymentIndexing.server.ts index 9a0156dba9c..7d03147e7b5 100644 --- a/apps/webapp/app/v3/services/startDeploymentIndexing.server.ts +++ b/apps/webapp/app/v3/services/startDeploymentIndexing.server.ts @@ -15,9 +15,10 @@ export class StartDeploymentIndexing extends BaseService { friendlyId: deploymentId, }, data: { - imageReference: registryProxy - ? registryProxy.rewriteImageReference(body.imageReference) - : body.imageReference, + imageReference: + registryProxy && body.selfHosted !== true + ? registryProxy.rewriteImageReference(body.imageReference) + : body.imageReference, status: "DEPLOYING", }, }); diff --git a/packages/cli-v3/src/commands/deploy.ts b/packages/cli-v3/src/commands/deploy.ts index 2c1d94654d3..5eaf7895c77 100644 --- a/packages/cli-v3/src/commands/deploy.ts +++ b/packages/cli-v3/src/commands/deploy.ts @@ -387,6 +387,7 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) { deploymentResponse.data.id, { imageReference, + selfHosted: options.selfHosted, } ); diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index 30101309cd9..c5e5d2011a3 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -116,6 +116,7 @@ export type GetEnvironmentVariablesResponseBody = z.infer< export const StartDeploymentIndexingRequestBody = z.object({ imageReference: z.string(), + selfHosted: z.boolean().optional(), }); export type StartDeploymentIndexingRequestBody = z.infer; From 5ca9e5667c826e894f1407460bd33f5b4e280ffa Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 1 May 2024 10:09:44 +0100 Subject: [PATCH 06/57] Add an admin API endpoint to get info about the shared marqs queue --- apps/webapp/app/routes/admin.api.v1.marqs.ts | 31 ++++++++++++++++++++ apps/webapp/app/v3/marqs/index.server.ts | 29 ++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 apps/webapp/app/routes/admin.api.v1.marqs.ts diff --git a/apps/webapp/app/routes/admin.api.v1.marqs.ts b/apps/webapp/app/routes/admin.api.v1.marqs.ts new file mode 100644 index 00000000000..14a9fd409e8 --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.marqs.ts @@ -0,0 +1,31 @@ +import { LoaderFunctionArgs, json } from "@remix-run/server-runtime"; +import { prisma } from "~/db.server"; +import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server"; +import { marqs } from "~/v3/marqs/index.server"; + +export async function loader({ request, params }: LoaderFunctionArgs) { + // Next authenticate the request + const authenticationResult = await authenticateApiRequestWithPersonalAccessToken(request); + + if (!authenticationResult) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + const user = await prisma.user.findUnique({ + where: { + id: authenticationResult.userId, + }, + }); + + if (!user) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + if (!user.admin) { + return json({ error: "You must be an admin to perform this action" }, { status: 403 }); + } + + const details = await marqs?.getSharedQueueDetails(); + + return json(details); +} diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index f4658326a81..bfb3cd82dec 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -283,6 +283,35 @@ export class MarQS { ); } + public async getSharedQueueDetails() { + const parentQueue = constants.SHARED_QUEUE; + + const { range, selectionId } = await this.queuePriorityStrategy.nextCandidateSelection( + parentQueue + ); + const queues = await this.#zrangeWithScores(parentQueue, range[0], range[1]); + + const queuesWithScores = await this.#calculateQueueScores(queues, (queue) => + this.#calculateMessageQueueCapacities(queue) + ); + + // We need to priority shuffle here to ensure all workers aren't just working on the highest priority queue + const choice = this.queuePriorityStrategy.chooseQueue( + queuesWithScores, + parentQueue, + selectionId + ); + + return { + selectionId, + queues, + queuesWithScores, + nextRange: range, + queueCount: queues.length, + queueChoice: choice, + }; + } + /** * Dequeue a message from the shared queue (this should be used in production environments) */ From 1bba5d561756a516a15c21d5ad2027db20843a0c Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 1 May 2024 10:30:59 +0100 Subject: [PATCH 07/57] Allow admins to view any project metrics --- .../routes/projects.v3.$projectRef.metrics.ts | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/apps/webapp/app/routes/projects.v3.$projectRef.metrics.ts b/apps/webapp/app/routes/projects.v3.$projectRef.metrics.ts index 16168361b66..b95b058a68c 100644 --- a/apps/webapp/app/routes/projects.v3.$projectRef.metrics.ts +++ b/apps/webapp/app/routes/projects.v3.$projectRef.metrics.ts @@ -20,22 +20,41 @@ export async function loader({ params, request }: LoaderFunctionArgs) { const validatedParams = ParamsSchema.parse(params); - const project = await prisma.project.findFirst({ + const user = await prisma.user.findUnique({ where: { - externalRef: validatedParams.projectRef, - organization: { - members: { - some: { - userId: authenticationResult.userId, - }, - }, - }, - }, - include: { - organization: true, + id: authenticationResult.userId, }, }); + if (!user) { + return json({ error: "Invalid or Missing Access Token" }, { status: 401 }); + } + + const project = user.admin + ? await prisma.project.findFirst({ + where: { + externalRef: validatedParams.projectRef, + }, + include: { + organization: true, + }, + }) + : await prisma.project.findFirst({ + where: { + externalRef: validatedParams.projectRef, + organization: { + members: { + some: { + userId: authenticationResult.userId, + }, + }, + }, + }, + include: { + organization: true, + }, + }); + if (!project) { return new Response("Not found", { status: 404 }); } From 14992f43ae5c743d7d20b27660a08c7db7357c43 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 1 May 2024 15:38:01 +0100 Subject: [PATCH 08/57] start adding lazy attempts to prod --- apps/coordinator/src/index.ts | 3 +- .../routes/api.v1.runs.$runParam.attempts.ts | 2 +- apps/webapp/app/v3/handleSocketIo.server.ts | 4 + .../app/v3/marqs/devQueueConsumer.server.ts | 2 +- .../v3/marqs/sharedQueueConsumer.server.ts | 116 +++++++++--------- .../createDeployedBackgroundWorker.server.ts | 1 + .../services/createTaskRunAttempt.server.ts | 55 +++++++-- .../cli-v3/src/workers/prod/entry-point.ts | 3 +- packages/core/src/v3/schemas/messages.ts | 69 ++++++++--- 9 files changed, 160 insertions(+), 95 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index e49aa04ebd0..6fd842efaaf 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -890,7 +890,7 @@ class TaskCoordinator { logger.log("[INDEX_TASKS]", message); const workerAck = await this.#platformSocket?.sendWithAck("CREATE_WORKER", { - version: "v1", + version: "v2", projectRef: socket.data.projectRef, envId: socket.data.envId, deploymentId: message.deploymentId, @@ -899,6 +899,7 @@ class TaskCoordinator { packageVersion: message.packageVersion, tasks: message.tasks, }, + supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts, }); if (!workerAck) { diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts index ecc59815b4e..9c2845f6a52 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts @@ -29,7 +29,7 @@ export async function action({ request, params }: ActionFunctionArgs) { const service = new CreateTaskRunAttemptService(); try { - const execution = await service.call(runParam, authenticationResult.environment); + const { execution } = await service.call(runParam, authenticationResult.environment); return json(execution, { status: 200 }); } catch (error) { diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index 6c9bb340207..81e8c468324 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -106,6 +106,9 @@ function createCoordinatorNamespace(io: Server) { TASK_HEARTBEAT: async (message) => { await sharedQueueTasks.taskHeartbeat(message.attemptFriendlyId); }, + TASK_RUN_HEARTBEAT: async (message) => { + await sharedQueueTasks.taskRunHeartbeat(message.runId); + }, CHECKPOINT_CREATED: async (message) => { const createCheckpoint = new CreateCheckpointService(); await createCheckpoint.call(message); @@ -123,6 +126,7 @@ function createCoordinatorNamespace(io: Server) { const worker = await service.call(message.projectRef, environment, message.deploymentId, { localOnly: false, metadata: message.metadata, + supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts, }); return { success: !!worker }; diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index 8a56981d3dd..f459b00854b 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -509,7 +509,7 @@ export class DevQueueConsumer { } } else { const service = new CreateTaskRunAttemptService(); - const execution = await service.call(lockedTaskRun.friendlyId, this.env); + const { execution } = await service.call(lockedTaskRun.friendlyId, this.env); const payload: TaskRunExecutionPayload = { traceContext: lockedTaskRun.traceContext as Record, diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index d79cbd0df7f..f88e7a3bca7 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -1,6 +1,5 @@ import { Context, ROOT_CONTEXT, Span, SpanKind, context, trace } from "@opentelemetry/api"; import { - Machine, ProdTaskRunExecution, ProdTaskRunExecutionPayload, TaskRunError, @@ -29,6 +28,8 @@ import { findCurrentWorkerDeployment } from "../models/workerDeployment.server"; import { RestoreCheckpointService } from "../services/restoreCheckpoint.server"; import { tracer } from "../tracer.server"; import { CrashTaskRunService } from "../services/crashTaskRun.server"; +import { FailedTaskRunService } from "../failedTaskRun.server"; +import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server"; const WithTraceContext = z.object({ traceparent: z.string().optional(), @@ -408,26 +409,6 @@ export class SharedQueueConsumer { return; } - const queue = await prisma.taskQueue.findUnique({ - where: { - runtimeEnvironmentId_name: { - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - name: lockedTaskRun.queue, - }, - }, - }); - - if (!queue) { - logger.debug("SharedQueueConsumer queue not found, so nacking message", { - queueMessage: message, - taskRunQueue: lockedTaskRun.queue, - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - }); - - await this.#nackAndDoMoreWork(message.messageId, this._options.nextTickInterval); - return; - } - if (!this._enabled) { logger.debug("SharedQueueConsumer not enabled, so nacking message", { queueMessage: message, @@ -437,39 +418,12 @@ export class SharedQueueConsumer { return; } - const taskRunAttempt = await prisma.taskRunAttempt.create({ - data: { - number: lockedTaskRun.attempts[0] ? lockedTaskRun.attempts[0].number + 1 : 1, - friendlyId: generateFriendlyId("attempt"), - taskRunId: lockedTaskRun.id, - startedAt: new Date(), - backgroundWorkerId: backgroundTask.workerId, - backgroundWorkerTaskId: backgroundTask.id, - status: "PENDING" as const, - queueId: queue.id, - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - }, - include: { - backgroundWorkerTask: true, - }, - }); - - const isRetry = taskRunAttempt.number > 1; + const nextAttemptNumber = lockedTaskRun.attempts[0] + ? lockedTaskRun.attempts[0].number + 1 + : 1; - const { machineConfig } = taskRunAttempt.backgroundWorkerTask; - const machine = Machine.safeParse(machineConfig ?? {}); + const isRetry = nextAttemptNumber > 1; - if (!machine.success) { - logger.error("Failed to parse machine config", { - queueMessage: message.data, - messageId: message.messageId, - attemptId: taskRunAttempt.id, - machineConfig, - }); - - await this.#ackAndDoMoreWork(message.messageId); - return; - } try { if (messageBody.data.checkpointEventId) { const restoreService = new RestoreCheckpointService(); @@ -491,23 +445,51 @@ export class SharedQueueConsumer { } else if (isRetry) { socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { version: "v1", - runId: taskRunAttempt.taskRunId, + runId: lockedTaskRun.id, }); } else { + const environment = await prisma.runtimeEnvironment.findUniqueOrThrow({ + where: { + id: lockedTaskRun.runtimeEnvironmentId, + }, + include: { + project: true, + organization: true, + }, + }); + + const service = new CreateTaskRunAttemptService(); + const { attempt, machine } = await service.call( + lockedTaskRun.friendlyId, + environment, + false + ); + + if (!machine) { + logger.error("Missing machine config", { + queueMessage: message.data, + messageId: message.messageId, + attemptId: attempt.id, + }); + + await this.#ackAndDoMoreWork(message.messageId); + return; + } + await this._sender.send("BACKGROUND_WORKER_MESSAGE", { backgroundWorkerId: deployment.worker.friendlyId, data: { type: "SCHEDULE_ATTEMPT", image: deployment.imageReference, version: deployment.version, - machine: machine.data, + machine: machine, // identifiers - id: taskRunAttempt.id, + id: attempt.id, envId: lockedTaskRun.runtimeEnvironment.id, envType: lockedTaskRun.runtimeEnvironment.type, orgId: lockedTaskRun.runtimeEnvironment.organizationId, projectId: lockedTaskRun.runtimeEnvironment.projectId, - runId: taskRunAttempt.taskRunId, + runId: lockedTaskRun.id, }, }); } @@ -529,11 +511,7 @@ export class SharedQueueConsumer { data: { lockedAt: null, lockedById: null, - }, - }), - prisma.taskRunAttempt.delete({ - where: { - id: taskRunAttempt.id, + status: lockedTaskRun.status, }, }), ]); @@ -1086,6 +1064,8 @@ class SharedQueueTasks { } async taskHeartbeat(attemptFriendlyId: string, seconds: number = 60) { + logger.debug("[SharedQueueConsumer] taskHeartbeat()", { id: attemptFriendlyId, seconds }); + const taskRunAttempt = await prisma.taskRunAttempt.findUnique({ where: { friendlyId: attemptFriendlyId }, }); @@ -1096,6 +1076,20 @@ class SharedQueueTasks { await marqs?.heartbeatMessage(taskRunAttempt.taskRunId, seconds); } + + async taskRunHeartbeat(runId: string, seconds: number = 60) { + logger.debug("[SharedQueueConsumer] taskRunHeartbeat()", { runId, seconds }); + + await marqs?.heartbeatMessage(runId, seconds); + } + + public async taskRunFailed(completion: TaskRunFailedExecutionResult) { + logger.debug("[SharedQueueConsumer] taskRunFailed()", { completion }); + + const service = new FailedTaskRunService(); + + await service.call(completion.id, completion); + } } export const sharedQueueTasks = singleton("sharedQueueTasks", () => new SharedQueueTasks()); diff --git a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts index 9ef6333f337..127a50324d6 100644 --- a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts +++ b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts @@ -44,6 +44,7 @@ export class CreateDeployedBackgroundWorkerService extends BaseService { contentHash: body.metadata.contentHash, cliVersion: body.metadata.cliPackageVersion, sdkVersion: body.metadata.packageVersion, + supportsLazyAttempts: body.supportsLazyAttempts, }, }); diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts index 7e24985404d..cdb2b8a9c28 100644 --- a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -1,15 +1,22 @@ -import { TaskRunExecution } from "@trigger.dev/core/v3"; +import { Machine, TaskRunExecution } from "@trigger.dev/core/v3"; import { $transaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { generateFriendlyId } from "../friendlyIdentifiers"; import { BaseService, ServiceValidationError } from "./baseService.server"; +import { TaskRun, TaskRunAttempt } from "@trigger.dev/database"; export class CreateTaskRunAttemptService extends BaseService { public async call( runFriendlyId: string, - environment: AuthenticatedEnvironment - ): Promise { + environment: AuthenticatedEnvironment, + setToExecuting = true + ): Promise<{ + execution: TaskRunExecution; + run: TaskRun; + attempt: TaskRunAttempt; + machine?: Machine; + }> { return await this.traceWithEnv("call()", environment, async (span) => { span.setAttribute("taskRunId", runFriendlyId); @@ -70,21 +77,26 @@ export class CreateTaskRunAttemptService extends BaseService { startedAt: new Date(), backgroundWorkerId: taskRun.lockedBy!.worker.id, backgroundWorkerTaskId: taskRun.lockedBy!.id, - status: "EXECUTING" as const, + status: setToExecuting ? "EXECUTING" : "PENDING", queueId: queue.id, runtimeEnvironmentId: environment.id, }, - }); - - await tx.taskRun.update({ - where: { - id: taskRun.id, - }, - data: { - status: "EXECUTING", + include: { + backgroundWorkerTask: true, }, }); + if (setToExecuting) { + await tx.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "EXECUTING", + }, + }); + } + return taskRunAttempt; }); @@ -142,7 +154,24 @@ export class CreateTaskRunAttemptService extends BaseService { : undefined, }; - return execution; + const { machineConfig } = taskRunAttempt.backgroundWorkerTask; + const machine = Machine.safeParse(machineConfig ?? {}); + + if (!machine.success) { + logger.error("Failed to parse machine config", { + run: taskRun.id, + attempt: taskRunAttempt.id, + backgroundWorkerTask: taskRunAttempt.backgroundWorkerTask.id, + machineConfig, + }); + } + + return { + execution, + run: taskRun, + attempt: taskRunAttempt, + machine: machine.success ? machine.data : undefined, + }; }); } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 457ada131dc..641414c7236 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -473,9 +473,10 @@ class ProdWorker { const taskResources = await this.#initializeWorker(); const { success } = await socket.emitWithAck("INDEX_TASKS", { - version: "v1", + version: "v2", deploymentId: this.deploymentId, ...taskResources, + supportsLazyAttempts: true, }); if (success) { diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index 819b8c3a491..b81425487d0 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -165,6 +165,10 @@ export const childToWorkerMessages = { version: z.literal("v1").default("v1"), id: z.string(), }), + TASK_RUN_HEARTBEAT: z.object({ + version: z.literal("v1").default("v1"), + id: z.string(), + }), READY_TO_DISPOSE: z.undefined(), WAIT_FOR_DURATION: z.object({ version: z.literal("v1").default("v1"), @@ -205,6 +209,12 @@ export const ProdChildToWorkerMessages = { id: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + id: z.string(), + }), + }, READY_TO_DISPOSE: { message: z.undefined(), }, @@ -402,6 +412,18 @@ export const PlatformToProviderMessages = { }, }; +const CreateWorkerMessage = z.object({ + projectRef: z.string(), + envId: z.string(), + deploymentId: z.string(), + metadata: z.object({ + cliPackageVersion: z.string().optional(), + contentHash: z.string(), + packageVersion: z.string(), + tasks: TaskResource.array(), + }), +}); + export const CoordinatorToPlatformMessages = { LOG: { message: z.object({ @@ -411,18 +433,15 @@ export const CoordinatorToPlatformMessages = { }), }, CREATE_WORKER: { - message: z.object({ - version: z.literal("v1").default("v1"), - projectRef: z.string(), - envId: z.string(), - deploymentId: z.string(), - metadata: z.object({ - cliPackageVersion: z.string().optional(), - contentHash: z.string(), - packageVersion: z.string(), - tasks: TaskResource.array(), + message: z.discriminatedUnion("version", [ + CreateWorkerMessage.extend({ + version: z.literal("v1"), }), - }), + CreateWorkerMessage.extend({ + version: z.literal("v2"), + supportsLazyAttempts: z.boolean(), + }), + ]), callback: z.discriminatedUnion("success", [ z.object({ success: z.literal(false), @@ -474,6 +493,12 @@ export const CoordinatorToPlatformMessages = { attemptFriendlyId: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + }, CHECKPOINT_CREATED: { message: z.object({ version: z.literal("v1").default("v1"), @@ -586,6 +611,13 @@ export const SharedQueueToClientMessages = { }, }; +const IndexTasksMessage = z.object({ + version: z.literal("v1"), + deploymentId: z.string(), + tasks: TaskResource.array(), + packageVersion: z.string(), +}); + export const ProdWorkerToCoordinatorMessages = { LOG: { message: z.object({ @@ -595,12 +627,15 @@ export const ProdWorkerToCoordinatorMessages = { callback: z.void(), }, INDEX_TASKS: { - message: z.object({ - version: z.literal("v1").default("v1"), - deploymentId: z.string(), - tasks: TaskResource.array(), - packageVersion: z.string(), - }), + message: z.discriminatedUnion("version", [ + IndexTasksMessage.extend({ + version: z.literal("v1"), + }), + IndexTasksMessage.extend({ + version: z.literal("v2"), + supportsLazyAttempts: z.boolean(), + }), + ]), callback: z.discriminatedUnion("success", [ z.object({ success: z.literal(false), From c75bbfdf2c53e9c76f1f5308dbbe096737474147 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 15:58:02 +0100 Subject: [PATCH 09/57] lazy attempt creation for prod workers --- apps/coordinator/src/index.ts | 76 +++++++++++++++ apps/webapp/app/v3/eventRepository.server.ts | 31 ++++++ apps/webapp/app/v3/failedTaskRun.server.ts | 38 +------- apps/webapp/app/v3/handleSocketIo.server.ts | 48 ++++++++++ .../v3/marqs/sharedQueueConsumer.server.ts | 84 +++++++++++----- .../services/createTaskRunAttempt.server.ts | 84 +++++++++++----- .../src/workers/prod/backgroundWorker.ts | 84 ++++++++++++++-- .../cli-v3/src/workers/prod/entry-point.ts | 96 ++++++++++++++++++- packages/core-apps/src/provider.ts | 2 - packages/core/src/v3/schemas/messages.ts | 84 +++++++++++++++- 10 files changed, 536 insertions(+), 91 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 6fd842efaaf..7277d8c13a0 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -624,6 +624,44 @@ class TaskCoordinator { } }); + socket.on("READY_FOR_LAZY_ATTEMPT", async (message) => { + logger.log("[READY_FOR_LAZY_ATTEMPT]", message); + + try { + const lazyAttempt = await this.#platformSocket?.sendWithAck("READY_FOR_LAZY_ATTEMPT", { + ...message, + envId: socket.data.envId, + }); + + if (!lazyAttempt) { + logger.error("no lazy attempt ack", { runId: socket.data.runId }); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + + return; + } + + if (!lazyAttempt.success) { + logger.error("failed to get lazy attempt payload", { runId: socket.data.runId }); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + + return; + } + + socket.emit("EXECUTE_TASK_RUN_LAZY_ATTEMPT", { + version: "v1", + lazyPayload: lazyAttempt.lazyPayload, + }); + } catch (error) { + logger.error("Error", { error }); + } + }); + socket.on("READY_FOR_RESUME", async (message) => { logger.log("[READY_FOR_RESUME]", message); @@ -714,6 +752,19 @@ class TaskCoordinator { } }); + socket.on("TASK_RUN_FAILED_TO_RUN", async ({ completion }) => { + logger.log("completed task", { completionId: completion.id }); + + this.#platformSocket?.send("TASK_RUN_FAILED_TO_RUN", { + version: "v1", + completion, + }); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + }); + socket.on("READY_FOR_CHECKPOINT", async (message) => { logger.log("[READY_FOR_CHECKPOINT]", message); @@ -918,6 +969,28 @@ class TaskCoordinator { error: message.error, }); }); + + socket.on("CREATE_TASK_RUN_ATTEMPT", async (message, callback) => { + logger.log("[CREATE_TASK_RUN_ATTEMPT]", message); + + const createAttempt = await this.#platformSocket?.sendWithAck("CREATE_TASK_RUN_ATTEMPT", { + runId: message.runId, + envId: socket.data.envId, + }); + + if (!createAttempt?.success) { + logger.debug("no ack while creating attempt", message); + callback({ success: false }); + return; + } + + socket.data.attemptFriendlyId = createAttempt.executionPayload.execution.attempt.id; + + callback({ + success: true, + executionPayload: createAttempt.executionPayload, + }); + }); }, onDisconnect: async (socket, handler, sender, logger) => { this.#platformSocket?.send("LOG", { @@ -929,6 +1002,9 @@ class TaskCoordinator { TASK_HEARTBEAT: async (message) => { this.#platformSocket?.send("TASK_HEARTBEAT", message); }, + TASK_RUN_HEARTBEAT: async (message) => { + this.#platformSocket?.send("TASK_RUN_HEARTBEAT", message); + }, }, }); diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts index bfac9b700a7..8c94f44e26c 100644 --- a/apps/webapp/app/v3/eventRepository.server.ts +++ b/apps/webapp/app/v3/eventRepository.server.ts @@ -10,6 +10,7 @@ import { SpanEvents, SpanMessagingEvent, TaskEventStyle, + TaskRunError, correctErrorStackTrace, createPacketAttributesAsJson, flattenAttributes, @@ -864,6 +865,36 @@ export function stripAttributePrefix(attributes: Attributes, prefix: string) { return result; } +export function createExceptionPropertiesFromError(error: TaskRunError): ExceptionEventProperties { + switch (error.type) { + case "BUILT_IN_ERROR": { + return { + type: error.name, + message: error.message, + stacktrace: error.stackTrace, + }; + } + case "CUSTOM_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + case "INTERNAL_ERROR": { + return { + type: "Internal error", + message: [error.code, error.message].filter(Boolean).join(": "), + }; + } + case "STRING_ERROR": { + return { + type: "Error", + message: error.raw, + }; + } + } +} + /** * Filters out partial events from a batch of creatable events, excluding those that have a corresponding full event. * @param batch - The batch of creatable events to filter. diff --git a/apps/webapp/app/v3/failedTaskRun.server.ts b/apps/webapp/app/v3/failedTaskRun.server.ts index b9cffeaf670..79594e73cba 100644 --- a/apps/webapp/app/v3/failedTaskRun.server.ts +++ b/apps/webapp/app/v3/failedTaskRun.server.ts @@ -1,13 +1,9 @@ -import { - ExceptionEventProperties, - TaskRunError, - TaskRunFailedExecutionResult, -} from "@trigger.dev/core/v3"; +import { TaskRunFailedExecutionResult } from "@trigger.dev/core/v3"; import { logger } from "~/services/logger.server"; import { marqs } from "~/v3/marqs/index.server"; import { TaskRunStatus } from "@trigger.dev/database"; -import { eventRepository } from "./eventRepository.server"; +import { createExceptionPropertiesFromError, eventRepository } from "./eventRepository.server"; import { BaseService } from "./services/baseService.server"; const FAILABLE_TASK_RUN_STATUSES: TaskRunStatus[] = ["EXECUTING", "PENDING", "WAITING_FOR_DEPLOY"]; @@ -68,33 +64,3 @@ export class FailedTaskRunService extends BaseService { }); } } - -function createExceptionPropertiesFromError(error: TaskRunError): ExceptionEventProperties { - switch (error.type) { - case "BUILT_IN_ERROR": { - return { - type: error.name, - message: error.message, - stacktrace: error.stackTrace, - }; - } - case "CUSTOM_ERROR": { - return { - type: "Error", - message: error.raw, - }; - } - case "INTERNAL_ERROR": { - return { - type: "Internal error", - message: [error.code, error.message].filter(Boolean).join(": "), - }; - } - case "STRING_ERROR": { - return { - type: "Error", - message: error.raw, - }; - } - } -} diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index 81e8c468324..2b917e45ad0 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -22,6 +22,7 @@ import { DeploymentIndexFailed } from "./services/deploymentIndexFailed.server"; import { Redis } from "ioredis"; import { createAdapter } from "@socket.io/redis-adapter"; import { CrashTaskRunService } from "./services/crashTaskRun.server"; +import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server"; export const socketIo = singleton("socketIo", initalizeIoServer); @@ -91,6 +92,23 @@ function createCoordinatorNamespace(io: Server) { return { success: true, payload }; } }, + READY_FOR_LAZY_ATTEMPT: async (message) => { + try { + const payload = await sharedQueueTasks.getLazyAttemptPayload( + message.envId, + message.runId + ); + + if (!payload) { + logger.error("Failed to retrieve lazy attempt payload", message); + return { success: false, reason: "Failed to retrieve payload" }; + } + + return { success: true, lazyPayload: payload }; + } catch (error) { + return { success: false }; + } + }, READY_FOR_RESUME: async (message) => { const resumeAttempt = new ResumeAttemptService(); await resumeAttempt.call(message); @@ -103,6 +121,9 @@ function createCoordinatorNamespace(io: Server) { checkpoint: message.checkpoint, }); }, + TASK_RUN_FAILED_TO_RUN: async (message) => { + await sharedQueueTasks.taskRunFailed(message.completion); + }, TASK_HEARTBEAT: async (message) => { await sharedQueueTasks.taskHeartbeat(message.attemptFriendlyId); }, @@ -135,6 +156,33 @@ function createCoordinatorNamespace(io: Server) { return { success: false }; } }, + CREATE_TASK_RUN_ATTEMPT: async (message) => { + try { + const environment = await findEnvironmentById(message.envId); + + if (!environment) { + logger.error("Environment not found", { id: message.envId }); + return { success: false, reason: "Environment not found" }; + } + + const service = new CreateTaskRunAttemptService(); + const { attempt } = await service.call(message.runId, environment, false); + + const payload = await sharedQueueTasks.getExecutionPayloadFromAttempt(attempt.id, true); + + if (!payload) { + logger.error("Failed to retrieve payload after attempt creation", { + id: message.envId, + }); + return { success: false, reason: "Failed to retrieve payload" }; + } + + return { success: true, executionPayload: payload }; + } catch (error) { + logger.error("Error while creating attempt", { error }); + return { success: false }; + } + }, INDEXING_FAILED: async (message) => { try { const service = new DeploymentIndexFailed(); diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index f88e7a3bca7..4e75c20477b 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -1,9 +1,11 @@ import { Context, ROOT_CONTEXT, Span, SpanKind, context, trace } from "@opentelemetry/api"; import { + Machine, ProdTaskRunExecution, ProdTaskRunExecutionPayload, TaskRunError, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionResult, TaskRunFailedExecutionResult, TaskRunSuccessfulExecutionResult, @@ -30,6 +32,7 @@ import { tracer } from "../tracer.server"; import { CrashTaskRunService } from "../services/crashTaskRun.server"; import { FailedTaskRunService } from "../failedTaskRun.server"; import { CreateTaskRunAttemptService } from "../services/createTaskRunAttempt.server"; +import { findEnvironmentById } from "~/models/runtimeEnvironment.server"; const WithTraceContext = z.object({ traceparent: z.string().optional(), @@ -393,6 +396,7 @@ export class SharedQueueConsumer { createdAt: "desc", }, }, + lockedBy: true, }, }); @@ -442,34 +446,29 @@ export class SharedQueueConsumer { await this.#ackAndDoMoreWork(message.messageId); return; } - } else if (isRetry) { + + break; + } + + if (!deployment.worker.supportsLazyAttempts) { + const service = new CreateTaskRunAttemptService(); + await service.call(lockedTaskRun.friendlyId, undefined, false); + } + + if (isRetry) { socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { version: "v1", runId: lockedTaskRun.id, }); } else { - const environment = await prisma.runtimeEnvironment.findUniqueOrThrow({ - where: { - id: lockedTaskRun.runtimeEnvironmentId, - }, - include: { - project: true, - organization: true, - }, - }); + const machineConfig = lockedTaskRun.lockedBy?.machineConfig; + const machine = Machine.safeParse(machineConfig ?? {}); - const service = new CreateTaskRunAttemptService(); - const { attempt, machine } = await service.call( - lockedTaskRun.friendlyId, - environment, - false - ); - - if (!machine) { - logger.error("Missing machine config", { + if (!machine.success) { + logger.error("Failed to parse machine config", { queueMessage: message.data, messageId: message.messageId, - attemptId: attempt.id, + machineConfig, }); await this.#ackAndDoMoreWork(message.messageId); @@ -482,9 +481,9 @@ export class SharedQueueConsumer { type: "SCHEDULE_ATTEMPT", image: deployment.imageReference, version: deployment.version, - machine: machine, + machine: machine.data, // identifiers - id: attempt.id, + id: "placeholder", // TODO: Remove this completely in a future release envId: lockedTaskRun.runtimeEnvironment.id, envType: lockedTaskRun.runtimeEnvironment.type, orgId: lockedTaskRun.runtimeEnvironment.organizationId, @@ -1063,6 +1062,47 @@ class SharedQueueTasks { return this.getExecutionPayloadFromAttempt(latestAttempt.id, setToExecuting, isRetrying); } + async getLazyAttemptPayload( + envId: string, + runId: string + ): Promise { + const environment = await findEnvironmentById(envId); + + if (!environment) { + logger.error("Environment not found", { id: envId }); + return; + } + + const run = await prisma.taskRun.findUnique({ + where: { + id: runId, + runtimeEnvironmentId: environment.id, + }, + }); + + if (!run) { + logger.error("Run not found", { id: runId, envId }); + return; + } + + const environmentRepository = new EnvironmentVariablesRepository(); + const variables = await environmentRepository.getEnvironmentVariables( + environment.projectId, + environment.id + ); + + return { + traceContext: run.traceContext as Record, + environment: variables.reduce((acc: Record, curr) => { + acc[curr.key] = curr.value; + return acc; + }, {}), + runId: run.friendlyId, + messageId: run.id, + isTest: run.isTest, + } satisfies TaskRunExecutionLazyAttemptPayload; + } + async taskHeartbeat(attemptFriendlyId: string, seconds: number = 60) { logger.debug("[SharedQueueConsumer] taskHeartbeat()", { id: attemptFriendlyId, seconds }); diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts index cdb2b8a9c28..90ad4927352 100644 --- a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -1,5 +1,5 @@ -import { Machine, TaskRunExecution } from "@trigger.dev/core/v3"; -import { $transaction } from "~/db.server"; +import { TaskRunExecution } from "@trigger.dev/core/v3"; +import { $transaction, PrismaClientOrTransaction, prisma } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { generateFriendlyId } from "../friendlyIdentifiers"; @@ -8,26 +8,47 @@ import { TaskRun, TaskRunAttempt } from "@trigger.dev/database"; export class CreateTaskRunAttemptService extends BaseService { public async call( - runFriendlyId: string, - environment: AuthenticatedEnvironment, + runId: string, + env?: AuthenticatedEnvironment, setToExecuting = true ): Promise<{ execution: TaskRunExecution; run: TaskRun; attempt: TaskRunAttempt; - machine?: Machine; }> { + let environment: AuthenticatedEnvironment | undefined = env; + + if (!environment) { + environment = await getAuthenticatedEnvironmentFromRun(runId, this._prisma); + + if (!environment) { + throw new ServiceValidationError("Environment not found", 404); + } + } + + const isFriendlyId = runId.startsWith("run_"); + return await this.traceWithEnv("call()", environment, async (span) => { - span.setAttribute("taskRunId", runFriendlyId); + if (isFriendlyId) { + span.setAttribute("taskRunFriendlyId", runId); + } else { + span.setAttribute("taskRunId", runId); + } const taskRun = await this._prisma.taskRun.findUnique({ where: { - friendlyId: runFriendlyId, + id: !isFriendlyId ? runId : undefined, + friendlyId: isFriendlyId ? runId : undefined, runtimeEnvironmentId: environment.id, }, include: { tags: true, - attempts: true, + attempts: { + take: 1, + orderBy: { + number: "desc", + }, + }, lockedBy: { include: { worker: true, @@ -47,6 +68,9 @@ export class CreateTaskRunAttemptService extends BaseService { throw new ServiceValidationError("Task run not found", 404); } + span.setAttribute("taskRunId", taskRun.id); + span.setAttribute("taskRunFriendlyId", taskRun.friendlyId); + if (taskRun.status === "CANCELED") { throw new ServiceValidationError("Task run is cancelled", 400); } @@ -68,10 +92,12 @@ export class CreateTaskRunAttemptService extends BaseService { throw new ServiceValidationError("Queue not found", 404); } + const nextAttemptNumber = taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1; + const taskRunAttempt = await $transaction(this._prisma, async (tx) => { const taskRunAttempt = await tx.taskRunAttempt.create({ data: { - number: taskRun.attempts[0] ? taskRun.attempts[0].number + 1 : 1, + number: nextAttemptNumber, friendlyId: generateFriendlyId("attempt"), taskRunId: taskRun.id, startedAt: new Date(), @@ -82,6 +108,7 @@ export class CreateTaskRunAttemptService extends BaseService { runtimeEnvironmentId: environment.id, }, include: { + backgroundWorker: true, backgroundWorkerTask: true, }, }); @@ -101,6 +128,7 @@ export class CreateTaskRunAttemptService extends BaseService { }); if (!taskRunAttempt) { + logger.error("Failed to create task run attempt", { runId: taskRun.id, nextAttemptNumber }); throw new ServiceValidationError("Failed to create task run attempt", 500); } @@ -154,24 +182,36 @@ export class CreateTaskRunAttemptService extends BaseService { : undefined, }; - const { machineConfig } = taskRunAttempt.backgroundWorkerTask; - const machine = Machine.safeParse(machineConfig ?? {}); - - if (!machine.success) { - logger.error("Failed to parse machine config", { - run: taskRun.id, - attempt: taskRunAttempt.id, - backgroundWorkerTask: taskRunAttempt.backgroundWorkerTask.id, - machineConfig, - }); - } - return { execution, run: taskRun, attempt: taskRunAttempt, - machine: machine.success ? machine.data : undefined, }; }); } } + +async function getAuthenticatedEnvironmentFromRun( + friendlyId: string, + prismaClient?: PrismaClientOrTransaction +) { + const taskRun = await (prismaClient ?? prisma).taskRun.findUnique({ + where: { + friendlyId, + }, + include: { + runtimeEnvironment: { + include: { + organization: true, + project: true, + }, + }, + }, + }); + + if (!taskRun) { + return; + } + + return taskRun?.runtimeEnvironment; +} diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 8f2bbe897a1..7766b980146 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -11,6 +11,7 @@ import { TaskRunBuiltInError, TaskRunErrorCodes, TaskRunExecution, + TaskRunExecutionLazyAttemptPayload, TaskRunExecutionPayload, TaskRunExecutionResult, WaitReason, @@ -56,7 +57,11 @@ type BackgroundWorkerParams = { export class ProdBackgroundWorker { private _initialized: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -74,6 +79,18 @@ export class ProdBackgroundWorker { public onReadyForCheckpoint = Evt.create<{ version?: "v1" }>(); public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>(); + public onCreateTaskRunAttempt = Evt.create<{ version?: "v1"; runId: string }>(); + public attemptCreatedNotification = Evt.create< + | { + success: false; + reason?: string; + } + | { + success: true; + execution: ProdTaskRunExecution; + } + >(); + private _onClose: Evt = new Evt(); public tasks: Array = []; @@ -95,6 +112,7 @@ export class ProdBackgroundWorker { this._closed = true; this.onTaskHeartbeat.detach(); + this.onTaskRunHeartbeat.detach(); // We need to close the task run process await this._taskRunProcess?.cleanup(true); @@ -204,7 +222,10 @@ export class ProdBackgroundWorker { this._taskRunProcess?.waitCompletedNotification(); } - async #initializeTaskRunProcess(payload: ProdTaskRunExecutionPayload): Promise { + async #initializeTaskRunProcess( + payload: ProdTaskRunExecutionPayload, + messageId?: string + ): Promise { const metadata = this.getMetadata( payload.execution.worker.id, payload.execution.worker.version @@ -219,7 +240,8 @@ export class ProdBackgroundWorker { ...(payload.environment ?? {}), }, metadata, - this.params + this.params, + messageId ); taskRunProcess.onExit.attach(() => { @@ -230,6 +252,10 @@ export class ProdBackgroundWorker { this.onTaskHeartbeat.post(id); }); + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); + taskRunProcess.onWaitForBatch.attach((message) => { this.onWaitForBatch.post(message); }); @@ -267,9 +293,12 @@ export class ProdBackgroundWorker { } // We need to fork the process before we can execute any tasks - async executeTaskRun(payload: ProdTaskRunExecutionPayload): Promise { + async executeTaskRun( + payload: ProdTaskRunExecutionPayload, + messageId?: string + ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload); + const taskRunProcess = await this.#initializeTaskRunProcess(payload, messageId); const result = await taskRunProcess.executeTaskRun(payload); @@ -342,6 +371,40 @@ export class ProdBackgroundWorker { await this._taskRunProcess?.cancel(); } + async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload) { + // Post to coordinator + this.onCreateTaskRunAttempt.post({ runId: payload.runId }); + + let execution: ProdTaskRunExecution; + + try { + // ..and wait for response + const attemptCreated = await this.attemptCreatedNotification.waitFor(30_000); + + if (!attemptCreated.success) { + throw new Error( + `Failed to create attempt${attemptCreated.reason ? `: ${attemptCreated.reason}` : ""}` + ); + } + + execution = attemptCreated.execution; + } catch (error) { + console.error("Error while creating attempt", error); + throw new Error(`Failed to create task run attempt: ${error}`); + } + + const completion = await this.executeTaskRun( + { + execution, + traceContext: payload.traceContext, + environment: payload.environment, + }, + payload.messageId + ); + + return { execution, completion }; + } + async #correctError( error: TaskRunBuiltInError, execution: TaskRunExecution @@ -369,7 +432,11 @@ class TaskRunProcess { private _isBeingKilled: boolean = false; private _isBeingCancelled: boolean = false; + /** + * @deprecated use onTaskRunHeartbeat instead + */ public onTaskHeartbeat: Evt = new Evt(); + public onTaskRunHeartbeat: Evt = new Evt(); public onExit: Evt = new Evt(); public onWaitForBatch: Evt< @@ -393,7 +460,8 @@ class TaskRunProcess { private path: string, private env: NodeJS.ProcessEnv, private metadata: BackgroundWorkerProperties, - private worker: BackgroundWorkerParams + private worker: BackgroundWorkerParams, + private messageId?: string ) {} async initialize() { @@ -439,7 +507,11 @@ class TaskRunProcess { process.exit(0); }, TASK_HEARTBEAT: async (message) => { - this.onTaskHeartbeat.post(message.id); + if (this.messageId) { + this.onTaskRunHeartbeat.post(this.messageId); + } else { + this.onTaskHeartbeat.post(message.id); + } }, TASKS_READY: async (message) => {}, WAIT_FOR_TASK: async (message) => { diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 641414c7236..f6fe5a0f3ad 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -5,6 +5,7 @@ import { PreStopCauses, ProdWorkerToCoordinatorMessages, TaskResource, + TaskRunFailedExecutionResult, WaitReason, } from "@trigger.dev/core/v3"; import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket"; @@ -78,6 +79,10 @@ class ProdWorker { this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId }); }); + this.#backgroundWorker.onTaskRunHeartbeat.attach((runId) => { + this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId }); + }); + this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => { // Flush before checkpointing so we don't flush the same spans again after restore await this.#backgroundWorker.flushTelemetry(); @@ -108,6 +113,40 @@ class ProdWorker { this.#backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled }); }); + this.#backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => { + logger.log("onCreateTaskRunAttempt()", { message }); + + const createAttempt = await this.#coordinatorSocket.socket.emitWithAck( + "CREATE_TASK_RUN_ATTEMPT", + { + version: "v1", + runId: message.runId, + } + ); + + if (!createAttempt.success) { + this.#backgroundWorker.attemptCreatedNotification.post({ + success: false, + reason: createAttempt.reason, + }); + return; + } + + this.#backgroundWorker.attemptCreatedNotification.post({ + success: true, + execution: createAttempt.executionPayload.execution, + }); + }); + + this.#backgroundWorker.attemptCreatedNotification.attach((message) => { + if (!message.success) { + return; + } + + // Workers with lazy attempt support set their friendly ID here + this.attemptFriendlyId = message.execution.attempt.id; + }); + this.#backgroundWorker.onWaitForDuration.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); @@ -420,6 +459,59 @@ class ProdWorker { this.#prepareForRetry(willCheckpointAndRestore, shouldExit); }, + EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => { + if (this.executing) { + logger.error("dropping execute request, already executing"); + return; + } + + this.executing = true; + + try { + const { completion, execution } = + await this.#backgroundWorker.executeTaskRunLazyAttempt(message.lazyPayload); + + logger.log("completed", completion); + + this.completed.add(execution.attempt.id); + + const { willCheckpointAndRestore, shouldExit } = + await this.#coordinatorSocket.socket.emitWithAck("TASK_RUN_COMPLETED", { + version: "v1", + execution, + completion, + }); + + logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit }); + + this.#prepareForRetry(willCheckpointAndRestore, shouldExit); + } catch (error) { + const completion: TaskRunFailedExecutionResult = { + ok: false, + id: message.lazyPayload.runId, + retry: undefined, + error: + error instanceof Error + ? { + type: "BUILT_IN_ERROR", + name: error.name, + message: error.message, + stackTrace: error.stack ?? "", + } + : { + type: "BUILT_IN_ERROR", + name: "UnknownError", + message: String(error), + stackTrace: "", + }, + }; + + this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", { + version: "v1", + completion, + }); + } + }, REQUEST_ATTEMPT_CANCELLATION: async (message) => { if (!this.executing) { return; @@ -436,7 +528,7 @@ class ProdWorker { return; } - this.#coordinatorSocket.socket.emit("READY_FOR_EXECUTION", { + this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", { version: "v1", runId: this.runId, totalCompletions: this.completed.size, @@ -564,7 +656,7 @@ class ProdWorker { return; } - socket.emit("READY_FOR_EXECUTION", { + socket.emit("READY_FOR_LAZY_ATTEMPT", { version: "v1", runId: this.runId, totalCompletions: this.completed.size, diff --git a/packages/core-apps/src/provider.ts b/packages/core-apps/src/provider.ts index cb1b0c4d574..159c81dbc37 100644 --- a/packages/core-apps/src/provider.ts +++ b/packages/core-apps/src/provider.ts @@ -46,7 +46,6 @@ export interface TaskOperationsCreateOptions { orgId: string; projectId: string; runId: string; - attemptId: string; } export interface TaskOperationsRestoreOptions { @@ -129,7 +128,6 @@ export class ProviderShell implements Provider { orgId: message.data.orgId, projectId: message.data.projectId, runId: message.data.runId, - attemptId: message.data.id, }); } catch (error) { logger.error("create failed", error); diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index b81425487d0..2c31e047f89 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -29,7 +29,7 @@ export const BackgroundWorkerServerMessages = z.discriminatedUnion("type", [ version: z.string(), machine: Machine, // identifiers - id: z.string(), // attempt + id: z.string().optional(), // TODO: Remove this completely in a future release envId: z.string(), envType: EnvironmentType, orgId: z.string(), @@ -451,6 +451,23 @@ export const CoordinatorToPlatformMessages = { }), ]), }, + CREATE_TASK_RUN_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + envId: z.string(), + }), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + reason: z.string().optional(), + }), + z.object({ + success: z.literal(true), + executionPayload: ProdTaskRunExecutionPayload, + }), + ]), + }, READY_FOR_EXECUTION: { message: z.object({ version: z.literal("v1").default("v1"), @@ -467,6 +484,24 @@ export const CoordinatorToPlatformMessages = { }), ]), }, + READY_FOR_LAZY_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + envId: z.string(), + totalCompletions: z.number(), + }), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + reason: z.string().optional(), + }), + z.object({ + success: z.literal(true), + lazyPayload: TaskRunExecutionLazyAttemptPayload, + }), + ]), + }, READY_FOR_RESUME: { message: z.object({ version: z.literal("v1").default("v1"), @@ -487,6 +522,12 @@ export const CoordinatorToPlatformMessages = { .optional(), }), }, + TASK_RUN_FAILED_TO_RUN: { + message: z.object({ + version: z.literal("v1").default("v1"), + completion: TaskRunFailedExecutionResult, + }), + }, TASK_HEARTBEAT: { message: z.object({ version: z.literal("v1").default("v1"), @@ -652,6 +693,13 @@ export const ProdWorkerToCoordinatorMessages = { totalCompletions: z.number(), }), }, + READY_FOR_LAZY_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + totalCompletions: z.number(), + }), + }, READY_FOR_RESUME: { message: z.object({ version: z.literal("v1").default("v1"), @@ -688,6 +736,12 @@ export const ProdWorkerToCoordinatorMessages = { attemptFriendlyId: z.string(), }), }, + TASK_RUN_HEARTBEAT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + }, TASK_RUN_COMPLETED: { message: z.object({ version: z.literal("v1").default("v1"), @@ -699,6 +753,12 @@ export const ProdWorkerToCoordinatorMessages = { shouldExit: z.boolean(), }), }, + TASK_RUN_FAILED_TO_RUN: { + message: z.object({ + version: z.literal("v1").default("v1"), + completion: TaskRunFailedExecutionResult, + }), + }, WAIT_FOR_DURATION: { message: z.object({ version: z.literal("v1").default("v1"), @@ -744,6 +804,22 @@ export const ProdWorkerToCoordinatorMessages = { }), }), }, + CREATE_TASK_RUN_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + }), + callback: z.discriminatedUnion("success", [ + z.object({ + success: z.literal(false), + reason: z.string().optional(), + }), + z.object({ + success: z.literal(true), + executionPayload: ProdTaskRunExecutionPayload, + }), + ]), + }, }; export const CoordinatorToProdWorkerMessages = { @@ -767,6 +843,12 @@ export const CoordinatorToProdWorkerMessages = { executionPayload: ProdTaskRunExecutionPayload, }), }, + EXECUTE_TASK_RUN_LAZY_ATTEMPT: { + message: z.object({ + version: z.literal("v1").default("v1"), + lazyPayload: TaskRunExecutionLazyAttemptPayload, + }), + }, REQUEST_ATTEMPT_CANCELLATION: { message: z.object({ version: z.literal("v1").default("v1"), From f53004de6b500307fcb5f37bdb0e2e9244334fd5 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 16:40:25 +0100 Subject: [PATCH 10/57] resurrect prod stack traces --- apps/webapp/app/v3/eventRepository.server.ts | 21 +++++++++++++------- packages/core/src/v3/errors.ts | 12 +++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts index 8c94f44e26c..b81474ef49b 100644 --- a/apps/webapp/app/v3/eventRepository.server.ts +++ b/apps/webapp/app/v3/eventRepository.server.ts @@ -415,6 +415,7 @@ export class EventRepository { }, select: { traceId: true, + environmentType: true, }, }); @@ -493,7 +494,11 @@ export class EventRepository { }); } - const events = transformEvents(span.data.events, fullEvent.metadata as Attributes); + const events = transformEvents( + span.data.events, + fullEvent.metadata as Attributes, + traceSearch.environmentType === "DEVELOPMENT" + ); return { ...fullEvent, @@ -1115,16 +1120,16 @@ function removePrivateProperties( return result; } -function transformEvents(events: SpanEvents, properties: Attributes): SpanEvents { - return (events ?? []).map((event) => transformEvent(event, properties)); +function transformEvents(events: SpanEvents, properties: Attributes, isDev: boolean): SpanEvents { + return (events ?? []).map((event) => transformEvent(event, properties, isDev)); } -function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent { +function transformEvent(event: SpanEvent, properties: Attributes, isDev: boolean): SpanEvent { if (isExceptionSpanEvent(event)) { return { ...event, properties: { - exception: transformException(event.properties.exception, properties), + exception: transformException(event.properties.exception, properties, isDev), }, }; } @@ -1134,11 +1139,12 @@ function transformEvent(event: SpanEvent, properties: Attributes): SpanEvent { function transformException( exception: ExceptionEventProperties, - properties: Attributes + properties: Attributes, + isDev: boolean ): ExceptionEventProperties { const projectDirAttributeValue = properties[SemanticInternalAttributes.PROJECT_DIR]; - if (typeof projectDirAttributeValue !== "string") { + if (projectDirAttributeValue !== undefined && typeof projectDirAttributeValue !== "string") { return exception; } @@ -1147,6 +1153,7 @@ function transformException( stacktrace: exception.stacktrace ? correctErrorStackTrace(exception.stacktrace, projectDirAttributeValue, { removeFirstLine: true, + isDev, }) : undefined, }; diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index c9a1e155153..aa2481da9ff 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -57,13 +57,13 @@ export function createErrorTaskError(error: TaskRunError): any { export function correctErrorStackTrace( stackTrace: string, projectDir?: string, - options?: { removeFirstLine?: boolean } + options?: { removeFirstLine?: boolean; isDev?: boolean } ) { const [errorLine, ...traceLines] = stackTrace.split("\n"); return [ options?.removeFirstLine ? undefined : errorLine, - ...traceLines.map((line) => correctStackTraceLine(line, projectDir)), + ...traceLines.map((line) => correctStackTraceLine(line, projectDir, options?.isDev)), ] .filter(Boolean) .join("\n"); @@ -75,17 +75,21 @@ const LINES_TO_IGNORE = [ /TaskExecutor/, /EXECUTE_TASK_RUN/, /@trigger.dev\/core/, + /packages\/core\/src\/v3/, /safeJsonProcess/, /__entryPoint.ts/, + /ZodIpc/, + /startActiveSpan/, + /processTicksAndRejections/, ]; -function correctStackTraceLine(line: string, projectDir?: string) { +function correctStackTraceLine(line: string, projectDir?: string, isDev?: boolean) { if (LINES_TO_IGNORE.some((regex) => regex.test(line))) { return; } // Check to see if the path is inside the project directory - if (projectDir && !line.includes(projectDir)) { + if (isDev && projectDir && !line.includes(projectDir)) { return; } From 1919b6f8666f042ada7781058f8dde667c743a2b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 16:41:12 +0100 Subject: [PATCH 11/57] add exception event to failed run spans --- apps/webapp/app/v3/services/completeAttempt.server.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index ae16564ea5b..ec4bc6e3562 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -11,7 +11,7 @@ import { PrismaClientOrTransaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { safeJsonParse } from "~/utils/json"; -import { eventRepository } from "../eventRepository.server"; +import { createExceptionPropertiesFromError, eventRepository } from "../eventRepository.server"; import { marqs } from "~/v3/marqs/index.server"; import { BaseService } from "./baseService.server"; import { CancelAttemptService } from "./cancelAttempt.server"; @@ -248,6 +248,15 @@ export class CompleteAttemptService extends BaseService { attributes: { isError: true, }, + events: [ + { + name: "exception", + time: new Date(), + properties: { + exception: createExceptionPropertiesFromError(completion.error), + }, + }, + ], }); if ( From a86b2e66f938572c89bf780c5a7e9fa7d431b19a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 17:05:42 +0100 Subject: [PATCH 12/57] simplify dependency resumes --- .../src/workers/prod/backgroundWorker.ts | 19 ++++---- .../cli-v3/src/workers/prod/entry-point.ts | 45 +++++++------------ .../cli-v3/src/workers/prod/worker-facade.ts | 6 +-- .../core/src/v3/runtime/prodRuntimeManager.ts | 6 +-- packages/core/src/v3/schemas/messages.ts | 16 ++++--- 5 files changed, 42 insertions(+), 50 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 7766b980146..2d367a6d77d 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -211,11 +211,8 @@ export class ProdBackgroundWorker { // We need to notify all the task run processes that a task run has completed, // in case they are waiting for it through triggerAndWait - async taskRunCompletedNotification( - completion: TaskRunExecutionResult, - execution: TaskRunExecution - ) { - this._taskRunProcess?.taskRunCompletedNotification(completion, execution); + async taskRunCompletedNotification(completion: TaskRunExecutionResult) { + this._taskRunProcess?.taskRunCompletedNotification(completion); } async waitCompletedNotification() { @@ -233,7 +230,8 @@ export class ProdBackgroundWorker { if (!this._taskRunProcess) { const taskRunProcess = new TaskRunProcess( - payload.execution, + payload.execution.run.id, + payload.execution.run.isTest, this.path, { ...this.params.env, @@ -456,7 +454,8 @@ class TaskRunProcess { public onCancelCheckpoint = Evt.create<{ version?: "v1" | "v2"; reason?: WaitReason }>(); constructor( - private execution: ProdTaskRunExecution, + private runId: string, + private isTest: boolean, private path: string, private env: NodeJS.ProcessEnv, private metadata: BackgroundWorkerProperties, @@ -468,7 +467,7 @@ class TaskRunProcess { this._child = fork(this.path, { stdio: [/*stdin*/ "ignore", /*stdout*/ "pipe", /*stderr*/ "pipe", "ipc"], env: { - ...(this.execution.run.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), + ...(this.isTest ? { TRIGGER_LOG_LEVEL: "debug" } : {}), ...this.env, OTEL_RESOURCE_ATTRIBUTES: JSON.stringify({ [SemanticInternalAttributes.PROJECT_DIR]: this.worker.projectConfig.projectDir, @@ -631,15 +630,15 @@ class TaskRunProcess { return result; } - taskRunCompletedNotification(completion: TaskRunExecutionResult, execution: TaskRunExecution) { + taskRunCompletedNotification(completion: TaskRunExecutionResult) { if (!completion.ok && typeof completion.retry !== "undefined") { return; } if (this._child?.connected && !this._isBeingKilled && !this._child.killed) { this._ipc?.send("TASK_RUN_COMPLETED_NOTIFICATION", { + version: "v2", completion, - execution, }); } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index f6fe5a0f3ad..a3bd67aab9e 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -356,28 +356,14 @@ class ProdWorker { serverMessages: CoordinatorToProdWorkerMessages, extraHeaders, handlers: { - RESUME_AFTER_DEPENDENCY: async (message) => { + RESUME_AFTER_DEPENDENCY: async ({ completions }) => { if (!this.paused) { - logger.error("worker not paused", { - completions: message.completions, - executions: message.executions, - }); + logger.error("Failed to resume after dependency: Worker not paused"); return; } - if (message.completions.length !== message.executions.length) { - logger.error("did not receive the same number of completions and executions", { - completions: message.completions, - executions: message.executions, - }); - return; - } - - if (message.completions.length === 0 || message.executions.length === 0) { - logger.error("no completions or executions", { - completions: message.completions, - executions: message.executions, - }); + if (completions.length === 0) { + logger.error("Failed to resume after dependency: No completions"); return; } @@ -385,17 +371,19 @@ class ProdWorker { this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH" ) { - logger.error("not waiting to resume after dependency", { + logger.error("Failed to resume after dependency: Invalid next resume", { nextResumeAfter: this.nextResumeAfter, }); return; } - if (this.nextResumeAfter === "WAIT_FOR_TASK" && message.completions.length > 1) { - logger.error("waiting for single task but got multiple completions", { - completions: message.completions, - executions: message.executions, - }); + if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) { + logger.error( + "Failed to resume after dependency: Waiting for single task but got multiple completions", + { + completions: completions, + } + ); return; } @@ -403,13 +391,12 @@ class ProdWorker { this.nextResumeAfter = undefined; this.waitForPostStart = false; - for (let i = 0; i < message.completions.length; i++) { - const completion = message.completions[i]; - const execution = message.executions[i]; + for (let i = 0; i < completions.length; i++) { + const completion = completions[i]; - if (!completion || !execution) continue; + if (!completion) continue; - this.#backgroundWorker.taskRunCompletedNotification(completion, execution); + this.#backgroundWorker.taskRunCompletedNotification(completion); } }, RESUME_AFTER_DURATION: async (message) => { diff --git a/packages/cli-v3/src/workers/prod/worker-facade.ts b/packages/cli-v3/src/workers/prod/worker-facade.ts index bba267b8ffd..bec24a8a772 100644 --- a/packages/cli-v3/src/workers/prod/worker-facade.ts +++ b/packages/cli-v3/src/workers/prod/worker-facade.ts @@ -170,8 +170,8 @@ const zodIpc = new ZodIpcConnection({ _isRunning = false; } }, - TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion, execution }) => { - prodRuntimeManager.resumeTask(completion, execution); + TASK_RUN_COMPLETED_NOTIFICATION: async ({ completion }) => { + prodRuntimeManager.resumeTask(completion); }, WAIT_COMPLETED_NOTIFICATION: async () => { prodRuntimeManager.resumeAfterDuration(); @@ -228,7 +228,7 @@ zodIpc.send("TASKS_READY", { tasks: TASK_METADATA }).catch((err) => { process.title = "trigger-prod-worker"; -async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 5) { +async function asyncHeartbeat(initialDelayInSeconds: number = 30, intervalInSeconds: number = 20) { async function _doHeartbeat() { while (true) { if (_isRunning && _execution) { diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index 02a3c8c787e..953f610592c 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -163,8 +163,8 @@ export class ProdRuntimeManager implements RuntimeManager { }; } - resumeTask(completion: TaskRunExecutionResult, execution: TaskRunExecution): void { - const wait = this._taskWaits.get(execution.run.id); + resumeTask(completion: TaskRunExecutionResult): void { + const wait = this._taskWaits.get(completion.id); if (!wait) { return; @@ -172,7 +172,7 @@ export class ProdRuntimeManager implements RuntimeManager { wait.resolve(completion); - this._taskWaits.delete(execution.run.id); + this._taskWaits.delete(completion.id); } private get waitThresholdInMs(): number { diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index 2c31e047f89..d8124546cbb 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -280,11 +280,17 @@ export const ProdWorkerToChildMessages = { }), }, TASK_RUN_COMPLETED_NOTIFICATION: { - message: z.object({ - version: z.literal("v1").default("v1"), - completion: TaskRunExecutionResult, - execution: TaskRunExecution, - }), + message: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + completion: TaskRunExecutionResult, + execution: TaskRunExecution, + }), + z.object({ + version: z.literal("v2"), + completion: TaskRunExecutionResult, + }), + ]), }, CLEANUP: { message: z.object({ From dcc97455973bdc3daba83ceafa4b886e38a77238 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 17:13:27 +0100 Subject: [PATCH 13/57] fix typecheck --- .../webapp/app/v3/services/createTaskRunAttempt.server.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts index 90ad4927352..91dc5ccad39 100644 --- a/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts +++ b/apps/webapp/app/v3/services/createTaskRunAttempt.server.ts @@ -16,14 +16,10 @@ export class CreateTaskRunAttemptService extends BaseService { run: TaskRun; attempt: TaskRunAttempt; }> { - let environment: AuthenticatedEnvironment | undefined = env; + const environment = env ?? (await getAuthenticatedEnvironmentFromRun(runId, this._prisma)); if (!environment) { - environment = await getAuthenticatedEnvironmentFromRun(runId, this._prisma); - - if (!environment) { - throw new ServiceValidationError("Environment not found", 404); - } + throw new ServiceValidationError("Environment not found", 404); } const isFriendlyId = runId.startsWith("run_"); From 90153c31f6b338af008c51b718e86c58527653f4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 2 May 2024 17:32:42 +0100 Subject: [PATCH 14/57] fix merge --- apps/webapp/app/v3/marqs/index.server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index 45f5eadfcbf..c0eb62f98e7 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -290,7 +290,7 @@ export class MarQS { const { range, selectionId } = await this.queuePriorityStrategy.nextCandidateSelection( parentQueue ); - const queues = await this.#zrangeWithScores(parentQueue, range[0], range[1]); + const queues = await this.#getChildQueuesWithScores(parentQueue, range); const queuesWithScores = await this.#calculateQueueScores(queues, (queue) => this.#calculateMessageQueueCapacities(queue) From 0552a8eef59fad51b6718b2439fc6ffd42cba547 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 12:02:21 +0100 Subject: [PATCH 15/57] fresh process for all attempts --- .../src/workers/prod/backgroundWorker.ts | 77 +++++++- .../cli-v3/src/workers/prod/entry-point.ts | 172 +++++++++--------- packages/core/src/v3/schemas/common.ts | 2 + 3 files changed, 162 insertions(+), 89 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 2d367a6d77d..e5fc35b1a60 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -47,6 +47,14 @@ class CancelledProcessError extends Error { } } +class SigKillTimeoutProcessError extends Error { + constructor() { + super("Process kill timeout"); + + this.name = "SigKillTimeoutProcessError"; + } +} + type BackgroundWorkerParams = { env: Record; projectConfig: Config; @@ -118,6 +126,25 @@ export class ProdBackgroundWorker { await this._taskRunProcess?.cleanup(true); } + async killTaskRunProcess(flush = true, signal: number | NodeJS.Signals = "SIGKILL") { + if (this._closed || !this._taskRunProcess) { + return; + } + + if (flush) { + await this.flushTelemetry(); + } + + const onExit = this._taskRunProcess.onExit.waitFor(5_000); + + this._taskRunProcess.kill(signal); + + // Wait until the process has been killed + await onExit; + + this._closed = true; + } + async flushTelemetry() { await this._taskRunProcess?.cleanup(false); } @@ -228,6 +255,26 @@ export class ProdBackgroundWorker { payload.execution.worker.version ); + this._closed = false; + + // If the child process is currently being killed, we should wait for it to be dead before creating a fresh one (with a sensible timeout) + if (this._taskRunProcess?.isBeingKilled) { + try { + await this._taskRunProcess.onExit.waitFor(5_000); + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + try { + const forcedKill = this._taskRunProcess.onExit.waitFor(5_000); + this._taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + } + if (!this._taskRunProcess) { const taskRunProcess = new TaskRunProcess( payload.execution.run.id, @@ -353,6 +400,18 @@ export class ProdBackgroundWorker { }; } + if (e instanceof SigKillTimeoutProcessError) { + return { + id: payload.execution.attempt.id, + ok: false, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.TASK_PROCESS_SIGKILL_TIMEOUT, + }, + }; + } + return { id: payload.execution.attempt.id, ok: false, @@ -435,7 +494,7 @@ class TaskRunProcess { */ public onTaskHeartbeat: Evt = new Evt(); public onTaskRunHeartbeat: Evt = new Evt(); - public onExit: Evt = new Evt(); + public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null }> = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -649,7 +708,7 @@ class TaskRunProcess { } } - async #handleExit(code: number) { + async #handleExit(code: number | null, signal: NodeJS.Signals | null) { // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { if (status === "PENDING") { @@ -668,12 +727,12 @@ class TaskRunProcess { } else if (this._isBeingKilled) { rejecter(new CleanupProcessError()); } else { - rejecter(new UnexpectedExitError(code)); + rejecter(new UnexpectedExitError(code ?? -1)); } } } - this.onExit.post(code); + this.onExit.post({ code, signal }); } #handleLog(data: Buffer) { @@ -706,9 +765,11 @@ class TaskRunProcess { ); } - #kill() { - if (this._child && !this._child.killed) { - this._child?.kill(); - } + kill(signal?: number | NodeJS.Signals) { + this._child?.kill(signal); + } + + get isBeingKilled() { + return this._isBeingKilled || this._child?.killed; } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index a39ebcc5832..629a8e3aee6 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -61,8 +61,79 @@ class ProdWorker { process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM")); this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST); + this.#backgroundWorker = this.#createBackgroundWorker(); - this.#backgroundWorker = new ProdBackgroundWorker("worker.js", { + this.#httpPort = port; + this.#httpServer = this.#createHttpServer(); + } + + async #handleSignal(signal: NodeJS.Signals) { + logger.log("Received signal", { signal }); + + if (signal === "SIGTERM") { + if (this.executing) { + const terminationGracePeriodSeconds = 60 * 60; + + logger.log("Waiting for attempt to complete before exiting", { + terminationGracePeriodSeconds, + }); + + // Wait for termination grace period minus 5s to give cleanup a chance to complete + await setTimeout(terminationGracePeriodSeconds * 1000 - 5000); + + logger.log("Termination timeout reached, exiting gracefully."); + } else { + logger.log("Not executing, exiting immediately."); + } + + await this.#exitGracefully(); + } + + logger.log("Unhandled signal", { signal }); + } + + async #exitGracefully() { + await this.#backgroundWorker.close(); + process.exit(0); + } + + async #reconnect(isPostStart = false, reconnectImmediately = false) { + if (isPostStart) { + this.waitForPostStart = false; + } + + this.#coordinatorSocket.close(); + + if (!reconnectImmediately) { + await setTimeout(1000); + } + + let coordinatorHost = COORDINATOR_HOST; + + try { + if (this.runningInKubernetes) { + coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace( + "\n", + "" + ); + + logger.log("reconnecting", { + coordinatorHost: { + fromEnv: COORDINATOR_HOST, + fromVolume: coordinatorHost, + current: this.#coordinatorSocket.socket.io.opts.hostname, + }, + }); + } + } catch (error) { + logger.error("taskinfo read error during reconnect", { error }); + } finally { + this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); + } + } + + #createBackgroundWorker() { + const backgroundWorker = new ProdBackgroundWorker("worker.js", { projectConfig: __PROJECT_CONFIG__, env: { ...gatherProcessEnv(), @@ -74,23 +145,23 @@ class ProdWorker { contentHash: this.contentHash, }); - this.#backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => { + backgroundWorker.onTaskHeartbeat.attach((attemptFriendlyId) => { // TODO: Switch to .send() once coordinator uses zod handler for all messages this.#coordinatorSocket.socket.emit("TASK_HEARTBEAT", { version: "v1", attemptFriendlyId }); }); - this.#backgroundWorker.onTaskRunHeartbeat.attach((runId) => { + backgroundWorker.onTaskRunHeartbeat.attach((runId) => { this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId }); }); - this.#backgroundWorker.onReadyForCheckpoint.attach(async (message) => { + backgroundWorker.onReadyForCheckpoint.attach(async (message) => { // Flush before checkpointing so we don't flush the same spans again after restore - await this.#backgroundWorker.flushTelemetry(); + await backgroundWorker.flushTelemetry(); this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); }); // Currently, this is only used for duration waits. Might need adjusting for other use cases. - this.#backgroundWorker.onCancelCheckpoint.attach(async (message) => { + backgroundWorker.onCancelCheckpoint.attach(async (message) => { logger.log("onCancelCheckpoint", { message }); const { checkpointCanceled } = await this.#coordinatorSocket.socket.emitWithAck( @@ -110,10 +181,10 @@ class ProdWorker { } } - this.#backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled }); + backgroundWorker.checkpointCanceledNotification.post({ checkpointCanceled }); }); - this.#backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => { + backgroundWorker.onCreateTaskRunAttempt.attach(async (message) => { logger.log("onCreateTaskRunAttempt()", { message }); const createAttempt = await this.#coordinatorSocket.socket.emitWithAck( @@ -125,20 +196,20 @@ class ProdWorker { ); if (!createAttempt.success) { - this.#backgroundWorker.attemptCreatedNotification.post({ + backgroundWorker.attemptCreatedNotification.post({ success: false, reason: createAttempt.reason, }); return; } - this.#backgroundWorker.attemptCreatedNotification.post({ + backgroundWorker.attemptCreatedNotification.post({ success: true, execution: createAttempt.executionPayload.execution, }); }); - this.#backgroundWorker.attemptCreatedNotification.attach((message) => { + backgroundWorker.attemptCreatedNotification.attach((message) => { if (!message.success) { return; } @@ -147,7 +218,7 @@ class ProdWorker { this.attemptFriendlyId = message.execution.attempt.id; }); - this.#backgroundWorker.onWaitForDuration.attach(async (message) => { + backgroundWorker.onWaitForDuration.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); return; @@ -164,7 +235,7 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore); }); - this.#backgroundWorker.onWaitForTask.attach(async (message) => { + backgroundWorker.onWaitForTask.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); return; @@ -181,7 +252,7 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore); }); - this.#backgroundWorker.onWaitForBatch.attach(async (message) => { + backgroundWorker.onWaitForBatch.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); return; @@ -198,73 +269,7 @@ class ProdWorker { this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore); }); - this.#httpPort = port; - this.#httpServer = this.#createHttpServer(); - } - - async #handleSignal(signal: NodeJS.Signals) { - logger.log("Received signal", { signal }); - - if (signal === "SIGTERM") { - if (this.executing) { - const terminationGracePeriodSeconds = 60 * 60; - - logger.log("Waiting for attempt to complete before exiting", { - terminationGracePeriodSeconds, - }); - - // Wait for termination grace period minus 5s to give cleanup a chance to complete - await setTimeout(terminationGracePeriodSeconds * 1000 - 5000); - - logger.log("Termination timeout reached, exiting gracefully."); - } else { - logger.log("Not executing, exiting immediately."); - } - - await this.#exitGracefully(); - } - - logger.log("Unhandled signal", { signal }); - } - - async #exitGracefully() { - await this.#backgroundWorker.close(); - process.exit(0); - } - - async #reconnect(isPostStart = false, reconnectImmediately = false) { - if (isPostStart) { - this.waitForPostStart = false; - } - - this.#coordinatorSocket.close(); - - if (!reconnectImmediately) { - await setTimeout(1000); - } - - let coordinatorHost = COORDINATOR_HOST; - - try { - if (this.runningInKubernetes) { - coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace( - "\n", - "" - ); - - logger.log("reconnecting", { - coordinatorHost: { - fromEnv: COORDINATOR_HOST, - fromVolume: coordinatorHost, - current: this.#coordinatorSocket.socket.io.opts.hostname, - }, - }); - } - } catch (error) { - logger.error("taskinfo read error during reconnect", { error }); - } finally { - this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); - } + return backgroundWorker; } async #prepareForWait(reason: WaitReason, willCheckpointAndRestore: boolean) { @@ -295,11 +300,15 @@ class ProdWorker { } await this.#exitGracefully(); + return; } this.executing = false; this.attemptFriendlyId = undefined; + // Every retry gets a fresh process + await this.#backgroundWorker.killTaskRunProcess(); + if (willCheckpointAndRestore) { this.waitForPostStart = true; this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); @@ -686,6 +695,7 @@ class ProdWorker { paused: this.paused, completed: this.completed.size, nextResumeAfter: this.nextResumeAfter, + waitForPostStart: this.waitForPostStart, }); } diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index 4f3052ea196..2362748d705 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -31,6 +31,7 @@ export const TaskRunErrorCodes = { TASK_EXECUTION_FAILED: "TASK_EXECUTION_FAILED", TASK_EXECUTION_ABORTED: "TASK_EXECUTION_ABORTED", TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", + TASK_PROCESS_SIGKILL_TIMEOUT: "TASK_PROCESS_SIGKILL_TIMEOUT", TASK_RUN_CANCELLED: "TASK_RUN_CANCELLED", TASK_OUTPUT_ERROR: "TASK_OUTPUT_ERROR", HANDLE_ERROR_ERROR: "HANDLE_ERROR_ERROR", @@ -47,6 +48,7 @@ export const TaskRunInternalError = z.object({ "TASK_EXECUTION_FAILED", "TASK_EXECUTION_ABORTED", "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", + "TASK_PROCESS_SIGKILL_TIMEOUT", "TASK_RUN_CANCELLED", "TASK_OUTPUT_ERROR", "HANDLE_ERROR_ERROR", From 1286147ea85770d7c7b7d49d0cda039d0b54e6b4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 12:22:34 +0100 Subject: [PATCH 16/57] always try sigterm first --- .../src/workers/prod/backgroundWorker.ts | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index e5fc35b1a60..91c898e9879 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -126,7 +126,7 @@ export class ProdBackgroundWorker { await this._taskRunProcess?.cleanup(true); } - async killTaskRunProcess(flush = true, signal: number | NodeJS.Signals = "SIGKILL") { + async killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { if (this._closed || !this._taskRunProcess) { return; } @@ -135,12 +135,16 @@ export class ProdBackgroundWorker { await this.flushTelemetry(); } - const onExit = this._taskRunProcess.onExit.waitFor(5_000); - - this._taskRunProcess.kill(signal); - - // Wait until the process has been killed - await onExit; + try { + const initialExit = this._taskRunProcess.onExit.waitFor(5_000); + this._taskRunProcess.kill(initialSignal); + await initialExit; + } catch (error) { + // Try again with SIGKILL + const forcedExit = this._taskRunProcess.onExit.waitFor(5_000); + this._taskRunProcess.kill("SIGKILL"); + await forcedExit; + } this._closed = true; } From 30b6c2c35edb2b5a78ca88bec212bf502bd08e20 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 14:19:08 +0100 Subject: [PATCH 17/57] stop heartbeat timeout on non-inplace replace message --- apps/webapp/app/v3/marqs/index.server.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index c0eb62f98e7..f0b48393878 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -458,6 +458,8 @@ export class MarQS { return; } + workerQueue.dequeue(`requeueTaskRun:${messageId}`); + await this.#callAcknowledgeMessage({ parentQueue: oldMessage.parentQueue, messageKey: this.keys.messageKey(messageId), From 4ace3a4bb7870b1d4e590ac1ad742841d416b7b6 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 14:29:07 +0100 Subject: [PATCH 18/57] add missing ack on checkpoint creation service failure --- apps/webapp/app/v3/services/completeAttempt.server.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index ec4bc6e3562..b2d5dfe9478 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -56,6 +56,8 @@ export class CompleteAttemptService extends BaseService { }, }); + // No attempt, so there's no message to ACK + return "COMPLETED"; } @@ -142,6 +144,8 @@ export class CompleteAttemptService extends BaseService { env ); + // The cancel service handles ACK + return "COMPLETED"; } @@ -226,6 +230,8 @@ export class CompleteAttemptService extends BaseService { }, }); + await marqs?.acknowledgeMessage(taskRunAttempt.taskRunId); + return "COMPLETED"; } From 78a1e57e1b45ed5694c0f32040f23ceb6e8bebdd Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 15:02:14 +0100 Subject: [PATCH 19/57] bypass dequeue for retries with running worker --- .../app/v3/services/completeAttempt.server.ts | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index b2d5dfe9478..28a302c775c 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -19,6 +19,7 @@ import { ResumeTaskRunDependenciesService } from "./resumeTaskRunDependencies.se import { MAX_TASK_RUN_ATTEMPTS } from "~/consts"; import { CreateCheckpointService } from "./createCheckpoint.server"; import { TaskRun } from "@trigger.dev/database"; +import { socketIo } from "../handleSocketIo.server"; type FoundAttempt = Awaited>; @@ -202,7 +203,7 @@ export class CompleteAttemptService extends BaseService { } if (!checkpoint) { - await this.#enqueueRetry(taskRunAttempt.taskRun, completion.retry.timestamp); + await this.#retryAttempt(taskRunAttempt.taskRun, completion.retry.timestamp); return "RETRIED"; } @@ -235,7 +236,7 @@ export class CompleteAttemptService extends BaseService { return "COMPLETED"; } - await this.#enqueueRetry( + await this.#retryAttempt( taskRunAttempt.taskRun, completion.retry.timestamp, checkpointCreateResult.event.id @@ -320,17 +321,25 @@ export class CompleteAttemptService extends BaseService { } } - async #enqueueRetry(run: TaskRun, retryTimestamp: number, checkpointEventId?: string) { - // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt - return await marqs?.replaceMessage( - run.id, - { - type: "EXECUTE", - taskIdentifier: run.taskIdentifier, - checkpointEventId: checkpointEventId, - }, - retryTimestamp - ); + async #retryAttempt(run: TaskRun, retryTimestamp: number, checkpointEventId?: string) { + if (checkpointEventId) { + // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt + return await marqs?.replaceMessage( + run.id, + { + type: "EXECUTE", + taskIdentifier: run.taskIdentifier, + checkpointEventId: checkpointEventId, + }, + retryTimestamp + ); + } else { + // There's no checkpoint so the worker is still running and waiting for this retry message + return socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { + version: "v1", + runId: run.id, + }); + } } #generateMetadataAttributesForNextAttempt(execution: TaskRunExecution) { From 1f119441e7804c4d27cbf6ba078263d1b1d596e4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 3 May 2024 17:18:33 +0100 Subject: [PATCH 20/57] respect retry delays --- apps/webapp/app/services/worker.server.ts | 13 +++++++ .../app/v3/services/completeAttempt.server.ts | 6 +-- .../app/v3/services/retryAttempt.server.ts | 39 +++++++++++++++++++ 3 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 apps/webapp/app/v3/services/retryAttempt.server.ts diff --git a/apps/webapp/app/services/worker.server.ts b/apps/webapp/app/services/worker.server.ts index daade7460b3..fb09523b68f 100644 --- a/apps/webapp/app/services/worker.server.ts +++ b/apps/webapp/app/services/worker.server.ts @@ -38,6 +38,7 @@ import { eventRepository } from "~/v3/eventRepository.server"; import { ExecuteTasksWaitingForDeployService } from "~/v3/services/executeTasksWaitingForDeploy"; import { TriggerScheduledTaskService } from "~/v3/services/triggerScheduledTask.server"; import { RequeueTaskRunService } from "~/v3/requeueTaskRun.server"; +import { RetryAttemptService } from "~/v3/services/retryAttempt.server"; const workerCatalog = { indexEndpoint: z.object({ @@ -140,6 +141,9 @@ const workerCatalog = { "v3.requeueTaskRun": z.object({ runId: z.string(), }), + "v3.retryAttempt": z.object({ + runId: z.string(), + }), }; const executionWorkerCatalog = { @@ -546,6 +550,15 @@ function getWorkerQueue() { await service.call(payload.runId); }, }, + "v3.retryAttempt": { + priority: 0, + maxAttempts: 3, + handler: async (payload, job) => { + const service = new RetryAttemptService(); + + return await service.call(payload.runId); + }, + }, }, }); } diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index 28a302c775c..bcbfcb7cf56 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -20,6 +20,7 @@ import { MAX_TASK_RUN_ATTEMPTS } from "~/consts"; import { CreateCheckpointService } from "./createCheckpoint.server"; import { TaskRun } from "@trigger.dev/database"; import { socketIo } from "../handleSocketIo.server"; +import { RetryAttemptService } from "./retryAttempt.server"; type FoundAttempt = Awaited>; @@ -335,10 +336,7 @@ export class CompleteAttemptService extends BaseService { ); } else { // There's no checkpoint so the worker is still running and waiting for this retry message - return socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { - version: "v1", - runId: run.id, - }); + RetryAttemptService.enqueue(run.id, this._prisma, new Date(retryTimestamp)); } } diff --git a/apps/webapp/app/v3/services/retryAttempt.server.ts b/apps/webapp/app/v3/services/retryAttempt.server.ts new file mode 100644 index 00000000000..86844b53496 --- /dev/null +++ b/apps/webapp/app/v3/services/retryAttempt.server.ts @@ -0,0 +1,39 @@ +import { BaseService } from "./baseService.server"; +import { logger } from "~/services/logger.server"; +import { socketIo } from "../handleSocketIo.server"; +import { PrismaClientOrTransaction } from "~/db.server"; +import { workerQueue } from "~/services/worker.server"; + +export class RetryAttemptService extends BaseService { + public async call(runId: string) { + const taskRun = await this._prisma.taskRun.findFirst({ + where: { + id: runId, + }, + }); + + if (!taskRun) { + logger.error("Task run not found", { runId }); + return; + } + + socketIo.coordinatorNamespace.emit("READY_FOR_RETRY", { + version: "v1", + runId, + }); + } + + static async enqueue(runId: string, tx: PrismaClientOrTransaction, runAt?: Date) { + return await workerQueue.enqueue( + "v3.retryAttempt", + { + runId, + }, + { + tx, + runAt, + jobKey: `retryAttempt:${runId}`, + } + ); + } +} From ba72219455a02d6f5a3837f904bd65a886492da9 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 7 May 2024 14:52:19 +0100 Subject: [PATCH 21/57] crash runs with invalid run status for execution --- apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index 12004c6d896..718f91e3c34 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -291,7 +291,7 @@ export class SharedQueueConsumer { (!retryingFromCheckpoint && !EXECUTABLE_RUN_STATUSES.withoutCheckpoint.includes(existingTaskRun.status)) ) { - logger.debug("Task run has invalid status for execution", { + logger.error("Task run has invalid status for execution", { queueMessage: message.data, messageId: message.messageId, taskRun: existingTaskRun.id, @@ -299,6 +299,12 @@ export class SharedQueueConsumer { retryingFromCheckpoint, }); + const service = new CrashTaskRunService(); + await service.call(existingTaskRun.id, { + crashAttempts: true, + reason: `Invalid run status for execution: ${existingTaskRun.status}`, + }); + await this.#ackAndDoMoreWork(message.messageId); return; } From 5dfaf99f71cd8cad7ccb2e970fb5af5cb6c40d9f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 7 May 2024 22:49:24 +0100 Subject: [PATCH 22/57] remove debug logs --- packages/core/src/v3/runtime/prodRuntimeManager.ts | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index 953f610592c..ed5521fd124 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -94,19 +94,9 @@ export class ProdRuntimeManager implements RuntimeManager { return; } - process.stdout.write("pre"); - process.stdout.write(JSON.stringify(clock.preciseNow())); - - console.log("pre", clock.preciseNow()); - // Resets the clock to the current time clock.reset(); - console.log("post", clock.preciseNow()); - - process.stdout.write("post"); - process.stdout.write(JSON.stringify(clock.preciseNow())); - this._waitForDuration.resolve("external"); this._waitForDuration = undefined; } From 93dca36f0c86e39ffdb337cb0958bc8cd455c20b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 7 May 2024 22:51:37 +0100 Subject: [PATCH 23/57] fix nack message --- apps/webapp/app/v3/marqs/index.server.ts | 45 +++++++------------ apps/webapp/app/v3/requeueTaskRun.server.ts | 14 ++++++ .../app/v3/services/completeAttempt.server.ts | 6 ++- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/apps/webapp/app/v3/marqs/index.server.ts b/apps/webapp/app/v3/marqs/index.server.ts index f0b48393878..0816bab3b01 100644 --- a/apps/webapp/app/v3/marqs/index.server.ts +++ b/apps/webapp/app/v3/marqs/index.server.ts @@ -21,7 +21,7 @@ import { QueueCapacities, QueueRange, } from "./types"; -import { workerQueue } from "~/services/worker.server"; +import { RequeueTaskRunService } from "../requeueTaskRun.server"; const tracer = trace.getTracer("marqs"); @@ -260,15 +260,9 @@ export class MarQS { }); } - await workerQueue.enqueue( - "v3.requeueTaskRun", - { - runId: messageData.messageId, - }, - { - runAt: new Date(Date.now() + this.visibilityTimeoutInMs), - jobKey: `requeueTaskRun:${messageData.messageId}`, - } + await RequeueTaskRunService.enqueue( + messageData.messageId, + new Date(Date.now() + this.visibilityTimeoutInMs) ); return message; @@ -391,7 +385,7 @@ export class MarQS { [SemanticAttributes.PARENT_QUEUE]: message.parentQueue, }); - workerQueue.dequeue(`requeueTaskRun:${messageId}`); + await RequeueTaskRunService.dequeue(messageId); await this.#callAcknowledgeMessage({ parentQueue: message.parentQueue, @@ -458,7 +452,7 @@ export class MarQS { return; } - workerQueue.dequeue(`requeueTaskRun:${messageId}`); + await RequeueTaskRunService.dequeue(messageId); await this.#callAcknowledgeMessage({ parentQueue: oldMessage.parentQueue, @@ -526,6 +520,8 @@ export class MarQS { [SemanticAttributes.PARENT_QUEUE]: message.parentQueue, }); + await RequeueTaskRunService.dequeue(messageId); + await this.#callNackMessage({ messageKey: this.keys.messageKey(messageId), messageQueue: message.queue, @@ -559,16 +555,7 @@ export class MarQS { maxVisibilityTimeout: Date.now() + this.visibilityTimeoutInMs, }); - await workerQueue.enqueue( - "v3.requeueTaskRun", - { - runId: messageId, - }, - { - runAt: new Date(Date.now() + seconds * 1000), - jobKey: `requeueTaskRun:${messageId}`, - } - ); + await RequeueTaskRunService.enqueue(messageId, new Date(Date.now() + seconds * 1000)); } get visibilityTimeoutInMs() { @@ -1351,20 +1338,18 @@ local messageId = ARGV[2] local currentTime = tonumber(ARGV[3]) local messageScore = tonumber(ARGV[4]) --- Check to see if the message is still in the visibilityQueue -local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 - -if messageVisibility == 0 then - return -end - -- Update the concurrency keys redis.call('SREM', concurrencyKey, messageId) redis.call('SREM', envConcurrencyKey, messageId) redis.call('SREM', orgConcurrencyKey, messageId) +-- Check to see if the message is still in the visibilityQueue +local messageVisibility = tonumber(redis.call('ZSCORE', visibilityQueue, messageId)) or 0 + +if messageVisibility > 0 then -- Remove the message from the timeout queue (deprecated, will eventually remove this) -redis.call('ZREM', visibilityQueue, messageId) + redis.call('ZREM', visibilityQueue, messageId) +end -- Enqueue the message into the queue redis.call('ZADD', childQueueKey, messageScore, messageId) diff --git a/apps/webapp/app/v3/requeueTaskRun.server.ts b/apps/webapp/app/v3/requeueTaskRun.server.ts index 3673e16bdb5..e2b904998fd 100644 --- a/apps/webapp/app/v3/requeueTaskRun.server.ts +++ b/apps/webapp/app/v3/requeueTaskRun.server.ts @@ -4,6 +4,8 @@ import { marqs } from "~/v3/marqs/index.server"; import assertNever from "assert-never"; import { FailedTaskRunService } from "./failedTaskRun.server"; import { BaseService } from "./services/baseService.server"; +import { PrismaClientOrTransaction } from "~/db.server"; +import { workerQueue } from "~/services/worker.server"; export class RequeueTaskRunService extends BaseService { public async call(runId: string) { @@ -78,4 +80,16 @@ export class RequeueTaskRunService extends BaseService { } } } + + public static async enqueue(runId: string, runAt?: Date, tx?: PrismaClientOrTransaction) { + return await workerQueue.enqueue( + "v3.requeueTaskRun", + { runId }, + { runAt, jobKey: `requeueTaskRun:${runId}` } + ); + } + + public static async dequeue(runId: string, tx?: PrismaClientOrTransaction) { + return await workerQueue.dequeue(`requeueTaskRun:${runId}`, { tx }); + } } diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index bcbfcb7cf56..3952b06aa07 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -19,7 +19,6 @@ import { ResumeTaskRunDependenciesService } from "./resumeTaskRunDependencies.se import { MAX_TASK_RUN_ATTEMPTS } from "~/consts"; import { CreateCheckpointService } from "./createCheckpoint.server"; import { TaskRun } from "@trigger.dev/database"; -import { socketIo } from "../handleSocketIo.server"; import { RetryAttemptService } from "./retryAttempt.server"; type FoundAttempt = Awaited>; @@ -186,7 +185,10 @@ export class CompleteAttemptService extends BaseService { endTime: retryAt, }); - logger.debug("Retrying", { taskRun: taskRunAttempt.taskRun.friendlyId }); + logger.debug("Retrying", { + taskRun: taskRunAttempt.taskRun.friendlyId, + retry: completion.retry, + }); await this._prisma.taskRun.update({ where: { From bf79e6b9d30f23eb4e1c5342f314c74cfd100f70 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 08:36:19 +0100 Subject: [PATCH 24/57] fix version locking --- .../app/v3/marqs/devQueueConsumer.server.ts | 1 + .../app/v3/marqs/sharedQueueConsumer.server.ts | 16 +++++++++++++++- .../webapp/app/v3/services/triggerTask.server.ts | 1 - 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts index f459b00854b..ef6086c0901 100644 --- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts @@ -409,6 +409,7 @@ export class DevQueueConsumer { data: { lockedAt: new Date(), lockedById: backgroundTask.id, + lockedToVersionId: backgroundWorker.id, }, include: { attempts: { diff --git a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts index 718f91e3c34..57ddafb31a5 100644 --- a/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts +++ b/apps/webapp/app/v3/marqs/sharedQueueConsumer.server.ts @@ -260,6 +260,14 @@ export class SharedQueueConsumer { where: { id: message.messageId, }, + include: { + lockedToVersion: { + include: { + deployment: true, + tasks: true, + }, + }, + }, }); if (!existingTaskRun) { @@ -309,7 +317,12 @@ export class SharedQueueConsumer { return; } - const deployment = await findCurrentWorkerDeployment(existingTaskRun.runtimeEnvironmentId); + const deployment = existingTaskRun.lockedToVersion?.deployment + ? { + ...existingTaskRun.lockedToVersion.deployment, + worker: existingTaskRun.lockedToVersion, + } + : await findCurrentWorkerDeployment(existingTaskRun.runtimeEnvironmentId); if (!deployment || !deployment.worker) { logger.error("No matching deployment found for task run", { @@ -384,6 +397,7 @@ export class SharedQueueConsumer { data: { lockedAt: new Date(), lockedById: backgroundTask.id, + lockedToVersionId: deployment.worker.id, }, include: { runtimeEnvironment: true, diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts index 38ec94bddd4..8c535d4fc36 100644 --- a/apps/webapp/app/v3/services/triggerTask.server.ts +++ b/apps/webapp/app/v3/services/triggerTask.server.ts @@ -4,7 +4,6 @@ import { TriggerTaskRequestBody, packetRequiresOffloading, } from "@trigger.dev/core/v3"; -import { nanoid } from "nanoid"; import { createHash } from "node:crypto"; import { $transaction } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; From 6ad28b6818bb83e7ef686d2609becf4d72ccf808 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 08:38:01 +0100 Subject: [PATCH 25/57] fresh attempt processes in dev and prod --- packages/cli-v3/src/workers/common/errors.ts | 32 +++ .../src/workers/dev/backgroundWorker.ts | 206 +++++++++----- .../src/workers/prod/backgroundWorker.ts | 253 ++++++++++-------- 3 files changed, 325 insertions(+), 166 deletions(-) diff --git a/packages/cli-v3/src/workers/common/errors.ts b/packages/cli-v3/src/workers/common/errors.ts index 4017d3cc65a..4ba1a8ef3bd 100644 --- a/packages/cli-v3/src/workers/common/errors.ts +++ b/packages/cli-v3/src/workers/common/errors.ts @@ -21,3 +21,35 @@ export class TaskMetadataParseError extends Error { this.name = "TaskMetadataParseError"; } } + +export class UnexpectedExitError extends Error { + constructor(public code: number) { + super(`Unexpected exit with code ${code}`); + + this.name = "UnexpectedExitError"; + } +} + +export class CleanupProcessError extends Error { + constructor() { + super("Cancelled"); + + this.name = "CleanupProcessError"; + } +} + +export class CancelledProcessError extends Error { + constructor() { + super("Cancelled"); + + this.name = "CancelledProcessError"; + } +} + +export class SigKillTimeoutProcessError extends Error { + constructor() { + super("Process kill timeout"); + + this.name = "SigKillTimeoutProcessError"; + } +} diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index 2ded486b1c5..b1867355b3c 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -1,5 +1,4 @@ import { - APIError, BackgroundWorkerProperties, BackgroundWorkerServerMessages, CreateBackgroundWorkerResponse, @@ -39,7 +38,14 @@ import { import { safeDeleteFileSync } from "../../utilities/fileSystem.js"; import { installPackages } from "../../utilities/installPackages.js"; import { logger } from "../../utilities/logger.js"; -import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors.js"; +import { + CancelledProcessError, + CleanupProcessError, + SigKillTimeoutProcessError, + TaskMetadataParseError, + UncaughtExceptionError, + UnexpectedExitError, +} from "../common/errors.js"; import { CliApiClient } from "../../apiClient.js"; export type CurrentWorkers = BackgroundWorkerCoordinator["currentWorkers"]; @@ -246,30 +252,6 @@ export class BackgroundWorkerCoordinator { } } -class UnexpectedExitError extends Error { - constructor(public code: number) { - super(`Unexpected exit with code ${code}`); - - this.name = "UnexpectedExitError"; - } -} - -class CleanupProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CleanupProcessError"; - } -} - -class CancelledProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CancelledProcessError"; - } -} - export type BackgroundWorkerParams = { env: Record; dependencies?: Record; @@ -295,6 +277,7 @@ export class BackgroundWorker { public metadata: BackgroundWorkerProperties | undefined; _taskRunProcesses: Map = new Map(); + private _taskRunProcessesBeingKilled: Set = new Set(); private _closed: boolean = false; @@ -422,39 +405,106 @@ export class BackgroundWorker { throw new Error("Worker not registered"); } - if (!this._taskRunProcesses.has(payload.execution.run.id)) { - const taskRunProcess = new TaskRunProcess( - payload.execution.run.id, - payload.execution.run.isTest, - this.path, - { - ...this.params.env, - ...(payload.environment ?? {}), - ...this.#readEnvVars(), - }, - this.metadata, - this.params, - messageId - ); + this._closed = false; - taskRunProcess.onExit.attach(() => { - this._taskRunProcesses.delete(payload.execution.run.id); - }); + if (this._taskRunProcesses.has(payload.execution.run.id)) { + return this._taskRunProcesses.get(payload.execution.run.id) as TaskRunProcess; + } - taskRunProcess.onTaskHeartbeat.attach((id) => { - this.onTaskHeartbeat.post(id); - }); + await this.#killCurrentTaskRunProcessBeforeAttempt(payload.execution.run.id); - taskRunProcess.onTaskRunHeartbeat.attach((id) => { - this.onTaskRunHeartbeat.post(id); - }); + const taskRunProcess = new TaskRunProcess( + payload.execution.run.id, + payload.execution.run.isTest, + this.path, + { + ...this.params.env, + ...(payload.environment ?? {}), + ...this.#readEnvVars(), + }, + this.metadata, + this.params, + messageId + ); + + taskRunProcess.onExit.attach(({ pid }) => { + this._taskRunProcesses.delete(payload.execution.run.id); + if (pid) { + this._taskRunProcessesBeingKilled.delete(pid); + } + }); + + taskRunProcess.onIsBeingKilled.attach((pid) => { + if (pid) { + this._taskRunProcessesBeingKilled.add(pid); + } + }); + + taskRunProcess.onTaskHeartbeat.attach((id) => { + this.onTaskHeartbeat.post(id); + }); + + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); + + await taskRunProcess.initialize(); + + this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess); + + return taskRunProcess; + } - await taskRunProcess.initialize(); + async #killCurrentTaskRunProcessBeforeAttempt(runId: string) { + const taskRunProcess = this._taskRunProcesses.get(runId); - this._taskRunProcesses.set(payload.execution.run.id, taskRunProcess); + if (!taskRunProcess) { + return; } - return this._taskRunProcesses.get(payload.execution.run.id) as TaskRunProcess; + if (taskRunProcess.isBeingKilled) { + if (this._taskRunProcessesBeingKilled.size > 1) { + // If there's more than one being killed, wait for graceful exit + try { + await taskRunProcess.onExit.waitFor(5_000); + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + } else { + // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel + } + } else { + // It's not being killed, so kill it + if (this._taskRunProcessesBeingKilled.size > 0) { + // If there's one being killed already, wait for graceful exit + try { + await taskRunProcess.onExit.waitFor(5_000); + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + } else { + // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. + taskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); + } + } } async cancelRun(taskRunId: string) { @@ -567,8 +617,8 @@ export class BackgroundWorker { const taskRunProcess = await this.#initializeTaskRunProcess(payload, messageId); const result = await taskRunProcess.executeTaskRun(payload); - // Kill the worker if the task was successful or if it's not going to be retried); - await taskRunProcess.cleanup(result.ok || result.retry === undefined); + // Always kill the worker + await taskRunProcess.cleanup(true); if (result.ok) { return result; @@ -669,6 +719,7 @@ class TaskRunProcess { }); private _sender: ZodMessageSender; private _child: ChildProcess | undefined; + private _childPid?: number; private _attemptPromises: Map< string, { resolver: (value: TaskRunExecutionResult) => void; rejecter: (err?: any) => void } @@ -682,7 +733,9 @@ class TaskRunProcess { */ public onTaskHeartbeat: Evt = new Evt(); public onTaskRunHeartbeat: Evt = new Evt(); - public onExit: Evt = new Evt(); + public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> = + new Evt(); + public onIsBeingKilled: Evt = new Evt(); constructor( private runId: string, @@ -736,6 +789,7 @@ class TaskRunProcess { ? ["--inspect-brk", "--trace-uncaught", "--no-warnings=ExperimentalWarning"] : ["--trace-uncaught", "--no-warnings=ExperimentalWarning"], }); + this._childPid = this._child?.pid; this._child.on("message", this.#handleMessage.bind(this)); this._child.on("exit", this.#handleExit.bind(this)); @@ -748,6 +802,11 @@ class TaskRunProcess { return; } + if (kill) { + this._isBeingKilled = true; + this.onIsBeingKilled.post(this._child?.pid); + } + logger.debug(`[${this.runId}] cleaning up task run process`, { kill }); await this._sender.send("CLEANUP", { @@ -755,7 +814,7 @@ class TaskRunProcess { kill, }); - this._isBeingKilled = kill; + // FIXME: Something broke READY_TO_DISPOSE. We never receive it, so we always have to kill the process after the timeout below. // Set a timeout to kill the child process if it hasn't been killed within 5 seconds setTimeout(() => { @@ -867,8 +926,8 @@ class TaskRunProcess { } } - async #handleExit(code: number) { - logger.debug(`[${this.runId}] task run process exiting`, { code }); + async #handleExit(code: number | null, signal: NodeJS.Signals | null) { + logger.debug(`[${this.runId}] task run process exiting`, { code, signal }); // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { @@ -888,12 +947,12 @@ class TaskRunProcess { } else if (this._isBeingKilled) { rejecter(new CleanupProcessError()); } else { - rejecter(new UnexpectedExitError(code)); + rejecter(new UnexpectedExitError(code ?? -1)); } } } - this.onExit.post(code); + this.onExit.post({ code, signal, pid: this.pid }); } #handleLog(data: Buffer) { @@ -939,6 +998,33 @@ class TaskRunProcess { this._child?.kill(); } } + + async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) { + logger.debug(`[${this.runId}] killing task run process`, { + signal, + timeoutInMs, + pid: this.pid, + }); + + this._isBeingKilled = true; + + const killTimeout = this.onExit.waitFor(timeoutInMs); + + this.onIsBeingKilled.post(this._child?.pid); + this._child?.kill(signal); + + if (timeoutInMs) { + await killTimeout; + } + } + + get isBeingKilled() { + return this._isBeingKilled || this._child?.killed; + } + + get pid() { + return this._childPid; + } } function formatErrorLog(error: TaskRunError) { diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 91c898e9879..7536777a202 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -21,39 +21,14 @@ import { ZodIpcConnection } from "@trigger.dev/core/v3/zodIpc"; import type { InferSocketMessageSchema } from "@trigger.dev/core/v3/zodSocket"; import { Evt } from "evt"; import { ChildProcess, fork } from "node:child_process"; -import { TaskMetadataParseError, UncaughtExceptionError } from "../common/errors"; - -class UnexpectedExitError extends Error { - constructor(public code: number) { - super(`Unexpected exit with code ${code}`); - - this.name = "UnexpectedExitError"; - } -} - -class CleanupProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CleanupProcessError"; - } -} - -class CancelledProcessError extends Error { - constructor() { - super("Cancelled"); - - this.name = "CancelledProcessError"; - } -} - -class SigKillTimeoutProcessError extends Error { - constructor() { - super("Process kill timeout"); - - this.name = "SigKillTimeoutProcessError"; - } -} +import { + CancelledProcessError, + CleanupProcessError, + SigKillTimeoutProcessError, + TaskMetadataParseError, + UncaughtExceptionError, + UnexpectedExitError, +} from "../common/errors"; type BackgroundWorkerParams = { env: Record; @@ -104,6 +79,7 @@ export class ProdBackgroundWorker { public tasks: Array = []; _taskRunProcess: TaskRunProcess | undefined; + private _taskRunProcessesBeingKilled: Set = new Set(); private _closed: boolean = false; @@ -135,14 +111,16 @@ export class ProdBackgroundWorker { await this.flushTelemetry(); } + const currentTaskRunProcess = this._taskRunProcess; + try { - const initialExit = this._taskRunProcess.onExit.waitFor(5_000); - this._taskRunProcess.kill(initialSignal); + const initialExit = currentTaskRunProcess.onExit.waitFor(5_000); + currentTaskRunProcess.kill(initialSignal); await initialExit; } catch (error) { // Try again with SIGKILL - const forcedExit = this._taskRunProcess.onExit.waitFor(5_000); - this._taskRunProcess.kill("SIGKILL"); + const forcedExit = currentTaskRunProcess.onExit.waitFor(5_000); + currentTaskRunProcess.kill("SIGKILL"); await forcedExit; } @@ -250,7 +228,7 @@ export class ProdBackgroundWorker { this._taskRunProcess?.waitCompletedNotification(); } - async #initializeTaskRunProcess( + async #getFreshTaskRunProcess( payload: ProdTaskRunExecutionPayload, messageId?: string ): Promise { @@ -261,93 +239,136 @@ export class ProdBackgroundWorker { this._closed = false; - // If the child process is currently being killed, we should wait for it to be dead before creating a fresh one (with a sensible timeout) - if (this._taskRunProcess?.isBeingKilled) { - try { - await this._taskRunProcess.onExit.waitFor(5_000); - } catch (error) { - console.error("TaskRunProcess graceful kill timeout exceeded", error); + await this.#killCurrentTaskRunProcessBeforeAttempt(); - try { - const forcedKill = this._taskRunProcess.onExit.waitFor(5_000); - this._taskRunProcess.kill("SIGKILL"); - await forcedKill; - } catch (error) { - console.error("TaskRunProcess forced kill timeout exceeded", error); - throw new SigKillTimeoutProcessError(); - } + const taskRunProcess = new TaskRunProcess( + payload.execution.run.id, + payload.execution.run.isTest, + this.path, + { + ...this.params.env, + ...(payload.environment ?? {}), + }, + metadata, + this.params, + messageId + ); + + taskRunProcess.onExit.attach(({ pid }) => { + this._taskRunProcess = undefined; + if (pid) { + this._taskRunProcessesBeingKilled.delete(pid); } - } + }); - if (!this._taskRunProcess) { - const taskRunProcess = new TaskRunProcess( - payload.execution.run.id, - payload.execution.run.isTest, - this.path, - { - ...this.params.env, - ...(payload.environment ?? {}), - }, - metadata, - this.params, - messageId - ); + taskRunProcess.onIsBeingKilled.attach((pid) => { + if (pid) { + this._taskRunProcessesBeingKilled.add(pid); + } + }); - taskRunProcess.onExit.attach(() => { - this._taskRunProcess = undefined; - }); + taskRunProcess.onTaskHeartbeat.attach((id) => { + this.onTaskHeartbeat.post(id); + }); - taskRunProcess.onTaskHeartbeat.attach((id) => { - this.onTaskHeartbeat.post(id); - }); + taskRunProcess.onTaskRunHeartbeat.attach((id) => { + this.onTaskRunHeartbeat.post(id); + }); - taskRunProcess.onTaskRunHeartbeat.attach((id) => { - this.onTaskRunHeartbeat.post(id); - }); + taskRunProcess.onWaitForBatch.attach((message) => { + this.onWaitForBatch.post(message); + }); - taskRunProcess.onWaitForBatch.attach((message) => { - this.onWaitForBatch.post(message); - }); + taskRunProcess.onWaitForDuration.attach((message) => { + this.onWaitForDuration.post(message); + }); - taskRunProcess.onWaitForDuration.attach((message) => { - this.onWaitForDuration.post(message); - }); + taskRunProcess.onWaitForTask.attach((message) => { + this.onWaitForTask.post(message); + }); - taskRunProcess.onWaitForTask.attach((message) => { - this.onWaitForTask.post(message); - }); + taskRunProcess.onReadyForCheckpoint.attach((message) => { + this.onReadyForCheckpoint.post(message); + }); - taskRunProcess.onReadyForCheckpoint.attach((message) => { - this.onReadyForCheckpoint.post(message); - }); + taskRunProcess.onCancelCheckpoint.attach((message) => { + this.onCancelCheckpoint.post(message); + }); - taskRunProcess.onCancelCheckpoint.attach((message) => { - this.onCancelCheckpoint.post(message); - }); + // Notify down the chain + this.preCheckpointNotification.attach((message) => { + taskRunProcess.preCheckpointNotification.post(message); + }); + this.checkpointCanceledNotification.attach((message) => { + taskRunProcess.checkpointCanceledNotification.post(message); + }); - // Notify down the chain - this.preCheckpointNotification.attach((message) => { - taskRunProcess.preCheckpointNotification.post(message); - }); - this.checkpointCanceledNotification.attach((message) => { - taskRunProcess.checkpointCanceledNotification.post(message); - }); + await taskRunProcess.initialize(); - await taskRunProcess.initialize(); + this._taskRunProcess = taskRunProcess; - this._taskRunProcess = taskRunProcess; + return this._taskRunProcess; + } + + async #killCurrentTaskRunProcessBeforeAttempt() { + if (!this._taskRunProcess) { + return; } - return this._taskRunProcess; + const currentTaskRunProcess = this._taskRunProcess; + + if (currentTaskRunProcess.isBeingKilled) { + if (this._taskRunProcessesBeingKilled.size > 1) { + // If there's more than one being killed, wait for graceful exit + try { + await currentTaskRunProcess.onExit.waitFor(5_000); + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + try { + const forcedKill = currentTaskRunProcess.onExit.waitFor(5_000); + currentTaskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + } else { + // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel + } + } else { + // It's not being killed, so kill it + if (this._taskRunProcessesBeingKilled.size > 0) { + // If there's one being killed already, wait for graceful exit + try { + await currentTaskRunProcess.onExit.waitFor(5_000); + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + try { + const forcedKill = currentTaskRunProcess.onExit.waitFor(5_000); + currentTaskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + } else { + // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. + currentTaskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); + } + } } - // We need to fork the process before we can execute any tasks + // We need to fork the process before we can execute any tasks, use a fresh process for each execution async executeTaskRun( payload: ProdTaskRunExecutionPayload, messageId?: string ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload, messageId); + const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId); const result = await taskRunProcess.executeTaskRun(payload); @@ -483,6 +504,7 @@ class TaskRunProcess { typeof ProdWorkerToChildMessages >; private _child?: ChildProcess; + private _childPid?: number; private _attemptPromises: Map< string, @@ -498,7 +520,9 @@ class TaskRunProcess { */ public onTaskHeartbeat: Evt = new Evt(); public onTaskRunHeartbeat: Evt = new Evt(); - public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null }> = new Evt(); + public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> = + new Evt(); + public onIsBeingKilled: Evt = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -538,6 +562,7 @@ class TaskRunProcess { ...(this.worker.debugOtel ? { OTEL_LOG_LEVEL: "debug" } : {}), }, }); + this._childPid = this._child?.pid; this._ipc = new ZodIpcConnection({ listenSchema: ProdChildToWorkerMessages, @@ -652,7 +677,10 @@ class TaskRunProcess { return; } - this._isBeingKilled = kill; + if (kill) { + this._isBeingKilled = true; + this.onIsBeingKilled.post(this._child?.pid); + } await this._ipc?.sendWithAck("CLEANUP", { flush: true, @@ -736,7 +764,7 @@ class TaskRunProcess { } } - this.onExit.post({ code, signal }); + this.onExit.post({ code, signal, pid: this.pid }); } #handleLog(data: Buffer) { @@ -769,11 +797,24 @@ class TaskRunProcess { ); } - kill(signal?: number | NodeJS.Signals) { + async kill(signal?: number | NodeJS.Signals, timeoutInMs?: number) { + this._isBeingKilled = true; + + const killTimeout = this.onExit.waitFor(timeoutInMs); + + this.onIsBeingKilled.post(this._child?.pid); this._child?.kill(signal); + + if (timeoutInMs) { + await killTimeout; + } } get isBeingKilled() { return this._isBeingKilled || this._child?.killed; } + + get pid() { + return this._childPid; + } } From 0c0eb021ec4ed9e5cbfc081083a16b45f1399df4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 12:57:28 +0100 Subject: [PATCH 26/57] improve handling of ipc timeouts --- .../core/src/v3/runtime/prodRuntimeManager.ts | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index 02a3c8c787e..3aa42040910 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -55,10 +55,14 @@ export class ProdRuntimeManager implements RuntimeManager { this._waitForDuration = { resolve, reject }; }); - const { willCheckpointAndRestore } = await this.ipc.sendWithAck("WAIT_FOR_DURATION", { - ms, - now, - }); + const { willCheckpointAndRestore } = await this.ipc.sendWithAck( + "WAIT_FOR_DURATION", + { + ms, + now, + }, + 10_000 + ); if (!willCheckpointAndRestore) { await internalTimeout; @@ -74,14 +78,24 @@ export class ProdRuntimeManager implements RuntimeManager { // Resets the clock to the current time clock.reset(); - // The coordinator should cancel any in-progress checkpoints - const { checkpointCanceled, version } = await this.ipc.sendWithAck("CANCEL_CHECKPOINT", { - version: "v2", - reason: "WAIT_FOR_DURATION", - }); - - if (checkpointCanceled) { - // There won't be a checkpoint or external resume and we've already completed our internal timeout + try { + // The coordinator should cancel any in-progress checkpoints + const { checkpointCanceled, version } = await this.ipc.sendWithAck( + "CANCEL_CHECKPOINT", + { + version: "v2", + reason: "WAIT_FOR_DURATION", + }, + 10_000 + ); + + if (checkpointCanceled) { + // There won't be a checkpoint or external resume and we've already completed our internal timeout + return; + } + } catch (error) { + // If the cancellation times out, we will proceed as if the checkpoint was canceled + logger.debug("Checkpoint cancellation timed out", { error }); return; } From 091f1d80621eec70c4b79b87770b32e6385ccc65 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 12:58:36 +0100 Subject: [PATCH 27/57] consider checkpoint failures on cancellation --- apps/coordinator/src/index.ts | 121 +++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 40 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index e49aa04ebd0..51d2dbc9638 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -1,5 +1,5 @@ import { createServer } from "node:http"; -import { $ } from "execa"; +import { $, type ExecaChildProcess } from "execa"; import { nanoid } from "nanoid"; import { Server } from "socket.io"; import { @@ -19,6 +19,7 @@ collectDefaultMetrics(); const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || 8020); const NODE_NAME = process.env.NODE_NAME || "coordinator"; const DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS = 30_000; +const CHAOS_MONKEY_ENABLED = !!process.env.CHAOS_MONKEY_ENABLED; const REGISTRY_HOST = process.env.REGISTRY_HOST || "localhost:5000"; const CHECKPOINT_PATH = process.env.CHECKPOINT_PATH || "/checkpoints"; @@ -49,6 +50,10 @@ type CheckpointData = { docker: boolean; }; +function isExecaChildProcess(maybeExeca: unknown): maybeExeca is Awaited { + return typeof maybeExeca === "object" && maybeExeca !== null && "escapedCommand" in maybeExeca; +} + class Checkpointer { #initialized = false; #canCheckpoint = false; @@ -56,6 +61,7 @@ class Checkpointer { #logger = new SimpleLogger("[checkptr]"); #abortControllers = new Map(); + #failedCheckpoints = new Map(); constructor(private opts = { forceSimulate: false }) {} @@ -150,7 +156,11 @@ class Checkpointer { success: !!result, }); - return result; + if (!result.success) { + return; + } + + return result.checkpoint; } isCheckpointing(runId: string) { @@ -158,6 +168,13 @@ class Checkpointer { } cancelCheckpoint(runId: string): boolean { + // If the last checkpoint failed, pretend we canceled it + // This ensures tasks don't wait for external resume messages to continue + if (this.#hasFailedCheckpoint(runId)) { + this.#clearFailedCheckpoint(runId); + return true; + } + const controller = this.#abortControllers.get(runId); if (!controller) { @@ -176,26 +193,31 @@ class Checkpointer { leaveRunning = true, // This mirrors kubernetes behaviour more accurately projectRef, deploymentVersion, - }: CheckpointAndPushOptions): Promise { + }: CheckpointAndPushOptions): Promise< + { success: true; checkpoint: CheckpointData } | { success: false; reason?: "CANCELED" } + > { await this.initialize(); + const options = { + runId, + leaveRunning, + projectRef, + deploymentVersion, + }; + if (!this.#dockerMode && !this.#canCheckpoint) { this.#logger.error("No checkpoint support. Simulation requires docker."); - return; + return { success: false }; } if (this.#abortControllers.has(runId)) { - logger.error("Checkpoint procedure already in progress", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - }); - return; + logger.error("Checkpoint procedure already in progress", { options }); + return { success: false }; } + // This is a new checkpoint, clear any last failure for this run + this.#clearFailedCheckpoint(runId); + const controller = new AbortController(); this.#abortControllers.set(runId, controller); @@ -206,14 +228,7 @@ class Checkpointer { const imageRef = this.#getImageRef(projectRef, deploymentVersion, shortCode); const exportLocation = this.#getExportLocation(projectRef, deploymentVersion, shortCode); - this.#logger.log("Checkpointing:", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - }); + this.#logger.log("Checkpointing:", { options }); const containterName = this.#getRunContainerName(runId); @@ -234,9 +249,9 @@ class Checkpointer { ); } } - } catch (error: any) { - this.#logger.error(error.stderr); - return; + } catch (error) { + this.#logger.error("Failed while creating docker checkpoint", { exportLocation }); + throw error; } this.#logger.log("checkpoint created:", { @@ -245,8 +260,11 @@ class Checkpointer { }); return { - location: exportLocation, - docker: true, + success: true, + checkpoint: { + location: exportLocation, + docker: true, + }, }; } @@ -291,29 +309,52 @@ class Checkpointer { // this.#logger.log("Deleted checkpoint image", { imageRef }); } catch (error) { this.#logger.error("Failed during checkpoint cleanup", { exportLocation }); - this.#logger.debug(error); + throw error; } return { - location: imageRef, - docker: false, + success: true, + checkpoint: { + location: imageRef, + docker: false, + }, }; } catch (error) { - this.#logger.error("checkpoint failed", { - options: { - runId, - leaveRunning, - projectRef, - deploymentVersion, - }, - error, - }); - return; + if (isExecaChildProcess(error)) { + if (error.isCanceled) { + this.#logger.error("Checkpoint canceled", { options, error }); + + return { success: false, reason: "CANCELED" }; + } + + // Everything that's not a cancellation is a failure + this.#failCheckpoint(runId, error); + this.#logger.error("Checkpoint command error", { options, error }); + + return { success: false }; + } + + this.#failCheckpoint(runId, error); + this.#logger.error("Unhandled checkpoint error", { options, error }); + + return { success: false }; } finally { this.#abortControllers.delete(runId); } } + #failCheckpoint(runId: string, error: unknown) { + this.#failedCheckpoints.set(runId, error); + } + + #clearFailedCheckpoint(runId: string) { + this.#failedCheckpoints.delete(runId); + } + + #hasFailedCheckpoint(runId: string) { + return this.#failedCheckpoints.has(runId); + } + #getRunContainerName(suffix: string) { return `task-run-${suffix}`; } @@ -934,7 +975,7 @@ class TaskCoordinator { return provider; } - #cancelCheckpoint(runId: string) { + #cancelCheckpoint(runId: string): boolean { const checkpointWait = this.#checkpointableTasks.get(runId); if (checkpointWait) { From 2a43a216b83923dc842b70fd46396af7caac3a52 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 12:59:24 +0100 Subject: [PATCH 28/57] add basic chaos monkey to checkpointer --- apps/coordinator/src/index.ts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 51d2dbc9638..31fd33bb8bb 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -33,6 +33,10 @@ const SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ? const logger = new SimpleLogger(`[${NODE_NAME}]`); +if (CHAOS_MONKEY_ENABLED) { + logger.log("🍌 Chaos monkey enabled"); +} + type CheckpointerInitializeReturn = { canCheckpoint: boolean; willSimulate: boolean; @@ -224,6 +228,22 @@ class Checkpointer { const $$ = $({ signal: controller.signal }); try { + if (CHAOS_MONKEY_ENABLED) { + console.log("🍌 Chaos monkey wreaking havoc"); + + const random = Math.random(); + + if (random < 0.33) { + // Fake long checkpoint duration + await $$`sleep 300`; + } else if (random < 0.66) { + // Fake checkpoint error + await $$`false`; + } else { + // no-op + } + } + const shortCode = nanoid(8); const imageRef = this.#getImageRef(projectRef, deploymentVersion, shortCode); const exportLocation = this.#getExportLocation(projectRef, deploymentVersion, shortCode); From e6cea796453ca7c7a4af8264554d7866f642beb8 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 8 May 2024 13:01:19 +0100 Subject: [PATCH 29/57] changeset --- .changeset/warm-olives-provide.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/warm-olives-provide.md diff --git a/.changeset/warm-olives-provide.md b/.changeset/warm-olives-provide.md new file mode 100644 index 00000000000..b57d242a52b --- /dev/null +++ b/.changeset/warm-olives-provide.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Improve handling of IPC timeouts and fix checkpoint cancellation after failures From 7a9cd8d13715927108d5c1f8de6a84bb9d16658f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 26 Apr 2024 22:37:37 +0100 Subject: [PATCH 30/57] control forced checkpoint simulation via env var --- apps/coordinator/src/index.ts | 6 +++++- apps/docker-provider/src/index.ts | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 46ee3e3e135..a1dbfcc11d9 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -21,6 +21,10 @@ const NODE_NAME = process.env.NODE_NAME || "coordinator"; const DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS = 30_000; const CHAOS_MONKEY_ENABLED = !!process.env.CHAOS_MONKEY_ENABLED; +const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes( + process.env.FORCE_CHECKPOINT_SIMULATION ?? "true" +); + const REGISTRY_HOST = process.env.REGISTRY_HOST || "localhost:5000"; const CHECKPOINT_PATH = process.env.CHECKPOINT_PATH || "/checkpoints"; const REGISTRY_TLS_VERIFY = process.env.REGISTRY_TLS_VERIFY === "false" ? "false" : "true"; @@ -382,7 +386,7 @@ class Checkpointer { class TaskCoordinator { #httpServer: ReturnType; - #checkpointer = new Checkpointer({ forceSimulate: true }); + #checkpointer = new Checkpointer({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }); #prodWorkerNamespace: ZodNamespace< typeof ProdWorkerToCoordinatorMessages, diff --git a/apps/docker-provider/src/index.ts b/apps/docker-provider/src/index.ts index f1f945853c4..a5e588956c8 100644 --- a/apps/docker-provider/src/index.ts +++ b/apps/docker-provider/src/index.ts @@ -13,9 +13,14 @@ import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3"; const MACHINE_NAME = process.env.MACHINE_NAME || "local"; const COORDINATOR_PORT = process.env.COORDINATOR_PORT || 8020; const COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1"; + const OTEL_EXPORTER_OTLP_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || "http://0.0.0.0:4318"; +const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes( + process.env.FORCE_CHECKPOINT_SIMULATION ?? "true" +); + const logger = new SimpleLogger(`[${MACHINE_NAME}]`); type InitializeReturn = { @@ -278,7 +283,7 @@ class DockerTaskOperations implements TaskOperations { } const provider = new ProviderShell({ - tasks: new DockerTaskOperations({ forceSimulate: true }), + tasks: new DockerTaskOperations({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }), type: "docker", }); From 19f65688ec0302a4be8c304ac83b54d59c9944d0 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 21 May 2024 10:38:32 +0100 Subject: [PATCH 31/57] fix merge --- .../core/src/v3/runtime/prodRuntimeManager.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index b356aedb228..89b4afddafe 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -78,15 +78,16 @@ export class ProdRuntimeManager implements RuntimeManager { // Resets the clock to the current time clock.reset(); - // The coordinator should cancel any in-progress checkpoints - const { checkpointCanceled, version } = await this.ipc.sendWithAck( - "CANCEL_CHECKPOINT", - { - version: "v2", - reason: "WAIT_FOR_DURATION", - }, - 31_000 - ); + try { + // The coordinator should cancel any in-progress checkpoints + const { checkpointCanceled, version } = await this.ipc.sendWithAck( + "CANCEL_CHECKPOINT", + { + version: "v2", + reason: "WAIT_FOR_DURATION", + }, + 31_000 + ); if (checkpointCanceled) { // There won't be a checkpoint or external resume and we've already completed our internal timeout From 181641d7f180db12f9771a7181c70909ed1b5dce Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 21 May 2024 17:11:15 +0100 Subject: [PATCH 32/57] kill old attempt processes before checkpointing --- .../src/workers/prod/backgroundWorker.ts | 24 +++++++++++++------ .../cli-v3/src/workers/prod/entry-point.ts | 16 +++++++++---- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 7536777a202..24bd528fd40 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -79,7 +79,7 @@ export class ProdBackgroundWorker { public tasks: Array = []; _taskRunProcess: TaskRunProcess | undefined; - private _taskRunProcessesBeingKilled: Set = new Set(); + private _taskRunProcessesBeingKilled: Map = new Map(); private _closed: boolean = false; @@ -261,9 +261,9 @@ export class ProdBackgroundWorker { } }); - taskRunProcess.onIsBeingKilled.attach((pid) => { - if (pid) { - this._taskRunProcessesBeingKilled.add(pid); + taskRunProcess.onIsBeingKilled.attach((taskRunProcess) => { + if (taskRunProcess?.pid) { + this._taskRunProcessesBeingKilled.set(taskRunProcess.pid, taskRunProcess); } }); @@ -310,6 +310,16 @@ export class ProdBackgroundWorker { return this._taskRunProcess; } + async forceKillOldTaskRunProcesses() { + for (const taskRunProcess of this._taskRunProcessesBeingKilled.values()) { + try { + await taskRunProcess.kill("SIGKILL"); + } catch (error) { + console.error("Error while force killing old task run processes", error); + } + } + } + async #killCurrentTaskRunProcessBeforeAttempt() { if (!this._taskRunProcess) { return; @@ -522,7 +532,7 @@ class TaskRunProcess { public onTaskRunHeartbeat: Evt = new Evt(); public onExit: Evt<{ code: number | null; signal: NodeJS.Signals | null; pid?: number }> = new Evt(); - public onIsBeingKilled: Evt = new Evt(); + public onIsBeingKilled: Evt = new Evt(); public onWaitForBatch: Evt< InferSocketMessageSchema @@ -679,7 +689,7 @@ class TaskRunProcess { if (kill) { this._isBeingKilled = true; - this.onIsBeingKilled.post(this._child?.pid); + this.onIsBeingKilled.post(this); } await this._ipc?.sendWithAck("CLEANUP", { @@ -802,7 +812,7 @@ class TaskRunProcess { const killTimeout = this.onExit.waitFor(timeoutInMs); - this.onIsBeingKilled.post(this._child?.pid); + this.onIsBeingKilled.post(this); this._child?.kill(signal); if (timeoutInMs) { diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 629a8e3aee6..39f38841a38 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -154,9 +154,10 @@ class ProdWorker { this.#coordinatorSocket.socket.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId }); }); + // Currently, this is only used for duration waits backgroundWorker.onReadyForCheckpoint.attach(async (message) => { - // Flush before checkpointing so we don't flush the same spans again after restore - await backgroundWorker.flushTelemetry(); + await this.#prepareForCheckpoint(); + this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); }); @@ -283,9 +284,8 @@ class ProdWorker { this.waitForPostStart = true; if (reason === "WAIT_FOR_TASK" || reason === "WAIT_FOR_BATCH") { - // Flush before checkpointing so we don't flush the same spans again after restore // Duration waits do this via the "ready for checkpoint" event instead - await this.#backgroundWorker.flushTelemetry(); + await this.#prepareForCheckpoint(); } } } @@ -316,6 +316,14 @@ class ProdWorker { } } + async #prepareForCheckpoint() { + // Flush before checkpointing so we don't flush the same spans again after restore + await this.#backgroundWorker.flushTelemetry(); + + // Kill the previous worker process to prevent large checkpoints + await this.#backgroundWorker.forceKillOldTaskRunProcesses(); + } + #resumeAfterDuration() { this.paused = false; this.nextResumeAfter = undefined; From 08378259be2039f5ffd21b46b7a74b8addfe99c5 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 21 May 2024 17:12:17 +0100 Subject: [PATCH 33/57] detailed perf logging for checkpointing --- apps/coordinator/src/index.ts | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index a1dbfcc11d9..611b1ee04f1 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -308,21 +308,45 @@ class Checkpointer { throw new Error("could not find container id"); } + const start = performance.now(); + + // Create checkpoint this.#logger.debug(await $$`crictl checkpoint --export=${exportLocation} ${containerId}`); + const postCheckpoint = performance.now(); // Create image from checkpoint const container = this.#logger.debug(await $$`buildah from scratch`); + const postFrom = performance.now(); + this.#logger.debug(await $$`buildah add ${container} ${exportLocation} /`); + const postAdd = performance.now(); + this.#logger.debug( await $$`buildah config --annotation=io.kubernetes.cri-o.annotations.checkpoint.name=counter ${container}` ); + const postConfig = performance.now(); + this.#logger.debug(await $$`buildah commit ${container} ${imageRef}`); + const postCommit = performance.now(); + this.#logger.debug(await $$`buildah rm ${container}`); + const postRm = performance.now(); // Push checkpoint image this.#logger.debug(await $$`buildah push --tls-verify=${REGISTRY_TLS_VERIFY} ${imageRef}`); + const postPush = performance.now(); + + const perf = { + "crictl checkpoint": postCheckpoint - start, + "buildah from": postFrom - postCheckpoint, + "buildah add": postAdd - postFrom, + "buildah config": postConfig - postAdd, + "buildah commit": postCommit - postConfig, + "buildah rm": postRm - postCommit, + "buildah push": postPush - postRm, + }; - this.#logger.log("Checkpointed and pushed image to:", { location: imageRef }); + this.#logger.log("Checkpointed and pushed image to:", { location: imageRef, perf }); try { await $$`rm ${exportLocation}`; From 59a6476254eb818a037ce6de8c9ebc301a494337 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 21 May 2024 17:13:39 +0100 Subject: [PATCH 34/57] add coordinator otlp endpoint example --- apps/docker-provider/.env.example | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/docker-provider/.env.example b/apps/docker-provider/.env.example index 2d24f79c8ca..75c54083d1a 100644 --- a/apps/docker-provider/.env.example +++ b/apps/docker-provider/.env.example @@ -4,6 +4,8 @@ PLATFORM_WS_PORT=3030 PLATFORM_SECRET=provider-secret SECURE_CONNECTION=false +OTEL_EXPORTER_OTLP_ENDPOINT=http://0.0.0.0:3030/otel + # Use this if you are on macOS # COORDINATOR_HOST="host.docker.internal" # OTEL_EXPORTER_OTLP_ENDPOINT="http://host.docker.internal:4318" \ No newline at end of file From 833a0f1688eb72bf9c7bd1255a9da637f8be359c Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 16:06:47 +0100 Subject: [PATCH 35/57] improve prod run cancellation --- apps/coordinator/src/index.ts | 22 ++++++++++++ .../app/v3/services/cancelTaskRun.server.ts | 34 ++++++++++++++++--- .../cli-v3/src/workers/prod/entry-point.ts | 29 +++++++++++----- packages/core/src/v3/schemas/messages.ts | 19 +++++++++-- packages/core/src/v3/zodSocket.ts | 31 ++++++++++++++--- 5 files changed, 114 insertions(+), 21 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 611b1ee04f1..170db5bd625 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -531,6 +531,28 @@ class TaskCoordinator { taskSocket.emit("REQUEST_ATTEMPT_CANCELLATION", message); }, + REQUEST_RUN_CANCELLATION: async (message) => { + const taskSocket = await this.#getRunSocket(message.runId); + + if (!taskSocket) { + logger.log("Socket for run not found", { + runId: message.runId, + }); + return; + } + + if (message.delayInMs) { + taskSocket.emit("REQUEST_EXIT", { + version: "v2", + delayInMs: message.delayInMs, + }); + } else { + // If there's no delay, assume the worker doesn't support non-v1 messages + taskSocket.emit("REQUEST_EXIT", { + version: "v1", + }); + } + }, READY_FOR_RETRY: async (message) => { const taskSocket = await this.#getRunSocket(message.runId); diff --git a/apps/webapp/app/v3/services/cancelTaskRun.server.ts b/apps/webapp/app/v3/services/cancelTaskRun.server.ts index 72179a0dce7..a4822d854aa 100644 --- a/apps/webapp/app/v3/services/cancelTaskRun.server.ts +++ b/apps/webapp/app/v3/services/cancelTaskRun.server.ts @@ -24,9 +24,15 @@ const CANCELLABLE_ATTEMPT_STATUSES: Array = [ "PENDING", ]; -type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{ +type ExtendedTaskRun = Prisma.TaskRunGetPayload<{ include: { runtimeEnvironment: true; + lockedToVersion: true; + }; +}>; + +type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{ + include: { backgroundWorker: true; }; }>; @@ -71,11 +77,10 @@ export class CancelTaskRunService extends BaseService { }, include: { backgroundWorker: true, - runtimeEnvironment: true, }, }, - dependency: true, runtimeEnvironment: true, + lockedToVersion: true, }, }); @@ -96,6 +101,7 @@ export class CancelTaskRunService extends BaseService { // Cancel any in progress attempts if (opts.cancelAttempts) { await this.#cancelPotentiallyRunningAttempts(cancelledTaskRun, cancelledTaskRun.attempts); + await this.#cancelRemainingRunWorkers(cancelledTaskRun); } return { @@ -103,9 +109,12 @@ export class CancelTaskRunService extends BaseService { }; } - async #cancelPotentiallyRunningAttempts(run: TaskRun, attempts: ExtendedTaskRunAttempt[]) { + async #cancelPotentiallyRunningAttempts( + run: ExtendedTaskRun, + attempts: ExtendedTaskRunAttempt[] + ) { for (const attempt of attempts) { - if (attempt.runtimeEnvironment.type === "DEVELOPMENT") { + if (run.runtimeEnvironment.type === "DEVELOPMENT") { // Signal the task run attempt to stop await devPubSub.publish( `backgroundWorker:${attempt.backgroundWorkerId}:${attempt.id}`, @@ -158,4 +167,19 @@ export class CancelTaskRunService extends BaseService { } } } + + async #cancelRemainingRunWorkers(run: ExtendedTaskRun) { + if (run.runtimeEnvironment.type === "DEVELOPMENT") { + // Nothing to do + return; + } + + // Broadcast cancel message to all coordinators + socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", { + version: "v1", + runId: run.id, + // Give the attempts some time to exit gracefully. If the runs supports lazy attempts, it also supports exit delays. + delayInMs: run.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + }); + } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 39f38841a38..542a535306d 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -518,12 +518,20 @@ class ProdWorker { }, REQUEST_ATTEMPT_CANCELLATION: async (message) => { if (!this.executing) { + logger.log("dropping cancel request, not executing", { status: this.#status }); return; } + logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status }); + await this.#backgroundWorker.cancelAttempt(message.attemptId); }, - REQUEST_EXIT: async () => { + REQUEST_EXIT: async (message) => { + if (message.version === "v2" && message.delayInMs) { + logger.log("exit requested with delay", { delayInMs: message.delayInMs }); + await setTimeout(message.delayInMs); + } + this.#coordinatorSocket.close(); process.exit(0); }, @@ -698,13 +706,7 @@ class ProdWorker { } case "/status": { - return reply.json({ - executing: this.executing, - paused: this.paused, - completed: this.completed.size, - nextResumeAfter: this.nextResumeAfter, - waitForPostStart: this.waitForPostStart, - }); + return reply.json(this.#status); } case "/connect": { @@ -866,6 +868,17 @@ class ProdWorker { return data?.variables ?? {}; } + get #status() { + return { + executing: this.executing, + paused: this.paused, + completed: this.completed.size, + nextResumeAfter: this.nextResumeAfter, + waitForPostStart: this.waitForPostStart, + attemptFriendlyId: this.attemptFriendlyId, + }; + } + start() { this.#httpServer.listen(this.#httpPort, this.host); } diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index d8124546cbb..d1b559e5d9d 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -612,6 +612,13 @@ export const PlatformToCoordinatorMessages = { attemptFriendlyId: z.string(), }), }, + REQUEST_RUN_CANCELLATION: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + delayInMs: z.number().optional(), + }), + }, READY_FOR_RETRY: { message: z.object({ version: z.literal("v1").default("v1"), @@ -862,9 +869,15 @@ export const CoordinatorToProdWorkerMessages = { }), }, REQUEST_EXIT: { - message: z.object({ - version: z.literal("v1").default("v1"), - }), + message: z.discriminatedUnion("version", [ + z.object({ + version: z.literal("v1"), + }), + z.object({ + version: z.literal("v2"), + delayInMs: z.number().optional(), + }), + ]), }, READY_FOR_RETRY: { message: z.object({ diff --git a/packages/core/src/v3/zodSocket.ts b/packages/core/src/v3/zodSocket.ts index 1e2ae1e9e5d..964318586a4 100644 --- a/packages/core/src/v3/zodSocket.ts +++ b/packages/core/src/v3/zodSocket.ts @@ -1,8 +1,9 @@ import type { Socket } from "socket.io-client"; import { io } from "socket.io-client"; -import { z } from "zod"; +import { ZodError, z } from "zod"; import { EventEmitterLike, ZodMessageValueSchema } from "./zodMessageHandler"; import { LogLevel, SimpleStructuredLogger, StructuredLogger } from "./utils/structuredLogger"; +import { fromZodError } from "zod-validation-error"; export interface ZodSocketMessageCatalogSchema { [key: string]: @@ -81,7 +82,7 @@ export type MessagesFromSocketCatalog Date: Fri, 24 May 2024 16:07:27 +0100 Subject: [PATCH 36/57] rename supports lazy attempts migration --- .../migration.sql | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename packages/database/prisma/migrations/{20240430101936_ => 20240430101936_add_lazy_attempt_support_flag_to_workers}/migration.sql (100%) diff --git a/packages/database/prisma/migrations/20240430101936_/migration.sql b/packages/database/prisma/migrations/20240430101936_add_lazy_attempt_support_flag_to_workers/migration.sql similarity index 100% rename from packages/database/prisma/migrations/20240430101936_/migration.sql rename to packages/database/prisma/migrations/20240430101936_add_lazy_attempt_support_flag_to_workers/migration.sql From 1e8743d4ceb0c65b84ad0849e8cf1e648992dcb9 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 16:14:17 +0100 Subject: [PATCH 37/57] fix graceful exit --- packages/cli-v3/src/workers/common/errors.ts | 8 ++ .../src/workers/prod/backgroundWorker.ts | 114 +++++++++++------- .../cli-v3/src/workers/prod/entry-point.ts | 16 ++- .../cli-v3/src/workers/prod/worker-facade.ts | 17 --- 4 files changed, 90 insertions(+), 65 deletions(-) diff --git a/packages/cli-v3/src/workers/common/errors.ts b/packages/cli-v3/src/workers/common/errors.ts index 4ba1a8ef3bd..053ab8d19ba 100644 --- a/packages/cli-v3/src/workers/common/errors.ts +++ b/packages/cli-v3/src/workers/common/errors.ts @@ -53,3 +53,11 @@ export class SigKillTimeoutProcessError extends Error { this.name = "SigKillTimeoutProcessError"; } } + +export class GracefulExitTimeoutError extends Error { + constructor() { + super("Graceful exit timeout"); + + this.name = "GracefulExitTimeoutError"; + } +} diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 24bd528fd40..5431d9d10cb 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -24,6 +24,7 @@ import { ChildProcess, fork } from "node:child_process"; import { CancelledProcessError, CleanupProcessError, + GracefulExitTimeoutError, SigKillTimeoutProcessError, TaskMetadataParseError, UncaughtExceptionError, @@ -88,7 +89,7 @@ export class ProdBackgroundWorker { private params: BackgroundWorkerParams ) {} - async close() { + async close(gracefulExitTimeoutElapsed = false) { if (this._closed) { return; } @@ -99,7 +100,7 @@ export class ProdBackgroundWorker { this.onTaskRunHeartbeat.detach(); // We need to close the task run process - await this._taskRunProcess?.cleanup(true); + await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed); } async killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { @@ -113,16 +114,10 @@ export class ProdBackgroundWorker { const currentTaskRunProcess = this._taskRunProcess; - try { - const initialExit = currentTaskRunProcess.onExit.waitFor(5_000); - currentTaskRunProcess.kill(initialSignal); - await initialExit; - } catch (error) { - // Try again with SIGKILL - const forcedExit = currentTaskRunProcess.onExit.waitFor(5_000); - currentTaskRunProcess.kill("SIGKILL"); - await forcedExit; - } + // Try graceful exit but don't wait. We limit the amount of processes during creation instead. + this.#tryGracefulExit(currentTaskRunProcess, true, initialSignal).catch((error) => { + console.error("Error while trying graceful exit", error); + }); this._closed = true; } @@ -329,42 +324,14 @@ export class ProdBackgroundWorker { if (currentTaskRunProcess.isBeingKilled) { if (this._taskRunProcessesBeingKilled.size > 1) { - // If there's more than one being killed, wait for graceful exit - try { - await currentTaskRunProcess.onExit.waitFor(5_000); - } catch (error) { - console.error("TaskRunProcess graceful kill timeout exceeded", error); - - try { - const forcedKill = currentTaskRunProcess.onExit.waitFor(5_000); - currentTaskRunProcess.kill("SIGKILL"); - await forcedKill; - } catch (error) { - console.error("TaskRunProcess forced kill timeout exceeded", error); - throw new SigKillTimeoutProcessError(); - } - } + await this.#tryGracefulExit(currentTaskRunProcess); } else { // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel } } else { // It's not being killed, so kill it if (this._taskRunProcessesBeingKilled.size > 0) { - // If there's one being killed already, wait for graceful exit - try { - await currentTaskRunProcess.onExit.waitFor(5_000); - } catch (error) { - console.error("TaskRunProcess graceful kill timeout exceeded", error); - - try { - const forcedKill = currentTaskRunProcess.onExit.waitFor(5_000); - currentTaskRunProcess.kill("SIGKILL"); - await forcedKill; - } catch (error) { - console.error("TaskRunProcess forced kill timeout exceeded", error); - throw new SigKillTimeoutProcessError(); - } - } + await this.#tryGracefulExit(currentTaskRunProcess); } else { // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. currentTaskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); @@ -372,6 +339,37 @@ export class ProdBackgroundWorker { } } + async #tryGracefulExit( + taskRunProcess: TaskRunProcess, + kill = false, + initialSignal: number | NodeJS.Signals = "SIGTERM" + ) { + try { + const initialExit = taskRunProcess.onExit.waitFor(5_000); + + if (kill) { + taskRunProcess.kill(initialSignal); + } + + await initialExit; + } catch (error) { + console.error("TaskRunProcess graceful kill timeout exceeded", error); + + this.#tryForcefulExit(taskRunProcess); + } + } + + async #tryForcefulExit(taskRunProcess: TaskRunProcess) { + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + console.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + // We need to fork the process before we can execute any tasks, use a fresh process for each execution async executeTaskRun( payload: ProdTaskRunExecutionPayload, @@ -447,6 +445,19 @@ export class ProdBackgroundWorker { }; } + if (e instanceof GracefulExitTimeoutError) { + return { + id: payload.execution.attempt.id, + ok: false, + retry: undefined, + error: { + type: "INTERNAL_ERROR", + code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT, + message: "Worker process killed while attempt in progress.", + }, + }; + } + return { id: payload.execution.attempt.id, ok: false, @@ -524,6 +535,7 @@ class TaskRunProcess { private _currentExecution: TaskRunExecution | undefined; private _isBeingKilled: boolean = false; private _isBeingCancelled: boolean = false; + private _gracefulExitTimeoutElapsed: boolean = false; /** * @deprecated use onTaskRunHeartbeat instead @@ -682,7 +694,7 @@ class TaskRunProcess { await this.cleanup(true); } - async cleanup(kill: boolean = false) { + async cleanup(kill = false, gracefulExitTimeoutElapsed = false) { if (kill && this._isBeingKilled) { return; } @@ -692,10 +704,21 @@ class TaskRunProcess { this.onIsBeingKilled.post(this); } + const killChildProcess = gracefulExitTimeoutElapsed && !!this._currentExecution; + + // Kill parent unless graceful exit timeout has elapsed and we're in the middle of an execution + const killParentProcess = kill && !killChildProcess; + await this._ipc?.sendWithAck("CLEANUP", { flush: true, - kill, + kill: killParentProcess, }); + + if (killChildProcess) { + this._gracefulExitTimeoutElapsed = true; + // Kill the child process + await this.kill("SIGKILL"); + } } async executeTaskRun(payload: TaskRunExecutionPayload): Promise { @@ -766,6 +789,9 @@ class TaskRunProcess { if (this._isBeingCancelled) { rejecter(new CancelledProcessError()); + } else if (this._gracefulExitTimeoutElapsed) { + // Order matters, this has to be before the graceful exit timeout + rejecter(new GracefulExitTimeoutError()); } else if (this._isBeingKilled) { rejecter(new CleanupProcessError()); } else { diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 542a535306d..517dbe4b434 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -71,6 +71,8 @@ class ProdWorker { logger.log("Received signal", { signal }); if (signal === "SIGTERM") { + let gracefulExitTimeoutElapsed = false; + if (this.executing) { const terminationGracePeriodSeconds = 60 * 60; @@ -80,21 +82,27 @@ class ProdWorker { // Wait for termination grace period minus 5s to give cleanup a chance to complete await setTimeout(terminationGracePeriodSeconds * 1000 - 5000); + gracefulExitTimeoutElapsed = true; logger.log("Termination timeout reached, exiting gracefully."); } else { logger.log("Not executing, exiting immediately."); } - await this.#exitGracefully(); + await this.#exitGracefully(gracefulExitTimeoutElapsed); + return; } logger.log("Unhandled signal", { signal }); } - async #exitGracefully() { - await this.#backgroundWorker.close(); - process.exit(0); + async #exitGracefully(gracefulExitTimeoutElapsed = false) { + await this.#backgroundWorker.close(gracefulExitTimeoutElapsed); + + if (!gracefulExitTimeoutElapsed) { + // TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies + process.exit(0); + } } async #reconnect(isPostStart = false, reconnectImmediately = false) { diff --git a/packages/cli-v3/src/workers/prod/worker-facade.ts b/packages/cli-v3/src/workers/prod/worker-facade.ts index d4cd7a91c2e..e16496d3a02 100644 --- a/packages/cli-v3/src/workers/prod/worker-facade.ts +++ b/packages/cli-v3/src/workers/prod/worker-facade.ts @@ -179,23 +179,6 @@ const zodIpc = new ZodIpcConnection({ CLEANUP: async ({ flush, kill }, sender) => { if (kill) { await tracingSDK.flush(); - - if (_execution) { - // Fail currently executing attempt - await sender.send("TASK_RUN_COMPLETED", { - execution: _execution, - result: { - ok: false, - id: _execution.run.id, - error: { - type: "INTERNAL_ERROR", - code: TaskRunErrorCodes.GRACEFUL_EXIT_TIMEOUT, - message: "Worker process killed while attempt in progress.", - }, - }, - }); - } - // Now we need to exit the process await sender.send("READY_TO_DISPOSE", undefined); } else { From e913c41ac6b397823dbc5f00ce07dbd04d48b1bc Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 17:21:43 +0100 Subject: [PATCH 38/57] fix retry mechanics --- .../cli-v3/src/workers/prod/backgroundWorker.ts | 10 ++++++++-- packages/cli-v3/src/workers/prod/entry-point.ts | 15 +++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 5431d9d10cb..0b8483c1aaf 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -103,7 +103,7 @@ export class ProdBackgroundWorker { await this._taskRunProcess?.cleanup(true, gracefulExitTimeoutElapsed); } - async killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { + async #killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { if (this._closed || !this._taskRunProcess) { return; } @@ -250,7 +250,11 @@ export class ProdBackgroundWorker { ); taskRunProcess.onExit.attach(({ pid }) => { - this._taskRunProcess = undefined; + // Only delete the task run process if the pid matches + if (this._taskRunProcess?.pid === pid) { + this._taskRunProcess = undefined; + } + if (pid) { this._taskRunProcessesBeingKilled.delete(pid); } @@ -467,6 +471,8 @@ export class ProdBackgroundWorker { code: TaskRunErrorCodes.TASK_EXECUTION_FAILED, }, }; + } finally { + await this.#killTaskRunProcess(); } } diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 517dbe4b434..568b836fc55 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -314,19 +314,22 @@ class ProdWorker { this.executing = false; this.attemptFriendlyId = undefined; - // Every retry gets a fresh process - await this.#backgroundWorker.killTaskRunProcess(); - if (willCheckpointAndRestore) { this.waitForPostStart = true; + + // We already flush after completion, so we don't need to do it here + this.#prepareForCheckpoint(false); + this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); return; } } - async #prepareForCheckpoint() { - // Flush before checkpointing so we don't flush the same spans again after restore - await this.#backgroundWorker.flushTelemetry(); + async #prepareForCheckpoint(flush = true) { + if (flush) { + // Flush before checkpointing so we don't flush the same spans again after restore + await this.#backgroundWorker.flushTelemetry(); + } // Kill the previous worker process to prevent large checkpoints await this.#backgroundWorker.forceKillOldTaskRunProcesses(); From 6aba34726835bb689556aa7532f3d68cb5103a89 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 9 May 2024 08:16:44 +0100 Subject: [PATCH 39/57] clear paused state before retry --- packages/cli-v3/src/workers/prod/entry-point.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 568b836fc55..bf83608739d 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -311,6 +311,9 @@ class ProdWorker { return; } + // Clear state for next execution + this.paused = false; + this.waitForPostStart = false; this.executing = false; this.attemptFriendlyId = undefined; From 40a99f8ed17ce989ddc67de72f36f09be4af955d Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 9 May 2024 11:20:14 +0100 Subject: [PATCH 40/57] remove checkpoint image after push --- apps/coordinator/src/index.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 170db5bd625..113ef6a1082 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -352,9 +352,8 @@ class Checkpointer { await $$`rm ${exportLocation}`; this.#logger.log("Deleted checkpoint archive", { exportLocation }); - // Disabled for now as this will increase restore time by having to pull the image again - // await $`buildah rmi ${imageRef}`; - // this.#logger.log("Deleted checkpoint image", { imageRef }); + await $`buildah rmi ${imageRef}`; + this.#logger.log("Deleted checkpoint image", { imageRef }); } catch (error) { this.#logger.error("Failed during checkpoint cleanup", { exportLocation }); throw error; From 5e4b4a3ad0ce969dcae69f4f94dadd9a1ff7f988 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 9 May 2024 12:14:08 +0100 Subject: [PATCH 41/57] crash worker on unrecoverable errors --- apps/coordinator/src/index.ts | 14 ++++++ apps/webapp/app/v3/handleSocketIo.server.ts | 14 +++++- .../cli-v3/src/workers/prod/entry-point.ts | 46 +++++++++++++++++++ packages/core/src/v3/schemas/messages.ts | 21 +++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 113ef6a1082..0a5332423e3 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -1101,6 +1101,20 @@ class TaskCoordinator { executionPayload: createAttempt.executionPayload, }); }); + + socket.on("UNRECOVERABLE_ERROR", async (message) => { + logger.log("[UNRECOVERABLE_ERROR]", message); + + this.#platformSocket?.send("RUN_CRASHED", { + version: "v1", + runId: socket.data.runId, + error: message.error, + }); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + }); }, onDisconnect: async (socket, handler, sender, logger) => { this.#platformSocket?.send("LOG", { diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts index 2b917e45ad0..ca731f202d9 100644 --- a/apps/webapp/app/v3/handleSocketIo.server.ts +++ b/apps/webapp/app/v3/handleSocketIo.server.ts @@ -189,7 +189,19 @@ function createCoordinatorNamespace(io: Server) { await service.call(message.deploymentId, message.error); } catch (e) { - logger.error("Error while indexing", { error: e }); + logger.error("Error while processing index failure", { error: e }); + } + }, + RUN_CRASHED: async (message) => { + try { + const service = new CrashTaskRunService(); + + await service.call(message.runId, { + reason: `${message.error.name}: ${message.error.message}`, + logs: message.error.stack, + }); + } catch (e) { + logger.error("Error while processing run failure", { error: e }); } }, }, diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index bf83608739d..157b1881e5e 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -230,6 +230,15 @@ class ProdWorker { backgroundWorker.onWaitForDuration.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name: "NoAttemptId", + message: "Attempt ID not set before waiting for duration", + }, + }); + return; } @@ -247,6 +256,15 @@ class ProdWorker { backgroundWorker.onWaitForTask.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name: "NoAttemptId", + message: "Attempt ID not set before waiting for task", + }, + }); + return; } @@ -264,6 +282,15 @@ class ProdWorker { backgroundWorker.onWaitForBatch.attach(async (message) => { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); + + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name: "NoAttemptId", + message: "Attempt ID not set before waiting for batch", + }, + }); + return; } @@ -569,11 +596,30 @@ class ProdWorker { if (this.paused) { if (!this.nextResumeAfter) { + logger.error("Missing next resume reason"); + + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name: "NoNextResume", + message: "Next resume reason not set while resuming from paused state", + }, + }); + return; } if (!this.attemptFriendlyId) { logger.error("Missing friendly ID"); + + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name: "NoAttemptId", + message: "Attempt ID not set while resuming from paused state", + }, + }); + return; } diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index d1b559e5d9d..4cd16df1ea5 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -585,6 +585,17 @@ export const CoordinatorToPlatformMessages = { }), }), }, + RUN_CRASHED: { + message: z.object({ + version: z.literal("v1").default("v1"), + runId: z.string(), + error: z.object({ + name: z.string(), + message: z.string(), + stack: z.string().optional(), + }), + }), + }, }; export const PlatformToCoordinatorMessages = { @@ -833,6 +844,16 @@ export const ProdWorkerToCoordinatorMessages = { }), ]), }, + UNRECOVERABLE_ERROR: { + message: z.object({ + version: z.literal("v1").default("v1"), + error: z.object({ + name: z.string(), + message: z.string(), + stack: z.string().optional(), + }), + }), + }, }; export const CoordinatorToProdWorkerMessages = { From bc71e2c14c8604601e8b9f9a9aa17781a8863137 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 17:38:55 +0100 Subject: [PATCH 42/57] refactor unrecoverable error emit --- .../cli-v3/src/workers/prod/entry-point.ts | 59 ++++++++----------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 157b1881e5e..71d36eea67c 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -231,13 +231,10 @@ class ProdWorker { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); - this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { - version: "v1", - error: { - name: "NoAttemptId", - message: "Attempt ID not set before waiting for duration", - }, - }); + this.#emitUnrecoverableError( + "NoAttemptId", + "Attempt ID not set before waiting for duration" + ); return; } @@ -257,13 +254,7 @@ class ProdWorker { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); - this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { - version: "v1", - error: { - name: "NoAttemptId", - message: "Attempt ID not set before waiting for task", - }, - }); + this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for task"); return; } @@ -283,13 +274,7 @@ class ProdWorker { if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); - this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { - version: "v1", - error: { - name: "NoAttemptId", - message: "Attempt ID not set before waiting for batch", - }, - }); + this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set before waiting for batch"); return; } @@ -598,13 +583,10 @@ class ProdWorker { if (!this.nextResumeAfter) { logger.error("Missing next resume reason"); - this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { - version: "v1", - error: { - name: "NoNextResume", - message: "Next resume reason not set while resuming from paused state", - }, - }); + this.#emitUnrecoverableError( + "NoNextResume", + "Next resume reason not set while resuming from paused state" + ); return; } @@ -612,13 +594,10 @@ class ProdWorker { if (!this.attemptFriendlyId) { logger.error("Missing friendly ID"); - this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { - version: "v1", - error: { - name: "NoAttemptId", - message: "Attempt ID not set while resuming from paused state", - }, - }); + this.#emitUnrecoverableError( + "NoAttemptId", + "Attempt ID not set while resuming from paused state" + ); return; } @@ -939,6 +918,16 @@ class ProdWorker { }; } + #emitUnrecoverableError(name: string, message: string) { + this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { + version: "v1", + error: { + name, + message, + }, + }); + } + start() { this.#httpServer.listen(this.#httpPort, this.host); } From 48aadea3207dbd97ee52a88d2c2f0fa74ec37744 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 17:39:31 +0100 Subject: [PATCH 43/57] switch to do hosted busybox image --- apps/kubernetes-provider/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/kubernetes-provider/src/index.ts b/apps/kubernetes-provider/src/index.ts index 8981c85d928..53870d36d5b 100644 --- a/apps/kubernetes-provider/src/index.ts +++ b/apps/kubernetes-provider/src/index.ts @@ -212,7 +212,7 @@ class KubernetesTaskOperations implements TaskOperations { }, { name: "populate-taskinfo", - image: "docker.io/library/busybox", + image: "registry.digitalocean.com/trigger/busybox", imagePullPolicy: "IfNotPresent", command: ["/bin/sh", "-c"], args: ["printenv COORDINATOR_HOST | tee /etc/taskinfo/coordinator-host"], From 127d1aa65cfd160c874af60b53fe952d71e1e33b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 17:48:04 +0100 Subject: [PATCH 44/57] increase wait for duration ipc timeout --- packages/core/src/v3/runtime/prodRuntimeManager.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/v3/runtime/prodRuntimeManager.ts b/packages/core/src/v3/runtime/prodRuntimeManager.ts index 89b4afddafe..19dc0fd8833 100644 --- a/packages/core/src/v3/runtime/prodRuntimeManager.ts +++ b/packages/core/src/v3/runtime/prodRuntimeManager.ts @@ -61,7 +61,7 @@ export class ProdRuntimeManager implements RuntimeManager { ms, now, }, - 10_000 + 31_000 ); if (!willCheckpointAndRestore) { From 02ae3f8edaa5f50c7284e998880865e47f224392 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 17:49:59 +0100 Subject: [PATCH 45/57] add changeset for misc fixes --- .changeset/tricky-keys-attack.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .changeset/tricky-keys-attack.md diff --git a/.changeset/tricky-keys-attack.md b/.changeset/tricky-keys-attack.md new file mode 100644 index 00000000000..271096497d7 --- /dev/null +++ b/.changeset/tricky-keys-attack.md @@ -0,0 +1,14 @@ +--- +"trigger.dev": patch +"@trigger.dev/core": patch +--- + +- Clear paused states before retry +- Detect and handle unrecoverable worker errors +- Remove checkpoints after successful push +- Permanently switch to DO hosted busybox image +- Fix IPC timeout issue, or at least handle it more gracefully +- Handle checkpoint failures +- Basic chaos monkey for checkpoint testing +- Stack traces are back in the dashboard +- Display final errors on root span From 0ad7b839df75e49c092d9c5c787063ad33fa09aa Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 24 May 2024 18:59:32 +0100 Subject: [PATCH 46/57] fix merge --- packages/cli-v3/src/commands/dev.tsx | 32 +++++++++++-------- .../src/workers/dev/backgroundWorker.ts | 4 +-- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx index 6a6a3e0038b..a692b3bd1d9 100644 --- a/packages/cli-v3/src/commands/dev.tsx +++ b/packages/cli-v3/src/commands/dev.tsx @@ -536,21 +536,25 @@ function useDev({ const processEnv = await gatherProcessEnv(); - const backgroundWorker = new BackgroundWorker(fullPath, { - projectConfig: config, - dependencies, - env: { - ...processEnv, - TRIGGER_API_URL: apiUrl, - TRIGGER_SECRET_KEY: apiKey, - ...(environmentVariablesResponse.success - ? environmentVariablesResponse.data.variables - : {}), + const backgroundWorker = new BackgroundWorker( + fullPath, + { + projectConfig: config, + dependencies, + env: { + ...processEnv, + TRIGGER_API_URL: apiUrl, + TRIGGER_SECRET_KEY: apiKey, + ...(environmentVariablesResponse.success + ? environmentVariablesResponse.data.variables + : {}), + }, + debuggerOn, + debugOtel, + resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule), }, - debuggerOn, - debugOtel, - resolveEnvVariables: createResolveEnvironmentVariablesFunction(configModule), - }); + environmentClient + ); try { await backgroundWorker.initialize(); diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index bfb95cda207..6b6b2158746 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -46,7 +46,6 @@ import { UncaughtExceptionError, UnexpectedExitError, } from "../common/errors.js"; -import { env } from "node:process"; import { CliApiClient } from "../../apiClient.js"; export type CurrentWorkers = BackgroundWorkerCoordinator["currentWorkers"]; @@ -290,7 +289,8 @@ export class BackgroundWorker { constructor( public path: string, - public params: BackgroundWorkerParams + public params: BackgroundWorkerParams, + private apiClient: CliApiClient ) {} close() { From 8e5b71dedbe8cc65107f74d2ffad1e8db1e335f0 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 27 May 2024 11:57:27 +0100 Subject: [PATCH 47/57] fix retry delay span runId --- apps/webapp/app/v3/services/completeAttempt.server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index 912174d5644..bd4e6a94d22 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -178,7 +178,7 @@ export class CompleteAttemptService extends BaseService { properties: { retryAt: retryAt.toISOString(), }, - runId: taskRunAttempt.taskRunId, + runId: taskRunAttempt.taskRun.friendlyId, style: { icon: "schedule-attempt", }, From ee660a3a9540acd9ccd17e7a920a5467dac0cde2 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 28 May 2024 09:09:21 +0100 Subject: [PATCH 48/57] fix dev retries --- .../src/workers/dev/backgroundWorker.ts | 109 +++++++++++------- 1 file changed, 68 insertions(+), 41 deletions(-) diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index 6b6b2158746..b27617a5780 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -419,22 +419,28 @@ export class BackgroundWorker { } } - async #initializeTaskRunProcess( + #prefixedMessage(payload: TaskRunExecutionPayload, message: string = "") { + return `[${payload.execution.run.id}.${payload.execution.attempt.number}] ${message}`; + } + + async #getFreshTaskRunProcess( payload: TaskRunExecutionPayload, messageId?: string ): Promise { + logger.debug(this.#prefixedMessage(payload, "getFreshTaskRunProcess()")); + if (!this.metadata) { throw new Error("Worker not registered"); } this._closed = false; - if (this._taskRunProcesses.has(payload.execution.run.id)) { - return this._taskRunProcesses.get(payload.execution.run.id) as TaskRunProcess; - } + logger.debug(this.#prefixedMessage(payload, "killing current task run process before attempt")); await this.#killCurrentTaskRunProcessBeforeAttempt(payload.execution.run.id); + logger.debug(this.#prefixedMessage(payload, "creating new task run process")); + const taskRunProcess = new TaskRunProcess( payload.execution.run.id, payload.execution.run.isTest, @@ -450,7 +456,15 @@ export class BackgroundWorker { ); taskRunProcess.onExit.attach(({ pid }) => { - this._taskRunProcesses.delete(payload.execution.run.id); + logger.debug(this.#prefixedMessage(payload, "onExit()"), { pid }); + + const taskRunProcess = this._taskRunProcesses.get(payload.execution.run.id); + + // Only delete the task run process if the pid matches + if (taskRunProcess?.pid === pid) { + this._taskRunProcesses.delete(payload.execution.run.id); + } + if (pid) { this._taskRunProcessesBeingKilled.delete(pid); } @@ -481,47 +495,24 @@ export class BackgroundWorker { const taskRunProcess = this._taskRunProcesses.get(runId); if (!taskRunProcess) { + logger.debug(`[${runId}] no current task process to kill`); return; } + logger.debug(`[${runId}] killing current task process`, { + pid: taskRunProcess.pid, + }); + if (taskRunProcess.isBeingKilled) { if (this._taskRunProcessesBeingKilled.size > 1) { - // If there's more than one being killed, wait for graceful exit - try { - await taskRunProcess.onExit.waitFor(5_000); - } catch (error) { - console.error("TaskRunProcess graceful kill timeout exceeded", error); - - try { - const forcedKill = taskRunProcess.onExit.waitFor(5_000); - taskRunProcess.kill("SIGKILL"); - await forcedKill; - } catch (error) { - console.error("TaskRunProcess forced kill timeout exceeded", error); - throw new SigKillTimeoutProcessError(); - } - } + await this.#tryGracefulExit(taskRunProcess); } else { // If there's only one or none being killed, don't do anything so we can create a fresh one in parallel } } else { // It's not being killed, so kill it if (this._taskRunProcessesBeingKilled.size > 0) { - // If there's one being killed already, wait for graceful exit - try { - await taskRunProcess.onExit.waitFor(5_000); - } catch (error) { - console.error("TaskRunProcess graceful kill timeout exceeded", error); - - try { - const forcedKill = taskRunProcess.onExit.waitFor(5_000); - taskRunProcess.kill("SIGKILL"); - await forcedKill; - } catch (error) { - console.error("TaskRunProcess forced kill timeout exceeded", error); - throw new SigKillTimeoutProcessError(); - } - } + await this.#tryGracefulExit(taskRunProcess); } else { // There's none being killed yet, so we can kill it without waiting. We still set a timeout to kill it forcefully just in case it sticks around. taskRunProcess.kill("SIGTERM", 5_000).catch(() => {}); @@ -529,6 +520,37 @@ export class BackgroundWorker { } } + async #tryGracefulExit( + taskRunProcess: TaskRunProcess, + kill = false, + initialSignal: number | NodeJS.Signals = "SIGTERM" + ) { + try { + const initialExit = taskRunProcess.onExit.waitFor(5_000); + + if (kill) { + taskRunProcess.kill(initialSignal); + } + + await initialExit; + } catch (error) { + logger.error("TaskRunProcess graceful kill timeout exceeded", error); + + this.#tryForcefulExit(taskRunProcess); + } + } + + async #tryForcefulExit(taskRunProcess: TaskRunProcess) { + try { + const forcedKill = taskRunProcess.onExit.waitFor(5_000); + taskRunProcess.kill("SIGKILL"); + await forcedKill; + } catch (error) { + logger.error("TaskRunProcess forced kill timeout exceeded", error); + throw new SigKillTimeoutProcessError(); + } + } + async cancelRun(taskRunId: string) { const taskRunProcess = this._taskRunProcesses.get(taskRunId); @@ -636,7 +658,12 @@ export class BackgroundWorker { messageId?: string ): Promise { try { - const taskRunProcess = await this.#initializeTaskRunProcess(payload, messageId); + const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId); + + logger.debug(this.#prefixedMessage(payload, "executing task run"), { + pid: taskRunProcess.pid, + }); + const result = await taskRunProcess.executeTaskRun(payload); // Always kill the worker @@ -829,7 +856,7 @@ class TaskRunProcess { this.onIsBeingKilled.post(this._child?.pid); } - logger.debug(`[${this.runId}] cleaning up task run process`, { kill }); + logger.debug(`[${this.runId}] cleaning up task run process`, { kill, pid: this.pid }); await this._sender.send("CLEANUP", { flush: true, @@ -841,7 +868,7 @@ class TaskRunProcess { // Set a timeout to kill the child process if it hasn't been killed within 5 seconds setTimeout(() => { if (this._child && !this._child.killed) { - logger.debug(`[${this.runId}] killing task run process after timeout`); + logger.debug(`[${this.runId}] killing task run process after timeout`, { pid: this.pid }); this._child.kill(); } @@ -949,7 +976,7 @@ class TaskRunProcess { } async #handleExit(code: number | null, signal: NodeJS.Signals | null) { - logger.debug(`[${this.runId}] task run process exiting`, { code, signal }); + logger.debug(`[${this.runId}] handle task run process exit`, { code, signal, pid: this.pid }); // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { @@ -1014,9 +1041,9 @@ class TaskRunProcess { } #kill() { - if (this._child && !this._child.killed) { - logger.debug(`[${this.runId}] killing task run process`); + logger.debug(`[${this.runId}] #kill()`, { pid: this.pid }); + if (this._child && !this._child.killed) { this._child?.kill(); } } From 8f378b2a6e14dfa29d6e0997a0fd8597f7fd1035 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 28 May 2024 14:06:49 +0100 Subject: [PATCH 49/57] improve prod worker logging --- .../src/workers/prod/backgroundWorker.ts | 45 ++++++++++++++++++- .../cli-v3/src/workers/prod/entry-point.ts | 19 ++++---- packages/core/src/v3/schemas/messages.ts | 1 + 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/packages/cli-v3/src/workers/prod/backgroundWorker.ts b/packages/cli-v3/src/workers/prod/backgroundWorker.ts index 0b8483c1aaf..2b0c523e7b4 100644 --- a/packages/cli-v3/src/workers/prod/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/prod/backgroundWorker.ts @@ -90,6 +90,8 @@ export class ProdBackgroundWorker { ) {} async close(gracefulExitTimeoutElapsed = false) { + console.log("Closing worker", { gracefulExitTimeoutElapsed, closed: this._closed }); + if (this._closed) { return; } @@ -104,6 +106,8 @@ export class ProdBackgroundWorker { } async #killTaskRunProcess(flush = true, initialSignal: number | NodeJS.Signals = "SIGTERM") { + console.log("Killing task run process", { flush, initialSignal, closed: this._closed }); + if (this._closed || !this._taskRunProcess) { return; } @@ -119,6 +123,10 @@ export class ProdBackgroundWorker { console.error("Error while trying graceful exit", error); }); + console.log("Killed task run process, setting closed to true", { + closed: this._closed, + pid: currentTaskRunProcess.pid, + }); this._closed = true; } @@ -232,6 +240,9 @@ export class ProdBackgroundWorker { payload.execution.worker.version ); + console.log("Getting fresh task run process, setting closed to false", { + closed: this._closed, + }); this._closed = false; await this.#killCurrentTaskRunProcessBeforeAttempt(); @@ -250,6 +261,8 @@ export class ProdBackgroundWorker { ); taskRunProcess.onExit.attach(({ pid }) => { + console.log("Task run process exited", { pid }); + // Only delete the task run process if the pid matches if (this._taskRunProcess?.pid === pid) { this._taskRunProcess = undefined; @@ -320,12 +333,21 @@ export class ProdBackgroundWorker { } async #killCurrentTaskRunProcessBeforeAttempt() { + console.log("killCurrentTaskRunProcessBeforeAttempt()", { + hasTaskRunProcess: !!this._taskRunProcess, + }); + if (!this._taskRunProcess) { return; } const currentTaskRunProcess = this._taskRunProcess; + console.log("Killing current task run process", { + isBeingKilled: currentTaskRunProcess?.isBeingKilled, + totalBeingKilled: this._taskRunProcessesBeingKilled.size, + }); + if (currentTaskRunProcess.isBeingKilled) { if (this._taskRunProcessesBeingKilled.size > 1) { await this.#tryGracefulExit(currentTaskRunProcess); @@ -382,6 +404,11 @@ export class ProdBackgroundWorker { try { const taskRunProcess = await this.#getFreshTaskRunProcess(payload, messageId); + console.log("executing task run", { + attempt: payload.execution.attempt.id, + taskRunPid: taskRunProcess.pid, + }); + const result = await taskRunProcess.executeTaskRun(payload); if (result.ok) { @@ -477,7 +504,12 @@ export class ProdBackgroundWorker { } async cancelAttempt(attemptId: string) { - await this._taskRunProcess?.cancel(); + if (!this._taskRunProcess) { + console.error("No task run process to cancel attempt", { attemptId }); + return; + } + + await this._taskRunProcess.cancel(); } async executeTaskRunLazyAttempt(payload: TaskRunExecutionLazyAttemptPayload) { @@ -701,6 +733,8 @@ class TaskRunProcess { } async cleanup(kill = false, gracefulExitTimeoutElapsed = false) { + console.log("cleanup()", { kill, gracefulExitTimeoutElapsed }); + if (kill && this._isBeingKilled) { return; } @@ -715,6 +749,11 @@ class TaskRunProcess { // Kill parent unless graceful exit timeout has elapsed and we're in the middle of an execution const killParentProcess = kill && !killChildProcess; + console.log("Cleaning up task run process", { + killChildProcess, + killParentProcess, + }); + await this._ipc?.sendWithAck("CLEANUP", { flush: true, kill: killParentProcess, @@ -780,9 +819,13 @@ class TaskRunProcess { } async #handleExit(code: number | null, signal: NodeJS.Signals | null) { + console.log("handling child exit", { code, signal }); + // Go through all the attempts currently pending and reject them for (const [id, status] of this._attemptStatuses.entries()) { if (status === "PENDING") { + console.log("found pending attempt", { id }); + this._attemptStatuses.set(id, "REJECTED"); const attemptPromise = this._attemptPromises.get(id); diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 71d36eea67c..d59132e83d5 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -134,7 +134,9 @@ class ProdWorker { }); } } catch (error) { - logger.error("taskinfo read error during reconnect", { error }); + logger.error("taskinfo read error during reconnect", { + error: error instanceof Error ? error.message : error, + }); } finally { this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); } @@ -181,6 +183,8 @@ class ProdWorker { } ); + logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled }); + if (checkpointCanceled) { if (message.reason === "WAIT_FOR_DURATION") { // Worker will resume immediately @@ -385,11 +389,8 @@ class ProdWorker { extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId; } - logger.log("connecting to coordinator", { - host, - port: COORDINATOR_PORT, - extraHeaders, - }); + logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`); + logger.debug(`connecting with extra headers`, { extraHeaders }); const coordinatorConnection = new ZodSocketConnection({ namespace: "prod-worker", @@ -574,6 +575,8 @@ class ProdWorker { }, }, onConnection: async (socket, handler, sender, logger) => { + logger.log("connected to coordinator", { status: this.#status }); + if (this.waitForPostStart) { logger.log("skip connection handler, waiting for post start hook"); return; @@ -581,7 +584,7 @@ class ProdWorker { if (this.paused) { if (!this.nextResumeAfter) { - logger.error("Missing next resume reason"); + logger.error("Missing next resume reason", { status: this.#status }); this.#emitUnrecoverableError( "NoNextResume", @@ -592,7 +595,7 @@ class ProdWorker { } if (!this.attemptFriendlyId) { - logger.error("Missing friendly ID"); + logger.error("Missing friendly ID", { status: this.#status }); this.#emitUnrecoverableError( "NoAttemptId", diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index 4cd16df1ea5..10c20912a10 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -856,6 +856,7 @@ export const ProdWorkerToCoordinatorMessages = { }, }; +// TODO: The coordinator can only safely use v1 worker messages, higher versions will need a new flag, e.g. SUPPORTS_VERSIONED_MESSAGES export const CoordinatorToProdWorkerMessages = { RESUME_AFTER_DEPENDENCY: { message: z.object({ From 839b349945204c6296bf94a54f4de06a1feed8fe Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 28 May 2024 14:26:31 +0100 Subject: [PATCH 50/57] log checkpoint sizes --- apps/coordinator/src/index.ts | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index 0a5332423e3..c371c220128 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -1,4 +1,5 @@ import { createServer } from "node:http"; +import fs from "node:fs/promises"; import { $, type ExecaChildProcess } from "execa"; import { nanoid } from "nanoid"; import { Server } from "socket.io"; @@ -62,6 +63,36 @@ function isExecaChildProcess(maybeExeca: unknown): maybeExeca is Awaited { + try { + const stats = await fs.stat(filePath); + return stats.size; + } catch (error) { + console.error("Error getting file size:", error); + return -1; + } +} + +async function getParsedFileSize(filePath: string) { + const sizeInBytes = await getFileSize(filePath); + + let message = `Size in bytes: ${sizeInBytes}`; + + if (sizeInBytes > 1024 * 1024) { + const sizeInMB = (sizeInBytes / 1024 / 1024).toFixed(2); + message = `Size in MB (rounded): ${sizeInMB}`; + } else if (sizeInBytes > 1024) { + const sizeInKB = (sizeInBytes / 1024).toFixed(2); + message = `Size in KB (rounded): ${sizeInKB}`; + } + + return { + path: filePath, + sizeInBytes, + message, + }; +} + class Checkpointer { #initialized = false; #canCheckpoint = false; @@ -314,6 +345,10 @@ class Checkpointer { this.#logger.debug(await $$`crictl checkpoint --export=${exportLocation} ${containerId}`); const postCheckpoint = performance.now(); + // Print checkpoint size + const size = await getParsedFileSize(exportLocation); + this.#logger.log("checkpoint archive created", { size, options }); + // Create image from checkpoint const container = this.#logger.debug(await $$`buildah from scratch`); const postFrom = performance.now(); From 16a365fd6f63b870d1cebc4e9de6d4f36f4d6089 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 28 May 2024 15:00:15 +0100 Subject: [PATCH 51/57] add lazy attempts catalog entries --- .../v3-catalog/src/trigger/lazyAttempts.ts | 394 ++++++++++++++++++ references/v3-catalog/trigger.config.ts | 2 +- 2 files changed, 395 insertions(+), 1 deletion(-) create mode 100644 references/v3-catalog/src/trigger/lazyAttempts.ts diff --git a/references/v3-catalog/src/trigger/lazyAttempts.ts b/references/v3-catalog/src/trigger/lazyAttempts.ts new file mode 100644 index 00000000000..c84207b601b --- /dev/null +++ b/references/v3-catalog/src/trigger/lazyAttempts.ts @@ -0,0 +1,394 @@ +import { logger, task, wait } from "@trigger.dev/sdk/v3"; + +export const lazyImmediate = task({ + id: "lazy-immediate", + run: async (payload: { forceError?: boolean }) => { + logger.info("Log something", { payload }); + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWait = task({ + id: "lazy-wait", + run: async (payload: { forceError?: boolean; delayInSeconds?: number }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependency = task({ + id: "lazy-single-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependency = task({ + id: "lazy-batch-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveWaits = task({ + id: "lazy-consecutive-waits", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveDependencies = task({ + id: "lazy-consecutive-dependencies", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result #1", { result }); + + logger.info("Log something else", { payload }); + + const result2 = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result #2", { result2 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyConsecutiveBatchDependencies = task({ + id: "lazy-consecutive-batch-dependencies", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results #1", { results }); + + logger.info("Log something else", { payload }); + + const results2 = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results #2", { results2 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWaitThenSingleDependency = task({ + id: "lazy-wait-then-single-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyWaitThenBatchDependency = task({ + id: "lazy-wait-then-batch-dependency", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependencyThenWait = task({ + id: "lazy-single-dependency-then-wait", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazySingleDependencyThenBatch = task({ + id: "lazy-single-dependency-then-batch", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependencyThenWait = task({ + id: "lazy-batch-dependency-then-wait", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + await wait.for({ seconds: payload.delayInSeconds ?? 1 }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); + +export const lazyBatchDependencyThenSingle = task({ + id: "lazy-batch-dependency-then-single", + run: async (payload: { + forceError?: boolean; + forceChildError?: boolean; + delayInSeconds?: number; + }) => { + logger.info("Log something", { payload }); + + const results = await lazyWait.batchTriggerAndWait([ + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + { payload: { delayInSeconds: payload.delayInSeconds, forceError: payload.forceChildError } }, + ]); + logger.info("Batch results", { results }); + + logger.info("Log something else", { payload }); + + const result = await lazyWait.triggerAndWait({ + delayInSeconds: payload.delayInSeconds, + forceError: payload.forceChildError, + }); + logger.info("Single result", { result }); + + logger.info("Log something else again", { payload }); + + if (payload.forceError) { + throw new Error("Forced error"); + } + + return { + message: "This is a message", + payload, + }; + }, +}); diff --git a/references/v3-catalog/trigger.config.ts b/references/v3-catalog/trigger.config.ts index 649a349a66f..c1f504f2eb9 100644 --- a/references/v3-catalog/trigger.config.ts +++ b/references/v3-catalog/trigger.config.ts @@ -37,7 +37,7 @@ export const config: TriggerConfig = { retries: { enabledInDev: true, default: { - maxAttempts: 3, + maxAttempts: 4, minTimeoutInMs: 1000, maxTimeoutInMs: 10000, factor: 2, From d137e4e1fe64e68d44e58bb8f6cf3e2f2dc8b4d4 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 28 May 2024 16:07:31 +0100 Subject: [PATCH 52/57] Fixed merge issue: use zodFetch, not wrapZodFetch --- packages/cli-v3/src/apiClient.ts | 100 +++++++++++++++++-------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 3702ef97291..1e4719dfac4 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -17,9 +17,7 @@ import { ImportEnvironmentVariablesRequestBody, EnvironmentVariableResponseBody, TaskRunExecution, - APIError, } from "@trigger.dev/core/v3"; -import { zodfetch } from "@trigger.dev/core/v3/zodfetch"; export class CliApiClient { private readonly apiURL: string; @@ -32,7 +30,7 @@ export class CliApiClient { } async createAuthorizationCode() { - return wrapZodFetch( + return zodfetch( CreateAuthorizationCodeResponseSchema, `${this.apiURL}/api/v1/authorization-code`, { @@ -42,7 +40,7 @@ export class CliApiClient { } async getPersonalAccessToken(authorizationCode: string) { - return wrapZodFetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { + return zodfetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { method: "POST", body: JSON.stringify({ authorizationCode, @@ -55,7 +53,7 @@ export class CliApiClient { throw new Error("whoAmI: No access token"); } - return wrapZodFetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { + return zodfetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -68,7 +66,7 @@ export class CliApiClient { throw new Error("getProject: No access token"); } - return wrapZodFetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { + return zodfetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -81,7 +79,7 @@ export class CliApiClient { throw new Error("getProjects: No access token"); } - return wrapZodFetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { + return zodfetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -94,7 +92,7 @@ export class CliApiClient { throw new Error("createBackgroundWorker: No access token"); } - return wrapZodFetch( + return zodfetch( CreateBackgroundWorkerResponse, `${this.apiURL}/api/v1/projects/${projectRef}/background-workers`, { @@ -113,7 +111,7 @@ export class CliApiClient { throw new Error("creatTaskRunAttempt: No access token"); } - return wrapZodFetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { + return zodfetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -133,16 +131,12 @@ export class CliApiClient { throw new Error("getProjectDevEnv: No access token"); } - return wrapZodFetch( - GetProjectEnvResponse, - `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, - { - headers: { - Authorization: `Bearer ${this.accessToken}`, - "Content-Type": "application/json", - }, - } - ); + return zodfetch(GetProjectEnvResponse, `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, { + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + }); } async getEnvironmentVariables(projectRef: string) { @@ -150,7 +144,7 @@ export class CliApiClient { throw new Error("getEnvironmentVariables: No access token"); } - return wrapZodFetch( + return zodfetch( GetEnvironmentVariablesResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars`, { @@ -190,7 +184,7 @@ export class CliApiClient { throw new Error("initializeDeployment: No access token"); } - return wrapZodFetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { + return zodfetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -205,7 +199,7 @@ export class CliApiClient { throw new Error("startDeploymentIndexing: No access token"); } - return wrapZodFetch( + return zodfetch( StartDeploymentIndexingResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}/start-indexing`, { @@ -224,7 +218,7 @@ export class CliApiClient { throw new Error("getDeployment: No access token"); } - return wrapZodFetch( + return zodfetch( GetDeploymentResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}`, { @@ -244,42 +238,56 @@ type ApiResult = error: string; }; -async function wrapZodFetch( - schema: T, +async function zodfetch( + schema: z.Schema, url: string, requestInit?: RequestInit -): Promise>> { +): Promise> { try { - const response = await zodfetch(schema, url, requestInit, { - retry: { - minTimeoutInMs: 500, - maxTimeoutInMs: 5000, - maxAttempts: 3, - factor: 2, - randomize: false, - }, - }); + const response = await fetch(url, requestInit); - return { - success: true, - data: response, - }; - } catch (error) { - if (error instanceof APIError) { + if ((!requestInit || requestInit.method === "GET") && response.status === 404) { return { success: false, - error: error.message, + error: `404: ${response.statusText}`, }; - } else if (error instanceof Error) { + } + + if (response.status >= 400 && response.status < 500) { + const body = await response.json(); + if (!body.error) { + return { success: false, error: "Something went wrong" }; + } + + return { success: false, error: body.error }; + } + + if (response.status !== 200) { return { success: false, - error: error.message, + error: `Failed to fetch ${url}, got status code ${response.status}`, }; - } else { + } + + const jsonBody = await response.json(); + const parsedResult = schema.safeParse(jsonBody); + + if (parsedResult.success) { + return { success: true, data: parsedResult.data }; + } + + if ("error" in jsonBody) { return { success: false, - error: String(error), + error: typeof jsonBody.error === "string" ? jsonBody.error : JSON.stringify(jsonBody.error), }; } + + return { success: false, error: parsedResult.error.message }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : JSON.stringify(error), + }; } } From 79b47fd5d1c4f3017e85bcbfbe37baa853d25e27 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 28 May 2024 16:16:34 +0100 Subject: [PATCH 53/57] Revert "Fixed merge issue: use zodFetch, not wrapZodFetch" This reverts commit d137e4e1fe64e68d44e58bb8f6cf3e2f2dc8b4d4. --- packages/cli-v3/src/apiClient.ts | 100 ++++++++++++++----------------- 1 file changed, 46 insertions(+), 54 deletions(-) diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 1e4719dfac4..3702ef97291 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -17,7 +17,9 @@ import { ImportEnvironmentVariablesRequestBody, EnvironmentVariableResponseBody, TaskRunExecution, + APIError, } from "@trigger.dev/core/v3"; +import { zodfetch } from "@trigger.dev/core/v3/zodfetch"; export class CliApiClient { private readonly apiURL: string; @@ -30,7 +32,7 @@ export class CliApiClient { } async createAuthorizationCode() { - return zodfetch( + return wrapZodFetch( CreateAuthorizationCodeResponseSchema, `${this.apiURL}/api/v1/authorization-code`, { @@ -40,7 +42,7 @@ export class CliApiClient { } async getPersonalAccessToken(authorizationCode: string) { - return zodfetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { + return wrapZodFetch(GetPersonalAccessTokenResponseSchema, `${this.apiURL}/api/v1/token`, { method: "POST", body: JSON.stringify({ authorizationCode, @@ -53,7 +55,7 @@ export class CliApiClient { throw new Error("whoAmI: No access token"); } - return zodfetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { + return wrapZodFetch(WhoAmIResponseSchema, `${this.apiURL}/api/v2/whoami`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -66,7 +68,7 @@ export class CliApiClient { throw new Error("getProject: No access token"); } - return zodfetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { + return wrapZodFetch(GetProjectResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -79,7 +81,7 @@ export class CliApiClient { throw new Error("getProjects: No access token"); } - return zodfetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { + return wrapZodFetch(GetProjectsResponseBody, `${this.apiURL}/api/v1/projects`, { headers: { Authorization: `Bearer ${this.accessToken}`, "Content-Type": "application/json", @@ -92,7 +94,7 @@ export class CliApiClient { throw new Error("createBackgroundWorker: No access token"); } - return zodfetch( + return wrapZodFetch( CreateBackgroundWorkerResponse, `${this.apiURL}/api/v1/projects/${projectRef}/background-workers`, { @@ -111,7 +113,7 @@ export class CliApiClient { throw new Error("creatTaskRunAttempt: No access token"); } - return zodfetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { + return wrapZodFetch(TaskRunExecution, `${this.apiURL}/api/v1/runs/${runFriendlyId}/attempts`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -131,12 +133,16 @@ export class CliApiClient { throw new Error("getProjectDevEnv: No access token"); } - return zodfetch(GetProjectEnvResponse, `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, { - headers: { - Authorization: `Bearer ${this.accessToken}`, - "Content-Type": "application/json", - }, - }); + return wrapZodFetch( + GetProjectEnvResponse, + `${this.apiURL}/api/v1/projects/${projectRef}/${env}`, + { + headers: { + Authorization: `Bearer ${this.accessToken}`, + "Content-Type": "application/json", + }, + } + ); } async getEnvironmentVariables(projectRef: string) { @@ -144,7 +150,7 @@ export class CliApiClient { throw new Error("getEnvironmentVariables: No access token"); } - return zodfetch( + return wrapZodFetch( GetEnvironmentVariablesResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars`, { @@ -184,7 +190,7 @@ export class CliApiClient { throw new Error("initializeDeployment: No access token"); } - return zodfetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { + return wrapZodFetch(InitializeDeploymentResponseBody, `${this.apiURL}/api/v1/deployments`, { method: "POST", headers: { Authorization: `Bearer ${this.accessToken}`, @@ -199,7 +205,7 @@ export class CliApiClient { throw new Error("startDeploymentIndexing: No access token"); } - return zodfetch( + return wrapZodFetch( StartDeploymentIndexingResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}/start-indexing`, { @@ -218,7 +224,7 @@ export class CliApiClient { throw new Error("getDeployment: No access token"); } - return zodfetch( + return wrapZodFetch( GetDeploymentResponseBody, `${this.apiURL}/api/v1/deployments/${deploymentId}`, { @@ -238,56 +244,42 @@ type ApiResult = error: string; }; -async function zodfetch( - schema: z.Schema, +async function wrapZodFetch( + schema: T, url: string, requestInit?: RequestInit -): Promise> { +): Promise>> { try { - const response = await fetch(url, requestInit); + const response = await zodfetch(schema, url, requestInit, { + retry: { + minTimeoutInMs: 500, + maxTimeoutInMs: 5000, + maxAttempts: 3, + factor: 2, + randomize: false, + }, + }); - if ((!requestInit || requestInit.method === "GET") && response.status === 404) { + return { + success: true, + data: response, + }; + } catch (error) { + if (error instanceof APIError) { return { success: false, - error: `404: ${response.statusText}`, + error: error.message, }; - } - - if (response.status >= 400 && response.status < 500) { - const body = await response.json(); - if (!body.error) { - return { success: false, error: "Something went wrong" }; - } - - return { success: false, error: body.error }; - } - - if (response.status !== 200) { + } else if (error instanceof Error) { return { success: false, - error: `Failed to fetch ${url}, got status code ${response.status}`, + error: error.message, }; - } - - const jsonBody = await response.json(); - const parsedResult = schema.safeParse(jsonBody); - - if (parsedResult.success) { - return { success: true, data: parsedResult.data }; - } - - if ("error" in jsonBody) { + } else { return { success: false, - error: typeof jsonBody.error === "string" ? jsonBody.error : JSON.stringify(jsonBody.error), + error: String(error), }; } - - return { success: false, error: parsedResult.error.message }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : JSON.stringify(error), - }; } } From 23eb9183e9a370b7ea594a9942a255d30ecf9fa2 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 28 May 2024 16:18:32 +0100 Subject: [PATCH 54/57] importEnvVars uses wrapZodFetch now --- packages/cli-v3/src/apiClient.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/cli-v3/src/apiClient.ts b/packages/cli-v3/src/apiClient.ts index 3702ef97291..6b6fbcacbd1 100644 --- a/packages/cli-v3/src/apiClient.ts +++ b/packages/cli-v3/src/apiClient.ts @@ -171,7 +171,7 @@ export class CliApiClient { throw new Error("importEnvVars: No access token"); } - return zodfetch( + return wrapZodFetch( EnvironmentVariableResponseBody, `${this.apiURL}/api/v1/projects/${projectRef}/envvars/${slug}/import`, { From 0e7e0df3e0d667d84c21de374ebbbb0ff863d92b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 29 May 2024 12:11:56 +0100 Subject: [PATCH 55/57] add backwards compat for retries without checkpoints --- .../app/v3/services/completeAttempt.server.ts | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/v3/services/completeAttempt.server.ts b/apps/webapp/app/v3/services/completeAttempt.server.ts index bd4e6a94d22..93d491581e5 100644 --- a/apps/webapp/app/v3/services/completeAttempt.server.ts +++ b/apps/webapp/app/v3/services/completeAttempt.server.ts @@ -211,7 +211,12 @@ export class CompleteAttemptService extends BaseService { } if (!checkpoint) { - await this.#retryAttempt(taskRunAttempt.taskRun, completion.retry.timestamp); + await this.#retryAttempt( + taskRunAttempt.taskRun, + completion.retry.timestamp, + undefined, + taskRunAttempt.backgroundWorker.supportsLazyAttempts + ); return "RETRIED"; } @@ -329,8 +334,13 @@ export class CompleteAttemptService extends BaseService { } } - async #retryAttempt(run: TaskRun, retryTimestamp: number, checkpointEventId?: string) { - if (checkpointEventId) { + async #retryAttempt( + run: TaskRun, + retryTimestamp: number, + checkpointEventId?: string, + supportsLazyAttempts?: boolean + ) { + if (checkpointEventId || !supportsLazyAttempts) { // We have to replace a potential RESUME with EXECUTE to correctly retry the attempt return await marqs?.replaceMessage( run.id, @@ -342,7 +352,8 @@ export class CompleteAttemptService extends BaseService { retryTimestamp ); } else { - // There's no checkpoint so the worker is still running and waiting for this retry message + // There's no checkpoint so the worker is still running and waiting for a retry message + // It supports lazy attempts so we can bypass the queue and send the message directly to the worker RetryAttemptService.enqueue(run.id, this._prisma, new Date(retryTimestamp)); } } @@ -377,6 +388,7 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId: include: { taskRun: true, backgroundWorkerTask: true, + backgroundWorker: true, }, }); } From 66c91864bdba4d057a067104597f06681c1890ed Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 29 May 2024 13:35:07 +0100 Subject: [PATCH 56/57] handle more cases of unrecoverable runs --- apps/coordinator/src/index.ts | 44 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts index c371c220128..1bdbc4bc445 100644 --- a/apps/coordinator/src/index.ts +++ b/apps/coordinator/src/index.ts @@ -673,6 +673,20 @@ class TaskCoordinator { onConnection: async (socket, handler, sender) => { const logger = new SimpleLogger(`[prod-worker][${socket.id}]`); + const crashRun = async (error: { name: string; message: string; stack?: string }) => { + try { + this.#platformSocket?.send("RUN_CRASHED", { + version: "v1", + runId: socket.data.runId, + error, + }); + } finally { + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + } + }; + const checkpointInProgress = () => { return this.#checkpointableTasks.has(socket.data.runId); }; @@ -741,8 +755,9 @@ class TaskCoordinator { if (!executionAck) { logger.error("no execution ack", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForExecutionError", + message: "No execution ack", }); return; @@ -751,8 +766,9 @@ class TaskCoordinator { if (!executionAck.success) { logger.error("failed to get execution payload", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForExecutionError", + message: "Failed to get execution payload", }); return; @@ -781,8 +797,9 @@ class TaskCoordinator { if (!lazyAttempt) { logger.error("no lazy attempt ack", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "No lazy attempt ack", }); return; @@ -791,8 +808,9 @@ class TaskCoordinator { if (!lazyAttempt.success) { logger.error("failed to get lazy attempt payload", { runId: socket.data.runId }); - socket.emit("REQUEST_EXIT", { - version: "v1", + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "Failed to get lazy attempt payload", }); return; @@ -1140,15 +1158,7 @@ class TaskCoordinator { socket.on("UNRECOVERABLE_ERROR", async (message) => { logger.log("[UNRECOVERABLE_ERROR]", message); - this.#platformSocket?.send("RUN_CRASHED", { - version: "v1", - runId: socket.data.runId, - error: message.error, - }); - - socket.emit("REQUEST_EXIT", { - version: "v1", - }); + await crashRun(message.error); }); }, onDisconnect: async (socket, handler, sender, logger) => { From 2099d9162d36605dbb59baab1625bd4a32842d0b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 29 May 2024 16:00:12 +0100 Subject: [PATCH 57/57] don't kill the child process if it shouldn't be killed --- packages/cli-v3/src/workers/dev/backgroundWorker.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/cli-v3/src/workers/dev/backgroundWorker.ts b/packages/cli-v3/src/workers/dev/backgroundWorker.ts index b27617a5780..c912aaa4691 100644 --- a/packages/cli-v3/src/workers/dev/backgroundWorker.ts +++ b/packages/cli-v3/src/workers/dev/backgroundWorker.ts @@ -865,6 +865,10 @@ class TaskRunProcess { // FIXME: Something broke READY_TO_DISPOSE. We never receive it, so we always have to kill the process after the timeout below. + if (!kill) { + return; + } + // Set a timeout to kill the child process if it hasn't been killed within 5 seconds setTimeout(() => { if (this._child && !this._child.killed) {