From 1b3ed0685ac570c98f04a3bdd19e1c4f5b1efc08 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 28 Jun 2024 18:17:02 +0100 Subject: [PATCH 1/7] Await file watcher cleanup in dev --- .changeset/purple-spiders-care.md | 5 +++++ packages/cli-v3/src/commands/dev.tsx | 22 +++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 .changeset/purple-spiders-care.md diff --git a/.changeset/purple-spiders-care.md b/.changeset/purple-spiders-care.md new file mode 100644 index 00000000000..c73d3383b52 --- /dev/null +++ b/.changeset/purple-spiders-care.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Await file watcher cleanup in dev diff --git a/packages/cli-v3/src/commands/dev.tsx b/packages/cli-v3/src/commands/dev.tsx index 7828fcdc9b3..8ca8359de03 100644 --- a/packages/cli-v3/src/commands/dev.tsx +++ b/packages/cli-v3/src/commands/dev.tsx @@ -760,15 +760,23 @@ function useDev({ }); return () => { - logger.debug(`Shutting down dev session for ${config.project}`); + const cleanup = async () => { + logger.debug(`Shutting down dev session for ${config.project}`); - taskFileWatcher.close(); + const start = Date.now(); - websocket?.close(); - backgroundWorkerCoordinator.close(); - ctx?.dispose().catch((error) => { - console.error(error); - }); + await taskFileWatcher.close(); + + websocket?.close(); + backgroundWorkerCoordinator.close(); + ctx?.dispose().catch((error) => { + console.error(error); + }); + + logger.debug(`Shutdown completed in ${Date.now() - start}ms`); + }; + + cleanup(); }; }, [config, apiUrl, apiKey, environmentClient]); } From 907bcbf8cb0d371af0f2fab22fc3aa1b9c598875 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 28 Jun 2024 18:46:25 +0100 Subject: [PATCH 2/7] Fix artifact detection logs --- packages/cli-v3/src/utilities/getUserPackageManager.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/cli-v3/src/utilities/getUserPackageManager.ts b/packages/cli-v3/src/utilities/getUserPackageManager.ts index 2e8d7cd1a05..d50dcf5b83f 100644 --- a/packages/cli-v3/src/utilities/getUserPackageManager.ts +++ b/packages/cli-v3/src/utilities/getUserPackageManager.ts @@ -62,6 +62,7 @@ export async function detectPackageManagerFromArtifacts(path: string): Promise

Date: Sat, 29 Jun 2024 11:01:51 +0100 Subject: [PATCH 3/7] Fix next runs table when schedule disabled --- .../app/components/runs/v3/TaskRunsTable.tsx | 2 +- .../route.tsx | 45 ++++++++++++++----- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/components/runs/v3/TaskRunsTable.tsx b/apps/webapp/app/components/runs/v3/TaskRunsTable.tsx index 7d0e526594f..2a6ac4fe9f5 100644 --- a/apps/webapp/app/components/runs/v3/TaskRunsTable.tsx +++ b/apps/webapp/app/components/runs/v3/TaskRunsTable.tsx @@ -125,7 +125,7 @@ export function TaskRunsTable({ {total === 0 && !hasFilters ? ( - + {!isLoading && } ) : runs.length === 0 ? ( diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.v3.$projectParam.schedules.$scheduleParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.v3.$projectParam.schedules.$scheduleParam/route.tsx index 373d25a0157..f0440799b35 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.v3.$projectParam.schedules.$scheduleParam/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.v3.$projectParam.schedules.$scheduleParam/route.tsx @@ -23,6 +23,7 @@ import { Paragraph } from "~/components/primitives/Paragraph"; import { Property, PropertyTable } from "~/components/primitives/PropertyTable"; import { Table, + TableBlankRow, TableBody, TableCell, TableHeader, @@ -180,6 +181,14 @@ export const action = async ({ request, params }: ActionFunctionArgs) => { } }; +function PlaceholderText({ title }: { title: string }) { + return ( +

+ {title} +
+ ); +} + export default function Page() { const { schedule } = useTypedLoaderData(); const location = useLocation(); @@ -252,18 +261,30 @@ export default function Page() { - {schedule.nextRuns.map((run, index) => ( - - {!isUtc && ( - - - - )} - - - - - ))} + {schedule.active ? ( + schedule.nextRuns.length ? ( + schedule.nextRuns.map((run, index) => ( + + {!isUtc && ( + + + + )} + + + + + )) + ) : ( + + + + ) + ) : ( + + + + )} From 54b73d27fe20d3d81056755e37607c763d3771b6 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 29 Jun 2024 11:13:16 +0100 Subject: [PATCH 4/7] Improve OOM error messages --- packages/cli-v3/src/workers/common/errors.ts | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/cli-v3/src/workers/common/errors.ts b/packages/cli-v3/src/workers/common/errors.ts index 5be691dfc7d..265caea8843 100644 --- a/packages/cli-v3/src/workers/common/errors.ts +++ b/packages/cli-v3/src/workers/common/errors.ts @@ -69,7 +69,8 @@ export class GracefulExitTimeoutError extends Error { export function getFriendlyErrorMessage( code: number, signal: NodeJS.Signals | null, - stderr: string | undefined + stderr: string | undefined, + dockerMode = true ) { const message = (text: string) => { if (signal) { @@ -79,7 +80,20 @@ export function getFriendlyErrorMessage( } }; - if (code === 137 || stderr?.includes("OOMErrorHandler")) { + if (code === 137) { + if (dockerMode) { + return message( + "Process ran out of memory! Try choosing a machine preset with more memory for this task." + ); + } else { + // Note: containerState reason and message should be checked to clarify the error + return message( + "Process most likely ran out of memory, but we can't be certain. Try choosing a machine preset with more memory for this task." + ); + } + } + + if (stderr?.includes("OOMErrorHandler")) { return message( "Process ran out of memory! Try choosing a machine preset with more memory for this task." ); From 9d20ef7c910546f93b9e6fd4d50b513bd67b8c12 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 29 Jun 2024 11:14:58 +0100 Subject: [PATCH 5/7] Add test link to completed deployment message --- packages/cli-v3/src/commands/deploy.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/cli-v3/src/commands/deploy.ts b/packages/cli-v3/src/commands/deploy.ts index add7f4e688c..e7bdc096ba8 100644 --- a/packages/cli-v3/src/commands/deploy.ts +++ b/packages/cli-v3/src/commands/deploy.ts @@ -441,6 +441,13 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) { `${authorization.dashboardUrl}/projects/v3/${resolvedConfig.config.project}/deployments/${finishedDeployment.shortCode}` ); + const testLink = cliLink( + "Test tasks", + `${authorization.dashboardUrl}/projects/v3/${resolvedConfig.config.project}/test?environment=${ + options.env === "prod" ? "prod" : "stg" + }` + ); + switch (finishedDeployment.status) { case "DEPLOYED": { if (warnings.warnings.length > 0) { @@ -461,7 +468,7 @@ async function _deployCommand(dir: string, options: DeployCommandOptions) { outro( `Version ${version} deployed with ${taskCount} detected task${ taskCount === 1 ? "" : "s" - } ${deploymentLink}` + } | ${deploymentLink} | ${testLink}` ); } From 0f6e24d3008201064e7cd69025b0732df380f59f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 29 Jun 2024 12:35:49 +0100 Subject: [PATCH 6/7] Fix OOM detection, again --- apps/kubernetes-provider/src/index.ts | 16 +++---- apps/kubernetes-provider/src/taskMonitor.ts | 42 +++++++++---------- .../app/v3/services/crashTaskRun.server.ts | 3 +- .../services/deploymentIndexFailed.server.ts | 12 +++++- .../cli-v3/src/workers/prod/entry-point.ts | 27 ++++++++---- packages/core-apps/src/process.ts | 2 + packages/core-apps/src/provider.ts | 3 +- packages/core/src/v3/schemas/messages.ts | 2 + 8 files changed, 67 insertions(+), 40 deletions(-) create mode 100644 packages/core-apps/src/process.ts diff --git a/apps/kubernetes-provider/src/index.ts b/apps/kubernetes-provider/src/index.ts index a310f63c01b..0648bc5a842 100644 --- a/apps/kubernetes-provider/src/index.ts +++ b/apps/kubernetes-provider/src/index.ts @@ -529,27 +529,27 @@ provider.listen(); const taskMonitor = new TaskMonitor({ runtimeEnv: RUNTIME_ENV, - onIndexFailure: async (deploymentId, failureInfo) => { - logger.log("Indexing failed", { deploymentId, failureInfo }); + onIndexFailure: async (deploymentId, details) => { + logger.log("Indexing failed", { deploymentId, details }); try { provider.platformSocket.send("INDEXING_FAILED", { deploymentId, error: { - name: `Crashed with exit code ${failureInfo.exitCode}`, - message: failureInfo.reason, - stack: failureInfo.logs, + name: `Crashed with exit code ${details.exitCode}`, + message: details.reason, + stack: details.logs, }, }); } catch (error) { logger.error(error); } }, - onRunFailure: async (runId, failureInfo) => { - logger.log("Run failed:", { runId, failureInfo }); + onRunFailure: async (runId, details) => { + logger.log("Run failed:", { runId, details }); try { - provider.platformSocket.send("WORKER_CRASHED", { runId, ...failureInfo }); + provider.platformSocket.send("WORKER_CRASHED", { runId, ...details }); } catch (error) { logger.error(error); } diff --git a/apps/kubernetes-provider/src/taskMonitor.ts b/apps/kubernetes-provider/src/taskMonitor.ts index 20fdb64f207..1554fd46c85 100644 --- a/apps/kubernetes-provider/src/taskMonitor.ts +++ b/apps/kubernetes-provider/src/taskMonitor.ts @@ -1,25 +1,20 @@ import * as k8s from "@kubernetes/client-node"; import { SimpleLogger } from "@trigger.dev/core-apps"; +import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core-apps/process"; import { setTimeout } from "timers/promises"; import PQueue from "p-queue"; +import type { Prettify } from "@trigger.dev/core/v3"; -type IndexFailureHandler = ( - deploymentId: string, - failureInfo: { - exitCode: number; - reason: string; - logs: string; - } -) => Promise; - -type RunFailureHandler = ( - runId: string, - failureInfo: { - exitCode: number; - reason: string; - logs: string; - } -) => Promise; +type FailureDetails = Prettify<{ + exitCode: number; + reason: string; + logs: string; + overrideCompletion: boolean; +}>; + +type IndexFailureHandler = (deploymentId: string, details: FailureDetails) => Promise; + +type RunFailureHandler = (runId: string, details: FailureDetails) => Promise; type TaskMonitorOptions = { runtimeEnv: "local" | "kubernetes"; @@ -144,8 +139,7 @@ export class TaskMonitor { const containerState = this.#getContainerStateSummary(containerStatus.state); const exitCode = containerState.exitCode ?? -1; - // We use this special exit code to signal any errors were already handled elsewhere - if (exitCode === 111) { + if (exitCode === EXIT_CODE_ALREADY_HANDLED) { return; } @@ -162,6 +156,7 @@ export class TaskMonitor { let reason = rawReason || "Unknown error"; let logs = rawLogs || ""; + let overrideCompletion = false; switch (rawReason) { case "Error": @@ -181,8 +176,10 @@ export class TaskMonitor { } break; case "OOMKilled": - reason = - "Process ran out of memory! Try choosing a machine preset with more memory for this task."; + overrideCompletion = true; + reason = `${ + exitCode === EXIT_CODE_CHILD_NONZERO ? "Child process" : "Parent process" + } ran out of memory! Try choosing a machine preset with more memory for this task.`; break; default: break; @@ -192,7 +189,8 @@ export class TaskMonitor { exitCode, reason, logs, - }; + overrideCompletion, + } satisfies FailureDetails; const app = pod.metadata?.labels?.app; diff --git a/apps/webapp/app/v3/services/crashTaskRun.server.ts b/apps/webapp/app/v3/services/crashTaskRun.server.ts index d7241f5261a..1a26343fafd 100644 --- a/apps/webapp/app/v3/services/crashTaskRun.server.ts +++ b/apps/webapp/app/v3/services/crashTaskRun.server.ts @@ -13,6 +13,7 @@ export type CrashTaskRunServiceOptions = { logs?: string; crashAttempts?: boolean; crashedAt?: Date; + overrideCompletion?: boolean; }; export class CrashTaskRunService extends BaseService { @@ -36,7 +37,7 @@ export class CrashTaskRunService extends BaseService { } // Make sure the task run is in a crashable state - if (!isCrashableRunStatus(taskRun.status)) { + if (!opts.overrideCompletion && !isCrashableRunStatus(taskRun.status)) { logger.error("Task run is not in a crashable state", { runId, status: taskRun.status }); return; } diff --git a/apps/webapp/app/v3/services/deploymentIndexFailed.server.ts b/apps/webapp/app/v3/services/deploymentIndexFailed.server.ts index fc69fa8cace..ef8ac93d612 100644 --- a/apps/webapp/app/v3/services/deploymentIndexFailed.server.ts +++ b/apps/webapp/app/v3/services/deploymentIndexFailed.server.ts @@ -18,7 +18,8 @@ export class DeploymentIndexFailed extends BaseService { message: string; stack?: string; stderr?: string; - } + }, + overrideCompletion = false ) { const isFriendlyId = maybeFriendlyId.startsWith("deployment_"); @@ -38,6 +39,15 @@ export class DeploymentIndexFailed extends BaseService { } if (FINAL_DEPLOYMENT_STATUSES.includes(deployment.status)) { + if (overrideCompletion) { + logger.error("No support for overriding final deployment statuses just yet", { + id: deployment.id, + status: deployment.status, + previousError: deployment.errorData, + incomingError: error, + }); + } + logger.error("Worker deployment already in final state", { id: deployment.id, status: deployment.status, diff --git a/packages/cli-v3/src/workers/prod/entry-point.ts b/packages/cli-v3/src/workers/prod/entry-point.ts index 870899cc79c..53b726d4f19 100644 --- a/packages/cli-v3/src/workers/prod/entry-point.ts +++ b/packages/cli-v3/src/workers/prod/entry-point.ts @@ -5,12 +5,14 @@ import { PreStopCauses, ProdWorkerToCoordinatorMessages, TaskResource, + TaskRunErrorCodes, TaskRunFailedExecutionResult, WaitReason, } from "@trigger.dev/core/v3"; import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket"; import { HttpReply, getRandomPortNumber } from "@trigger.dev/core-apps/http"; import { SimpleLogger } from "@trigger.dev/core-apps/logger"; +import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core-apps/process"; import { readFile } from "node:fs/promises"; import { createServer } from "node:http"; import { ProdBackgroundWorker } from "./backgroundWorker"; @@ -97,12 +99,12 @@ class ProdWorker { logger.log("Unhandled signal", { signal }); } - async #exitGracefully(gracefulExitTimeoutElapsed = false) { + async #exitGracefully(gracefulExitTimeoutElapsed = false, exitCode = 0) { await this.#backgroundWorker.close(gracefulExitTimeoutElapsed); if (!gracefulExitTimeoutElapsed) { // TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies - process.exit(0); + process.exit(exitCode); } } @@ -315,7 +317,11 @@ class ProdWorker { } } - async #prepareForRetry(willCheckpointAndRestore: boolean, shouldExit: boolean) { + async #prepareForRetry( + willCheckpointAndRestore: boolean, + shouldExit: boolean, + exitCode?: number + ) { logger.log("prepare for retry", { willCheckpointAndRestore, shouldExit }); // Graceful shutdown on final attempt @@ -324,7 +330,7 @@ class ProdWorker { logger.log("WARNING: Will checkpoint but also requested exit. This won't end well."); } - await this.#exitGracefully(); + await this.#exitGracefully(false, exitCode); return; } @@ -516,7 +522,14 @@ class ProdWorker { logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit }); - this.#prepareForRetry(willCheckpointAndRestore, shouldExit); + const exitCode = + !completion.ok && + completion.error.type === "INTERNAL_ERROR" && + completion.error.code === TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE + ? EXIT_CODE_CHILD_NONZERO + : 0; + + this.#prepareForRetry(willCheckpointAndRestore, shouldExit, exitCode); } catch (error) { const completion: TaskRunFailedExecutionResult = { ok: false, @@ -709,8 +722,8 @@ class ProdWorker { } await setTimeout(200); - // Use exit code 111 so we can ignore those failures in the task monitor - process.exit(111); + + process.exit(EXIT_CODE_ALREADY_HANDLED); } } diff --git a/packages/core-apps/src/process.ts b/packages/core-apps/src/process.ts new file mode 100644 index 00000000000..5396967a7fa --- /dev/null +++ b/packages/core-apps/src/process.ts @@ -0,0 +1,2 @@ +export const EXIT_CODE_ALREADY_HANDLED = 111; +export const EXIT_CODE_CHILD_NONZERO = 112; \ No newline at end of file diff --git a/packages/core-apps/src/provider.ts b/packages/core-apps/src/provider.ts index 79959a61242..8ce381ba27b 100644 --- a/packages/core-apps/src/provider.ts +++ b/packages/core-apps/src/provider.ts @@ -14,6 +14,7 @@ import { getRandomPortNumber, HttpReply, getTextBody } from "./http"; import { SimpleLogger } from "./logger"; import { isExecaChildProcess } from "./checkpoints"; import { setTimeout } from "node:timers/promises"; +import { EXIT_CODE_ALREADY_HANDLED } from "./process"; const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || getRandomPortNumber()); const MACHINE_NAME = process.env.MACHINE_NAME || "local"; @@ -198,7 +199,7 @@ export class ProviderShell implements Provider { stderr: error.stderr, }); - if (error.exitCode === 111) { + if (error.exitCode === EXIT_CODE_ALREADY_HANDLED) { logger.error("Index failure already reported by the worker", { socketMessage: message, }); diff --git a/packages/core/src/v3/schemas/messages.ts b/packages/core/src/v3/schemas/messages.ts index ed696edafcd..f2327540e7f 100644 --- a/packages/core/src/v3/schemas/messages.ts +++ b/packages/core/src/v3/schemas/messages.ts @@ -334,6 +334,7 @@ export const ProviderToPlatformMessages = { exitCode: z.number().optional(), message: z.string().optional(), logs: z.string().optional(), + overrideCompletion: z.boolean().optional(), }), }, INDEXING_FAILED: { @@ -346,6 +347,7 @@ export const ProviderToPlatformMessages = { stack: z.string().optional(), stderr: z.string().optional(), }), + overrideCompletion: z.string().optional(), }), }, }; From 7799ef5cf0d6b08ce7cd3a563e2abebe61764df6 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 29 Jun 2024 12:53:20 +0100 Subject: [PATCH 7/7] Add changeset --- .changeset/tall-masks-repeat.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .changeset/tall-masks-repeat.md diff --git a/.changeset/tall-masks-repeat.md b/.changeset/tall-masks-repeat.md new file mode 100644 index 00000000000..a435652df37 --- /dev/null +++ b/.changeset/tall-masks-repeat.md @@ -0,0 +1,9 @@ +--- +"@trigger.dev/core-apps": patch +"trigger.dev": patch +"@trigger.dev/core": patch +--- + +- Fix artifact detection logs +- Fix OOM detection and error messages +- Add test link to cli deployment completion