From cdfe3343b56d84cbe99000f51655a7495d4cf880 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 11 May 2026 17:10:15 +0100 Subject: [PATCH] fix(core): retry TASK_PROCESS_SIGSEGV under the user's retry policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SIGSEGV was hard-classified as non-retriable in shouldRetryError on the assumption that it's always a deterministic native crash. For Node tasks that's not reliably true — many production SIGSEGVs are flaky: - Native addon races (sharp, canvas, better-sqlite3, node-rdkafka, bcrypt, etc.) — libuv thread-pool work stepping on V8 handles. Different heap layout / thread schedule on a fresh process, retry often succeeds. - JIT / GC interaction — V8 turbofan deopt or GC during a native callback. Timing-dependent. - Near-OOM in native code — when RSS approaches the cgroup limit, native allocations fail and poorly-written addons dereference NULL → SIGSEGV instead of a clean OOM-kill. A fresh process with cleaner memory often succeeds. - Host / hardware issues — bit flips, kernel quirks. Retry lands on a different host. The codebase was already inconsistent here: shouldLookupRetrySettings listed SIGSEGV as retry-config-aware, but the shouldRetryError gate short-circuited fail_run before that branch could be reached. And we already retry TASK_RUN_UNCAUGHT_EXCEPTION — clearly a user-code bug — under the user's retry policy, so refusing to retry SIGSEGV was the odd one out. Flip TASK_PROCESS_SIGSEGV from the false branch to the true branch in shouldRetryError. The existing retrying.ts pipeline then gates the retry on lockedRetryConfig + maxAttempts — same path SIGTERM and uncaught-exception already use. No new code paths; tasks without a retry policy still fail fast. Tests added in packages/core/test/errors.test.ts lock down the new classification alongside SIGTERM, SIGKILL_TIMEOUT, and the OOM codes (still non-retriable here because OOM has its own machine-bump retry path in retrying.ts that runs before shouldRetryError). Closes TRI-9234. Co-Authored-By: Claude Opus 4.7 (1M context) --- .changeset/retry-sigsegv.md | 5 +++++ packages/core/src/v3/errors.ts | 2 +- packages/core/test/errors.test.ts | 36 ++++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 .changeset/retry-sigsegv.md diff --git a/.changeset/retry-sigsegv.md b/.changeset/retry-sigsegv.md new file mode 100644 index 00000000000..5a53c351efe --- /dev/null +++ b/.changeset/retry-sigsegv.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Retry `TASK_PROCESS_SIGSEGV` task crashes under the user's retry policy instead of failing the run on the first segfault. SIGSEGV in Node tasks is frequently non-deterministic (native addon races, JIT/GC interaction, near-OOM in native code, host issues), so retrying on a fresh process often succeeds. The retry is gated by the task's existing `retry` config + `maxAttempts` — same path `TASK_PROCESS_SIGTERM` and uncaught exceptions already use — so tasks without a retry policy still fail fast. diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index a538ca9357b..90650bbd18f 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -361,7 +361,6 @@ export function shouldRetryError(error: TaskRunError): boolean { case "CONFIGURED_INCORRECTLY": case "TASK_ALREADY_RUNNING": case "TASK_PROCESS_SIGKILL_TIMEOUT": - case "TASK_PROCESS_SIGSEGV": case "TASK_PROCESS_OOM_KILLED": case "TASK_PROCESS_MAYBE_OOM_KILLED": case "TASK_RUN_CANCELLED": @@ -398,6 +397,7 @@ export function shouldRetryError(error: TaskRunError): boolean { case "TASK_RUN_UNCAUGHT_EXCEPTION": case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE": case "TASK_PROCESS_SIGTERM": + case "TASK_PROCESS_SIGSEGV": return true; default: diff --git a/packages/core/test/errors.test.ts b/packages/core/test/errors.test.ts index dee6509d3a2..9a94366d845 100644 --- a/packages/core/test/errors.test.ts +++ b/packages/core/test/errors.test.ts @@ -1,5 +1,13 @@ import { describe, it, expect } from "vitest"; -import { truncateStack, truncateMessage, parseError, sanitizeError } from "../src/v3/errors.js"; +import { + truncateStack, + truncateMessage, + parseError, + sanitizeError, + shouldRetryError, + shouldLookupRetrySettings, +} from "../src/v3/errors.js"; +import type { TaskRunError } from "../src/v3/schemas/common.js"; // Helper: build a fake stack with N frames function buildStack(messageLines: string[], frameCount: number): string { @@ -238,3 +246,29 @@ describe("truncateStack message line bounding", () => { expect(result).toContain("...[truncated]"); }); }); + +describe("shouldRetryError + shouldLookupRetrySettings", () => { + const internal = (code: string): TaskRunError => + ({ type: "INTERNAL_ERROR", code } as TaskRunError); + + it("retries SIGSEGV (changed from non-retriable) and looks up retry settings", () => { + const err = internal("TASK_PROCESS_SIGSEGV"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("retries SIGTERM via the same path", () => { + const err = internal("TASK_PROCESS_SIGTERM"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("still does not retry SIGKILL timeout", () => { + expect(shouldRetryError(internal("TASK_PROCESS_SIGKILL_TIMEOUT"))).toBe(false); + }); + + it("still does not retry OOM kills (handled by the separate machine-bump path)", () => { + expect(shouldRetryError(internal("TASK_PROCESS_OOM_KILLED"))).toBe(false); + expect(shouldRetryError(internal("TASK_PROCESS_MAYBE_OOM_KILLED"))).toBe(false); + }); +});