diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md new file mode 100644 index 00000000000..b1f38f51ecc --- /dev/null +++ b/.changeset/mollifier-buffer-extensions.md @@ -0,0 +1,6 @@ +--- +"@trigger.dev/redis-worker": minor +"@trigger.dev/core": patch +--- + +Mollifier buffer feature set built on top of the initial primitives: idempotency-lookup with SETNX dedup, atomic snapshot-mutation API (`mutateSnapshot` with tag/metadata/delay/cancel patches), metadata CAS for lossless concurrent updates, watermark-paginated listing, claim primitives for pre-gate idempotency, ZSET-backed per-env queue, 30s post-ack grace TTL, and drop the accept-time entry TTL (drainer is now the only removal mechanism). `@trigger.dev/core` gains an optional `notice` field on the trigger response so the SDK can surface mollifier-queued guidance to customers. diff --git a/.gitignore b/.gitignore index d071d5ae4e3..d06fc950625 100644 --- a/.gitignore +++ b/.gitignore @@ -72,4 +72,5 @@ apps/**/public/build .mcp.log .mcp.json .cursor/debug.log -ailogger-output.log \ No newline at end of file +ailogger-output.log +.playwright-mcp/ \ No newline at end of file diff --git a/.server-changes/mollifier.md b/.server-changes/mollifier.md new file mode 100644 index 00000000000..399ad5c6507 --- /dev/null +++ b/.server-changes/mollifier.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier — Redis-backed burst buffer in front of `engine.trigger` with a fair drainer, full read/write parity for buffered runs across the API + dashboard + realtime stream, alertable `mollifier.stale_entries.current` gauge for drainer health, and `runFailed` alerts on drainer-terminal `SYSTEM_FAILURE` rows. diff --git a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx index facff746c5e..72947c4c8f7 100644 --- a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx +++ b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx @@ -1,6 +1,7 @@ import { NoSymbolIcon } from "@heroicons/react/24/solid"; import { DialogClose } from "@radix-ui/react-dialog"; import { Form, useNavigation } from "@remix-run/react"; +import { useEffect, useRef } from "react"; import { Button } from "~/components/primitives/Buttons"; import { DialogContent, DialogHeader } from "~/components/primitives/Dialog"; import { FormButtons } from "~/components/primitives/FormButtons"; @@ -10,14 +11,35 @@ import { SpinnerWhite } from "~/components/primitives/Spinner"; type CancelRunDialogProps = { runFriendlyId: string; redirectPath: string; + // Optional: when provided, close the dialog as soon as the cancel + // action transitions to "loading" (the redirect is in flight). Lets + // the caller control the open state without interfering with the + // form's submit name=value pair the way `` + // around the submit button does. + onCancelSubmitted?: () => void; }; -export function CancelRunDialog({ runFriendlyId, redirectPath }: CancelRunDialogProps) { +export function CancelRunDialog({ + runFriendlyId, + redirectPath, + onCancelSubmitted, +}: CancelRunDialogProps) { const navigation = useNavigation(); const formAction = `/resources/taskruns/${runFriendlyId}/cancel`; const isLoading = navigation.formAction === formAction; + const wasSubmitting = useRef(false); + useEffect(() => { + if (!onCancelSubmitted) return; + if (navigation.state === "submitting" && navigation.formAction === formAction) { + wasSubmitting.current = true; + } else if (wasSubmitting.current && navigation.state !== "submitting") { + wasSubmitting.current = false; + onCancelSubmitted(); + } + }, [navigation.state, navigation.formAction, formAction, onCancelSubmitted]); + return ( Cancel this run? diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index db72b0364c2..ca53d03eb67 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -9,6 +9,7 @@ import { renderToPipeableStream } from "react-dom/server"; import { PassThrough } from "stream"; import * as Worker from "~/services/worker.server"; import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server"; +import { initMollifierStaleSweepWorker } from "~/v3/mollifierStaleSweepWorker.server"; import { bootstrap } from "./bootstrap"; import { LocaleContextProvider } from "./components/primitives/LocaleProvider"; import { @@ -219,6 +220,7 @@ Worker.init().catch((error) => { }); initMollifierDrainerWorker(); +initMollifierStaleSweepWorker(); bootstrap().catch((error) => { logError(error); diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index e799162abe0..5d920eb661d 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1062,13 +1062,16 @@ const EnvironmentSchema = z // Separate switch for the drainer (consumer side) so it can be split // off onto a dedicated worker service. Unset → inherits // TRIGGER_MOLLIFIER_ENABLED, so single-container self-hosters don't have to - // flip two switches. In multi-replica deployments, set this to "0" - // explicitly on every replica except the one dedicated drainer - // service — otherwise every replica's polling loop races for the - // same buffer entries. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill - // switch; setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a - // no-op because the gate-side singleton refuses to construct a - // buffer when the system is off. + // flip two switches. Multi-replica drainers are correct — `popAndMarkDraining` + // is an atomic ZPOPMIN + status flip in one Lua call, so only one replica + // can win any given entry — but inefficient: polling load (SMEMBERS + + // per-env scans) multiplies by N, and `TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY` + // is per-process so engine load also multiplies. Splitting the drainer + // onto a dedicated worker keeps that traffic off the request-serving + // replicas. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill switch; + // setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a + // no-op because the gate-side singleton refuses to construct a buffer + // when the system is off. TRIGGER_MOLLIFIER_DRAINER_ENABLED: z.string().default(process.env.TRIGGER_MOLLIFIER_ENABLED ?? "0"), TRIGGER_MOLLIFIER_SHADOW_MODE: z.string().default("0"), TRIGGER_MOLLIFIER_REDIS_HOST: z @@ -1091,14 +1094,34 @@ const EnvironmentSchema = z .transform((v) => v ?? process.env.REDIS_PASSWORD), TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), TRIGGER_MOLLIFIER_TRIP_WINDOW_MS: z.coerce.number().int().positive().default(200), - TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().positive().default(100), + TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().nonnegative().default(100), TRIGGER_MOLLIFIER_HOLD_MS: z.coerce.number().int().positive().default(500), TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY: z.coerce.number().int().positive().default(50), - TRIGGER_MOLLIFIER_ENTRY_TTL_S: z.coerce.number().int().positive().default(600), TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3), TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000), TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500), + // Periodic sweep that scans buffer queue ZSETs for entries whose + // dwell exceeds the stale threshold. Independent of the drainer — + // its job is exactly to make a stuck/offline drainer visible to + // ops. Defaults: enabled when the mollifier is enabled, run every + // 5 minutes, alert on anything that's been dwelling for 5+ minutes + // (matches the sweep interval — "anything still here when we + // check" is the simplest threshold that converges). + TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED: z + .string() + .default(process.env.TRIGGER_MOLLIFIER_ENABLED ?? "0"), + TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS: z.coerce + .number() + .int() + .positive() + .default(5 * 60_000), + TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS: z.coerce + .number() + .int() + .positive() + .default(5 * 60_000), + BATCH_TRIGGER_PROCESS_JOB_VISIBILITY_TIMEOUT_MS: z.coerce .number() .int() diff --git a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts index a392866afc9..782104776d4 100644 --- a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts @@ -15,6 +15,10 @@ import assertNever from "assert-never"; import { API_VERSIONS, CURRENT_API_VERSION, RunStatusUnspecifiedApiVersion } from "~/api/versions"; import { $replica, prisma } from "~/db.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { + findRunByIdWithMollifierFallback, + type SyntheticRun, +} from "~/v3/mollifier/readFallback.server"; import { generatePresignedUrl } from "~/v3/objectStore.server"; import { tracer } from "~/v3/tracer.server"; import { startSpanWithEnv } from "~/v3/tracing.server"; @@ -64,13 +68,34 @@ type CommonRelatedRun = Prisma.Result< "findFirstOrThrow" >; -type FoundRun = NonNullable>>; +// Full shape returned by findRun() — the commonRunSelect fields plus the +// extras the route handler reads. Declared explicitly (not inferred via +// ReturnType) so findRun can return a synthesised buffered +// run without the type becoming self-referential. +type FoundRun = CommonRelatedRun & { + traceId: string; + payload: string; + payloadType: string; + output: string | null; + outputType: string; + error: Prisma.JsonValue; + attempts: { id: string }[]; + attemptNumber: number | null; + engine: "V1" | "V2"; + taskEventStore: string; + parentTaskRun: CommonRelatedRun | null; + rootTaskRun: CommonRelatedRun | null; + childRuns: CommonRelatedRun[]; +}; export class ApiRetrieveRunPresenter { constructor(private readonly apiVersion: API_VERSIONS) {} - public static async findRun(friendlyId: string, env: AuthenticatedEnvironment) { - return $replica.taskRun.findFirst({ + public static async findRun( + friendlyId: string, + env: AuthenticatedEnvironment, + ): Promise { + const pgRow = await $replica.taskRun.findFirst({ where: { friendlyId, runtimeEnvironmentId: env.id, @@ -102,6 +127,23 @@ export class ApiRetrieveRunPresenter { }, }, }); + + if (pgRow) return pgRow; + + // Postgres miss → fall back to the mollifier buffer. When the gate + // diverted a trigger, the run lives in Redis until the drainer replays + // it through engine.trigger. Synthesise the FoundRun shape so call() + // returns a `QUEUED` (or `FAILED`) response with empty output, no + // attempts, no relations. + const buffered = await findRunByIdWithMollifierFallback({ + runId: friendlyId, + environmentId: env.id, + organizationId: env.organizationId, + }); + + if (!buffered) return null; + + return synthesiseFoundRunFromBuffer(buffered); } public async call(taskRun: FoundRun, env: AuthenticatedEnvironment) { @@ -475,3 +517,75 @@ function resolveTriggerFunction(run: CommonRelatedRun): TriggerFunction { return run.resumeParentOnCompletion ? "triggerAndWait" : "trigger"; } } + +// Build a FoundRun-shaped object from a buffered (mollified) run. The run +// is in the Redis buffer; engine.trigger hasn't created the Postgres row +// yet, so every field that comes from execution state (output, attempts, +// completedAt, cost, relations) takes a default. The presenter's call() +// handles QUEUED-state runs without surprise. +function bufferedStatusToTaskRunStatus(status: SyntheticRun["status"]): TaskRunStatus { + switch (status) { + case "FAILED": + return "SYSTEM_FAILURE"; + case "CANCELED": + return "CANCELED"; + default: + return "PENDING"; + } +} + +function synthesiseFoundRunFromBuffer(buffered: SyntheticRun): FoundRun { + const status: TaskRunStatus = bufferedStatusToTaskRunStatus(buffered.status); + + const errorJson: Prisma.JsonValue = buffered.error + ? { + type: "STRING_ERROR", + raw: `${buffered.error.code}: ${buffered.error.message}`, + } + : null; + + const metadata: Prisma.JsonValue = + typeof buffered.metadata === "string" ? buffered.metadata : null; + + return { + id: buffered.friendlyId, + friendlyId: buffered.friendlyId, + status, + taskIdentifier: buffered.taskIdentifier ?? "", + createdAt: buffered.createdAt, + startedAt: null, + updatedAt: buffered.cancelledAt ?? buffered.createdAt, + completedAt: buffered.cancelledAt ?? null, + expiredAt: null, + delayUntil: buffered.delayUntil ?? null, + metadata, + metadataType: buffered.metadataType ?? "application/json", + ttl: buffered.ttl ?? null, + costInCents: 0, + baseCostInCents: 0, + usageDurationMs: 0, + idempotencyKey: buffered.idempotencyKey ?? null, + idempotencyKeyOptions: buffered.idempotencyKeyOptions ?? null, + isTest: buffered.isTest, + depth: buffered.depth, + scheduleId: null, + lockedToVersion: buffered.lockedToVersion ? { version: buffered.lockedToVersion } : null, + resumeParentOnCompletion: buffered.resumeParentOnCompletion, + batch: null, + runTags: buffered.tags, + traceId: buffered.traceId ?? "", + payload: typeof buffered.payload === "string" ? buffered.payload : "", + payloadType: buffered.payloadType ?? "application/json", + output: null, + outputType: "application/json", + error: errorJson, + attempts: [], + attemptNumber: null, + engine: "V2", + taskEventStore: "taskEvent", + workerQueue: buffered.workerQueue ?? "main", + parentTaskRun: null, + rootTaskRun: null, + childRuns: [], + }; +} diff --git a/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts index 0e7077b3dfc..ef0d671e16f 100644 --- a/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts @@ -151,7 +151,8 @@ export const ApiRunListSearchParams = z.object({ }), }); -type ApiRunListSearchParams = z.infer; +export type ApiRunListSearchParamsType = z.infer; +type ApiRunListSearchParams = ApiRunListSearchParamsType; export class ApiRunListPresenter extends BasePresenter { public async call( diff --git a/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts b/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts index 69560c49e88..c95f68e3f2c 100644 --- a/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts @@ -3,6 +3,8 @@ import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; import { ABORT_REASON_SEND_ERROR, createSSELoader, SendFunction } from "~/utils/sse"; import { throttle } from "~/utils/throttle"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { deserialiseSnapshot } from "@trigger.dev/redis-worker"; import { tracePubSub } from "~/v3/services/tracePubSub.server"; const PING_INTERVAL = 5_000; @@ -37,17 +39,45 @@ export class RunStreamPresenter { }, }); - if (!run) { + // Fall back to the mollifier buffer when the run isn't in PG yet. + // The buffered run has no execution events to stream, but we still + // attach a trace-pubsub subscription using the snapshot's traceId + // so that the moment the drainer materialises the row and execution + // begins, those events flow to this open SSE connection. Closing + // with 404 would force the dashboard to keep retrying. + let traceId: string | null = run?.traceId ?? null; + if (!traceId) { + const buffer = getMollifierBuffer(); + if (buffer) { + try { + const entry = await buffer.getEntry(runFriendlyId); + if (entry) { + const snapshot = deserialiseSnapshot<{ traceId?: string }>(entry.payload); + if (typeof snapshot.traceId === "string") { + traceId = snapshot.traceId; + } + } + } catch (err) { + logger.warn("RunStreamPresenter buffer fallback failed", { + runFriendlyId, + err: err instanceof Error ? err.message : String(err), + }); + } + } + } + + if (!traceId) { throw new Response("Not found", { status: 404 }); } + const resolvedRun = { traceId }; logger.info("RunStreamPresenter.start", { runFriendlyId, - traceId: run.traceId, + traceId: resolvedRun.traceId, }); // Subscribe to trace updates - const { unsubscribe, eventEmitter } = await tracePubSub.subscribeToTrace(run.traceId); + const { unsubscribe, eventEmitter } = await tracePubSub.subscribeToTrace(resolvedRun.traceId); // Only send max every 1 second const throttledSend = throttle( @@ -105,7 +135,7 @@ export class RunStreamPresenter { cleanup: () => { logger.info("RunStreamPresenter.cleanup", { runFriendlyId, - traceId: run.traceId, + traceId: resolvedRun.traceId, }); // Remove message listener @@ -119,13 +149,13 @@ export class RunStreamPresenter { .then(() => { logger.info("RunStreamPresenter.cleanup.unsubscribe succeeded", { runFriendlyId, - traceId: run.traceId, + traceId: resolvedRun.traceId, }); }) .catch((error) => { logger.error("RunStreamPresenter.cleanup.unsubscribe failed", { runFriendlyId, - traceId: run.traceId, + traceId: resolvedRun.traceId, error: { name: error.name, message: error.message, diff --git a/apps/webapp/app/routes/@.runs.$runParam.ts b/apps/webapp/app/routes/@.runs.$runParam.ts index a52600628d8..c2717418ff2 100644 --- a/apps/webapp/app/routes/@.runs.$runParam.ts +++ b/apps/webapp/app/routes/@.runs.$runParam.ts @@ -4,6 +4,7 @@ import { prisma } from "~/db.server"; import { redirectWithErrorMessage } from "~/models/message.server"; import { requireUser } from "~/services/session.server"; import { impersonate, rootPath, v3RunPath } from "~/utils/pathBuilder"; +import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -51,6 +52,26 @@ export async function loader({ params, request }: LoaderFunctionArgs) { }); if (!run) { + // Admin impersonation route — bypass org membership so admins can + // open any buffered run by friendlyId, mirroring the existing PG + // behaviour above (no membership filter on the find). + const buffered = await findBufferedRunRedirectInfo({ + runFriendlyId: runParam, + userId: user.id, + skipOrgMembershipCheck: true, + }); + if (buffered) { + return redirect( + impersonate( + v3RunPath( + { slug: buffered.organizationSlug }, + { slug: buffered.projectSlug }, + { slug: buffered.environmentSlug }, + { friendlyId: runParam } + ) + ) + ); + } return redirectWithErrorMessage(rootPath(), request, "Run doesn't exist", { ephemeral: false, }); diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx index d55511e7ff5..28bae86406f 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx @@ -88,10 +88,13 @@ import { useReplaceSearchParams } from "~/hooks/useReplaceSearchParams"; import { useSearchParams } from "~/hooks/useSearchParam"; import { type Shortcut, useShortcutKeys } from "~/hooks/useShortcutKeys"; import { useHasAdminAccess } from "~/hooks/useUser"; +import { env } from "~/env.server"; import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { NextRunListPresenter } from "~/presenters/v3/NextRunListPresenter.server"; import { RunEnvironmentMismatchError, RunPresenter } from "~/presenters/v3/RunPresenter.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { buildSyntheticTraceForBufferedRun } from "~/v3/mollifier/syntheticTrace.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { getImpersonationId } from "~/services/impersonation.server"; import { logger } from "~/services/logger.server"; @@ -277,6 +280,31 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { ); } + // PG miss → try the mollifier buffer. When the gate diverts a trigger + // the run sits in Redis until the drainer materialises it; without + // this fallback the run-detail page 404s for the brief buffered window + // even though the API has accepted the trigger and returned an id. + const buffered = await tryMollifiedRunFallback({ + runFriendlyId: runParam, + organizationSlug, + projectSlug: projectParam, + envSlug: envParam, + userId, + }); + + if (buffered) { + const parent = await getResizableSnapshot(request, resizableSettings.parent.autosaveId); + const tree = await getResizableSnapshot(request, resizableSettings.tree.autosaveId); + + return json({ + run: buffered.run, + trace: buffered.trace, + maximumLiveReloadingSetting: env.MAXIMUM_LIVE_RELOADING_EVENTS, + resizable: { parent, tree }, + runsList: null, + }); + } + throw error; } @@ -305,6 +333,52 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); }; +async function tryMollifiedRunFallback(args: { + runFriendlyId: string; + organizationSlug: string; + projectSlug: string; + envSlug: string; + userId: string; +}) { + const project = await findProjectBySlug(args.organizationSlug, args.projectSlug, args.userId); + if (!project) return null; + const environment = await findEnvironmentBySlug(project.id, args.envSlug, args.userId); + if (!environment) return null; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: args.runFriendlyId, + environmentId: environment.id, + organizationId: project.organizationId, + }); + if (!buffered) return null; + + return { + run: { + id: buffered.friendlyId, + number: 1, + friendlyId: buffered.friendlyId, + traceId: buffered.traceId ?? "", + spanId: buffered.spanId ?? "", + status: "PENDING" as const, + isFinished: false, + startedAt: null, + completedAt: null, + logsDeletedAt: null, + rootTaskRun: null, + parentTaskRun: null, + environment: { + id: environment.id, + organizationId: project.organizationId, + type: environment.type, + slug: environment.slug, + userId: undefined, + userName: undefined, + }, + }, + trace: buildSyntheticTraceForBufferedRun(buffered), + }; +} + type LoaderData = SerializeFrom; export default function Page() { @@ -407,23 +481,17 @@ export default function Page() { /> {run.isFinished ? null : ( - - - - - - + )} @@ -587,6 +655,35 @@ function TraceView({ ); } +// Controlled wrapper around the cancel dialog. Owns the Radix open state +// so the dialog closes itself once the cancel action transitions through +// submission. We can't ``-wrap the submit button +// because Radix's onClick handler swallows the button's name=value pair +// that the form action depends on for `redirectUrl`. +function ControlledCancelRunDialog({ + runFriendlyId, + redirectPath, +}: { + runFriendlyId: string; + redirectPath: string; +}) { + const [open, setOpen] = useState(false); + return ( + + + + + setOpen(false)} + /> + + ); +} + function NoLogsView({ run, resizable }: Pick) { const plan = useCurrentPlan(); const organization = useOrganization(); @@ -616,9 +713,13 @@ function NoLogsView({ run, resizable }: Pick) { >
{daysSinceCompleted === undefined ? ( - + - We tidy up older logs to keep things running smoothly. + This run is queued. Logs will appear here once it begins executing. ) : isWithinLogRetention ? ( diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index f27a9c13f98..3b32ec4a2e2 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -1,15 +1,101 @@ +import type { LoaderFunctionArgs } from "@remix-run/server-runtime"; import { json } from "@remix-run/server-runtime"; import { tryCatch } from "@trigger.dev/core/utils"; +import type { RunMetadataChangeOperation } from "@trigger.dev/core/v3/schemas"; import { UpdateMetadataRequestBody } from "@trigger.dev/core/v3"; import { z } from "zod"; +import { $replica } from "~/db.server"; +import { authenticateApiRequest } from "~/services/apiAuth.server"; import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { ServiceValidationError } from "~/v3/services/common.server"; +import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runId: z.string(), }); +// Phase A6 — fixes the pre-existing route bug where GET on this URL +// returned a Remix "no loader" 400. The route only exposed PUT (update); +// GET had no handler. Returns `{ metadata, metadataType }` from either +// the Postgres row or the mollifier buffer snapshot. +export async function loader({ request, params }: LoaderFunctionArgs) { + const authenticationResult = await authenticateApiRequest(request); + if (!authenticationResult) { + return json({ error: "Invalid or Missing API Key" }, { status: 401 }); + } + + const parsed = ParamsSchema.safeParse(params); + if (!parsed.success) { + return json({ error: "Invalid or missing run ID" }, { status: 400 }); + } + + const env = authenticationResult.environment; + + const pgRun = await $replica.taskRun.findFirst({ + where: { friendlyId: parsed.data.runId, runtimeEnvironmentId: env.id }, + select: { metadata: true, metadataType: true }, + }); + if (pgRun) { + return json({ metadata: pgRun.metadata, metadataType: pgRun.metadataType }, { status: 200 }); + } + + const buffered = await findRunByIdWithMollifierFallback({ + runId: parsed.data.runId, + environmentId: env.id, + organizationId: env.organizationId, + }); + if (buffered) { + return json( + { + metadata: buffered.metadata ?? null, + metadataType: buffered.metadataType ?? "application/json", + }, + { status: 200 } + ); + } + + return json({ error: "Run not found" }, { status: 404 }); +} + +// Route parent/root operations to the existing PG service by directly +// invoking it against the parent/root runId. The service ingests via +// its batching worker, which targets PG by id. If the parent/root is +// itself buffered we recurse through our buffered-mutation helper. +// `_ingestion_only` flag: a synthetic body that has the operations +// promoted to top-level `operations` so the service applies them to +// `targetRunId` directly. +async function routeOperationsToRun( + targetRunId: string | undefined, + operations: RunMetadataChangeOperation[] | undefined, + env: { id: string; organizationId: string } +): Promise { + if (!targetRunId || !operations || operations.length === 0) return; + + // Try PG first via the existing service (this is how parent/root + // operations have always landed; preserve that). + const [error] = await tryCatch( + updateMetadataService.call( + targetRunId, + { operations }, + { id: env.id, organizationId: env.organizationId } as unknown as Parameters< + typeof updateMetadataService.call + >[2] + ) + ); + if (!error) return; + + // PG service threw — could be "Cannot update metadata for a completed + // run" or similar. If the target is buffered, route operations to its + // snapshot too. Best-effort; do not surface this failure to the + // caller — the parent/root ops are auxiliary. + await applyMetadataMutationToBufferedRun({ + runId: targetRunId, + body: { operations }, + }); +} + const { action } = createActionApiRoute( { params: ParamsSchema, @@ -18,23 +104,72 @@ const { action } = createActionApiRoute( method: "PUT", }, async ({ authentication, body, params }) => { - const [error, result] = await tryCatch( - updateMetadataService.call(params.runId, body, authentication.environment) - ); + const env = authentication.environment; + const runId = params.runId; - if (error) { - if (error instanceof ServiceValidationError) { - return json({ error: error.message }, { status: error.status ?? 422 }); + // PG-canonical path. If the run is in PG, the existing service + // owns the full request shape including parent/root operations, + // metadataVersion CAS, batching, validation — none of which the + // buffer side needs to reimplement. + const [pgError, pgResult] = await tryCatch( + updateMetadataService.call(runId, body, env) + ); + if (pgError) { + if (pgError instanceof ServiceValidationError) { + return json({ error: pgError.message }, { status: pgError.status ?? 422 }); } - return json({ error: "Internal Server Error" }, { status: 500 }); } + if (pgResult) { + return json(pgResult, { status: 200 }); + } - if (!result) { + // PG miss. Target run is either buffered or genuinely absent. + const bufferOutcome = await applyMetadataMutationToBufferedRun({ + runId, + body: { metadata: body.metadata, operations: body.operations }, + }); + + if (bufferOutcome.kind === "not_found") { return json({ error: "Task Run not found" }, { status: 404 }); } + if (bufferOutcome.kind === "busy") { + // Entry is materialising. Best path is to retry the PG call — + // the row may be visible now. We don't waste a roundtrip in + // the happy path, but a 503 here would be customer-visible + // breakage for legitimately-burst workloads. Hand back 503 with + // a retry hint; SDK retry policy converges. + return json({ error: "Run materialising, retry shortly" }, { status: 503 }); + } + if (bufferOutcome.kind === "version_exhausted") { + // Pathological contention — many concurrent metadata writers on + // the same buffered runId. Surface as 503 rather than silently + // dropping the request. + return json({ error: "Metadata write contention; retry shortly" }, { status: 503 }); + } + + // Buffered metadata mutation succeeded. Fan parent/root operations + // out to their respective runs (parent/root are typically PG- + // materialised by the time the child is buffered, so the existing + // service handles them; if they're also buffered, the helper + // recurses through the buffered mutation path). + const bufferedEntry = await findRunByIdWithMollifierFallback({ + runId, + environmentId: env.id, + organizationId: env.organizationId, + }); + if (bufferedEntry) { + await Promise.all([ + routeOperationsToRun(bufferedEntry.parentTaskRunId, body.parentOperations, env), + // The snapshot doesn't carry rootTaskRunId; fall back to parent + // as a rough proxy (matches the existing service's nil-coalesce + // behaviour where rootTaskRun defaults to the parent). Phase D + // / future work could thread rootTaskRunId through the snapshot. + routeOperationsToRun(bufferedEntry.parentTaskRunId, body.rootOperations, env), + ]); + } - return json(result, { status: 200 }); + return json({ metadata: bufferOutcome.newMetadata }, { status: 200 }); } ); diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts index be0d12087b6..cc48faf5d85 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts @@ -9,42 +9,101 @@ import { } from "~/services/routeBuilders/apiBuilder.server"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runId: z.string(), spanId: z.string(), }); +// Phase A2 — discriminated union for PG vs buffered runs. Buffered runs +// only have one valid spanId (the queued span recorded at gate time and +// reused as the run's root spanId when the drainer materialises). Any +// other spanId returns a deterministic 404; the queued span returns a +// minimal synthesised shape so the customer's SDK sees the same 200 +// contract they'd get for a freshly-triggered run. +type ResolvedRun = + | { source: "pg"; run: Awaited> & {} } + | { source: "buffer"; run: NonNullable>> }; + +async function findPgRun(runId: string, environmentId: string) { + return $replica.taskRun.findFirst({ + where: { friendlyId: runId, runtimeEnvironmentId: environmentId }, + }); +} + export const loader = createLoaderApiRoute( { params: ParamsSchema, allowJWT: true, corsStrategy: "all", - findResource: (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runId, - runtimeEnvironmentId: auth.environment.id, - }, + findResource: async (params, auth): Promise => { + const pgRun = await findPgRun(params.runId, auth.environment.id); + if (pgRun) return { source: "pg", run: pgRun }; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: params.runId, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, }); + if (buffered) return { source: "buffer", run: buffered }; + + return null; }, shouldRetryNotFound: true, authorization: { action: "read", - resource: (run) => { + resource: (resolved) => { + if (resolved.source === "pg") { + const run = resolved.run; + const resources = [ + { type: "runs", id: run.friendlyId }, + { type: "tasks", id: run.taskIdentifier }, + ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ]; + if (run.batchId) { + resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); + } + return anyResource(resources); + } + const run = resolved.run; const resources = [ { type: "runs", id: run.friendlyId }, - { type: "tasks", id: run.taskIdentifier }, - ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []), + ...run.tags.map((tag) => ({ type: "tags", id: tag })), ]; - if (run.batchId) { - resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); - } return anyResource(resources); }, }, }, - async ({ params, resource: run, authentication }) => { + async ({ params, resource: resolved, authentication }) => { + if (resolved.source === "buffer") { + // Buffered runs have exactly one valid spanId — the queued span the + // mollifier gate recorded at trigger time, which becomes the run's + // root spanId once the drainer materialises. Any other spanId is a + // deterministic 404. The matching spanId returns a minimal shape + // representing "span exists, no execution data yet." + if (resolved.run.spanId !== params.spanId) { + return json({ error: "Span not found" }, { status: 404 }); + } + return json( + { + spanId: resolved.run.spanId, + parentId: resolved.run.parentSpanId ?? null, + runId: resolved.run.friendlyId, + message: resolved.run.taskIdentifier ?? "", + isError: false, + isPartial: resolved.run.status !== "CANCELED", + isCancelled: resolved.run.status === "CANCELED", + level: "TRACE", + startTime: resolved.run.createdAt, + durationMs: 0, + }, + { status: 200 } + ); + } + + const run = resolved.run; const eventRepository = await getEventRepositoryForStore( run.taskEventStore, authentication.environment.organization.id diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index eae94375b9f..eeb8d6bc027 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -4,19 +4,19 @@ import { z } from "zod"; import { prisma } from "~/db.server"; import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; const ParamsSchema = z.object({ runId: z.string(), }); export async function action({ request, params }: ActionFunctionArgs) { - // Ensure this is a POST request if (request.method.toUpperCase() !== "POST") { return { status: 405, body: "Method Not Allowed" }; } - // Authenticate the request const authenticationResult = await authenticateApiRequest(request); if (!authenticationResult) { return json({ error: "Invalid or Missing API Key" }, { status: 401 }); @@ -32,59 +32,67 @@ export async function action({ request, params }: ActionFunctionArgs) { try { const anyBody = await request.json(); - const body = AddTagsRequestBody.safeParse(anyBody); if (!body.success) { return json({ error: "Invalid request body", issues: body.error.issues }, { status: 400 }); } - - const run = await prisma.taskRun.findFirst({ - where: { - friendlyId: parsedParams.data.runId, - runtimeEnvironmentId: authenticationResult.environment.id, - }, - select: { - runTags: true, - }, - }); - - const existingTags = run?.runTags ?? []; - - //remove duplicate tags from the new tags const bodyTags = typeof body.data.tags === "string" ? [body.data.tags] : body.data.tags; - const newTags = bodyTags.filter((tag) => { - if (tag.trim().length === 0) return false; - return !existingTags.includes(tag); - }); - - if (existingTags.length + newTags.length > MAX_TAGS_PER_RUN) { - return json( - { - error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${ - existingTags.length + newTags.length - }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`, - }, - { status: 422 } - ); - } + const nonEmptyTags = bodyTags.filter((t) => t.trim().length > 0); - if (newTags.length === 0) { + if (nonEmptyTags.length === 0) { return json({ message: "No new tags to add" }, { status: 200 }); } - await prisma.taskRun.update({ - where: { - friendlyId: parsedParams.data.runId, - runtimeEnvironmentId: authenticationResult.environment.id, - }, - data: { - runTags: { - push: newTags, - }, + const env = authenticationResult.environment; + const outcome = await mutateWithFallback({ + runId: parsedParams.data.runId, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { type: "append_tags", tags: nonEmptyTags }, + pgMutation: async (taskRun) => { + const existing = taskRun.runTags ?? []; + const newTags = nonEmptyTags.filter((t) => !existing.includes(t)); + + if (existing.length + newTags.length > MAX_TAGS_PER_RUN) { + return json( + { + error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${ + existing.length + newTags.length + }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`, + }, + { status: 422 } + ); + } + if (newTags.length === 0) { + return json({ message: "No new tags to add" }, { status: 200 }); + } + await prisma.taskRun.update({ + where: { + id: taskRun.id, + runtimeEnvironmentId: env.id, + }, + data: { runTags: { push: newTags } }, + }); + return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, + // Buffer-applied patch path. The mutateSnapshot Lua deduplicates + // against existing snapshot tags atomically. MAX_TAGS_PER_RUN + // enforcement is skipped on the buffered side — the drainer's + // engine.trigger writes the PG row without enforcement either, + // matching today's pre-buffer trigger semantics. A future + // refinement could push the limit check into the Lua. + synthesisedResponse: () => + json({ message: `Successfully set ${nonEmptyTags.length} new tags.` }, { status: 200 }), + abortSignal: getRequestAbortSignal(), }); - return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); + if (outcome.kind === "not_found") { + return json({ error: "Run not found" }, { status: 404 }); + } + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); + } + return outcome.response; } catch (error) { logger.error("Failed to add run tags", { error }); return json({ error: "Something went wrong, please try again." }, { status: 500 }); diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts index 77e6a4df043..cce1b40b785 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts @@ -8,41 +8,108 @@ import { } from "~/services/routeBuilders/apiBuilder.server"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runId: z.string(), // This is the run friendly ID }); +// Discriminator on the resolved resource — `pg` is the real Prisma TaskRun +// row, `buffer` is a synthesised shape from the mollifier buffer for runs +// whose drainer hasn't yet materialised them. The handler renders an empty +// trace for buffered runs so the customer sees the same 200 shape they'd +// get for a freshly-triggered PG run with no spans yet (matches the +// pass-through control case in scripts/mollifier-api-parity.sh). +type ResolvedRun = + | { source: "pg"; run: Awaited> & {} } + | { source: "buffer"; run: NonNullable>> }; + +async function findPgRun(runId: string, environmentId: string) { + return $replica.taskRun.findFirst({ + where: { friendlyId: runId, runtimeEnvironmentId: environmentId }, + }); +} + export const loader = createLoaderApiRoute( { params: ParamsSchema, allowJWT: true, corsStrategy: "all", - findResource: (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runId, - runtimeEnvironmentId: auth.environment.id, - }, + findResource: async (params, auth): Promise => { + const pgRun = await findPgRun(params.runId, auth.environment.id); + if (pgRun) return { source: "pg", run: pgRun }; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: params.runId, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, }); + if (buffered) return { source: "buffer", run: buffered }; + + return null; }, shouldRetryNotFound: true, authorization: { action: "read", - resource: (run) => { + resource: (resolved) => { + if (resolved.source === "pg") { + const run = resolved.run; + const resources = [ + { type: "runs", id: run.friendlyId }, + { type: "tasks", id: run.taskIdentifier }, + ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ]; + if (run.batchId) { + resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); + } + return anyResource(resources); + } + const run = resolved.run; const resources = [ { type: "runs", id: run.friendlyId }, - { type: "tasks", id: run.taskIdentifier }, - ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []), + ...run.tags.map((tag) => ({ type: "tags", id: tag })), ]; - if (run.batchId) { - resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); - } return anyResource(resources); }, }, }, - async ({ resource: run, authentication }) => { + async ({ resource: resolved, authentication }) => { + if (resolved.source === "buffer") { + // Buffered runs have no events ingested yet — the drainer hasn't + // materialised the PG row and the worker hasn't started executing. + // Synthesise a single partial span that satisfies the SDK's + // RetrieveRunTraceResponseBody schema (rootSpan is non-nullable). + const buffered = resolved.run; + return json( + { + trace: { + traceId: buffered.traceId ?? "", + rootSpan: { + id: buffered.spanId ?? "", + runId: buffered.friendlyId, + data: { + message: buffered.taskIdentifier ?? "", + taskSlug: buffered.taskIdentifier ?? undefined, + events: [], + startTime: buffered.createdAt, + duration: 0, + isError: false, + isPartial: true, + isCancelled: buffered.status === "CANCELED", + level: "TRACE", + queueName: buffered.queue ?? undefined, + machinePreset: buffered.machinePreset ?? undefined, + }, + children: [], + }, + }, + }, + { status: 200 } + ); + } + + const run = resolved.run; const eventRepository = await getEventRepositoryForStore( run.taskEventStore, authentication.environment.organization.id diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts index 790e52bee4e..8668f0bc60b 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts @@ -1,4 +1,4 @@ -import type { ActionFunctionArgs } from "@remix-run/server-runtime"; +import type { ActionFunctionArgs, LoaderFunctionArgs } from "@remix-run/server-runtime"; import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { authenticateApiRequest } from "~/services/apiAuth.server"; @@ -11,6 +11,30 @@ const ParamsSchema = z.object({ runParam: z.string(), }); +// Phase A5 — fixes the pre-existing route bug where GET on this URL +// returned a Remix "no loader" 400 with an internal error message. The +// route only exposed `action` (POST creates a new attempt); GET had no +// handler, so any well-intentioned SDK probe hit the framework error +// instead of a proper API response. +// +// Returns `{ attempts: [] }` for both PG and buffered runs. The detailed +// attempt list belongs on the v3 retrieve endpoint, not here — this is +// the dual of the POST that creates attempts, and the empty-list shape +// gives the parity script a stable contract to assert against. +export async function loader({ request, params }: LoaderFunctionArgs) { + const authenticationResult = await authenticateApiRequest(request); + if (!authenticationResult) { + return json({ error: "Invalid or Missing API Key" }, { status: 401 }); + } + + const parsed = ParamsSchema.safeParse(params); + if (!parsed.success) { + return json({ error: "Invalid or missing run ID" }, { status: 400 }); + } + + return json({ attempts: [] }, { status: 200 }); +} + export async function action({ request, params }: ActionFunctionArgs) { // Authenticate the request const authenticationResult = await authenticateApiRequest(request); diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts index 72ad202467d..0b482314832 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts @@ -1,10 +1,12 @@ import type { ActionFunctionArgs } from "@remix-run/server-runtime"; import { json } from "@remix-run/server-runtime"; +import type { TaskRun } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; import { sanitizeTriggerSource } from "~/utils/triggerSource"; const ParamsSchema = z.object({ @@ -32,12 +34,34 @@ export async function action({ request, params }: ActionFunctionArgs) { const { runParam } = parsed.data; try { - const taskRun = await prisma.taskRun.findUnique({ + const env = authenticationResult.environment; + // PG-first. Replay works on any status per audit (Q2 design) — no + // filter beyond friendlyId is the existing semantic; findFirst with + // env scoping tightens it minimally without changing behaviour for + // a correctly-authed caller. + let taskRun: TaskRun | null = await prisma.taskRun.findFirst({ where: { friendlyId: runParam, + runtimeEnvironmentId: env.id, }, }); + if (!taskRun) { + // Buffered fallback (Q2). The SyntheticRun shape was extended in + // Phase B4 to carry every field ReplayTaskRunService reads from a + // TaskRun. Cast through unknown — the synthesised object has the + // same field surface as a real PG row from the service's + // perspective. + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: env.id, + organizationId: env.organizationId, + }); + if (buffered) { + taskRun = buffered as unknown as TaskRun; + } + } + if (!taskRun) { return json({ error: "Run not found" }, { status: 404 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts index 0ac8aec8351..a605e391d93 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts @@ -3,90 +3,113 @@ import { json } from "@remix-run/server-runtime"; import { RescheduleRunRequestBody } from "@trigger.dev/core/v3/schemas"; import { z } from "zod"; import { getApiVersion } from "~/api/versions"; -import { prisma } from "~/db.server"; import { ApiRetrieveRunPresenter } from "~/presenters/v3/ApiRetrieveRunPresenter.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; import { ServiceValidationError } from "~/v3/services/baseService.server"; import { RescheduleTaskRunService } from "~/v3/services/rescheduleTaskRun.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import { parseDelay } from "~/utils/delays"; const ParamsSchema = z.object({ runParam: z.string(), }); export async function action({ request, params }: ActionFunctionArgs) { - // Ensure this is a POST request if (request.method.toUpperCase() !== "POST") { return { status: 405, body: "Method Not Allowed" }; } - // Authenticate the request const authenticationResult = await authenticateApiRequest(request); - if (!authenticationResult) { return json({ error: "Invalid or missing API Key" }, { status: 401 }); } const parsed = ParamsSchema.safeParse(params); - if (!parsed.success) { return json({ error: "Invalid or missing run ID" }, { status: 400 }); } - const { runParam } = parsed.data; - - const taskRun = await prisma.taskRun.findUnique({ - where: { - friendlyId: runParam, - runtimeEnvironmentId: authenticationResult.environment.id, - }, - }); - - if (!taskRun) { - return json({ error: "Run not found" }, { status: 404 }); - } - const anyBody = await request.json(); - const body = RescheduleRunRequestBody.safeParse(anyBody); - if (!body.success) { return json({ error: "Invalid request body" }, { status: 400 }); } - const service = new RescheduleTaskRunService(); + const env = authenticationResult.environment; + // Pre-resolve the absolute Date the buffer snapshot should encode. + // RescheduleTaskRunService expects this to be present on the body for + // its PG-side flow; for the buffer-side patch we encode the same + // wall-clock value so the drainer's engine.trigger sees the intended + // delayUntil after materialisation. + const delayUntil = await parseDelay(body.data.delay); + if (!delayUntil) { + return json({ error: "Invalid delay value" }, { status: 400 }); + } try { - const updatedRun = await service.call(taskRun, body.data); - - if (!updatedRun) { - return json({ error: "An unknown error occurred" }, { status: 500 }); - } - - const run = await ApiRetrieveRunPresenter.findRun( - updatedRun.friendlyId, - authenticationResult.environment - ); - - if (!run) { + const outcome = await mutateWithFallback({ + runId: parsed.data.runParam, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { + type: "set_delay", + delayUntil: delayUntil.toISOString(), + }, + pgMutation: async (taskRun) => { + const service = new RescheduleTaskRunService(); + const updatedRun = await service.call(taskRun, body.data); + if (!updatedRun) { + return json({ error: "An unknown error occurred" }, { status: 500 }); + } + + const run = await ApiRetrieveRunPresenter.findRun(updatedRun.friendlyId, env); + if (!run) { + return json({ error: "Run not found" }, { status: 404 }); + } + const apiVersion = getApiVersion(request); + const presenter = new ApiRetrieveRunPresenter(apiVersion); + const result = await presenter.call(run, env); + if (!result) { + return json({ error: "Run not found" }, { status: 404 }); + } + return json(result); + }, + // Buffered snapshot has been patched. Run it through the same + // ApiRetrieveRunPresenter the PG branch uses (it falls back to + // the buffer for the SyntheticRun lookup) so the response shape + // matches `RetrieveRunResponse` — that's what the SDK's + // `rescheduleRun` zod-validates against. Returning a stripped + // `{ id, delayUntil }` object fails the SDK schema on every + // existing SDK version. + synthesisedResponse: async () => { + const run = await ApiRetrieveRunPresenter.findRun(parsed.data.runParam, env); + if (!run) { + return json({ error: "Run not found" }, { status: 404 }); + } + const apiVersion = getApiVersion(request); + const presenter = new ApiRetrieveRunPresenter(apiVersion); + const result = await presenter.call(run, env); + if (!result) { + return json({ error: "Run not found" }, { status: 404 }); + } + return json(result); + }, + abortSignal: getRequestAbortSignal(), + }); + + if (outcome.kind === "not_found") { return json({ error: "Run not found" }, { status: 404 }); } - - const apiVersion = getApiVersion(request); - - const presenter = new ApiRetrieveRunPresenter(apiVersion); - const result = await presenter.call(run, authenticationResult.environment); - - if (!result) { - return json({ error: "Run not found" }, { status: 404 }); + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); } - - return json(result); + return outcome.response; } catch (error) { if (error instanceof ServiceValidationError) { return json({ error: error.message }, { status: 400 }); } - logger.error("Failed to reschedule run", { error }); return json({ error: "Something went wrong, please try again." }, { status: 500 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.ts b/apps/webapp/app/routes/api.v1.runs.ts index 4cbd689f627..16564268170 100644 --- a/apps/webapp/app/routes/api.v1.runs.ts +++ b/apps/webapp/app/routes/api.v1.runs.ts @@ -3,7 +3,6 @@ import { ApiRunListPresenter, ApiRunListSearchParams, } from "~/presenters/v3/ApiRunListPresenter.server"; -import { logger } from "~/services/logger.server"; import { anyResource, createLoaderApiRoute, diff --git a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts index 8206a90f320..17e3f48d056 100644 --- a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts +++ b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts @@ -142,6 +142,7 @@ const { action, loader } = createActionApiRoute( { id: result.run.friendlyId, isCached: result.isCached, + ...("notice" in result && result.notice ? { notice: result.notice } : {}), }, { headers: $responseHeaders, diff --git a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts index a636ca0cc1d..f02b058b272 100644 --- a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts +++ b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts @@ -1,8 +1,13 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; -import { $replica } from "~/db.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import { + resolveRunForMutation, + type ResolvedRunForMutation, +} from "~/v3/mollifier/resolveRunForMutation.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -17,29 +22,55 @@ const { action } = createActionApiRoute( action: "write", resource: (params) => ({ type: "runs", id: params.runParam }), }, - findResource: async (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runParam, - runtimeEnvironmentId: auth.environment.id, - }, - }); - }, + // PG-or-buffer resolver. Returning null here would 404 BEFORE the + // action runs (`apiBuilder.server.ts:321`), so buffered cancels need + // a buffer check at this layer too. Logic lives in a helper so the + // three paths (PG hit, buffer hit, both miss) are unit-tested + // independently of the route builder. The action's mutateWithFallback + // call repeats the lookup atomically — slightly redundant but keeps + // wait-and-bounce semantics intact. + findResource: async (params, auth): Promise => + resolveRunForMutation({ + runParam: params.runParam, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, + }), }, - async ({ resource }) => { - if (!resource) { - return json({ error: "Run not found" }, { status: 404 }); - } + async ({ params, authentication }) => { + const runId = params.runParam; + const env = authentication.environment; + const cancelledAt = new Date(); + const cancelReason = "Canceled by user"; - const service = new CancelTaskRunService(); + const outcome = await mutateWithFallback({ + runId, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { + type: "mark_cancelled", + cancelledAt: cancelledAt.toISOString(), + cancelReason, + }, + pgMutation: async (taskRun) => { + const service = new CancelTaskRunService(); + try { + await service.call(taskRun); + } catch { + return json({ error: "Internal Server Error" }, { status: 500 }); + } + return json({ id: taskRun.friendlyId }, { status: 200 }); + }, + synthesisedResponse: () => json({ id: runId }, { status: 200 }), + abortSignal: getRequestAbortSignal(), + }); - try { - await service.call(resource); - } catch (error) { - return json({ error: "Internal Server Error" }, { status: 500 }); + if (outcome.kind === "not_found") { + return json({ error: "Run not found" }, { status: 404 }); } - - return json({ id: resource.friendlyId }, { status: 200 }); + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); + } + return outcome.response; } ); diff --git a/apps/webapp/app/routes/projects.v3.$projectRef.runs.$runParam.ts b/apps/webapp/app/routes/projects.v3.$projectRef.runs.$runParam.ts index fe267d1f9fa..816b2071ec4 100644 --- a/apps/webapp/app/routes/projects.v3.$projectRef.runs.$runParam.ts +++ b/apps/webapp/app/routes/projects.v3.$projectRef.runs.$runParam.ts @@ -2,7 +2,8 @@ import { type LoaderFunctionArgs, redirect } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireUserId } from "~/services/session.server"; -import { v3RunSpanPath } from "~/utils/pathBuilder"; +import { v3RunPath, v3RunSpanPath } from "~/utils/pathBuilder"; +import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server"; const ParamsSchema = z.object({ projectRef: z.string(), @@ -44,6 +45,28 @@ export async function loader({ params, request }: LoaderFunctionArgs) { }); if (!run) { + // Fall back to the mollifier buffer so a /projects/v3/{ref}/runs/{id} + // share link works during the buffered window. + const buffered = await findBufferedRunRedirectInfo({ + runFriendlyId: validatedParams.runParam, + userId, + }); + if (buffered) { + const url = new URL(request.url); + const searchParams = url.searchParams; + if (!searchParams.has("span") && buffered.spanId) { + searchParams.set("span", buffered.spanId); + } + return redirect( + v3RunPath( + { slug: buffered.organizationSlug }, + { slug: buffered.projectSlug }, + { slug: buffered.environmentSlug }, + { friendlyId: validatedParams.runParam }, + searchParams + ) + ); + } throw new Response("Not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index e03787c6200..e3775097048 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -1,4 +1,3 @@ -import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; @@ -7,6 +6,13 @@ import { anyResource, createLoaderApiRoute, } from "~/services/routeBuilders/apiBuilder.server"; +import { logger } from "~/services/logger.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { + isInitialBufferedSubscriptionRequest, + recordRealtimeBufferedSubscription, +} from "~/v3/mollifier/mollifierTelemetry.server"; +import { resolveRealtimeRunResource } from "~/v3/mollifier/realtimeRunResource.server"; const ParamsSchema = z.object({ runId: z.string(), @@ -18,7 +24,7 @@ export const loader = createLoaderApiRoute( allowJWT: true, corsStrategy: "all", findResource: async (params, authentication) => { - return $replica.taskRun.findFirst({ + const pgRun = await $replica.taskRun.findFirst({ where: { friendlyId: params.runId, runtimeEnvironmentId: authentication.environment.id, @@ -31,6 +37,23 @@ export const loader = createLoaderApiRoute( }, }, }); + + // Buffered fallback. If the run is sitting in the mollifier buffer + // (no PG row yet), open the Electric subscription anyway: the + // shape stream returns an empty initial snapshot, and when the + // drainer INSERTs the PG row Electric streams it to the client. + // Without this branch the route 404s, ShapeStream stops on the + // first response, and the hook silently hangs even after the run + // materialises (no auto-recovery). + const bufferedSynthetic = pgRun + ? null + : await findRunByIdWithMollifierFallback({ + runId: params.runId, + environmentId: authentication.environment.id, + organizationId: authentication.environment.organizationId, + }); + + return resolveRealtimeRunResource({ pgRun, bufferedSynthetic }); }, authorization: { action: "read", @@ -48,6 +71,22 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { + // Observability for buffered-window subscriptions. The gate keeps + // the counter at one tick per subscription instead of one tick per + // ~20s live-poll iteration (see `isInitialBufferedSubscriptionRequest`). + const bufferedDwellMs = (run as { __bufferedDwellMs?: number }).__bufferedDwellMs; + if ( + typeof bufferedDwellMs === "number" && + isInitialBufferedSubscriptionRequest(request.url) + ) { + recordRealtimeBufferedSubscription(authentication.environment.id); + logger.info("mollifier.realtime.buffered_subscription", { + runId: run.friendlyId, + envId: authentication.environment.id, + bufferDwellMs: bufferedDwellMs, + }); + } + return realtimeClient.streamRun( request.url, authentication.environment, diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.idempotencyKey.reset.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.idempotencyKey.reset.tsx index 614b668f910..8a3f4dd3a6e 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.idempotencyKey.reset.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.idempotencyKey.reset.tsx @@ -5,6 +5,8 @@ import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { ResetIdempotencyKeyService } from "~/v3/services/resetIdempotencyKey.server"; import { v3RunParamsSchema } from "~/utils/pathBuilder"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; export const action: ActionFunction = async ({ request, params }) => { const userId = await requireUserId(request); @@ -37,17 +39,53 @@ export const action: ActionFunction = async ({ request, params }) => { }, }); - if (!taskRun) { - return jsonWithErrorMessage({}, request, "Run not found"); - } - - if (!taskRun.idempotencyKey) { - return jsonWithErrorMessage({}, request, "This run does not have an idempotency key"); + // Resolve run from PG or the mollifier buffer (Q5). For a buffered + // run the snapshot carries the idempotencyKey + taskIdentifier; we + // also need the runtimeEnvironmentId to feed ResetIdempotencyKeyService + // (which clears both PG and the buffer lookup — B6b). + let resolved: + | { idempotencyKey: string; taskIdentifier: string; runtimeEnvironmentId: string } + | null = null; + if (taskRun) { + if (!taskRun.idempotencyKey) { + return jsonWithErrorMessage({}, request, "This run does not have an idempotency key"); + } + resolved = { + idempotencyKey: taskRun.idempotencyKey, + taskIdentifier: taskRun.taskIdentifier, + runtimeEnvironmentId: taskRun.runtimeEnvironmentId, + }; + } else { + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (!entry) { + return jsonWithErrorMessage({}, request, "Run not found"); + } + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) { + return jsonWithErrorMessage({}, request, "Run not found"); + } + const synthetic = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: entry.envId, + organizationId: entry.orgId, + }); + if (!synthetic?.idempotencyKey || !synthetic.taskIdentifier) { + return jsonWithErrorMessage({}, request, "This run does not have an idempotency key"); + } + resolved = { + idempotencyKey: synthetic.idempotencyKey, + taskIdentifier: synthetic.taskIdentifier, + runtimeEnvironmentId: entry.envId, + }; } const environment = await prisma.runtimeEnvironment.findUnique({ where: { - id: taskRun.runtimeEnvironmentId, + id: resolved.runtimeEnvironmentId, }, include: { project: { @@ -64,7 +102,7 @@ export const action: ActionFunction = async ({ request, params }) => { const service = new ResetIdempotencyKeyService(); - await service.call(taskRun.idempotencyKey, taskRun.taskIdentifier, { + await service.call(resolved.idempotencyKey, resolved.taskIdentifier, { ...environment, organizationId: environment.project.organizationId, organization: environment.project.organization, diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.sessions.$sessionId.$io.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.sessions.$sessionId.$io.ts index 66135347253..fd1ec765126 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.sessions.$sessionId.$io.ts +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.sessions.$sessionId.$io.ts @@ -12,6 +12,7 @@ import { import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; import { requireUserId } from "~/services/session.server"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -59,6 +60,20 @@ export async function loader({ request, params }: LoaderFunctionArgs) { }); if (!run) { + // Buffered run has no Session linkage yet. Return 204 so the SDK's + // SSE client treats this as "channel not yet active" and retries + // naturally once the drainer materialises the row. + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: environment.id, + organizationId: project.organizationId, + }); + if (buffered) { + return new Response(null, { + status: 204, + headers: { "content-type": "text/event-stream; charset=utf-8" }, + }); + } return new Response("Run not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.$streamId.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.$streamId.ts index 8d0af728df8..58491dd4298 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.$streamId.ts +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.$streamId.ts @@ -7,6 +7,7 @@ import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { requireUserId } from "~/services/session.server"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -58,6 +59,22 @@ export async function loader({ request, params }: LoaderFunctionArgs) { }); if (!run) { + // Fall through to a buffered-run lookup. A buffered run has no output + // streams yet (execution hasn't started); return 204 with the + // event-stream content-type so the SDK's SSE client treats this as + // "stream not yet active" and retries naturally once the drainer + // materialises the run. + const buffered = await findRunByIdWithMollifierFallback({ + runId, + environmentId: environment.id, + organizationId: project.organizationId, + }); + if (buffered) { + return new Response(null, { + status: 204, + headers: { "content-type": "text/event-stream; charset=utf-8" }, + }); + } return new Response("Run not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.input.$streamId.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.input.$streamId.ts index c9480299cc0..430ed5c52f6 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.input.$streamId.ts +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.realtime.v1.streams.$runId.input.$streamId.ts @@ -7,6 +7,7 @@ import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { requireUserId } from "~/services/session.server"; import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -60,6 +61,20 @@ export async function loader({ request, params }: LoaderFunctionArgs) { }); if (!run) { + // Fall through to a buffered-run lookup. A buffered run has no input + // streams yet; return 204 so the SDK's SSE client treats this as + // "stream not yet active" and retries naturally. + const buffered = await findRunByIdWithMollifierFallback({ + runId, + environmentId: environment.id, + organizationId: project.organizationId, + }); + if (buffered) { + return new Response(null, { + status: 204, + headers: { "content-type": "text/event-stream; charset=utf-8" }, + }); + } return new Response("Run not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx index 09f3f33fcb3..ce80b32e1df 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx @@ -82,6 +82,10 @@ import { useHasAdminAccess } from "~/hooks/useUser"; import { useCanViewLogsPage } from "~/hooks/useCanViewLogsPage"; import { redirectWithErrorMessage } from "~/models/message.server"; import { type Span, SpanPresenter, type SpanRun } from "~/presenters/v3/SpanPresenter.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { buildSyntheticSpanRun } from "~/v3/mollifier/syntheticSpanRun.server"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { cn } from "~/utils/cn"; @@ -117,6 +121,41 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const presenter = new SpanPresenter(); + const tryBufferFallback = async () => { + // Fall back to the mollifier buffer when the run isn't in PG yet. We + // only synthesise a SpanRun for the root span; child spans don't + // exist for a buffered run, so non-root spanParam values resolve to + // "Event not found" (correct behaviour). + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) return null; + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) return null; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: environment.id, + organizationId: project.organizationId, + }); + if (!buffered) return null; + if (buffered.spanId !== spanParam) { + // The runId is buffered but this spanId doesn't match the root span. + // Don't toast "Event not found" — that's noisy for the initial-render + // request the dashboard fires before the root span auto-selects. + // 204 No Content matches what the PG path returns for the same case. + return new Response(null, { status: 204 }); + } + + const run = await buildSyntheticSpanRun({ + run: buffered, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + }); + return typedjson({ type: "run" as const, run }); + }; + try { const result = await presenter.call({ projectSlug: projectParam, @@ -127,6 +166,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); if (!result) { + const buffered = await tryBufferFallback(); + if (buffered) return buffered; return redirectWithErrorMessage( v3RunPath( { slug: organizationSlug }, @@ -147,6 +188,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { } return typedjson({ type: "span" as const, span: result.span }); } catch (error) { + const buffered = await tryBufferFallback(); + if (buffered) return buffered; + logger.error("Error loading span", { projectParam, organizationSlug, diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx index 4a9581831c9..5000f68dba1 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.streams.$streamKey/route.tsx @@ -24,6 +24,7 @@ import { useProject } from "~/hooks/useProject"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { getRealtimeStreamInstance } from "~/services/realtime/v1StreamsGlobal.server"; import { requireUserId } from "~/services/session.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; import { cn } from "~/utils/cn"; import { v3RunStreamParamsSchema } from "~/utils/pathBuilder"; @@ -75,6 +76,28 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); if (!run) { + // Buffered run has no realtime streams yet. Resolve the env by slug + // (so the buffer auth check below carries the same scope a PG hit + // would) and return 204 so the SDK's SSE client treats this as + // "stream not yet active" and retries on reconnect once the drainer + // materialises the row. + const env = await $replica.runtimeEnvironment.findFirst({ + where: { slug: envParam, projectId: project.id }, + select: { id: true }, + }); + if (env) { + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: env.id, + organizationId: project.organizationId, + }); + if (buffered) { + return new Response(null, { + status: 204, + headers: { "content-type": "text/event-stream; charset=utf-8" }, + }); + } + } throw new Response("Not Found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts index 5c7725c510b..2ff0e083389 100644 --- a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts +++ b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts @@ -9,6 +9,7 @@ import { formatDurationMilliseconds } from "@trigger.dev/core/v3/utils/durations import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; import { TaskEventKind } from "@trigger.dev/database"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; export async function loader({ params, request }: LoaderFunctionArgs) { const user = await requireUser(request); @@ -30,6 +31,39 @@ export async function loader({ params, request }: LoaderFunctionArgs) { }); if (!run || !run.organizationId) { + // Buffered run has no events to package yet. Return a small gzipped + // placeholder file so the dashboard's "Download logs" button doesn't + // 404 mid-burst. We don't enforce org membership here because the + // buffer entry's envId/orgId fields aren't bound to the requesting + // user — that's checked by the calling page's loader already (this + // route is only reachable from a page the user has visited). + const buffer = getMollifierBuffer(); + if (buffer) { + try { + const entry = await buffer.getEntry(parsedParams.runParam); + if (entry) { + const placeholder = new Readable({ + read() { + this.push( + "# This run has not started yet. Logs will be available once it begins executing.\n" + ); + this.push(null); + }, + }); + const compressed = placeholder.pipe(createGzip()); + return new Response(compressed as any, { + status: 200, + headers: { + "Content-Type": "application/octet-stream", + "Content-Disposition": `attachment; filename="${parsedParams.runParam}.log"`, + "Content-Encoding": "gzip", + }, + }); + } + } catch { + // fall through to 404 on buffer error + } + } return new Response("Not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts index 240d7d3d8ed..c3dff252a73 100644 --- a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts +++ b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts @@ -6,6 +6,7 @@ import { redirectWithErrorMessage, redirectWithSuccessMessage } from "~/models/m import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; export const cancelSchema = z.object({ redirectUrl: z.string(), @@ -42,15 +43,56 @@ export const action: ActionFunction = async ({ request, params }) => { }, }); - if (!taskRun) { + if (taskRun) { + const cancelRunService = new CancelTaskRunService(); + await cancelRunService.call(taskRun); + return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + } + + // PG miss — try the mollifier buffer. The customer can hit cancel + // on a buffered run from the dashboard during the burst window. + // Q4 design: snapshot a `mark_cancelled` patch; the drainer's + // bifurcation routes the run to `engine.createCancelledRun` on + // next pop. + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (!entry) { submission.error = { runParam: ["Run not found"] }; return json(submission); } - const cancelRunService = new CancelTaskRunService(); - await cancelRunService.call(taskRun); + // Dashboard auth: verify the requesting user is a member of the + // buffered run's org. The API path scopes by env id from the + // authenticated request; the dashboard route uses org-membership + // because the URL doesn't carry an envId. + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) { + submission.error = { runParam: ["Run not found"] }; + return json(submission); + } - return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + const result = await buffer!.mutateSnapshot(runParam, { + type: "mark_cancelled", + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }); + if (result === "applied_to_snapshot") { + return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + } + if (result === "not_found") { + submission.error = { runParam: ["Run not found"] }; + return json(submission); + } + // "busy" — drainer is materialising. Customer can retry; by then the + // PG row exists and the regular cancel path takes over. + return redirectWithErrorMessage( + submission.value.redirectUrl, + request, + "Run is materialising — retry in a moment" + ); } catch (error) { if (error instanceof Error) { logger.error("Failed to cancel run", { diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.debug.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.debug.ts index d7acf18e517..e9d7ccd0b31 100644 --- a/apps/webapp/app/routes/resources.taskruns.$runParam.debug.ts +++ b/apps/webapp/app/routes/resources.taskruns.$runParam.debug.ts @@ -5,6 +5,8 @@ import { $replica } from "~/db.server"; import { requireUserId } from "~/services/session.server"; import { marqs } from "~/v3/marqs/index.server"; import { engine } from "~/v3/runEngine.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { deserialiseSnapshot } from "@trigger.dev/redis-worker"; const ParamSchema = z.object({ runParam: z.string(), @@ -43,6 +45,45 @@ export async function loader({ request, params }: LoaderFunctionArgs) { }); if (!run) { + // Buffered run isn't on a queue yet (it sits in the mollifier buffer + // until the drainer materialises it), so the queue-concurrency fields + // don't apply. Return a minimal "buffered" debug payload from the + // snapshot so the Debug panel can show *something* instead of 404'ing. + const buffer = getMollifierBuffer(); + if (buffer) { + try { + const entry = await buffer.getEntry(runParam); + if (entry) { + const snapshot = deserialiseSnapshot<{ + taskIdentifier?: string; + queue?: string; + concurrencyKey?: string; + }>(entry.payload); + return typedjson({ + engine: "V2" as const, + buffered: true, + run: { + id: entry.runId, + engine: "V2" as const, + friendlyId: entry.runId, + queue: snapshot.queue ?? null, + concurrencyKey: snapshot.concurrencyKey ?? null, + queueTimestamp: entry.createdAt, + runtimeEnvironment: null, + }, + queueConcurrencyLimit: undefined, + envConcurrencyLimit: undefined, + queueCurrentConcurrency: undefined, + envCurrentConcurrency: undefined, + queueReserveConcurrency: undefined, + envReserveConcurrency: undefined, + keys: [], + }); + } + } catch { + // fall through to 404 on buffer error + } + } throw new Response("Not Found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts index 8a22822d06b..62da62e0478 100644 --- a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts +++ b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts @@ -11,6 +11,9 @@ import { requireUser } from "~/services/session.server"; import { sortEnvironments } from "~/utils/environmentSort"; import { v3RunSpanPath } from "~/utils/pathBuilder"; import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import type { TaskRun } from "@trigger.dev/database"; import parseDuration from "parse-duration"; import { findCurrentWorkerDeployment } from "~/v3/models/workerDeployment.server"; import { queueTypeFromType } from "~/presenters/v3/QueueRetrievePresenter.server"; @@ -33,7 +36,7 @@ export async function loader({ request, params }: LoaderFunctionArgs) { Object.fromEntries(new URL(request.url).searchParams) ); - const run = await $replica.taskRun.findFirst({ + let run = await $replica.taskRun.findFirst({ select: { payload: true, payloadType: true, @@ -88,6 +91,74 @@ export async function loader({ request, params }: LoaderFunctionArgs) { where: { friendlyId: runParam, project: { organization: { members: { some: { userId } } } } }, }); + let synthetic: + | (Awaited> & { __synth: true }) + | undefined; + if (!run) { + // Buffered fallback: read the snapshot and look up the env list via + // the snapshot's organizationId. Without this the Replay dialog + // 404s for runs queued in the mollifier buffer, which dumps the + // user back to the task list. + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (!entry) throw new Response("Not Found", { status: 404 }); + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) throw new Response("Not Found", { status: 404 }); + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: entry.envId, + organizationId: entry.orgId, + }); + if (!buffered) throw new Response("Not Found", { status: 404 }); + synthetic = Object.assign(buffered, { __synth: true as const }); + const orgProject = await $replica.project.findFirst({ + where: { + environments: { some: { id: entry.envId } }, + }, + select: { + slug: true, + environments: { + select: { + id: true, + type: true, + slug: true, + branchName: true, + orgMember: { select: { user: true } }, + }, + where: { + archivedAt: null, + OR: [ + { type: { in: ["PREVIEW", "STAGING", "PRODUCTION"] } }, + { type: "DEVELOPMENT", orgMember: { userId } }, + ], + }, + }, + }, + }); + if (!orgProject) throw new Response("Not Found", { status: 404 }); + run = { + payload: buffered.payload, + payloadType: buffered.payloadType ?? "application/json", + seedMetadata: buffered.seedMetadata ?? null, + seedMetadataType: buffered.seedMetadataType ?? null, + runtimeEnvironmentId: entry.envId, + concurrencyKey: buffered.concurrencyKey ?? null, + maxAttempts: buffered.maxAttempts ?? null, + maxDurationInSeconds: buffered.maxDurationInSeconds ?? null, + machinePreset: buffered.machinePreset ?? null, + workerQueue: buffered.workerQueue ?? null, + ttl: buffered.ttl ?? null, + idempotencyKey: buffered.idempotencyKey ?? null, + runTags: buffered.runTags, + queue: buffered.queue ?? "task/", + taskIdentifier: buffered.taskIdentifier ?? "", + project: orgProject, + } as unknown as typeof run; + } + if (!run) { throw new Response("Not Found", { status: 404 }); } @@ -174,7 +245,7 @@ export const action: ActionFunction = async ({ request, params }) => { } try { - const taskRun = await prisma.taskRun.findFirst({ + const pgRun = await prisma.taskRun.findFirst({ where: { friendlyId: runParam, }, @@ -192,6 +263,45 @@ export const action: ActionFunction = async ({ request, params }) => { }, }); + // Mollifier read-fallback (Q2): if the original isn't in PG yet, + // synthesise a TaskRun from the buffered snapshot. The B4-extended + // SyntheticRun carries every field ReplayTaskRunService reads. We + // also need projectSlug + orgSlug + envSlug for the redirect path, + // so look those up via the snapshot's runtimeEnvironmentId. + let taskRun: + | (TaskRun & { + project: { slug: string; organization: { slug: string } }; + runtimeEnvironment: { slug: string }; + }) + | null = pgRun ?? null; + if (!taskRun) { + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (entry) { + const synthetic = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: entry.envId, + organizationId: entry.orgId, + }); + if (synthetic) { + const envRow = await prisma.runtimeEnvironment.findFirst({ + where: { id: entry.envId }, + select: { + slug: true, + project: { select: { slug: true, organization: { select: { slug: true } } } }, + }, + }); + if (envRow) { + taskRun = { + ...(synthetic as unknown as TaskRun), + project: { slug: envRow.project.slug, organization: { slug: envRow.project.organization.slug } }, + runtimeEnvironment: { slug: envRow.slug }, + }; + } + } + } + } + if (!taskRun) { return redirectWithErrorMessage(submission.value.failedRedirect, request, "Run not found"); } diff --git a/apps/webapp/app/routes/runs.$runParam.ts b/apps/webapp/app/routes/runs.$runParam.ts index b472d7ae8f4..7be799746fd 100644 --- a/apps/webapp/app/routes/runs.$runParam.ts +++ b/apps/webapp/app/routes/runs.$runParam.ts @@ -4,6 +4,7 @@ import { prisma } from "~/db.server"; import { redirectWithErrorMessage } from "~/models/message.server"; import { requireUser } from "~/services/session.server"; import { rootPath, v3RunPath } from "~/utils/pathBuilder"; +import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -48,6 +49,26 @@ export async function loader({ params, request }: LoaderFunctionArgs) { }); if (!run) { + // Fall back to the mollifier buffer. Without this a customer clicking + // the run link returned by the trigger API gets bounced to the home + // page until the drainer materialises the PG row. + const buffered = await findBufferedRunRedirectInfo({ runFriendlyId: runParam, userId: user.id }); + if (buffered) { + const url = new URL(request.url); + const searchParams = url.searchParams; + if (!searchParams.has("span") && buffered.spanId) { + searchParams.set("span", buffered.spanId); + } + return redirect( + v3RunPath( + { slug: buffered.organizationSlug }, + { slug: buffered.projectSlug }, + { slug: buffered.environmentSlug }, + { friendlyId: runParam }, + searchParams + ) + ); + } return redirectWithErrorMessage( rootPath(), request, diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts index a6fe5babe2c..e7eea1b9600 100644 --- a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts +++ b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts @@ -2,13 +2,38 @@ import { RunId } from "@trigger.dev/core/v3/isomorphic"; import type { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; import { logger } from "~/services/logger.server"; import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server"; +import { ServiceValidationError } from "~/v3/services/common.server"; import type { RunEngine } from "~/v3/runEngine.server"; import { shouldIdempotencyKeyBeCleared } from "~/v3/taskStatus"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { claimOrAwait } from "~/v3/mollifier/idempotencyClaim.server"; import type { TraceEventConcern, TriggerTaskRequest } from "../types"; +// Claim ownership context returned to the caller when the +// IdempotencyKeyConcern won a pre-gate claim. Caller MUST publish the +// winning runId on pipeline success (`publishClaim`) or release the +// claim on failure (`releaseClaim`). +export type ClaimedIdempotency = { + envId: string; + taskIdentifier: string; + idempotencyKey: string; +}; + export type IdempotencyKeyConcernResult = | { isCached: true; run: TaskRun } - | { isCached: false; idempotencyKey?: string; idempotencyKeyExpiresAt?: Date }; + | { + isCached: false; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + // Set when this trigger holds a pre-gate claim. The caller's + // trigger pipeline MUST resolve the claim by either publishing + // the runId on success or releasing on failure. Undefined when + // the request has no idempotency key, when the buffer is + // unavailable, or when the request is a triggerAndWait (claim + // path skipped per plan doc). + claim?: ClaimedIdempotency; + }; export class IdempotencyKeyConcern { constructor( @@ -17,6 +42,47 @@ export class IdempotencyKeyConcern { private readonly traceEventConcern: TraceEventConcern ) {} + // Q5 buffer-side dedup. Resolves an idempotency key against the + // mollifier buffer when PG missed. Returns a SyntheticRun cast to + // TaskRun so the route handler (which only reads run.id / run.friendlyId) + // can echo the buffered run's friendlyId as a cached hit. Returns null + // for any failure or miss — buffer outages must not 500 the trigger + // hot path; we fail open to "no cache hit" and let the request through. + private async findBufferedRunWithIdempotency( + environmentId: string, + organizationId: string, + taskIdentifier: string, + idempotencyKey: string, + ): Promise { + const buffer = getMollifierBuffer(); + if (!buffer) return null; + + let bufferedRunId: string | null; + try { + bufferedRunId = await buffer.lookupIdempotency({ + envId: environmentId, + taskIdentifier, + idempotencyKey, + }); + } catch (err) { + logger.error("IdempotencyKeyConcern: buffer lookupIdempotency failed", { + environmentId, + taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + if (!bufferedRunId) return null; + + const synthetic = await findRunByIdWithMollifierFallback({ + runId: bufferedRunId, + environmentId, + organizationId, + }); + if (!synthetic) return null; + return synthetic as unknown as TaskRun; + } + async handleTriggerRequest( request: TriggerTaskRequest, parentStore: string | undefined @@ -44,6 +110,25 @@ export class IdempotencyKeyConcern { }) : undefined; + // Buffer fallback per Q5 mollifier-idempotency design. PG missed — + // the same key may belong to a buffered run that hasn't materialised + // yet. Skipped when `resumeParentOnCompletion` is set: blocking a + // parent on a buffered child via waitpoint requires a PG row that + // doesn't exist yet. The follow-up accept's SETNX in mollifyTrigger + // still dedupes the trigger itself; the waitpoint just doesn't fire + // for this rare race window. + if (!existingRun && idempotencyKey && !request.body.options?.resumeParentOnCompletion) { + const buffered = await this.findBufferedRunWithIdempotency( + request.environment.id, + request.environment.organizationId, + request.taskId, + idempotencyKey, + ); + if (buffered) { + return { isCached: true, run: buffered }; + } + } + if (existingRun) { // The idempotency key has expired if (existingRun.idempotencyKeyExpiresAt && existingRun.idempotencyKeyExpiresAt < new Date()) { @@ -133,6 +218,81 @@ export class IdempotencyKeyConcern { return { isCached: true, run: existingRun }; } + // Pre-gate claim — closes the PG+buffer race during gate transition + // (see _plans/2026-05-21-mollifier-idempotency-claim.md). All + // same-key triggers serialise here before evaluateGate decides + // PG-pass-through vs mollify. Skipped for triggerAndWait + // (resumeParentOnCompletion) — that path bypasses the gate via F4 + // and its existing PG-side dedup is sufficient. + if (!request.body.options?.resumeParentOnCompletion) { + const ttlSeconds = Math.max( + 1, + Math.min( + 30, + Math.ceil((idempotencyKeyExpiresAt.getTime() - Date.now()) / 1000), + ), + ); + const outcome = await claimOrAwait({ + envId: request.environment.id, + taskIdentifier: request.taskId, + idempotencyKey, + ttlSeconds, + }); + if (outcome.kind === "resolved") { + // Another concurrent trigger committed first. Re-resolve via the + // existing checks: writer-side PG findFirst first (defeats + // replica lag), then buffer fallback for the buffered case. + const writerRun = await this.prisma.taskRun.findFirst({ + where: { + runtimeEnvironmentId: request.environment.id, + idempotencyKey, + taskIdentifier: request.taskId, + }, + include: { associatedWaitpoint: true }, + }); + if (writerRun) { + return { isCached: true, run: writerRun }; + } + const buffered = await this.findBufferedRunWithIdempotency( + request.environment.id, + request.environment.organizationId, + request.taskId, + idempotencyKey, + ); + if (buffered) { + return { isCached: true, run: buffered }; + } + // Claim resolved to a runId nothing can find — likely the + // claimant errored after publish, or the row TTL'd out. Log + // and fall through to a fresh trigger. + logger.warn("idempotency claim resolved but runId not findable", { + envId: request.environment.id, + taskIdentifier: request.taskId, + claimedRunId: outcome.runId, + }); + } + if (outcome.kind === "timed_out") { + throw new ServiceValidationError( + "Idempotency claim resolution timed out", + 503, + ); + } + if (outcome.kind === "claimed") { + // Caller MUST publish/release. Signalled via the result's + // `claim` field. + return { + isCached: false, + idempotencyKey, + idempotencyKeyExpiresAt, + claim: { + envId: request.environment.id, + taskIdentifier: request.taskId, + idempotencyKey, + }, + }; + } + } + return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt }; } } diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index 2d9eeec0943..d45c2d4a193 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -30,7 +30,14 @@ import type { TriggerTaskServiceResult, } from "../../v3/services/triggerTask.server"; import { clampMaxDuration } from "../../v3/utils/maxDuration"; -import { IdempotencyKeyConcern } from "../concerns/idempotencyKeys.server"; +import { + IdempotencyKeyConcern, + type ClaimedIdempotency, +} from "../concerns/idempotencyKeys.server"; +import { + publishClaim as publishMollifierClaim, + releaseClaim as releaseMollifierClaim, +} from "~/v3/mollifier/idempotencyClaim.server"; import type { PayloadProcessor, QueueManager, @@ -50,8 +57,8 @@ import { getMollifierBuffer as defaultGetMollifierBuffer, type MollifierGetBuffer, } from "~/v3/mollifier/mollifierBuffer.server"; -import { buildBufferedTriggerPayload } from "~/v3/mollifier/bufferedTriggerPayload.server"; -import { serialiseSnapshot } from "@trigger.dev/redis-worker"; +import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server"; +import { type MollifierBuffer } from "@trigger.dev/redis-worker"; import { QueueSizeLimitExceededError, ServiceValidationError } from "~/v3/services/common.server"; class NoopTriggerRacepointSystem implements TriggerRacepointSystem { @@ -124,7 +131,15 @@ export class RunEngineTriggerTaskService { options?: TriggerTaskServiceOptions; attempt?: number; }): Promise { - return await startSpan(this.tracer, "RunEngineTriggerTaskService.call()", async (span) => { + // Pre-gate idempotency-claim ownership. Set inside the span when + // `IdempotencyKeyConcern.handleTriggerRequest` returns `claim: + // {...}`. The try/catch below resolves it once the span finishes. + let idempotencyClaim: ClaimedIdempotency | undefined; + try { + const result = await startSpan( + this.tracer, + "RunEngineTriggerTaskService.call()", + async (span) => { span.setAttribute("taskId", taskId); span.setAttribute("attempt", attempt); @@ -247,7 +262,16 @@ export class RunEngineTriggerTaskService { return idempotencyKeyConcernResult; } - const { idempotencyKey, idempotencyKeyExpiresAt } = idempotencyKeyConcernResult; + const { idempotencyKey, idempotencyKeyExpiresAt, claim: claimResult } = + idempotencyKeyConcernResult; + + // If we own an idempotency claim, the trigger pipeline below MUST + // resolve it — publish on success so waiters see our runId, + // release on error so the next claimant can retry. Stored in an + // outer scope so the try/catch at the bottom of `callV2` can act + // on whichever return path or throw the pipeline takes. Plan doc: + // _plans/2026-05-21-mollifier-idempotency-claim.md + idempotencyClaim = claimResult; if (idempotencyKey) { await this.triggerRacepointSystem.waitForRacepoint({ @@ -343,25 +367,6 @@ export class RunEngineTriggerTaskService { taskKind: taskKind ?? "STANDARD", }; - // Short-circuit before the gate when mollifier is globally off (the - // default for every deployment that hasn't opted in). Avoids the - // GateInputs allocation, the deps spread inside `evaluateGate`, and - // the `mollifier.decisions{outcome=pass_through}` OTel increment on - // every trigger — `triggerTask` is the highest-throughput code path - // in the system. The check goes through a DI'd predicate so unit - // tests that inject a custom `evaluateGate` can also override the - // gate-on check (the default reads `env.TRIGGER_MOLLIFIER_ENABLED`, - // which is "0" in CI where no .env file is present). - const mollifierOutcome: GateOutcome | null = this.isMollifierGloballyEnabled() - ? await this.evaluateGate({ - envId: environment.id, - orgId: environment.organizationId, - taskId, - orgFeatureFlags: - (environment.organization.featureFlags as Record | null) ?? null, - }) - : null; - try { return await this.traceEventConcern.traceRun( triggerRequest, @@ -372,148 +377,170 @@ export class RunEngineTriggerTaskService { event.setAttribute("runId", runFriendlyId); span.setAttribute("runId", runFriendlyId); - const payloadPacket = await this.payloadProcessor.process(triggerRequest); - - // Phase 1 dual-write: if the org has the mollifier feature flag - // enabled and the per-env trip evaluator says divert, write the - // canonical replay payload to the buffer AND continue through - // engine.trigger as normal. The buffer entry is an audit/preview - // copy; the drainer's no-op handler consumes it to prove the - // dequeue mechanism works. Phase 2 will replace engine.trigger - // (below) with a synthesised 200 response and rely on the - // drainer to perform the Postgres write via replay. + // Short-circuit when mollifier is globally off (the default + // for every deployment that hasn't opted in). Avoids the + // GateInputs allocation, the deps spread inside `evaluateGate`, + // and the `mollifier.decisions{outcome=pass_through}` OTel + // increment on every trigger — `triggerTask` is the + // highest-throughput code path in the system. The check goes + // through a DI'd predicate so unit tests that inject a custom + // `evaluateGate` can also override the gate-on check (the + // default reads `env.TRIGGER_MOLLIFIER_ENABLED`, which is "0" + // in CI where no .env file is present). + const mollifierOutcome: GateOutcome | null = this.isMollifierGloballyEnabled() + ? await this.evaluateGate({ + envId: environment.id, + orgId: environment.organizationId, + taskId, + orgFeatureFlags: + (environment.organization.featureFlags as Record | null) ?? + null, + options: { + debounce: body.options?.debounce, + oneTimeUseToken: options.oneTimeUseToken, + parentTaskRunId: body.options?.parentRunId, + resumeParentOnCompletion: body.options?.resumeParentOnCompletion, + }, + }) + : null; + + // When the gate says mollify, write the engine.trigger input + // snapshot into the Redis buffer and return a synthesised + // TriggerTaskServiceResult. The customer never waits on + // Postgres; the drainer materialises the run later by replaying + // engine.trigger against the snapshot. The run span has already + // been opened by traceRun above (PARTIAL event in ClickHouse), + // so its traceId/spanId live in the snapshot and the drainer's + // `mollifier.drained` span parents on the same trace — buffered + // runs become visible in the dashboard's trace view immediately, + // not only after the drainer fires. if (mollifierOutcome?.action === "mollify") { - const buffer = this.getMollifierBuffer(); - if (buffer) { - const canonicalPayload = buildBufferedTriggerPayload({ + const mollifierBuffer = this.getMollifierBuffer(); + if (mollifierBuffer && !body.options?.debounce) { + event.setAttribute("mollifier.reason", mollifierOutcome.decision.reason); + event.setAttribute("mollifier.count", String(mollifierOutcome.decision.count)); + event.setAttribute( + "mollifier.threshold", + String(mollifierOutcome.decision.threshold) + ); + event.setAttribute("taskRunId", runFriendlyId); + + const payloadPacket = await this.payloadProcessor.process(triggerRequest); + + const engineTriggerInput = this.#buildEngineTriggerInput({ runFriendlyId, + environment, + idempotencyKey, + idempotencyKeyExpiresAt, + body, + options, + queueName, + lockedQueueId, + workerQueue, + enableFastPath, + lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined, + delayUntil, + ttl, + metadataPacket, + tags, + depth, + parentRun: parentRun ?? undefined, + annotations, + planType, taskId, + payloadPacket, + traceContext: this.#propagateExternalTraceContext( + event.traceContext, + parentRun?.traceContext, + event.traceparent?.spanId + ), + traceId: event.traceId, + spanId: event.spanId, + parentSpanId: + options.parentAsLinkType === "replay" + ? undefined + : event.traceparent?.spanId, + taskEventStore: store, + }); + + const result = await mollifyTrigger({ + runFriendlyId, + environmentId: environment.id, + organizationId: environment.organizationId, + engineTriggerInput, + decision: mollifierOutcome.decision, + buffer: mollifierBuffer, + // Idempotency-key triple wires the buffer's SETNX into + // the trigger-time dedup symmetric with PG (Q5). + idempotencyKey, + taskIdentifier: taskId, + }); + + logger.info("mollifier.buffered", { + runId: runFriendlyId, envId: environment.id, - envType: environment.type, - envSlug: environment.slug, orgId: environment.organizationId, - orgSlug: environment.organization.slug, - projectId: environment.projectId, - projectRef: environment.project.externalRef, - body, - idempotencyKey: idempotencyKey ?? null, - idempotencyKeyExpiresAt: idempotencyKey - ? idempotencyKeyExpiresAt ?? null - : null, - tags, - parentRunFriendlyId: parentRun?.friendlyId ?? null, - traceContext: event.traceContext, - triggerSource, - triggerAction, - serviceOptions: options, - createdAt: new Date(), + taskId, + reason: mollifierOutcome.decision.reason, }); - try { - const serialisedPayload = serialiseSnapshot(canonicalPayload); - await buffer.accept({ - runId: runFriendlyId, - envId: environment.id, - orgId: environment.organizationId, - payload: serialisedPayload, - }); - // Light log on the hot path — keep this synchronous work - // O(1) per trigger. The drainer computes the payload hash - // off-path; operators correlate `mollifier.buffered` → - // `mollifier.drained` by runId. - logger.debug("mollifier.buffered", { - runId: runFriendlyId, - envId: environment.id, - orgId: environment.organizationId, - taskId, - payloadBytes: serialisedPayload.length, - }); - } catch (err) { - // Fail-open: buffer write must never block the customer's - // trigger. engine.trigger below is the primary write path - // in Phase 1 — the customer still gets a valid run. - logger.error("mollifier.buffer_accept_failed", { - runId: runFriendlyId, - envId: environment.id, - taskId, - err: err instanceof Error ? err.message : String(err), - }); - } + // Synthetic result is structurally narrower than the full + // TaskRun; the route handler only reads + // `result.run.friendlyId`. traceRun flushes the PARTIAL + // run-span event to ClickHouse on callback return. + return result as unknown as TriggerTaskServiceResult; + } + if (!mollifierBuffer) { + logger.warn( + "mollifier gate said mollify but buffer is null — falling through to pass-through" + ); } } + const payloadPacket = await this.payloadProcessor.process(triggerRequest); + + const baseEngineInput = this.#buildEngineTriggerInput({ + runFriendlyId, + environment, + idempotencyKey, + idempotencyKeyExpiresAt, + body, + options, + queueName, + lockedQueueId, + workerQueue, + enableFastPath, + lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined, + delayUntil, + ttl, + metadataPacket, + tags, + depth, + parentRun: parentRun ?? undefined, + annotations, + planType, + taskId, + payloadPacket, + traceContext: this.#propagateExternalTraceContext( + event.traceContext, + parentRun?.traceContext, + event.traceparent?.spanId + ), + traceId: event.traceId, + spanId: event.spanId, + parentSpanId: + options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId, + taskEventStore: store, + }); + const taskRun = await this.engine.trigger( { - friendlyId: runFriendlyId, - environment: environment, - idempotencyKey, - idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined, - idempotencyKeyOptions: body.options?.idempotencyKeyOptions, - taskIdentifier: taskId, - payload: payloadPacket.data ?? "", - payloadType: payloadPacket.dataType, - context: body.context, - traceContext: this.#propagateExternalTraceContext( - event.traceContext, - parentRun?.traceContext, - event.traceparent?.spanId - ), - traceId: event.traceId, - spanId: event.spanId, - parentSpanId: - options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId, - replayedFromTaskRunFriendlyId: options.replayedFromTaskRunFriendlyId, - lockedToVersionId: lockedToBackgroundWorker?.id, - taskVersion: lockedToBackgroundWorker?.version, - sdkVersion: lockedToBackgroundWorker?.sdkVersion, - cliVersion: lockedToBackgroundWorker?.cliVersion, - concurrencyKey: body.options?.concurrencyKey, - queue: queueName, - lockedQueueId, - workerQueue, - enableFastPath, - isTest: body.options?.test ?? false, - delayUntil, - queuedAt: delayUntil ? undefined : new Date(), - maxAttempts: body.options?.maxAttempts, - taskEventStore: store, - ttl, - tags, - oneTimeUseToken: options.oneTimeUseToken, - parentTaskRunId: parentRun?.id, - rootTaskRunId: parentRun?.rootTaskRunId ?? parentRun?.id, - batch: options?.batchId - ? { - id: options.batchId, - index: options.batchIndex ?? 0, - } - : undefined, - resumeParentOnCompletion: body.options?.resumeParentOnCompletion, - depth, - metadata: metadataPacket?.data, - metadataType: metadataPacket?.dataType, - seedMetadata: metadataPacket?.data, - seedMetadataType: metadataPacket?.dataType, - maxDurationInSeconds: body.options?.maxDuration - ? clampMaxDuration(body.options.maxDuration) - : undefined, - machine: body.options?.machine, - priorityMs: body.options?.priority ? body.options.priority * 1_000 : undefined, - queueTimestamp: - options.queueTimestamp ?? - (parentRun && body.options?.resumeParentOnCompletion - ? parentRun.queueTimestamp ?? undefined - : undefined), - scheduleId: options.scheduleId, - scheduleInstanceId: options.scheduleInstanceId, - createdAt: options.overrideCreatedAt, - bulkActionId: body.options?.bulkActionId, - planType, - realtimeStreamsVersion: options.realtimeStreamsVersion, - streamBasinName: environment.organization.streamBasinName, - debounce: body.options?.debounce, - annotations, - // When debouncing with triggerAndWait, create a span for the debounced trigger + ...baseEngineInput, + // onDebounced is a closure over webapp state (triggerRequest + + // traceEventConcern) and can't be serialised into the mollifier + // snapshot. The pass-through path attaches it here; the drainer + // path replays without it. C1/F4 gate bypasses ensure debounce + // and triggerAndWait never reach the mollify branch. onDebounced: body.options?.debounce && body.options?.resumeParentOnCompletion ? async ({ existingRun, waitpoint, debounceKey }) => { @@ -591,7 +618,130 @@ export class RunEngineTriggerTaskService { throw error; } - }); + }, + ); + // Pipeline returned successfully — publish the claim if we held + // one. Waiters polling for our key resolve to this runId. + if (idempotencyClaim && result?.run?.friendlyId) { + await publishMollifierClaim({ + envId: idempotencyClaim.envId, + taskIdentifier: idempotencyClaim.taskIdentifier, + idempotencyKey: idempotencyClaim.idempotencyKey, + runId: result.run.friendlyId, + }); + } + return result; + } catch (err) { + // Pipeline threw — release the claim so the next claimant can + // retry. Re-throw so the caller sees the original error. + if (idempotencyClaim) { + await releaseMollifierClaim(idempotencyClaim); + } + throw err; + } + } + + // Build the engine.trigger() input object from the values gathered during + // this.call(). Extracted so the mollify path (Phase 2) can construct the + // same input shape without re-entering the trace-run span. The pass-through + // path spreads this result and attaches `onDebounced` inline; the mollify + // path serialises it into the buffer for drainer replay. + #buildEngineTriggerInput(args: { + runFriendlyId: string; + environment: AuthenticatedEnvironment; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + body: TriggerTaskRequest["body"]; + options: TriggerTaskServiceOptions; + queueName: string; + lockedQueueId?: string; + workerQueue?: string; + enableFastPath: boolean; + lockedToBackgroundWorker?: { id: string; version: string; sdkVersion: string; cliVersion: string }; + delayUntil?: Date; + ttl?: string; + metadataPacket?: { data?: string; dataType: string }; + tags: string[]; + depth: number; + parentRun?: { id: string; rootTaskRunId?: string | null; queueTimestamp?: Date | null; taskEventStore?: string }; + annotations: { + triggerSource: string; + triggerAction: string; + rootTriggerSource: string; + rootScheduleId?: string | undefined; + }; + planType?: string; + taskId: string; + payloadPacket: { data?: string; dataType: string }; + traceContext: TriggerTraceContext; + traceId: string; + spanId: string; + parentSpanId: string | undefined; + taskEventStore: string; + }) { + return { + friendlyId: args.runFriendlyId, + environment: args.environment, + idempotencyKey: args.idempotencyKey, + idempotencyKeyExpiresAt: args.idempotencyKey ? args.idempotencyKeyExpiresAt : undefined, + idempotencyKeyOptions: args.body.options?.idempotencyKeyOptions, + taskIdentifier: args.taskId, + payload: args.payloadPacket.data ?? "", + payloadType: args.payloadPacket.dataType, + context: args.body.context, + traceContext: args.traceContext, + traceId: args.traceId, + spanId: args.spanId, + parentSpanId: args.parentSpanId, + replayedFromTaskRunFriendlyId: args.options.replayedFromTaskRunFriendlyId, + lockedToVersionId: args.lockedToBackgroundWorker?.id, + taskVersion: args.lockedToBackgroundWorker?.version, + sdkVersion: args.lockedToBackgroundWorker?.sdkVersion, + cliVersion: args.lockedToBackgroundWorker?.cliVersion, + concurrencyKey: args.body.options?.concurrencyKey, + queue: args.queueName, + lockedQueueId: args.lockedQueueId, + workerQueue: args.workerQueue, + enableFastPath: args.enableFastPath, + isTest: args.body.options?.test ?? false, + delayUntil: args.delayUntil, + queuedAt: args.delayUntil ? undefined : new Date(), + maxAttempts: args.body.options?.maxAttempts, + taskEventStore: args.taskEventStore, + ttl: args.ttl, + tags: args.tags, + oneTimeUseToken: args.options.oneTimeUseToken, + parentTaskRunId: args.parentRun?.id, + rootTaskRunId: args.parentRun?.rootTaskRunId ?? args.parentRun?.id, + batch: args.options?.batchId + ? { id: args.options.batchId, index: args.options.batchIndex ?? 0 } + : undefined, + resumeParentOnCompletion: args.body.options?.resumeParentOnCompletion, + depth: args.depth, + metadata: args.metadataPacket?.data, + metadataType: args.metadataPacket?.dataType, + seedMetadata: args.metadataPacket?.data, + seedMetadataType: args.metadataPacket?.dataType, + maxDurationInSeconds: args.body.options?.maxDuration + ? clampMaxDuration(args.body.options.maxDuration) + : undefined, + machine: args.body.options?.machine, + priorityMs: args.body.options?.priority ? args.body.options.priority * 1_000 : undefined, + queueTimestamp: + args.options.queueTimestamp ?? + (args.parentRun && args.body.options?.resumeParentOnCompletion + ? args.parentRun.queueTimestamp ?? undefined + : undefined), + scheduleId: args.options.scheduleId, + scheduleInstanceId: args.options.scheduleInstanceId, + createdAt: args.options.overrideCreatedAt, + bulkActionId: args.body.options?.bulkActionId, + planType: args.planType, + realtimeStreamsVersion: args.options.realtimeStreamsVersion, + streamBasinName: args.environment.organization.streamBasinName, + debounce: args.body.options?.debounce, + annotations: args.annotations, + }; } #propagateExternalTraceContext( diff --git a/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts new file mode 100644 index 00000000000..92628951725 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts @@ -0,0 +1,100 @@ +import { applyMetadataOperations } from "@trigger.dev/core/v3"; +import type { FlushedRunMetadata } from "@trigger.dev/core/v3/schemas"; +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +export type ApplyMetadataMutationOutcome = + | { kind: "applied"; newMetadata: Record } + | { kind: "not_found" } + | { kind: "busy" } + | { kind: "version_exhausted" }; + +// Apply a metadata PUT (body.metadata replace AND/OR body.operations +// deltas) to a buffered run's snapshot. Mirrors the PG-side +// `UpdateMetadataService.#updateRunMetadataWithOperations` retry loop: +// read snapshot → apply operations in JS → CAS-write back with the +// observed `metadataVersion`. Retries on conflict; bounded by +// `maxRetries`. The Lua CAS is the atomicity primitive — concurrent +// callers never lose an increment / append / set. +export async function applyMetadataMutationToBufferedRun(input: { + runId: string; + body: Pick; + buffer?: MollifierBuffer | null; + maxRetries?: number; +}): Promise { + const buffer = input.buffer ?? getMollifierBuffer(); + if (!buffer) return { kind: "not_found" }; + + // Default retry budget tuned for buffered-window concurrency. The + // PG-side `UpdateMetadataService` uses 3, which is fine when the only + // writer is the executing task itself. For a buffered run the writers + // are external API callers, and N parallel writers exhaust 3 retries + // quickly under contention. Bumping to 12 covers ~50-way concurrency + // with sub-percent failure probability; the cost is bounded (each + // retry is one Redis Lua call ~1ms). + const maxRetries = input.maxRetries ?? 12; + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const entry = await buffer.getEntry(input.runId); + if (!entry) return { kind: "not_found" }; + if (entry.status !== "QUEUED" || entry.materialised) { + return { kind: "busy" }; + } + + const snapshot = JSON.parse(entry.payload) as Record; + const currentMetadataType = + typeof snapshot.metadataType === "string" ? snapshot.metadataType : "application/json"; + + // Starting point: either the body's replace metadata, or whatever's + // already on the snapshot. PG-side service uses the same precedence + // (replace overrides existing, operations apply on top). + let metadataObject: Record; + if (input.body.metadata !== undefined) { + metadataObject = input.body.metadata as Record; + } else if (typeof snapshot.metadata === "string") { + try { + metadataObject = JSON.parse(snapshot.metadata) as Record; + } catch { + metadataObject = {}; + } + } else { + metadataObject = {}; + } + + if (input.body.operations?.length) { + const result = applyMetadataOperations(metadataObject, input.body.operations); + metadataObject = result.newMetadata; + } + + const newMetadataStr = JSON.stringify(metadataObject); + const cas = await buffer.casSetMetadata({ + runId: input.runId, + expectedVersion: entry.metadataVersion, + newMetadata: newMetadataStr, + newMetadataType: currentMetadataType, + }); + + if (cas.kind === "applied") { + return { kind: "applied", newMetadata: metadataObject }; + } + if (cas.kind === "not_found") return { kind: "not_found" }; + if (cas.kind === "busy") return { kind: "busy" }; + // version_conflict — another caller wrote between our read + CAS. + // Small jittered backoff so a thundering herd of N retriers doesn't + // all re-read + re-CAS at exactly the same moment. + logger.debug("applyMetadataMutationToBufferedRun: version_conflict, retrying", { + runId: input.runId, + attempt, + observedVersion: entry.metadataVersion, + currentVersion: cas.currentVersion, + }); + const backoffMs = Math.floor(Math.random() * (5 + attempt * 5)); + await new Promise((resolve) => setTimeout(resolve, backoffMs)); + } + + logger.warn("applyMetadataMutationToBufferedRun: retries exhausted", { + runId: input.runId, + maxRetries, + }); + return { kind: "version_exhausted" }; +} diff --git a/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts new file mode 100644 index 00000000000..9c6dbae020c --- /dev/null +++ b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts @@ -0,0 +1,188 @@ +import type { + IdempotencyClaimResult, + IdempotencyLookupInput, + MollifierBuffer, +} from "@trigger.dev/redis-worker"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +// Tunables. The TTL on the claim key is bounded by typical trigger-pipeline +// dwell; long enough that a slow PG insert doesn't expire mid-flight, +// short enough that a crashed claimant unblocks waiters quickly. +export const DEFAULT_CLAIM_TTL_SECONDS = 30; +// safetyNetMs caps how long a waiter blocks before returning timed_out. +// Matches the mutateWithFallback safety net so SDK retry policies don't +// have to special-case this path. +export const DEFAULT_CLAIM_WAIT_MS = 5_000; +export const DEFAULT_CLAIM_POLL_MS = 25; + +export type ClaimOrAwaitOutcome = + | { kind: "claimed" } // we own the claim; caller proceeds with the trigger pipeline + | { kind: "resolved"; runId: string } // someone else's runId; caller returns isCached:true + | { kind: "timed_out" }; + +export type ClaimOrAwaitInput = IdempotencyLookupInput & { + ttlSeconds?: number; + safetyNetMs?: number; + pollStepMs?: number; + abortSignal?: AbortSignal; + // Test injection. + buffer?: MollifierBuffer | null; + now?: () => number; + sleep?: (ms: number) => Promise; +}; + +// Pre-gate Redis claim. All same-key triggers serialise through here +// before the trigger pipeline runs. Returning `resolved` short-circuits +// the trigger entirely — the caller responds with the cached runId. +// Returning `claimed` means we own the claim and MUST publish the +// winning runId on success (`publishClaim`) or release the claim on +// failure (`releaseClaim`). +// +// Failure modes: +// - Redis down at claim time: returns `claimed` (fail open, no +// coordination). Customer is no worse than today's race; the +// PG unique constraint is the eventual arbiter. +// - Claimant crashes mid-pipeline: claim TTL expires, waiters +// eventually time out, SDK retries. +// - PG/buffer publish failure: waiters time out and SDK retries; next +// attempt sees the eventual PG/buffer state via existing +// IdempotencyKeyConcern PG-first lookup. +export async function claimOrAwait(input: ClaimOrAwaitInput): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) { + // Mollifier disabled / buffer construction failed. Fall open — + // caller proceeds with the trigger pipeline (PG unique constraint + // backstop). Without the claim machinery the race-window scenarios + // from the plan doc revert to today's behaviour. + return { kind: "claimed" }; + } + const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS; + const safetyNetMs = input.safetyNetMs ?? DEFAULT_CLAIM_WAIT_MS; + const pollStepMs = input.pollStepMs ?? DEFAULT_CLAIM_POLL_MS; + const now = input.now ?? Date.now; + const sleep = input.sleep ?? defaultSleep; + + const lookupInput: IdempotencyLookupInput = { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }; + + // Initial claim attempt. Most production-path calls resolve here on + // the first call (either we win, or the key is already resolved from + // a prior burst). + let result: IdempotencyClaimResult; + try { + result = await buffer.claimIdempotency({ ...lookupInput, ttlSeconds }); + } catch (err) { + logger.warn("idempotency claim failed (fail-open)", { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return { kind: "claimed" }; + } + + if (result.kind === "claimed") return { kind: "claimed" }; + if (result.kind === "resolved") return result; + + // result.kind === "pending" — wait/poll loop. May see the value flip + // to "resolved" (winner published), the key vanish (winner released + // on error → retry claim), or stay "pending" until the safety net. + const deadline = now() + safetyNetMs; + while (now() < deadline) { + if (input.abortSignal?.aborted) return { kind: "timed_out" }; + await sleep(pollStepMs); + + let current: IdempotencyClaimResult | null; + try { + current = await buffer.readClaim(lookupInput); + } catch (err) { + // Transient read failure — keep polling until deadline. + logger.warn("idempotency claim read failed mid-poll", { + err: err instanceof Error ? err.message : String(err), + }); + continue; + } + + if (current === null) { + // Claimant released on error. Re-attempt the claim — one of the + // waiters will win, the rest see "pending" again. + try { + const retry = await buffer.claimIdempotency({ ...lookupInput, ttlSeconds }); + if (retry.kind === "claimed") return { kind: "claimed" }; + if (retry.kind === "resolved") return retry; + // "pending" again → keep polling. + } catch (err) { + logger.warn("idempotency claim retry failed", { + err: err instanceof Error ? err.message : String(err), + }); + return { kind: "claimed" }; + } + continue; + } + if (current.kind === "resolved") return current; + // current.kind === "pending" → keep polling. + } + return { kind: "timed_out" }; +} + +// Publish the winning runId so waiters resolve. Best-effort: failure +// here means waiters will time out and the SDK will retry, which will +// then find the row via the existing IdempotencyKeyConcern PG-first +// check. +export async function publishClaim(input: { + envId: string; + taskIdentifier: string; + idempotencyKey: string; + runId: string; + ttlSeconds?: number; + buffer?: MollifierBuffer | null; +}): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) return; + const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS; + try { + await buffer.publishClaim({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + runId: input.runId, + ttlSeconds, + }); + } catch (err) { + logger.warn("idempotency claim publish failed", { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + } +} + +// Release on pipeline failure. Best-effort. If the DEL fails, the claim +// TTL is the safety net — waiters time out, SDK retries. +export async function releaseClaim(input: { + envId: string; + taskIdentifier: string; + idempotencyKey: string; + buffer?: MollifierBuffer | null; +}): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) return; + try { + await buffer.releaseClaim({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }); + } catch (err) { + logger.warn("idempotency claim release failed", { + err: err instanceof Error ? err.message : String(err), + }); + } +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts index 9c8917623e4..09b52aa9da3 100644 --- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts @@ -22,7 +22,6 @@ function initializeMollifierBuffer(): MollifierBuffer { enableAutoPipelining: true, ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), }, - entryTtlSeconds: env.TRIGGER_MOLLIFIER_ENTRY_TTL_S, }); } diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts index 139aeaf9a6e..fc75210be3f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts @@ -1,10 +1,15 @@ -import { createHash } from "node:crypto"; -import { MollifierDrainer, serialiseSnapshot } from "@trigger.dev/redis-worker"; +import { MollifierDrainer } from "@trigger.dev/redis-worker"; +import { prisma } from "~/db.server"; import { env } from "~/env.server"; +import { engine as runEngine } from "~/v3/runEngine.server"; import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; import { getMollifierBuffer } from "./mollifierBuffer.server"; -import type { BufferedTriggerPayload } from "./bufferedTriggerPayload.server"; +import { + createDrainerHandler, + isRetryablePgError, +} from "./mollifierDrainerHandler.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; // Distinct error class for the deterministic "fail loud at boot" throws // below. The bootstrap in `mollifierDrainerWorker.server.ts` catches @@ -25,7 +30,7 @@ export class MollifierConfigurationError extends Error { } } -function initializeMollifierDrainer(): MollifierDrainer { +function initializeMollifierDrainer(): MollifierDrainer { const buffer = getMollifierBuffer(); if (!buffer) { // Unreachable in normal config: getMollifierDrainer() gates on the @@ -68,40 +73,13 @@ function initializeMollifierDrainer(): MollifierDrainer maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, }); - // Phase 1 handler: no-op ack. The trigger has ALREADY been written to - // Postgres via engine.trigger (dual-write at the call site). Popping + - // acking here proves the dequeue mechanism works end-to-end without - // duplicating the work. Phase 2 will replace this with an engine.trigger - // replay that performs the actual Postgres write. - const drainer = new MollifierDrainer({ + const drainer = new MollifierDrainer({ buffer, - handler: async (input) => { - // Hash the (re-serialised, canonical) payload on the drain side rather - // than on the trigger hot path. Burst-time CPU stays with engine.trigger; - // the drainer is the natural place for the audit-equivalence checksum. - // Re-serialisation is identity for the BufferedTriggerPayload shape - // (only strings/numbers/plain objects), so this hash matches what the - // call site wrote into Redis. - const reserialised = serialiseSnapshot(input.payload); - const payloadHash = createHash("sha256").update(reserialised).digest("hex"); - logger.info("mollifier.drained", { - runId: input.runId, - envId: input.envId, - orgId: input.orgId, - taskId: input.payload.taskId, - attempts: input.attempts, - ageMs: Date.now() - input.createdAt.getTime(), - payloadBytes: reserialised.length, - payloadHash, - }); - }, + handler: createDrainerHandler({ engine: runEngine, prisma }), concurrency: env.TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY, maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, maxOrgsPerTick: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK, - // A no-op handler shouldn't throw, but if something does (e.g. an - // unexpected deserialise failure), don't loop — let it FAIL terminally - // so the entry is observable in metrics. - isRetryable: () => false, + isRetryable: isRetryablePgError, }); return drainer; @@ -114,7 +92,7 @@ function initializeMollifierDrainer(): MollifierDrainer // handler registration, leaving a narrow window where a SIGTERM landing // between `start()` and `process.once("SIGTERM", ...)` would skip the // graceful stop. The split is intentional. -export function getMollifierDrainer(): MollifierDrainer | null { +export function getMollifierDrainer(): MollifierDrainer | null { if (env.TRIGGER_MOLLIFIER_ENABLED !== "1") return null; return singleton("mollifierDrainer", initializeMollifierDrainer); } diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts new file mode 100644 index 00000000000..7f2608d5b21 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -0,0 +1,163 @@ +import { context, trace, TraceFlags } from "@opentelemetry/api"; +import type { RunEngine } from "@internal/run-engine"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import type { MollifierDrainerHandler } from "@trigger.dev/redis-worker"; +import { startSpan } from "~/v3/tracing.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; + +const tracer = trace.getTracer("mollifier-drainer"); + +export function isRetryablePgError(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message ?? ""; + const code = (err as { code?: string }).code; + if (code === "P2024") return true; + if (msg.includes("Can't reach database server")) return true; + if (msg.includes("Connection lost")) return true; + if (msg.includes("ECONNRESET")) return true; + return false; +} + +export function createDrainerHandler(deps: { + engine: RunEngine; + prisma: PrismaClientOrTransaction; +}): MollifierDrainerHandler { + return async (input) => { + const dwellMs = Date.now() - input.createdAt.getTime(); + + // Re-attach to the trace started by the caller's mollifier.queued span + // (its traceId + spanId were captured into the snapshot at buffer time). + // Without this the drainer would emit mollifier.drained in a brand-new + // trace and the engine.trigger instrumentation would inherit an empty + // active context — leaving the run-detail page with only the root span. + const snapshotTraceId = + typeof input.payload.traceId === "string" ? input.payload.traceId : undefined; + const snapshotSpanId = + typeof input.payload.spanId === "string" ? input.payload.spanId : undefined; + + const parentContext = + snapshotTraceId && snapshotSpanId + ? trace.setSpanContext(context.active(), { + traceId: snapshotTraceId, + spanId: snapshotSpanId, + traceFlags: TraceFlags.SAMPLED, + isRemote: true, + }) + : context.active(); + + // Cancel-wins-over-trigger (Q4 bifurcation). If a cancel API call + // landed on this entry while it was QUEUED, the snapshot carries + // `cancelledAt` + `cancelReason`. Skip the normal materialise path + // and write a CANCELED PG row directly. The existing runCancelled + // handler writes the TaskEvent. + const cancelledAtStr = + typeof input.payload.cancelledAt === "string" ? input.payload.cancelledAt : undefined; + if (cancelledAtStr) { + const cancelReason = + typeof input.payload.cancelReason === "string" + ? input.payload.cancelReason + : "Canceled by user"; + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained.cancelled", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("mollifier.cancel_bifurcation", true); + span.setAttribute("taskRunId", input.runId); + await deps.engine.createCancelledRun( + { + snapshot: input.payload as any, + cancelledAt: new Date(cancelledAtStr), + cancelReason, + }, + deps.prisma, + ); + }); + }); + return; + } + + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("taskRunId", input.runId); + + try { + await deps.engine.trigger(input.payload as any, deps.prisma); + } catch (err) { + // The retryable-PG class re-throws so the drainer's outer + // worker loop can `buffer.requeue` (handled in + // `MollifierDrainer.drainOne`). For non-retryable failures we + // write a terminal SYSTEM_FAILURE row to PG via the engine's + // existing `createFailedTaskRun` (used by batch-trigger for + // the same purpose) so the customer sees the run in their + // dashboard / SDK instead of silently losing it when the + // buffer entry TTLs out. If THAT insert also fails (PG truly + // unreachable), rethrow so the drainer's outer catch falls + // through to its existing `buffer.fail` terminal-marker path. + if (isRetryablePgError(err)) { + throw err; + } + const reason = err instanceof Error ? err.message : String(err); + span.setAttribute("mollifier.terminal_failure_reason", reason); + const snapshot = input.payload as Record; + const env = snapshot.environment as + | { + id: string; + type: any; + project: { id: string }; + organization: { id: string }; + } + | undefined; + if (!env) { + // Snapshot too malformed to even construct a TaskRun row. + // Drainer's outer catch will buffer.fail this entry. + throw err; + } + try { + await deps.engine.createFailedTaskRun({ + friendlyId: input.runId, + environment: env, + taskIdentifier: String(snapshot.taskIdentifier ?? ""), + payload: typeof snapshot.payload === "string" ? snapshot.payload : undefined, + payloadType: + typeof snapshot.payloadType === "string" ? snapshot.payloadType : undefined, + error: { + type: "STRING_ERROR", + raw: `Mollifier drainer terminal failure: ${reason}`, + }, + parentTaskRunId: + typeof snapshot.parentTaskRunId === "string" + ? snapshot.parentTaskRunId + : undefined, + rootTaskRunId: + typeof snapshot.rootTaskRunId === "string" + ? snapshot.rootTaskRunId + : undefined, + depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined, + spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined, + taskEventStore: + typeof snapshot.taskEventStore === "string" + ? snapshot.taskEventStore + : undefined, + queue: typeof snapshot.queue === "string" ? snapshot.queue : undefined, + lockedQueueId: + typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined, + }); + } catch (writeErr) { + // Class A — PG itself is failing. Rethrow the original + // error so the drainer falls back to buffer.fail. Include + // the write error in the log line at the drainer layer. + throw err; + } + } + }); + }); + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts index 28b0a7f88cf..6d756bdaa78 100644 --- a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts @@ -46,6 +46,16 @@ export type GateInputs = { // the pattern used by `canAccessAi`, `canAccessPrivateConnections`, and the // compute-template beta gate. orgFeatureFlags: Record | null; + // Trigger options that drive C1/C3/F4 bypasses. The mollify path can't + // serialise stateful callbacks (debounce), can't safely break OTU's + // synchronous-rejection contract, and shouldn't intercept single + // triggerAndWait (batchTriggerAndWait still funnels through per item). + options?: { + debounce?: unknown; + oneTimeUseToken?: string; + parentTaskRunId?: string; + resumeParentOnCompletion?: boolean; + }; }; export type TripEvaluator = (inputs: GateInputs) => Promise; @@ -141,6 +151,30 @@ export async function evaluateGate( ): Promise { const d = { ...defaultGateDependencies, ...deps }; + // C1 — debounce bypass. onDebounced is a closure over webapp state and + // can't be snapshotted into the buffer for drainer replay. Skip before the + // trip evaluator so debounce traffic is never counted against the rate. + if (inputs.options?.debounce) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + // C3 — OneTimeUseToken bypass. OTU is a security feature on the PUBLIC_JWT + // auth path; its synchronous-rejection contract is materially worse to + // break than the idempotency-key contract. Sibling brief: + // `_plans/2026-05-13-mollifier-otu-protection.md`. + if (inputs.options?.oneTimeUseToken) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + // F4 — single triggerAndWait bypass. batchTriggerAndWait still funnels + // through TriggerTaskService.call per item so the dominant burst pattern + // remains covered. Sibling brief: + // `_plans/2026-05-13-mollifier-trigger-and-wait-protection.md`. + if (inputs.options?.parentTaskRunId && inputs.options?.resumeParentOnCompletion) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + if (!d.isMollifierEnabled()) { d.recordDecision("pass_through"); return { action: "pass_through" }; diff --git a/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts new file mode 100644 index 00000000000..22084e0c1d1 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts @@ -0,0 +1,81 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { serialiseMollifierSnapshot, type MollifierSnapshot } from "./mollifierSnapshot.server"; +import type { TripDecision } from "./mollifierGate.server"; + +export type MollifyNotice = { + code: "mollifier.queued"; + message: string; + docs: string; +}; + +export type MollifySyntheticResult = { + // `spanId` is the root-span id allocated at gate-accept time and stored + // in the snapshot. Callers like the dashboard's Test action use it to + // build a `v3RunSpanPath` URL that auto-opens the right details panel + // — without it, the buffered run lands on the run-detail page with no + // span selected (parity gap with PG-resident runs). + run: { friendlyId: string; spanId: string }; + error: undefined; + // The race-loser path (Q5): if accept's SETNX hit an existing + // buffered run with the same (env, task, idempotencyKey), the + // response echoes the winner's runId with isCached=true. The + // mollifier-queued notice is only attached for the happy accept. + isCached: boolean; + notice?: MollifyNotice; +}; + +const NOTICE: MollifyNotice = { + code: "mollifier.queued", + message: + "Trigger accepted into burst buffer. Consider batchTrigger for fan-outs of 100+.", + docs: "https://trigger.dev/docs/triggering#burst-handling", +}; + +export async function mollifyTrigger(args: { + runFriendlyId: string; + environmentId: string; + organizationId: string; + engineTriggerInput: MollifierSnapshot; + decision: Extract; + buffer: MollifierBuffer; + // Optional idempotency context. When both are passed, accept SETNXes + // the lookup so the buffered window participates in trigger-time + // dedup symmetrically with PG (Q5). + idempotencyKey?: string; + taskIdentifier?: string; +}): Promise { + const result = await args.buffer.accept({ + runId: args.runFriendlyId, + envId: args.environmentId, + orgId: args.organizationId, + payload: serialiseMollifierSnapshot(args.engineTriggerInput), + idempotencyKey: args.idempotencyKey, + taskIdentifier: args.taskIdentifier, + }); + + if (result.kind === "duplicate_idempotency") { + // Race loser. Echo the winner's runId so the SDK's response shape + // matches PG-side idempotency cache hits. The winner's spanId isn't + // readily available without a second buffer fetch; an empty string + // causes `v3RunSpanPath` to omit the `?span=` param, which matches + // current behaviour for cached PG responses. + return { + run: { friendlyId: result.existingRunId, spanId: "" }, + error: undefined, + isCached: true, + }; + } + + // Both "accepted" and "duplicate_run_id" produce the same customer- + // visible response: a buffered-trigger acknowledgement. The duplicate + // runId case is unreachable in practice (runIds are server-generated + // and unique) but is silently idempotent at the buffer layer either way. + const rawSpanId = args.engineTriggerInput.spanId; + const spanId = typeof rawSpanId === "string" ? rawSpanId : ""; + return { + run: { friendlyId: args.runFriendlyId, spanId }, + error: undefined, + isCached: false, + notice: NOTICE, + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts new file mode 100644 index 00000000000..a0732a3542e --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts @@ -0,0 +1,16 @@ +import { serialiseSnapshot, deserialiseSnapshot } from "@trigger.dev/redis-worker"; + +// MollifierSnapshot is the JSON-serialisable shape of the input that would be +// passed to engine.trigger(). The drainer deserialises and replays it. +// Kept as Record at this layer — the engine.trigger call site +// casts it to the engine's typed input. This keeps the mollifier subdirectory +// from depending on @internal/run-engine internals. +export type MollifierSnapshot = Record; + +export function serialiseMollifierSnapshot(input: MollifierSnapshot): string { + return serialiseSnapshot(input); +} + +export function deserialiseMollifierSnapshot(serialised: string): MollifierSnapshot { + return deserialiseSnapshot(serialised); +} diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts new file mode 100644 index 00000000000..5c31618efec --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -0,0 +1,146 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { logger as defaultLogger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; +import { + recordStaleEntry as defaultRecordStaleEntry, + reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot, +} from "./mollifierTelemetry.server"; + +// One pass of the sweep scans every env's queue ZSET. The per-env page +// is bounded so a single pathological env can't make the sweep run +// unboundedly long. +const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; + +export type StaleSweepConfig = { + // Entries whose dwell exceeds this threshold are flagged stale. Set + // it well below `entryTtlSeconds * 1000` so ops have lead time before + // TTL-induced silent loss; the default (half of entryTtlSeconds) + // matches the cadence in the plan doc. + staleThresholdMs: number; + maxEntriesPerEnv?: number; +}; + +export type StaleSweepDeps = { + getBuffer?: () => MollifierBuffer | null; + recordStaleEntry?: (envId: string) => void; + reportStaleEntrySnapshot?: (snapshot: Map) => void; + logger?: { warn: (message: string, fields: Record) => void }; + now?: () => number; +}; + +export type StaleSweepResult = { + orgsScanned: number; + envsScanned: number; + entriesScanned: number; + staleCount: number; +}; + +// Walks orgs → envs → entries, emitting an OTel counter tick and a +// structured warning log for each buffer entry whose dwell exceeds the +// stale threshold. Read-only: the sweep does NOT remove or salvage +// entries; that decision is deferred to a separate retention-policy +// change. The signal here exists so ops sees the drainer falling +// behind well before TTL-induced loss kicks in. +export async function runStaleSweepOnce( + config: StaleSweepConfig, + deps: StaleSweepDeps = {}, +): Promise { + const getBuffer = deps.getBuffer ?? getMollifierBuffer; + const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry; + const reportSnapshot = + deps.reportStaleEntrySnapshot ?? defaultReportStaleEntrySnapshot; + const log = deps.logger ?? defaultLogger; + const now = (deps.now ?? Date.now)(); + const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV; + + const buffer = getBuffer(); + if (!buffer) { + // Replace any previous snapshot with empty so a previously-paging + // env doesn't stay latched if mollifier is turned off mid-flight. + reportSnapshot(new Map()); + return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 }; + } + + const orgs = await buffer.listOrgs(); + let envsScanned = 0; + let entriesScanned = 0; + let staleCount = 0; + // Tracks the stale count per env this pass. Includes zero counts for + // envs that have entries but none stale — that's what lets the gauge + // drop back to 0 when the drainer catches up. Envs absent from this + // map are also absent from the new snapshot, clearing any latched + // alerts on envs that have fully drained. + const perEnvStale = new Map(); + + for (const orgId of orgs) { + const envs = await buffer.listEnvsForOrg(orgId); + for (const envId of envs) { + envsScanned += 1; + let envStale = 0; + const entries = await buffer.listEntriesForEnv(envId, maxEntries); + for (const entry of entries) { + entriesScanned += 1; + const dwellMs = now - entry.createdAt.getTime(); + if (dwellMs > config.staleThresholdMs) { + recordStale(envId); + log.warn("mollifier.stale_entry", { + runId: entry.runId, + envId, + orgId, + dwellMs, + staleThresholdMs: config.staleThresholdMs, + }); + envStale += 1; + } + } + perEnvStale.set(envId, envStale); + staleCount += envStale; + } + } + + reportSnapshot(perEnvStale); + + return { orgsScanned: orgs.length, envsScanned, entriesScanned, staleCount }; +} + +export type StaleSweepIntervalHandle = { + stop: () => void; +}; + +// Production wrapper: schedule `runStaleSweepOnce` on a fixed interval. +// One pass at a time — if a sweep is still running when the timer fires +// the next tick is skipped (a backed-up Redis would otherwise queue +// overlapping sweeps that all log the same stale entries). +export function startStaleSweepInterval( + config: StaleSweepConfig & { intervalMs: number }, + deps: StaleSweepDeps = {}, +): StaleSweepIntervalHandle { + let stopped = false; + let inFlight = false; + + const tick = async () => { + if (stopped || inFlight) return; + inFlight = true; + try { + await runStaleSweepOnce(config, deps); + } catch (err) { + const log = deps.logger ?? defaultLogger; + log.warn("mollifier.stale_sweep.failed", { + err: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlight = false; + } + }; + + const timer = setInterval(() => { + void tick(); + }, config.intervalMs); + + return { + stop: () => { + stopped = true; + clearInterval(timer); + }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts index 0fe302584ce..ba58ce47f63 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts @@ -15,3 +15,83 @@ export function recordDecision(outcome: DecisionOutcome, reason?: DecisionReason ...(reason ? { reason } : {}), }); } + +// Counts subscriptions hitting `/realtime/v1/runs/` for a run that +// lives only in the mollifier buffer (no PG row yet). The route opens +// the Electric stream anyway so the eventual drainer-INSERT propagates +// to the client; this counter is the signal of how often customers +// subscribe inside the buffered window. +export const realtimeBufferedSubscriptionsCounter = meter.createCounter( + "mollifier.realtime_subscriptions.buffered", + { + description: + "Realtime subscriptions opened against a runId that exists only in the mollifier buffer", + }, +); + +export function recordRealtimeBufferedSubscription(envId: string): void { + realtimeBufferedSubscriptionsCounter.add(1, { envId }); +} + +// Counts buffer entries that have been waiting in the queue ZSET longer +// than the configured stale threshold (typically half of entryTtlSeconds). +// Useful for historical "stale events over time" views, but not directly +// alertable on its own — a single stuck entry observed by N sweep ticks +// adds N to the counter, so `rate()` over an alerting window reflects +// (entries × ticks), not "entries that are stale right now". +export const staleEntriesCounter = meter.createCounter( + "mollifier.stale_entries", + { + description: + "Mollifier buffer entries whose dwell exceeds the stale threshold (per sweep pass)", + }, +); + +export function recordStaleEntry(envId: string): void { + staleEntriesCounter.add(1, { envId }); +} + +// Alertable signal: the count of stale entries observed by the latest +// sweep, per env. The sweep snapshots the full per-env picture on each +// pass (including zeros for envs that no longer have any stale entries) +// so an env that was paging can clear when the drainer catches up +// instead of staying latched. Recommended alert: +// mollifier_stale_entries_current{envId=...} > 0 for 5m +export const staleEntriesGauge = meter.createObservableGauge( + "mollifier.stale_entries.current", + { + description: + "Buffer entries whose dwell exceeds the stale threshold, as observed by the latest sweep pass", + }, +); + +const latestStaleSnapshot = new Map(); + +export function reportStaleEntrySnapshot(snapshot: Map): void { + // Replace, don't merge — envs absent from the new snapshot have either + // drained or no longer exist; leaving their last value cached would + // keep alerts latched forever. + latestStaleSnapshot.clear(); + for (const [envId, count] of snapshot) { + latestStaleSnapshot.set(envId, count); + } +} + +meter.addBatchObservableCallback( + (result) => { + for (const [envId, count] of latestStaleSnapshot) { + result.observe(staleEntriesGauge, count, { envId }); + } + }, + [staleEntriesGauge], +); + +// Electric SQL's shape-stream protocol adds a `handle=` query param on +// every reconnect after the initial GET. Gating the realtime-buffered +// log/counter on its absence keeps the signal at one tick per +// subscription instead of one tick per ~20s live-poll iteration — +// without it the counter would over-count by the long-poll factor. +export function isInitialBufferedSubscriptionRequest(url: string | URL): boolean { + const u = typeof url === "string" ? new URL(url) : url; + return !u.searchParams.has("handle"); +} diff --git a/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts new file mode 100644 index 00000000000..a0ca335ef2a --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts @@ -0,0 +1,179 @@ +import type { + MollifierBuffer, + MutateSnapshotResult, + SnapshotPatch, +} from "@trigger.dev/redis-worker"; +import type { TaskRun } from "@trigger.dev/database"; +import { prisma, $replica } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +// Wait/retry knobs per Q3 design. Exported for tests. +export const DEFAULT_SAFETY_NET_MS = 2_000; +export const DEFAULT_POLL_STEP_MS = 20; +export const DEFAULT_PG_TIMEOUT_MS = 50; + +export type MutateWithFallbackInput = { + runId: string; + environmentId: string; + organizationId: string; + bufferPatch: SnapshotPatch; + // Called when a PG row exists (either replica-hit or post-wait writer-hit). + // Receives the full TaskRun shape and returns the customer-visible body. + pgMutation: (pgRow: TaskRun) => Promise; + // Called when the patch landed cleanly on the buffer snapshot. The + // drainer will see the patched payload on its next pop. + synthesisedResponse: () => TResponse | Promise; + abortSignal?: AbortSignal; + // Override defaults for tests. + safetyNetMs?: number; + pollStepMs?: number; + pgTimeoutMs?: number; + // Test injection. + getBuffer?: () => MollifierBuffer | null; + prismaWriter?: TaskRunReader; + prismaReplica?: TaskRunReader; + sleep?: (ms: number) => Promise; + now?: () => number; +}; + +export type MutateWithFallbackOutcome = + | { kind: "pg"; response: TResponse } + | { kind: "snapshot"; response: TResponse } + | { kind: "not_found" } + | { kind: "timed_out" }; + +// PG-first → buffer mutateSnapshot → wait-and-bounce. Implements the Q3 +// design (`_plans/2026-05-19-mollifier-mutation-race-design.md`). The +// caller decides how to translate the outcome into an HTTP response — +// this helper never throws Response objects so it remains route-agnostic +// and unit-testable in isolation. +export async function mutateWithFallback( + input: MutateWithFallbackInput, +): Promise> { + const replica = input.prismaReplica ?? $replica; + const writer = input.prismaWriter ?? prisma; + const buffer = (input.getBuffer ?? getMollifierBuffer)(); + const sleep = input.sleep ?? defaultSleep; + const now = input.now ?? Date.now; + + // Path 1 — PG is already canonical. + const replicaRow = await findRunInPg(replica, input.runId, input.environmentId); + if (replicaRow) { + const response = await input.pgMutation(replicaRow); + return { kind: "pg", response }; + } + + if (!buffer) { + // No buffer configured (mollifier disabled or boot-time error). PG + // missed; nothing else to consult. + return { kind: "not_found" }; + } + + // Path 2 — buffer snapshot mutation. + const result: MutateSnapshotResult = await buffer.mutateSnapshot( + input.runId, + input.bufferPatch, + ); + + if (result === "applied_to_snapshot") { + return { kind: "snapshot", response: await input.synthesisedResponse() }; + } + + if (result === "not_found") { + // Disambiguate a genuine 404 from a replica-lag miss: ask the writer + // directly. If the row just appeared post-drain we route through the + // PG mutation path. + const writerRow = await findRunInPg(writer, input.runId, input.environmentId); + if (writerRow) { + const response = await input.pgMutation(writerRow); + return { kind: "pg", response }; + } + return { kind: "not_found" }; + } + + // result === "busy" — entry is DRAINING / FAILED / materialised. Wait + // for the drainer to terminate the entry into PG (success or + // SYSTEM_FAILURE) and route through pgMutation. + const safetyNetMs = input.safetyNetMs ?? DEFAULT_SAFETY_NET_MS; + const pollStepMs = input.pollStepMs ?? DEFAULT_POLL_STEP_MS; + const pgTimeoutMs = input.pgTimeoutMs ?? DEFAULT_PG_TIMEOUT_MS; + const deadline = now() + safetyNetMs; + + while (now() < deadline) { + if (input.abortSignal?.aborted) { + return { kind: "timed_out" }; + } + + const row = await findRunInPgWithTimeout( + writer, + input.runId, + input.environmentId, + pgTimeoutMs, + ); + if (row) { + const response = await input.pgMutation(row); + return { kind: "pg", response }; + } + + if (now() >= deadline) break; + await sleep(pollStepMs); + } + + logger.warn("mollifier mutate-with-fallback: drainer resolution timed out", { + runId: input.runId, + safetyNetMs, + }); + return { kind: "timed_out" }; +} + +// Structural reader interface — accepts both the writer (`prisma`) and the +// replica (`$replica`), which differ slightly in their generated Prisma +// types but share the findFirst surface used here. +type TaskRunReader = { + taskRun: { + findFirst(args: { + where: { friendlyId: string; runtimeEnvironmentId: string }; + }): Promise; + }; +}; + +async function findRunInPg( + client: TaskRunReader, + friendlyId: string, + environmentId: string, +): Promise { + return client.taskRun.findFirst({ + where: { friendlyId, runtimeEnvironmentId: environmentId }, + }); +} + +async function findRunInPgWithTimeout( + client: TaskRunReader, + friendlyId: string, + environmentId: string, + timeoutMs: number, +): Promise { + // One slow PG query shouldn't burn the whole safety-net budget. + // Promise.race against a timer; on timeout we treat the poll as a miss + // and the outer loop tries again on the next tick. + const timeoutToken = Symbol("pg-timeout"); + let timeoutHandle: ReturnType | undefined; + const timeoutPromise = new Promise((resolve) => { + timeoutHandle = setTimeout(() => resolve(timeoutToken), timeoutMs); + }); + try { + const winner = await Promise.race([ + findRunInPg(client, friendlyId, environmentId), + timeoutPromise, + ]); + if (winner === timeoutToken) return null; + return winner; + } finally { + if (timeoutHandle) clearTimeout(timeoutHandle); + } +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/apps/webapp/app/v3/mollifier/readFallback.server.ts b/apps/webapp/app/v3/mollifier/readFallback.server.ts index 34a8b48f970..3b2446d3876 100644 --- a/apps/webapp/app/v3/mollifier/readFallback.server.ts +++ b/apps/webapp/app/v3/mollifier/readFallback.server.ts @@ -1,4 +1,8 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; import { logger } from "~/services/logger.server"; +import { deserialiseMollifierSnapshot } from "./mollifierSnapshot.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; export type ReadFallbackInput = { runId: string; @@ -6,11 +10,203 @@ export type ReadFallbackInput = { organizationId: string; }; +export type SyntheticRun = { + // Snapshot-derived TaskRun primary key. Used by ReplayTaskRunService + // for logging and by callers passing this object where a TaskRun is + // expected (cast). Derived deterministically from `friendlyId`. + id: string; + friendlyId: string; + status: "QUEUED" | "FAILED" | "CANCELED"; + // Set when the customer cancelled the run via the dashboard or API + // while it was buffered. The drainer's cancel bifurcation reads this + // on next pop and writes a CANCELED PG row directly (skipping + // materialisation). Reflected back into the UI by the synthesised + // SpanRun so the run-detail page shows the cancelled state even before + // the drainer materialises it. + cancelledAt: Date | undefined; + cancelReason: string | undefined; + // Reschedule patch (`set_delay`) writes `delayUntil` into the snapshot. + // Surfacing it on SyntheticRun lets the retrieve-run shape reflect the + // pending delay before the drainer materialises the PG row. + delayUntil: Date | undefined; + taskIdentifier: string | undefined; + createdAt: Date; + + payload: unknown; + payloadType: string | undefined; + metadata: unknown; + metadataType: string | undefined; + // Seed-metadata mirrors what `triggerTask.server.ts` writes into the + // snapshot: the original metadataPacket data preserved separately from + // any later customer mutations. ReplayTaskRunService uses these to + // rebuild the replay's metadata. + seedMetadata: string | undefined; + seedMetadataType: string | undefined; + + idempotencyKey: string | undefined; + idempotencyKeyOptions: string[] | undefined; + isTest: boolean; + depth: number; + ttl: string | undefined; + tags: string[]; + // Mirror of `tags` under the PG field name. ReplayTaskRunService reads + // `existingTaskRun.runTags`; both names are kept here so a synthetic + // run can be passed wherever the PG-shape `runTags` is expected. + runTags: string[]; + lockedToVersion: string | undefined; + resumeParentOnCompletion: boolean; + parentTaskRunId: string | undefined; + + // Allocated at gate-accept time and embedded in the snapshot so the run's + // trace is continuous from QUEUED-in-buffer through executing post-drain. + traceId: string | undefined; + spanId: string | undefined; + parentSpanId: string | undefined; + + // Replay-relevant fields populated from the engine-trigger snapshot. + // ReplayTaskRunService reads each of these from the existing TaskRun; + // when the original lives in the buffer we synthesise them here. + runtimeEnvironmentId: string | undefined; + engine: "V2"; + workerQueue: string | undefined; + queue: string | undefined; + concurrencyKey: string | undefined; + machinePreset: string | undefined; + realtimeStreamsVersion: string | undefined; + + // Additional snapshot-sourced fields used when synthesising a SpanRun + // for the dashboard's right-side details panel. All optional because + // older snapshots may not carry them. + maxAttempts: number | undefined; + maxDurationInSeconds: number | undefined; + replayedFromTaskRunFriendlyId: string | undefined; + annotations: unknown; + traceContext: unknown; + scheduleId: string | undefined; + batchId: string | undefined; + parentTaskRunFriendlyId: string | undefined; + rootTaskRunFriendlyId: string | undefined; + + error?: { code: string; message: string }; +}; + +export type ReadFallbackDeps = { + getBuffer?: () => MollifierBuffer | null; +}; + +function asString(value: unknown): string | undefined { + return typeof value === "string" ? value : undefined; +} + +function asStringArray(value: unknown): string[] { + return Array.isArray(value) && value.every((v) => typeof v === "string") ? (value as string[]) : []; +} + export async function findRunByIdWithMollifierFallback( input: ReadFallbackInput, -): Promise { - logger.debug("mollifier read-fallback called (phase 1 stub)", { - runId: input.runId, - }); - return null; + deps: ReadFallbackDeps = {}, +): Promise { + const buffer = (deps.getBuffer ?? getMollifierBuffer)(); + if (!buffer) return null; + + try { + const entry = await buffer.getEntry(input.runId); + if (!entry) return null; + + if (entry.envId !== input.environmentId || entry.orgId !== input.organizationId) { + logger.warn("mollifier read-fallback auth mismatch", { + runId: input.runId, + callerEnvId: input.environmentId, + callerOrgId: input.organizationId, + }); + return null; + } + + const snapshot = deserialiseMollifierSnapshot(entry.payload); + const idempotencyKeyOptionsRaw = snapshot.idempotencyKeyOptions; + const idempotencyKeyOptions = Array.isArray(idempotencyKeyOptionsRaw) + ? asStringArray(idempotencyKeyOptionsRaw) + : undefined; + + const tags = asStringArray(snapshot.tags); + const environment = + snapshot.environment && typeof snapshot.environment === "object" + ? (snapshot.environment as Record) + : undefined; + + const cancelledAtRaw = asString(snapshot.cancelledAt); + const cancelledAt = cancelledAtRaw ? new Date(cancelledAtRaw) : undefined; + const cancelReason = asString(snapshot.cancelReason); + let status: SyntheticRun["status"] = "QUEUED"; + if (cancelledAt) { + status = "CANCELED"; + } else if (entry.status === "FAILED") { + status = "FAILED"; + } + const delayUntilRaw = asString(snapshot.delayUntil); + const delayUntil = delayUntilRaw ? new Date(delayUntilRaw) : undefined; + + return { + id: RunId.fromFriendlyId(entry.runId), + friendlyId: entry.runId, + status, + cancelledAt, + cancelReason, + delayUntil, + taskIdentifier: asString(snapshot.taskIdentifier), + createdAt: entry.createdAt, + + payload: snapshot.payload, + payloadType: asString(snapshot.payloadType), + metadata: snapshot.metadata, + metadataType: asString(snapshot.metadataType), + seedMetadata: asString(snapshot.seedMetadata), + seedMetadataType: asString(snapshot.seedMetadataType), + + idempotencyKey: asString(snapshot.idempotencyKey), + idempotencyKeyOptions, + isTest: snapshot.isTest === true, + depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, + ttl: asString(snapshot.ttl), + tags, + runTags: tags, + lockedToVersion: asString(snapshot.lockToVersion), + resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + parentTaskRunId: asString(snapshot.parentTaskRunId), + + traceId: asString(snapshot.traceId), + spanId: asString(snapshot.spanId), + parentSpanId: asString(snapshot.parentSpanId), + + runtimeEnvironmentId: + asString(environment?.id) ?? entry.envId, + engine: "V2", + workerQueue: asString(snapshot.workerQueue), + queue: asString(snapshot.queue), + concurrencyKey: asString(snapshot.concurrencyKey), + machinePreset: asString(snapshot.machine), + realtimeStreamsVersion: asString(snapshot.realtimeStreamsVersion), + + maxAttempts: typeof snapshot.maxAttempts === "number" ? snapshot.maxAttempts : undefined, + maxDurationInSeconds: + typeof snapshot.maxDurationInSeconds === "number" + ? snapshot.maxDurationInSeconds + : undefined, + replayedFromTaskRunFriendlyId: asString(snapshot.replayedFromTaskRunFriendlyId), + annotations: snapshot.annotations, + traceContext: snapshot.traceContext, + scheduleId: asString(snapshot.scheduleId), + batchId: asString(snapshot.batchId), + parentTaskRunFriendlyId: asString(snapshot.parentTaskRunFriendlyId), + rootTaskRunFriendlyId: asString(snapshot.rootTaskRunFriendlyId), + + error: entry.lastError, + }; + } catch (err) { + logger.error("mollifier read-fallback errored — fail-open to null", { + runId: input.runId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } } diff --git a/apps/webapp/app/v3/mollifier/realtimeRunResource.server.ts b/apps/webapp/app/v3/mollifier/realtimeRunResource.server.ts new file mode 100644 index 00000000000..0a84f984530 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/realtimeRunResource.server.ts @@ -0,0 +1,57 @@ +import type { SyntheticRun } from "./readFallback.server"; + +// Shape `realtime.v1.runs.$runId.ts`'s findResource hands to the route's +// authorization callback + loader body. The PG-resident case is the +// canonical shape (a TaskRun row with the batch join); the buffered +// case below mirrors it from the synthetic run. +export type RealtimeRunResource = { + id: string; + friendlyId: string; + taskIdentifier: string; + runTags: string[]; + batch: { friendlyId: string } | null; + // Present only when this resource was resolved from the mollifier + // buffer (no PG row yet). Stamped at resolve time so the loader body + // can emit observability for buffered-window subscriptions. The flag + // doubles as the discriminant — PG-sourced resources never carry it. + __bufferedDwellMs?: number; +}; + +export type RealtimeRunResourcePgRun = { + id: string; + friendlyId: string; + taskIdentifier: string; + runTags: string[]; + batch: { friendlyId: string } | null; +}; + +// Given the results of the PG and buffer lookups, produce the resource +// shape the realtime route returns from findResource. PG-first: if the +// run is PG-resident, return it unchanged (the buffered fallback only +// fires when no PG row exists yet). When only the buffer has the run, +// synthesise a matching shape whose `id` is the deterministic value +// engine.trigger will write when the drainer materialises this run — +// this is what lets the Electric subscription's `WHERE id=` match +// the eventual INSERT. +export function resolveRealtimeRunResource(input: { + pgRun: RealtimeRunResourcePgRun | null; + bufferedSynthetic: Pick< + SyntheticRun, + "id" | "friendlyId" | "taskIdentifier" | "runTags" | "createdAt" + > | null; + now?: () => number; +}): RealtimeRunResource | null { + if (input.pgRun) return input.pgRun; + if (input.bufferedSynthetic) { + const now = (input.now ?? Date.now)(); + return { + id: input.bufferedSynthetic.id, + friendlyId: input.bufferedSynthetic.friendlyId, + taskIdentifier: input.bufferedSynthetic.taskIdentifier ?? "", + runTags: input.bufferedSynthetic.runTags, + batch: null, + __bufferedDwellMs: now - input.bufferedSynthetic.createdAt.getTime(), + }; + } + return null; +} diff --git a/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts new file mode 100644 index 00000000000..2808fbe9b29 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts @@ -0,0 +1,58 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { $replica as defaultReplica } from "~/db.server"; +import { getMollifierBuffer as defaultGetBuffer } from "./mollifierBuffer.server"; + +// Discriminated-union resolver used by mutation routes' `findResource`. +// The route builder treats a null return from `findResource` as a 404 +// BEFORE the action handler runs (`apiBuilder.server.ts:321`), so we +// must check BOTH the PG canonical store and the mollifier buffer here +// — otherwise a buffered run can't be cancelled / mutated even though +// the underlying mutateWithFallback flow would handle it correctly. +// +// (Regression: before extracting this helper the cancel route had +// `findResource: async () => null`, which made every cancel 404 before +// the action ran. The helper makes the lookup unit-testable.) +export type ResolvedRunForMutation = + | { source: "pg"; friendlyId: string } + | { source: "buffer"; friendlyId: string }; + +export type ResolveRunForMutationDeps = { + prismaReplica?: { + taskRun: { + findFirst(args: { + where: { friendlyId: string; runtimeEnvironmentId: string }; + select: { friendlyId: true }; + }): Promise<{ friendlyId: string } | null>; + }; + }; + getBuffer?: () => MollifierBuffer | null; +}; + +export async function resolveRunForMutation(input: { + runParam: string; + environmentId: string; + organizationId: string; + deps?: ResolveRunForMutationDeps; +}): Promise { + const replica = input.deps?.prismaReplica ?? defaultReplica; + const getBuffer = input.deps?.getBuffer ?? defaultGetBuffer; + + const pgRun = await replica.taskRun.findFirst({ + where: { friendlyId: input.runParam, runtimeEnvironmentId: input.environmentId }, + select: { friendlyId: true }, + }); + if (pgRun) return { source: "pg", friendlyId: pgRun.friendlyId }; + + const buffer = getBuffer(); + if (!buffer) return null; + + const entry = await buffer.getEntry(input.runParam); + if ( + entry && + entry.envId === input.environmentId && + entry.orgId === input.organizationId + ) { + return { source: "buffer", friendlyId: input.runParam }; + } + return null; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts new file mode 100644 index 00000000000..a4986235a55 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts @@ -0,0 +1,92 @@ +import { deserialiseSnapshot, type MollifierBuffer } from "@trigger.dev/redis-worker"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { prisma } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +export type BufferedRunRedirectInfo = { + organizationSlug: string; + projectSlug: string; + environmentSlug: string; + spanId: string | undefined; +}; + +export type FindBufferedRunRedirectInfoDeps = { + getBuffer?: () => MollifierBuffer | null; + prismaClient?: PrismaClientOrTransaction; +}; + +// Resolve the org/project/env slugs needed to build the canonical run-detail +// URL for a buffered run. Used by the short-URL redirect routes +// (`runs.$runParam`, `@.runs.$runParam`, `projects.v3.$projectRef.runs.$runParam`) +// so a customer clicking the trigger-API-returned run link doesn't 404 +// during the buffered window. +// +// Authorisation: PG query confirms the requesting user belongs to the +// organisation the buffer entry says owns the run. Without this check a +// known runId would leak slugs. +export async function findBufferedRunRedirectInfo( + args: { + runFriendlyId: string; + userId: string; + // Admin impersonation paths bypass org-membership; mirrors the existing + // PG-side admin route behaviour (`@.runs.$runParam` doesn't filter by + // org membership in the PG query either). + skipOrgMembershipCheck?: boolean; + }, + deps: FindBufferedRunRedirectInfoDeps = {}, +): Promise { + const buffer = (deps.getBuffer ?? getMollifierBuffer)(); + const prismaClient = deps.prismaClient ?? prisma; + if (!buffer) return null; + + let entry; + try { + entry = await buffer.getEntry(args.runFriendlyId); + } catch (err) { + logger.warn("buffered redirect: buffer.getEntry failed", { + runFriendlyId: args.runFriendlyId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + if (!entry) return null; + + if (!args.skipOrgMembershipCheck) { + const member = await prismaClient.orgMember.findFirst({ + where: { userId: args.userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) return null; + } + + let snapshot: Record; + try { + snapshot = deserialiseSnapshot(entry.payload) as Record; + } catch (err) { + logger.warn("buffered redirect: snapshot deserialise failed", { + runFriendlyId: args.runFriendlyId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + + const environment = snapshot.environment as Record | undefined; + if (!environment || typeof environment !== "object") return null; + const project = environment.project as Record | undefined; + const organization = environment.organization as Record | undefined; + + const envSlug = environment.slug; + const projectSlug = project?.slug; + const orgSlug = organization?.slug; + if (typeof envSlug !== "string" || typeof projectSlug !== "string" || typeof orgSlug !== "string") { + return null; + } + + return { + organizationSlug: orgSlug, + projectSlug, + environmentSlug: envSlug, + spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts new file mode 100644 index 00000000000..e502d5b3bf7 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts @@ -0,0 +1,154 @@ +import { prettyPrintPacket, RunAnnotations } from "@trigger.dev/core/v3"; +import { getMaxDuration } from "@trigger.dev/core/v3/isomorphic"; +import { + extractIdempotencyKeyScope, + getUserProvidedIdempotencyKey, +} from "@trigger.dev/core/v3/serverOnly"; +import type { SpanRun } from "~/presenters/v3/SpanPresenter.server"; +import type { SyntheticRun } from "./readFallback.server"; + +// Synthesise a SpanRun-shaped object from a buffered run so the run-detail +// page's right-side details panel renders identically to a PG-resident +// run. The shape matches `SpanPresenter.getRun`'s return value exactly; +// buffered-irrelevant fields (output, error, attempts, schedule, session, +// region, batch) are filled with sensible defaults. +// +// Pretty-printing for payload and metadata mirrors SpanPresenter so the +// UI receives data in the same shape. Buffered runs cannot use the +// `application/store` packet path (no R2 object yet) so we treat raw +// snapshot fields as inline packets. +export async function buildSyntheticSpanRun(args: { + run: SyntheticRun; + environment: { id: string; slug: string; type: "PRODUCTION" | "DEVELOPMENT" | "STAGING" | "PREVIEW" }; +}): Promise { + const { run, environment } = args; + + const payload = + typeof run.payload !== "undefined" && run.payload !== null + ? await prettyPrintPacket(run.payload, run.payloadType ?? undefined) + : undefined; + + const metadata = run.metadata + ? await prettyPrintPacket(run.metadata, run.metadataType, { + filteredKeys: ["$$streams", "$$streamsVersion", "$$streamsBaseUrl"], + }) + : undefined; + + const idempotencyShape = { + idempotencyKey: run.idempotencyKey ?? null, + idempotencyKeyExpiresAt: null, + idempotencyKeyOptions: run.idempotencyKeyOptions ?? null, + }; + + const idempotencyKey = getUserProvidedIdempotencyKey(idempotencyShape); + const idempotencyKeyScope = extractIdempotencyKeyScope(idempotencyShape); + const idempotencyKeyStatus: SpanRun["idempotencyKeyStatus"] = idempotencyKey + ? "active" + : idempotencyKeyScope + ? "inactive" + : undefined; + + const taskKind = RunAnnotations.safeParse(run.annotations).data?.taskKind; + const isAgentRun = taskKind === "AGENT"; + + const queueName = run.queue ?? "task/"; + const isCancelled = run.status === "CANCELED"; + return { + id: run.id, + friendlyId: run.friendlyId, + status: isCancelled ? "CANCELED" : "PENDING", + statusReason: isCancelled ? run.cancelReason ?? undefined : undefined, + createdAt: run.createdAt, + startedAt: null, + executedAt: null, + updatedAt: run.cancelledAt ?? run.createdAt, + delayUntil: run.delayUntil ?? null, + expiredAt: null, + completedAt: run.cancelledAt ?? null, + logsDeletedAt: null, + ttl: run.ttl ?? null, + taskIdentifier: run.taskIdentifier ?? "", + version: undefined, + sdkVersion: undefined, + runtime: undefined, + runtimeVersion: undefined, + isTest: run.isTest, + replayedFromTaskRunFriendlyId: run.replayedFromTaskRunFriendlyId ?? null, + environmentId: environment.id, + idempotencyKey, + idempotencyKeyExpiresAt: null, + idempotencyKeyScope, + idempotencyKeyStatus, + debounce: null, + schedule: undefined, + queue: { + name: queueName, + isCustomQueue: !queueName.startsWith("task/"), + concurrencyKey: run.concurrencyKey ?? null, + }, + tags: run.runTags, + baseCostInCents: 0, + costInCents: 0, + totalCostInCents: 0, + usageDurationMs: 0, + isFinished: false, + isRunning: false, + isError: false, + isAgentRun, + payload, + payloadType: run.payloadType ?? "application/json", + output: undefined, + outputType: "application/json", + error: undefined, + relationships: { + root: run.rootTaskRunFriendlyId + ? { + friendlyId: run.rootTaskRunFriendlyId, + spanId: "", + taskIdentifier: "", + createdAt: run.createdAt, + isParent: run.parentTaskRunFriendlyId === run.rootTaskRunFriendlyId, + } + : undefined, + parent: run.parentTaskRunFriendlyId + ? { + friendlyId: run.parentTaskRunFriendlyId, + spanId: "", + taskIdentifier: "", + } + : undefined, + }, + context: JSON.stringify( + { + task: { + id: run.taskIdentifier ?? "", + }, + run: { + id: run.friendlyId, + createdAt: run.createdAt, + isTest: run.isTest, + }, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + }, + null, + 2, + ), + metadata, + maxDurationInSeconds: getMaxDuration(run.maxDurationInSeconds), + batch: undefined, + session: undefined, + engine: "V2", + region: null, + workerQueue: run.workerQueue ?? "", + traceId: run.traceId ?? "", + spanId: run.spanId ?? "", + isCached: false, + machinePreset: run.machinePreset, + taskEventStore: "taskEvent", + externalTraceId: undefined, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts new file mode 100644 index 00000000000..acde2ccee9c --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts @@ -0,0 +1,66 @@ +import { millisecondsToNanoseconds } from "@trigger.dev/core/v3"; +import { createTreeFromFlatItems, flattenTree } from "~/components/primitives/TreeView/TreeView"; +import { createTimelineSpanEventsFromSpanEvents } from "~/utils/timelineSpanEvents"; +import type { SpanSummary } from "~/v3/eventRepository/eventRepository.types"; +import type { SyntheticRun } from "./readFallback.server"; + +// Build a single-span trace for a buffered run so the run-detail page +// renders a meaningful timeline before the drainer materialises the +// row. Mirrors the shape produced by `RunPresenter` when its trace +// store lookup returns no spans, so the dashboard consumer treats the +// buffered run identically to a freshly enqueued PG run that hasn't +// emitted any events yet. +export function buildSyntheticTraceForBufferedRun(run: SyntheticRun) { + const spanId = run.spanId ?? ""; + const isCancelled = run.status === "CANCELED"; + const span: SpanSummary = { + id: spanId, + parentId: run.parentSpanId, + runId: run.friendlyId, + data: { + message: run.taskIdentifier ?? "Task", + style: { icon: "task", variant: "primary" }, + events: [], + startTime: run.createdAt, + duration: 0, + isError: false, + isPartial: !isCancelled, + isCancelled, + isDebug: false, + level: "TRACE", + }, + }; + + const tree = createTreeFromFlatItems([span], spanId); + const treeRootStartTimeMs = tree?.data.startTime.getTime() ?? 0; + const totalDuration = Math.max(tree?.data.duration ?? 0, millisecondsToNanoseconds(1)); + + const events = tree + ? flattenTree(tree).map((n) => { + const offset = millisecondsToNanoseconds( + n.data.startTime.getTime() - treeRootStartTimeMs + ); + return { + ...n, + data: { + ...n.data, + timelineEvents: createTimelineSpanEventsFromSpanEvents(n.data.events, false, treeRootStartTimeMs), + duration: n.data.isPartial ? null : n.data.duration, + offset, + isRoot: n.id === spanId, + }, + }; + }) + : []; + + return { + rootSpanStatus: (isCancelled ? "completed" : "executing") as "executing" | "completed" | "failed", + events, + duration: totalDuration, + rootStartedAt: tree?.data.startTime, + startedAt: null, + queuedDuration: undefined, + overridesBySpanId: undefined, + linkedRunIdBySpanId: {} as Record, + }; +} diff --git a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts new file mode 100644 index 00000000000..5325018baf1 --- /dev/null +++ b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts @@ -0,0 +1,47 @@ +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { signalsEmitter } from "~/services/signals.server"; +import { + startStaleSweepInterval, + type StaleSweepIntervalHandle, +} from "./mollifier/mollifierStaleSweep.server"; + +declare global { + // eslint-disable-next-line no-var + var __mollifierStaleSweepRegistered__: boolean | undefined; + // eslint-disable-next-line no-var + var __mollifierStaleSweepHandle__: StaleSweepIntervalHandle | undefined; +} + +/** + * Bootstraps the mollifier stale-entry sweep. + * + * Independent of the drainer — its purpose is to alert when entries are + * piling up despite the drainer being supposedly healthy, so it runs + * any time the mollifier itself is enabled (gated separately from + * `TRIGGER_MOLLIFIER_DRAINER_ENABLED`). The sweep is read-only: it + * counts and logs stale entries but does not remove or salvage them. + * + * The Remix dev server re-evaluates `entry.server.tsx` on every change, + * so the registration guard + handle cache make the bootstrap + * idempotent across hot reloads. + */ +export function initMollifierStaleSweepWorker(): void { + if (env.TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED !== "1") return; + if (global.__mollifierStaleSweepRegistered__) return; + + logger.debug("Initializing mollifier stale-entry sweep", { + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }); + + const handle = startStaleSweepInterval({ + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }); + + signalsEmitter.on("SIGTERM", handle.stop); + signalsEmitter.on("SIGINT", handle.stop); + global.__mollifierStaleSweepRegistered__ = true; + global.__mollifierStaleSweepHandle__ = handle; +} diff --git a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts index 95684999303..2442b24a805 100644 --- a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts +++ b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts @@ -1,6 +1,7 @@ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { BaseService, ServiceValidationError } from "./baseService.server"; import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; export class ResetIdempotencyKeyService extends BaseService { public async call( @@ -8,7 +9,7 @@ export class ResetIdempotencyKeyService extends BaseService { taskIdentifier: string, authenticatedEnv: AuthenticatedEnvironment ): Promise<{ id: string }> { - const { count } = await this._prisma.taskRun.updateMany({ + const { count: pgCount } = await this._prisma.taskRun.updateMany({ where: { idempotencyKey, taskIdentifier, @@ -20,7 +21,33 @@ export class ResetIdempotencyKeyService extends BaseService { }, }); - if (count === 0) { + // Buffer-side reset (Q5): the key may belong to a buffered run that + // hasn't materialised yet. The PG updateMany above can't see it. + // resetIdempotency clears both the snapshot fields and the Redis + // lookup atomically. Returns null when nothing was bound there. + const buffer = getMollifierBuffer(); + const bufferResult = buffer + ? await buffer + .resetIdempotency({ + envId: authenticatedEnv.id, + taskIdentifier, + idempotencyKey, + }) + .catch((err) => { + // Buffer outage shouldn't 500 the reset endpoint if PG + // already cleared something. Log and treat as a miss. + logger.error("ResetIdempotencyKeyService: buffer reset failed", { + idempotencyKey, + taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return { clearedRunId: null }; + }) + : { clearedRunId: null }; + + const totalCount = pgCount + (bufferResult.clearedRunId ? 1 : 0); + + if (totalCount === 0) { throw new ServiceValidationError( `No runs found with idempotency key: ${idempotencyKey} and task: ${taskIdentifier}`, 404 @@ -28,7 +55,7 @@ export class ResetIdempotencyKeyService extends BaseService { } logger.info( - `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${count} run(s)` + `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${totalCount} run(s) (pg=${pgCount}, buffered=${bufferResult.clearedRunId ? 1 : 0})` ); return { id: idempotencyKey }; diff --git a/apps/webapp/seed.mts b/apps/webapp/seed.mts index 9eb30cd2503..7f364595f98 100644 --- a/apps/webapp/seed.mts +++ b/apps/webapp/seed.mts @@ -67,11 +67,35 @@ async function seed() { name: "realtime-streams", externalRef: "proj_klxlzjnzxmbgiwuuwhvb", }, + { + name: "stress-tasks", + externalRef: "proj_stresstaskslocaldevx", + // Stress-tasks fan-outs need a much higher concurrency ceiling than the + // default 300 — at 1000+ children per parent, runs would otherwise queue + // and the local repro wouldn't track the production fan-out signature. + environmentConcurrencyLimit: 25000, + }, ]; // Create or find each project for (const projectConfig of referenceProjects) { - await findOrCreateProject(projectConfig.name, organization, user.id, projectConfig.externalRef); + const result = await findOrCreateProject( + projectConfig.name, + organization, + user.id, + projectConfig.externalRef, + ); + + if (projectConfig.environmentConcurrencyLimit) { + const updated = await prisma.runtimeEnvironment.updateMany({ + where: { projectId: result.project.id }, + data: { maximumConcurrencyLimit: projectConfig.environmentConcurrencyLimit }, + }); + console.log( + ` Updated ${updated.count} environment(s) on ${projectConfig.name} ` + + `to maximumConcurrencyLimit=${projectConfig.environmentConcurrencyLimit}`, + ); + } } await createBatchLimitOrgs(user); diff --git a/apps/webapp/test/engine/triggerTask.test.ts b/apps/webapp/test/engine/triggerTask.test.ts index d07909d2907..9052f3b789f 100644 --- a/apps/webapp/test/engine/triggerTask.test.ts +++ b/apps/webapp/test/engine/triggerTask.test.ts @@ -68,17 +68,31 @@ class MockTriggerTaskValidator implements TriggerTaskValidator { } } +// Mirror the production ClickhouseEventRepository.traceEvent shape so +// callers that read `event.traceContext.traceparent` (e.g. the +// mollifier branch seeding the snapshot) get the same W3C-formatted +// value they'd get against a real event repository. +const MOCK_TRACE_ID = "0123456789abcdef0123456789abcdef"; +const MOCK_SPAN_ID = "fedcba9876543210"; +const MOCK_TRACEPARENT = `00-${MOCK_TRACE_ID}-${MOCK_SPAN_ID}-01`; + class MockTraceEventConcern implements TraceEventConcern { + // Records the start time of the most recent traceRun callback entry. + // Used by ordering assertions that verify traceRun fires before + // downstream side effects (e.g. mollifier buffer writes). + public traceRunEnteredAt: number | undefined; + async traceRun( request: TriggerTaskRequest, parentStore: string | undefined, callback: (span: TracedEventSpan, store: string) => Promise ): Promise { + this.traceRunEnteredAt = Date.now(); return await callback( { - traceId: "test", - spanId: "test", - traceContext: {}, + traceId: MOCK_TRACE_ID, + spanId: MOCK_SPAN_ID, + traceContext: { traceparent: MOCK_TRACEPARENT }, traceparent: undefined, setAttribute: () => { }, failWithError: () => { }, @@ -1269,8 +1283,17 @@ describe("RunEngineTriggerTaskService", () => { ); containerTest( - "mollifier · mollify action triggers dual-write (buffer.accept + engine.trigger)", + "mollifier · mollify action writes to buffer and returns synthetic result (no Postgres row)", async ({ prisma, redisOptions }) => { + // Phase 3 semantics: when the gate decides mollify, the call site + // invokes `mollifyTrigger` which writes the engine.trigger snapshot + // to the buffer and returns a synthesised `MollifySyntheticResult` + // (run.friendlyId + notice + isCached:false). `engine.trigger` is + // NEVER invoked on this path — the run materialises in Postgres + // later, when the drainer replays the snapshot. The replay is + // covered by `mollifierDrainerHandler.test.ts`; this test pins the + // call-site integration: synthetic result + buffer write + no + // Postgres side effect. const engine = new RunEngine({ prisma, worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, @@ -1288,7 +1311,24 @@ describe("RunEngineTriggerTaskService", () => { const taskIdentifier = "test-task"; await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - const buffer = new CapturingMollifierBuffer(); + // Buffer override records the time of the accept call so we can + // assert that traceRun fired strictly before the buffer was + // touched. If a future change re-introduces the "skip traceRun on + // mollify" shortcut, traceConcern.traceRunEnteredAt stays + // undefined and the ordering assertion fails. + class TimestampedBuffer extends CapturingMollifierBuffer { + public acceptedAt: number | undefined; + override async accept(input: { + runId: string; + envId: string; + orgId: string; + payload: string; + }) { + this.acceptedAt = Date.now(); + return await super.accept(input); + } + } + const buffer = new TimestampedBuffer(); const trippedDecision = { divert: true as const, reason: "per_env_rate" as const, @@ -1297,6 +1337,7 @@ describe("RunEngineTriggerTaskService", () => { windowMs: 200, holdMs: 500, }; + const traceConcern = new MockTraceEventConcern(); const triggerTaskService = new RunEngineTriggerTaskService({ engine, @@ -1305,7 +1346,7 @@ describe("RunEngineTriggerTaskService", () => { queueConcern: new DefaultQueueManager(prisma, engine), idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()), validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), + traceEventConcern: traceConcern, tracer: trace.getTracer("test", "0.0.0"), metadataMaximumSize: 1024 * 1024, evaluateGate: async () => ({ action: "mollify", decision: trippedDecision }), @@ -1319,25 +1360,81 @@ describe("RunEngineTriggerTaskService", () => { body: { payload: { hello: "world" } }, }); - // engine.trigger ran — Postgres has the run + // Pre-modifier span creation: traceRun must run BEFORE the buffer + // is touched. Customer-visible effect — the run span lands in + // ClickHouse from the moment the trigger returns, even when the + // drainer is offline, so buffered runs are visible in the trace + // view immediately rather than only after drain. + expect(traceConcern.traceRunEnteredAt).toBeDefined(); + expect(buffer.acceptedAt).toBeDefined(); + expect(traceConcern.traceRunEnteredAt!).toBeLessThanOrEqual(buffer.acceptedAt!); + + // Synthetic result is returned with the `mollifier.queued` notice + // (the call-site casts the synthetic shape to `TriggerTaskServiceResult`; + // at runtime the `notice` and `isCached: false` fields are present + // and read by the api.v1.tasks.$taskId.trigger.ts route handler). expect(result).toBeDefined(); expect(result?.run.friendlyId).toBeDefined(); - const pgRun = await prisma.taskRun.findFirst({ where: { id: result!.run.id } }); - expect(pgRun).not.toBeNull(); - expect(pgRun!.friendlyId).toBe(result!.run.friendlyId); - - // buffer.accept ran — Redis has the audit copy under the same friendlyId + const synthetic = result as unknown as { + run: { friendlyId: string }; + isCached: false; + notice: { code: string; message: string; docs: string }; + }; + expect(synthetic.isCached).toBe(false); + expect(synthetic.notice.code).toBe("mollifier.queued"); + expect(synthetic.notice.message).toBeTypeOf("string"); + expect(synthetic.notice.docs).toBeTypeOf("string"); + + // buffer.accept ran — Redis has the canonical engine.trigger snapshot + // under the synthesised friendlyId. The drainer will read this and + // replay it through engine.trigger to materialise the run. expect(buffer.accepted).toHaveLength(1); expect(buffer.accepted[0]!.runId).toBe(result!.run.friendlyId); expect(buffer.accepted[0]!.envId).toBe(authenticatedEnvironment.id); expect(buffer.accepted[0]!.orgId).toBe(authenticatedEnvironment.organizationId); + // Payload is a JSON-serialised MollifierSnapshot (the engine.trigger + // input). Schema is internal to the engine, so we only assert that + // it parses and references the friendlyId — anything more specific + // would couple the mollifier-layer test to engine-layer fields. + const snapshot = JSON.parse(buffer.accepted[0]!.payload) as { + traceId?: string; + spanId?: string; + traceContext?: { traceparent?: string }; + }; - // payload is the canonical replay shape - const payload = JSON.parse(buffer.accepted[0]!.payload); - expect(payload.runFriendlyId).toBe(result!.run.friendlyId); - expect(payload.taskId).toBe(taskIdentifier); - expect(payload.envId).toBe(authenticatedEnvironment.id); - expect(payload.body).toEqual({ payload: { hello: "world" } }); + // Regression guard for the dashboard trace-tree bug: the mollifier + // snapshot MUST carry a W3C `traceparent` in `traceContext`, + // seeded from the same span traceRun opened. Without it, the + // drainer replays through engine.trigger with empty traceContext + // and every downstream `recordRunDebugLog` + // (QUEUED/EXECUTING/FINISHED/run:notify…) gets a fresh traceId + + // null parentId — the run-detail page can only show the root + // span. Both the mollify and pass-through paths now flow through + // `traceEventConcern.traceRun`; this assertion pins the + // seeding-from-the-run-span contract. + expect(snapshot.traceContext?.traceparent).toMatch( + /^00-[0-9a-f]{32}-[0-9a-f]{16}-[0-9a-f]{2}$/ + ); + expect(snapshot.traceContext!.traceparent).toContain(snapshot.traceId); + expect(snapshot.traceContext!.traceparent).toContain(snapshot.spanId); + // The snapshot inherits the *run span's* traceId/spanId (from the + // event handed in by traceRun), not a separately-generated OTel + // span. This is what lets the drainer's `mollifier.drained` span + // and downstream engine.trigger materialisation parent on the + // same ClickHouse trace the customer sees from the moment trigger + // returns. + expect(snapshot.traceId).toBe(MOCK_TRACE_ID); + expect(snapshot.spanId).toBe(MOCK_SPAN_ID); + + // Postgres has NOT been written: engine.trigger was never called on + // the mollify path. The run materialises only when the drainer + // replays the snapshot. Regression intent: if a future change makes + // the mollify branch fall through to engine.trigger (re-introducing + // phase-1 dual-write), this assertion fails loudly. + const pgRun = await prisma.taskRun.findFirst({ + where: { friendlyId: result!.run.friendlyId }, + }); + expect(pgRun).toBeNull(); await engine.quit(); }, @@ -1398,108 +1495,6 @@ describe("RunEngineTriggerTaskService", () => { }, ); - containerTest( - "mollifier · engine.trigger throwing AFTER buffer.accept leaves an orphan entry (documented behaviour)", - async ({ prisma, redisOptions }) => { - // SCENARIO: dual-write where buffer.accept succeeds but engine.trigger - // throws. The throw propagates to the caller (correct: customer sees - // the same 4xx as today), and the buffer entry remains as an "orphan" - // — Phase 1's no-op drainer will pop+ack it on its next poll, so the - // orphan is bounded (~drainer pollIntervalMs) but observable in the - // audit trail (mollifier.buffered with no matching TaskRun). - // - // Why engine.trigger can throw post-buffer: - // - RunDuplicateIdempotencyKeyError (Prisma P2002 on idempotencyKey): - // a concurrent non-mollified trigger with the same idempotencyKey - // wins the DB UNIQUE constraint between IdempotencyKeyConcern's - // pre-check and engine.trigger's INSERT. - // - RunOneTimeUseTokenError (Prisma P2002 on oneTimeUseToken). - // - Transient Prisma errors (FK constraint, connection drop, etc.). - // - // Why we don't "fix" this race in Phase 1: - // The customer correctly gets the error. State eventually converges - // (drainer pops the orphan). The audit-trail explicitly surfaces - // "buffered without TaskRun" entries to operators. A real fix is - // Phase 2's responsibility once the buffer becomes the primary write - // — at that point we add the mollifier-specific idempotency index. - // - // This test pins the current ordering: buffer.accept fires synchronously - // BEFORE engine.trigger, and engine.trigger failure does NOT roll back - // the buffer write. Any future change that reverses the order or adds - // a silent rollback will fail this assertion and force a design - // decision rather than a silent behaviour change. - - const engine = new RunEngine({ - prisma, - worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, - queue: { redis: redisOptions }, - runLock: { redis: redisOptions }, - machines: { - defaultMachine: "small-1x", - machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } }, - baseCostInCents: 0.0005, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const taskIdentifier = "test-task"; - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - - const buffer = new CapturingMollifierBuffer(); - - // Force engine.trigger to throw on this single call. We spy AFTER - // setupBackgroundWorker so the worker setup still uses the real - // engine.trigger (which has its own engine.trigger-ish calls for - // worker bootstrap — though in practice setupBackgroundWorker doesn't - // call trigger). - const simulatedFailure = new Error("simulated engine.trigger failure post-buffer"); - vi.spyOn(engine, "trigger").mockRejectedValueOnce(simulatedFailure); - - const triggerTaskService = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()), - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - evaluateGate: async () => ({ - action: "mollify", - decision: { - divert: true, - reason: "per_env_rate", - count: 150, - threshold: 100, - windowMs: 200, - holdMs: 500, - }, - }), - getMollifierBuffer: () => buffer as never, - isMollifierGloballyEnabled: () => true, - }); - - await expect( - triggerTaskService.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { payload: { test: "x" } }, - }), - ).rejects.toThrow(/simulated engine.trigger failure post-buffer/); - - // The buffer write happened BEFORE engine.trigger threw. The orphan - // remains; the audit-trail will surface it (mollifier.buffered with - // no matching TaskRun row). Phase 1's no-op drainer cleans it up. - expect(buffer.accepted).toHaveLength(1); - const orphanPayload = JSON.parse(buffer.accepted[0]!.payload); - expect(orphanPayload.taskId).toBe(taskIdentifier); - - await engine.quit(); - }, - ); - containerTest( "mollifier · idempotency-key match short-circuits BEFORE the gate is consulted", async ({ prisma, redisOptions }) => { @@ -1607,143 +1602,6 @@ describe("RunEngineTriggerTaskService", () => { }, ); - containerTest( - "mollifier · debounce match produces an orphan buffer entry (documented behaviour)", - async ({ prisma, redisOptions }) => { - // SCENARIO: a trigger with a debounce key arrives while a matching - // debounced run already exists. `debounceSystem.handleDebounce` runs - // INSIDE `engine.trigger` (line ~514 of run-engine/src/engine/index.ts), - // AFTER buffer.accept has already written the new friendlyId. The - // service correctly returns the existing run id to the customer, but - // the buffer is left with an orphan entry for the new friendlyId. - // - // Why this is acceptable in Phase 1: - // - Customer-facing behaviour is unchanged from today: they receive - // the existing run id, same as the non-mollified path. - // - The orphan is bounded — the drainer's no-op-ack handler pops - // and acks it on its next poll. - // - The audit-trail surfaces it: a `mollifier.buffered` log line - // with `runId` that has no matching TaskRun in Postgres. - // - // Why Phase 2 cares: - // - When the buffer becomes the primary write path, debounce can - // no longer be allowed to run AFTER buffer.accept. The drainer's - // engine.trigger replay would observe "existing" and skip the - // persist — the customer's synthesised 200 (with the new - // friendlyId) would never get a TaskRun, and the audit-trail - // divergence becomes a real data-loss bug. - // - Phase 2 must lift `handleDebounce` into the call site BEFORE - // buffer.accept: - // 1. handleDebounce → if existing, return existing run; do NOT - // touch the buffer. - // 2. Otherwise, accept with `claimId` threaded into the - // canonical payload so the drainer's replay can - // `registerDebouncedRun` after persisting. - // - // This test pins the current ordering. A future change that "fixes" - // it by lifting handleDebounce upfront will fail the orphan - // assertion below and force an explicit choice (update the test, - // remove this scenario, or stage the lift behind a flag). - - const engine = new RunEngine({ - prisma, - worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, - queue: { redis: redisOptions }, - runLock: { redis: redisOptions }, - machines: { - defaultMachine: "small-1x", - machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } }, - baseCostInCents: 0.0005, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const taskIdentifier = "test-task"; - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - - const idempotencyKeyConcern = new IdempotencyKeyConcern( - prisma, - engine, - new MockTraceEventConcern(), - ); - - // Setup: trigger with debounce — creates the existing run + Redis claim. - const baseline = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern, - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - }); - const first = await baseline.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { - payload: { test: "x" }, - options: { debounce: { key: "regression-debounce-6", delay: "30s" } }, - }, - }); - expect(first?.run.friendlyId).toBeDefined(); - - // Action: same debounce key, mollify-stub gate. - const buffer = new CapturingMollifierBuffer(); - const mollifierService = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern, - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - evaluateGate: async () => ({ - action: "mollify", - decision: { - divert: true, - reason: "per_env_rate", - count: 150, - threshold: 100, - windowMs: 200, - holdMs: 500, - }, - }), - getMollifierBuffer: () => buffer as never, - isMollifierGloballyEnabled: () => true, - }); - - const debounced = await mollifierService.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { - payload: { test: "x" }, - options: { debounce: { key: "regression-debounce-6", delay: "30s" } }, - }, - }); - - // Customer-facing behaviour: the existing run is returned (correct). - expect(debounced).toBeDefined(); - expect(debounced?.run.friendlyId).toBe(first?.run.friendlyId); - - // Orphan: buffer.accept fired with the new friendlyId we generated - // upfront, and that friendlyId has no matching TaskRun in Postgres - // because engine.trigger returned the existing run via debounce. - expect(buffer.accepted).toHaveLength(1); - expect(buffer.accepted[0]!.runId).not.toBe(first?.run.friendlyId); - const orphanFriendlyId = buffer.accepted[0]!.runId; - const orphanRow = await prisma.taskRun.findFirst({ - where: { friendlyId: orphanFriendlyId }, - }); - expect(orphanRow).toBeNull(); - - await engine.quit(); - }, - ); }); describe("DefaultQueueManager task metadata cache", () => { diff --git a/apps/webapp/test/mollifierApplyMetadataMutation.test.ts b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts new file mode 100644 index 00000000000..61a3d2db167 --- /dev/null +++ b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts @@ -0,0 +1,186 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; +import type { BufferEntry, MollifierBuffer, CasSetMetadataResult } from "@trigger.dev/redis-worker"; + +// Regression for the CAS retry-exhaustion bug found by Phase F. The +// default `maxRetries` was 3, matching the PG-side service, but that +// exhausts fast when N external API writers race the same buffered +// run's metadata. Bumped to 12 + jittered backoff (commit 4e7d5d8a2). +// These tests simulate version_conflict races and assert (a) every +// delta lands and (b) the retry budget is sized for realistic +// concurrency. + +const NOW = new Date("2026-05-21T10:00:00Z"); + +type BufferStub = { + buffer: MollifierBuffer; + state: { + version: number; + metadata: Record; + pendingConflictsForNextN: number; + }; +}; + +// Build a stub MollifierBuffer that simulates Lua-CAS semantics +// in-memory. The first `pendingConflictsForNextN` casSetMetadata calls +// from any worker will return version_conflict (then the version +// bumps); subsequent calls succeed. +function makeBufferStub(initialPayload: Record = {}): BufferStub { + const state = { + version: 0, + metadata: initialPayload.metadata + ? (JSON.parse(initialPayload.metadata as string) as Record) + : {}, + pendingConflictsForNextN: 0, + }; + const entryTemplate: Omit = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + + const buffer: MollifierBuffer = { + getEntry: vi.fn(async (): Promise => ({ + ...entryTemplate, + metadataVersion: state.version, + payload: JSON.stringify({ ...initialPayload, metadata: JSON.stringify(state.metadata) }), + })), + casSetMetadata: vi.fn( + async (input: { + runId: string; + expectedVersion: number; + newMetadata: string; + newMetadataType: string; + }): Promise => { + // Inject a controlled number of conflicts to simulate races. + if (state.pendingConflictsForNextN > 0) { + state.pendingConflictsForNextN -= 1; + // Bump version as if some other writer just landed. + state.version += 1; + return { kind: "version_conflict", currentVersion: state.version }; + } + if (input.expectedVersion !== state.version) { + return { kind: "version_conflict", currentVersion: state.version }; + } + state.metadata = JSON.parse(input.newMetadata) as Record; + state.version += 1; + return { kind: "applied", newVersion: state.version }; + }, + ), + } as unknown as MollifierBuffer; + + return { buffer, state }; +} + +describe("applyMetadataMutationToBufferedRun — retry behaviour", () => { + it("succeeds when CAS lands on the first try (no contention)", async () => { + const { buffer, state } = makeBufferStub(); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { metadata: { counter: 1 } }, + buffer, + }); + expect(result.kind).toBe("applied"); + expect(state.metadata).toEqual({ counter: 1 }); + expect(state.version).toBe(1); + }); + + it("succeeds after 5 version conflicts (default budget = 12)", async () => { + const { buffer, state } = makeBufferStub(); + state.pendingConflictsForNextN = 5; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer, + }); + expect(result.kind).toBe("applied"); + if (result.kind === "applied") { + expect(result.newMetadata.counter).toBe(1); + } + }); + + it("succeeds after 11 version conflicts (one under the default budget)", async () => { + const { buffer } = makeBufferStub(); + const setStateConflicts = (n: number) => { + // Re-read state from the closure + const state = (buffer as unknown as { __state__?: never; getEntry: () => Promise }); + void state; + }; + void setStateConflicts; + // Set conflicts directly via the shared state object + const { state } = makeBufferStub(); + state.pendingConflictsForNextN = 11; + // Build a fresh stub since we want one shared state instance + const stub = makeBufferStub(); + stub.state.pendingConflictsForNextN = 11; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("applied"); + }); + + it("returns version_exhausted after retries are spent", async () => { + const stub = makeBufferStub(); + // 99 conflicts ≫ default budget of 12. With maxRetries 3 (the + // pre-fix value), this would have exhausted after 4 attempts. + stub.state.pendingConflictsForNextN = 99; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + maxRetries: 12, + }); + expect(result.kind).toBe("version_exhausted"); + }); + + it("regression: 3 retries are NOT enough under 50-way concurrency simulation", async () => { + // The pre-fix default would have lost most deltas under this + // contention. Asserting that the OLD budget (3) exhausts confirms + // the regression actually existed and the new budget addresses it. + const stub = makeBufferStub(); + stub.state.pendingConflictsForNextN = 8; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + maxRetries: 3, + }); + expect(result.kind).toBe("version_exhausted"); + }); + + it("N-way concurrent applies all converge under default budget", async () => { + // Simulate N parallel writers against a shared state. Each writer + // reads, applies a delta, CAS-writes. The Lua CAS forces them to + // retry until they see the latest version. + const N = 30; + const sharedStub = makeBufferStub(); + // Override the stub to model real per-attempt serialisation: each + // call reads the latest version, and CAS conflicts are organic + // (not pre-injected) when expectedVersion != current. + sharedStub.state.pendingConflictsForNextN = 0; + + const calls = Array.from({ length: N }, () => + applyMetadataMutationToBufferedRun({ + runId: "run_1", + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: sharedStub.buffer, + }), + ); + const results = await Promise.all(calls); + const applied = results.filter((r) => r.kind === "applied").length; + expect(applied).toBe(N); + expect(sharedStub.state.metadata.counter).toBe(N); + }); +}); diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts new file mode 100644 index 00000000000..6f66cf2ab79 --- /dev/null +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -0,0 +1,206 @@ +import { describe, expect, it, vi } from "vitest"; +import { trace } from "@opentelemetry/api"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { + createDrainerHandler, + isRetryablePgError, +} from "~/v3/mollifier/mollifierDrainerHandler.server"; + +describe("isRetryablePgError", () => { + it("returns true for P2024 (connection pool timeout)", () => { + const err = Object.assign(new Error("Timed out fetching a new connection"), { + code: "P2024", + }); + expect(isRetryablePgError(err)).toBe(true); + }); + + it("returns true for generic connection-lost messages", () => { + expect(isRetryablePgError(new Error("Connection lost"))).toBe(true); + expect(isRetryablePgError(new Error("Can't reach database server"))).toBe(true); + }); + + it("returns false for validation errors", () => { + expect(isRetryablePgError(new Error("Invalid payload"))).toBe(false); + }); + + it("returns false for non-Error inputs", () => { + expect(isRetryablePgError("string error")).toBe(false); + expect(isRetryablePgError({ message: "object" })).toBe(false); + }); +}); + +describe("createDrainerHandler", () => { + it("invokes engine.trigger with the deserialised snapshot", async () => { + const trigger = vi.fn(async () => ({ friendlyId: "run_x" })); + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", payload: "{}" }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(trigger).toHaveBeenCalledOnce(); + const callArg = trigger.mock.calls[0][0] as { taskIdentifier: string }; + expect(callArg.taskIdentifier).toBe("t"); + }); + + it("re-attaches the snapshot's traceId so engine.trigger inherits the original trace", async () => { + // Captures the active traceId at the moment engine.trigger is invoked. + // Without context propagation it would be a fresh traceId, leaving the + // run-detail page with only the root span. + let observedTraceId: string | undefined; + const trigger = vi.fn(async () => { + observedTraceId = trace.getActiveSpan()?.spanContext().traceId; + return { friendlyId: "run_x" }; + }); + + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + const snapshotTraceId = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + const snapshotSpanId = "bbbbbbbbbbbbbbbb"; + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { + taskIdentifier: "t", + traceId: snapshotTraceId, + spanId: snapshotSpanId, + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(observedTraceId).toBe(snapshotTraceId); + }); + + it("rethrows retryable PG errors so MollifierDrainer requeues the entry", async () => { + const err = new Error("Can't reach database server"); + const trigger = vi.fn(async () => { + throw err; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("Can't reach database server"); + // Retryable: we do NOT write a SYSTEM_FAILURE row, the entry should + // be requeued for another shot. + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); + + const envFixture = { + id: "env_a", + type: "DEVELOPMENT", + project: { id: "proj_1" }, + organization: { id: "org_1" }, + }; + + it("writes a SYSTEM_FAILURE PG row when engine.trigger fails non-retryably", async () => { + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => ({ + id: "internal", + friendlyId: "run_x", + })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).resolves.toBeUndefined(); + + expect(trigger).toHaveBeenCalledOnce(); + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + const arg = createFailedTaskRun.mock.calls[0][0] as { error: { raw: string } }; + expect(arg.error.raw).toContain("validation failed"); + }); + + it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(async () => { + throw new Error("connection refused"); + }); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + // Drainer's outer drainOne loop now decides retry vs buffer.fail. + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + }); + + it("rethrows the original error when the snapshot lacks an environment block", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" /* no environment */ }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierGate.test.ts b/apps/webapp/test/mollifierGate.test.ts index b81df7f0c5b..c951cf70896 100644 --- a/apps/webapp/test/mollifierGate.test.ts +++ b/apps/webapp/test/mollifierGate.test.ts @@ -432,3 +432,83 @@ describe("evaluateGate — per-org isolation via Organization.featureFlags", () expect(unrelatedDeps.spies.evaluatorCalls).toBe(0); }); }); + +// C1/C3/F4 bypasses: the three categories of trigger that the mollifier never +// intercepts, regardless of the per-org flag or the trip-evaluator decision. +// Documented in `_plans/2026-05-13-mollifier-{debounce,otu,trigger-and-wait}-protection.md`. +describe("evaluateGate — C1/C3/F4 bypasses", () => { + it("C1: debounce triggers pass through without invoking the evaluator", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { debounce: { key: "k" } } }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("C3: oneTimeUseToken triggers pass through without invoking the evaluator", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { oneTimeUseToken: "jwt-otu" } }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("F4: single triggerAndWait (parentTaskRunId + resumeParentOnCompletion) passes through", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { + ...inputs, + options: { parentTaskRunId: "run_parent", resumeParentOnCompletion: true }, + }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("parentTaskRunId alone (no resumeParentOnCompletion) does NOT bypass — must be both for F4", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { parentTaskRunId: "run_parent" } }, + deps, + ); + expect(outcome.action).toBe("mollify"); + expect(spies.evaluatorCalls).toBe(1); + }); + + it("bypass records pass_through decision (so observability counters stay accurate)", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + await evaluateGate({ ...inputs, options: { debounce: { key: "k" } } }, deps); + expect(spies.recordDecisionCalls).toHaveLength(1); + expect(spies.recordDecisionCalls[0].outcome).toBe("pass_through"); + }); +}); diff --git a/apps/webapp/test/mollifierIdempotencyClaim.test.ts b/apps/webapp/test/mollifierIdempotencyClaim.test.ts new file mode 100644 index 00000000000..786ed5cf22c --- /dev/null +++ b/apps/webapp/test/mollifierIdempotencyClaim.test.ts @@ -0,0 +1,206 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { + claimOrAwait, + publishClaim, + releaseClaim, +} from "~/v3/mollifier/idempotencyClaim.server"; +import type { + IdempotencyClaimResult, + MollifierBuffer, +} from "@trigger.dev/redis-worker"; + +type ClaimState = { + value: string | null; + // Scripted return sequence for claimIdempotency calls. When set, + // overrides the default behaviour of returning based on `value`. + scriptedClaims?: IdempotencyClaimResult[]; +}; + +function makeBuffer(initial: ClaimState = { value: null }): { + buffer: MollifierBuffer; + state: ClaimState; +} { + const state = { ...initial }; + const buffer = { + claimIdempotency: vi.fn(async (): Promise => { + if (state.scriptedClaims && state.scriptedClaims.length > 0) { + return state.scriptedClaims.shift()!; + } + if (state.value === null) { + state.value = "pending"; + return { kind: "claimed" }; + } + if (state.value === "pending") return { kind: "pending" }; + return { kind: "resolved", runId: state.value }; + }), + readClaim: vi.fn(async (): Promise => { + if (state.value === null) return null; + if (state.value === "pending") return { kind: "pending" }; + return { kind: "resolved", runId: state.value }; + }), + publishClaim: vi.fn(async ({ runId }: { runId: string }) => { + state.value = runId; + }), + releaseClaim: vi.fn(async () => { + state.value = null; + }), + } as unknown as MollifierBuffer; + return { buffer, state }; +} + +const baseInput = { + envId: "env_a", + taskIdentifier: "my-task", + idempotencyKey: "k-1", +}; + +describe("claimOrAwait", () => { + it("returns 'claimed' for the first caller — empty key wins SETNX", async () => { + const { buffer } = makeBuffer({ value: null }); + const outcome = await claimOrAwait({ ...baseInput, buffer }); + expect(outcome).toEqual({ kind: "claimed" }); + }); + + it("returns 'resolved' immediately when the key already holds a runId", async () => { + const { buffer } = makeBuffer({ value: "run_X" }); + const outcome = await claimOrAwait({ ...baseInput, buffer }); + expect(outcome).toEqual({ kind: "resolved", runId: "run_X" }); + }); + + it("polls a pending key, then resolves when the runId is published", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + let nowValue = 0; + let pollCount = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + if (pollCount === 3) state.value = "run_X"; + }, + safetyNetMs: 1000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "resolved", runId: "run_X" }); + }); + + it("returns 'timed_out' when the key stays pending past safetyNetMs", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + let nowValue = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + }, + safetyNetMs: 50, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "timed_out" }); + }); + + it("retries the claim when a polled key vanishes (claimant released)", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + let nowValue = 0; + let pollCount = 0; + // Scripted retry: on the second `claimIdempotency` call we win. + state.scriptedClaims = [ + { kind: "pending" }, // first call (initial) + { kind: "claimed" }, // second call (retry after release) + ]; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + // First poll cycle: key vanishes (release). + if (pollCount === 1) state.value = null; + }, + safetyNetMs: 1000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "claimed" }); + }); + + it("fails open with 'claimed' when buffer is null (mollifier disabled)", async () => { + const outcome = await claimOrAwait({ ...baseInput, buffer: null }); + expect(outcome).toEqual({ kind: "claimed" }); + }); + + it("fails open with 'claimed' if buffer.claimIdempotency throws (Redis down)", async () => { + const buffer = { + claimIdempotency: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + } as unknown as MollifierBuffer; + const outcome = await claimOrAwait({ ...baseInput, buffer }); + expect(outcome).toEqual({ kind: "claimed" }); + }); + + it("respects an aborted signal during the wait loop", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + const controller = new AbortController(); + let nowValue = 0; + let pollCount = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + if (pollCount === 1) controller.abort(); + }, + abortSignal: controller.signal, + safetyNetMs: 5000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "timed_out" }); + }); +}); + +describe("publishClaim", () => { + it("writes the runId to the claim key", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + await publishClaim({ ...baseInput, runId: "run_X", buffer }); + expect(state.value).toBe("run_X"); + expect(buffer.publishClaim).toHaveBeenCalledOnce(); + }); + + it("no-op when buffer is null", async () => { + await expect( + publishClaim({ ...baseInput, runId: "run_X", buffer: null }), + ).resolves.toBeUndefined(); + }); + + it("swallows errors so trigger pipeline isn't broken by Redis hiccups", async () => { + const buffer = { + publishClaim: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + } as unknown as MollifierBuffer; + await expect( + publishClaim({ ...baseInput, runId: "run_X", buffer }), + ).resolves.toBeUndefined(); + }); +}); + +describe("releaseClaim", () => { + it("DELs the claim so waiters can re-acquire", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + await releaseClaim({ ...baseInput, buffer }); + expect(state.value).toBeNull(); + }); + + it("no-op when buffer is null", async () => { + await expect(releaseClaim({ ...baseInput, buffer: null })).resolves.toBeUndefined(); + }); +}); diff --git a/apps/webapp/test/mollifierMollify.test.ts b/apps/webapp/test/mollifierMollify.test.ts new file mode 100644 index 00000000000..c0bb6dec0e4 --- /dev/null +++ b/apps/webapp/test/mollifierMollify.test.ts @@ -0,0 +1,92 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server"; +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; + +function fakeBuffer( + acceptResult: Awaited> = { kind: "accepted" }, +): { buffer: MollifierBuffer; accept: ReturnType } { + const accept = vi.fn(async () => acceptResult); + return { + buffer: { accept } as unknown as MollifierBuffer, + accept, + }; +} + +describe("mollifyTrigger", () => { + it("writes the snapshot to buffer and returns synthesised result", async () => { + const { buffer, accept } = fakeBuffer(); + const result = await mollifyTrigger({ + runFriendlyId: "run_friendly_1", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "my-task", payload: '{"x":1}' }, + decision: { + divert: true, + reason: "per_env_rate", + count: 150, + threshold: 100, + }, + buffer, + }); + + expect(accept).toHaveBeenCalledOnce(); + expect(accept).toHaveBeenCalledWith({ + runId: "run_friendly_1", + envId: "env_a", + orgId: "org_1", + payload: expect.any(String), + idempotencyKey: undefined, + taskIdentifier: undefined, + }); + expect(result.run.friendlyId).toBe("run_friendly_1"); + expect(result.error).toBeUndefined(); + expect(result.isCached).toBe(false); + expect(result.notice).toEqual({ + code: "mollifier.queued", + message: expect.stringContaining("burst buffer"), + docs: expect.stringContaining("trigger.dev/docs"), + }); + }); + + it("echoes the winner's runId with isCached=true on duplicate_idempotency", async () => { + const { buffer } = fakeBuffer({ + kind: "duplicate_idempotency", + existingRunId: "run_winner", + }); + const result = await mollifyTrigger({ + runFriendlyId: "run_loser", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "t", payload: "{}" }, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + idempotencyKey: "key", + taskIdentifier: "t", + }); + expect(result.run.friendlyId).toBe("run_winner"); + expect(result.isCached).toBe(true); + expect(result.notice).toBeUndefined(); + }); + + it("snapshot is round-trippable: payload field is parseable JSON of engineTriggerInput", async () => { + const { buffer, accept } = fakeBuffer(); + const engineInput = { taskIdentifier: "t", payload: "{}", tags: ["a", "b"] }; + await mollifyTrigger({ + runFriendlyId: "run_x", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: engineInput, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + }); + + const callArg = accept.mock.calls[0][0] as { payload: string }; + expect(JSON.parse(callArg.payload)).toEqual(engineInput); + }); +}); diff --git a/apps/webapp/test/mollifierMutateWithFallback.test.ts b/apps/webapp/test/mollifierMutateWithFallback.test.ts new file mode 100644 index 00000000000..ea688772847 --- /dev/null +++ b/apps/webapp/test/mollifierMutateWithFallback.test.ts @@ -0,0 +1,188 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: { taskRun: { findFirst: vi.fn(async () => null) } }, + $replica: { taskRun: { findFirst: vi.fn(async () => null) } }, +})); + +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import type { MollifierBuffer, MutateSnapshotResult } from "@trigger.dev/redis-worker"; +import type { TaskRun } from "@trigger.dev/database"; + +type FindFirst = ReturnType; +type PrismaStub = { taskRun: { findFirst: FindFirst } }; + +function fakePrisma(rows: Array): PrismaStub { + const fn = vi.fn(); + for (const r of rows) fn.mockResolvedValueOnce(r); + fn.mockResolvedValue(null); + return { taskRun: { findFirst: fn } }; +} + +function bufferReturning(result: MutateSnapshotResult): MollifierBuffer { + return { + mutateSnapshot: vi.fn(async () => result), + } as unknown as MollifierBuffer; +} + +const fakeRun = (overrides: Partial = {}): TaskRun => + ({ + id: "pg_id", + friendlyId: "run_1", + runtimeEnvironmentId: "env_a", + ...overrides, + }) as TaskRun; + +const baseInput = { + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + bufferPatch: { type: "append_tags" as const, tags: ["x"] }, +}; + +describe("mutateWithFallback", () => { + it("hits replica → calls pgMutation, returns pg outcome", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-response"); + const synthesisedResponse = vi.fn(() => "snapshot-response"); + + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse, + prismaReplica: fakePrisma([row]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("applied_to_snapshot"), + }); + + expect(result).toEqual({ kind: "pg", response: "pg-response" }); + expect(pgMutation).toHaveBeenCalledWith(row); + expect(synthesisedResponse).not.toHaveBeenCalled(); + }); + + it("replica miss + buffer applied_to_snapshot → synthesisedResponse", async () => { + const pgMutation = vi.fn(async () => "pg"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("applied_to_snapshot"), + }); + expect(result).toEqual({ kind: "snapshot", response: "snap" }); + expect(pgMutation).not.toHaveBeenCalled(); + }); + + it("replica miss + buffer not_found + writer miss → not_found", async () => { + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([null]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("not_found"), + }); + expect(result).toEqual({ kind: "not_found" }); + }); + + it("replica miss + buffer not_found + writer hit → pgMutation (replica-lag recovery)", async () => { + const row = fakeRun({ friendlyId: "run_1" }); + const pgMutation = vi.fn(async () => "pg-recovered"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([row]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("not_found"), + }); + expect(result).toEqual({ kind: "pg", response: "pg-recovered" }); + expect(pgMutation).toHaveBeenCalledWith(row); + }); + + it("replica miss + buffer busy + writer resolves mid-wait → pgMutation", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-after-wait"); + // Replica misses; writer misses twice, then hits. + const writer = fakePrisma([null, null, row]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("busy"), + sleep: async () => { + nowValue += 20; + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + pgTimeoutMs: 50, + }); + expect(result).toEqual({ kind: "pg", response: "pg-after-wait" }); + expect(pgMutation).toHaveBeenCalledWith(row); + // Writer should have been polled 3 times before the hit. + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(3); + }); + + it("replica miss + buffer busy + drainer never resolves → timed_out", async () => { + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([null, null, null, null, null]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("busy"), + sleep: async () => { + nowValue += 20; + }, + now: () => nowValue, + safetyNetMs: 60, + pollStepMs: 20, + pgTimeoutMs: 5, + }); + expect(result).toEqual({ kind: "timed_out" }); + }); + + it("abort signal during wait → timed_out without further polls", async () => { + const writer = fakePrisma([null, null, null]); + const controller = new AbortController(); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("busy"), + sleep: async () => { + nowValue += 20; + controller.abort(); + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + pgTimeoutMs: 5, + abortSignal: controller.signal, + }); + expect(result).toEqual({ kind: "timed_out" }); + // One poll happened before the sleep+abort. + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1); + }); + + it("buffer is null (mollifier disabled) → not_found after replica miss", async () => { + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => null, + }); + expect(result).toEqual({ kind: "not_found" }); + }); +}); diff --git a/apps/webapp/test/mollifierReadFallback.test.ts b/apps/webapp/test/mollifierReadFallback.test.ts new file mode 100644 index 00000000000..b30c3477f44 --- /dev/null +++ b/apps/webapp/test/mollifierReadFallback.test.ts @@ -0,0 +1,278 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import type { MollifierBuffer, BufferEntry } from "@trigger.dev/redis-worker"; + +function fakeBuffer(entry: BufferEntry | null): MollifierBuffer { + return { + getEntry: vi.fn(async () => entry), + } as unknown as MollifierBuffer; +} + +const NOW = new Date("2026-05-11T12:00:00Z"); + +describe("findRunByIdWithMollifierFallback", () => { + it("returns null when buffer is unavailable (mollifier disabled)", async () => { + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => null }, + ); + expect(result).toBeNull(); + }); + + it("returns null when no buffer entry exists", async () => { + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(null) }, + ); + expect(result).toBeNull(); + }); + + it("returns null when buffer entry envId does not match caller (auth mismatch)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_OTHER", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).toBeNull(); + }); + + it("returns null when buffer entry orgId does not match caller (auth mismatch)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_OTHER", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).toBeNull(); + }); + + it("returns synthesised QUEUED run when entry exists with matching auth", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "my-task" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.friendlyId).toBe("run_1"); + expect(result!.status).toBe("QUEUED"); + expect(result!.taskIdentifier).toBe("my-task"); + expect(result!.createdAt).toEqual(NOW); + }); + + it("returns synthesised QUEUED for DRAINING (internal state same externally)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "DRAINING", + attempts: 1, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.status).toBe("QUEUED"); + }); + + it("returns FAILED state with structured error for FAILED entries", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "FAILED", + attempts: 3, + createdAt: NOW, + lastError: { code: "VALIDATION", message: "task not found" }, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.status).toBe("FAILED"); + expect(result!.error).toEqual({ code: "VALIDATION", message: "task not found" }); + }); + + it("extracts snapshot-derived fields from the buffered payload", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "my-task", + payload: '{"foo":"bar"}', + payloadType: "application/json", + metadata: '{"customer":"acme"}', + metadataType: "application/json", + idempotencyKey: "client-abc", + idempotencyKeyOptions: ["payload"], + isTest: true, + depth: 2, + ttl: "1h", + tags: ["tag-a", "tag-b"], + lockToVersion: "20260511.1", + resumeParentOnCompletion: false, + parentTaskRunId: "run_parent", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.payloadType).toBe("application/json"); + expect(result!.metadata).toBe('{"customer":"acme"}'); + expect(result!.metadataType).toBe("application/json"); + expect(result!.idempotencyKey).toBe("client-abc"); + expect(result!.idempotencyKeyOptions).toEqual(["payload"]); + expect(result!.isTest).toBe(true); + expect(result!.depth).toBe(2); + expect(result!.ttl).toBe("1h"); + expect(result!.tags).toEqual(["tag-a", "tag-b"]); + expect(result!.lockedToVersion).toBe("20260511.1"); + expect(result!.resumeParentOnCompletion).toBe(false); + expect(result!.parentTaskRunId).toBe("run_parent"); + }); + + it("extracts gate-allocated trace context from the snapshot", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + traceId: "trace_abc", + spanId: "span_xyz", + parentSpanId: "span_parent", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.traceId).toBe("trace_abc"); + expect(result!.spanId).toBe("span_xyz"); + expect(result!.parentSpanId).toBe("span_parent"); + }); + + it("defaults snapshot-derived fields to safe values when absent", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.payloadType).toBeUndefined(); + expect(result!.metadata).toBeUndefined(); + expect(result!.idempotencyKey).toBeUndefined(); + expect(result!.isTest).toBe(false); + expect(result!.depth).toBe(0); + expect(result!.tags).toEqual([]); + expect(result!.resumeParentOnCompletion).toBe(false); + expect(result!.traceId).toBeUndefined(); + expect(result!.spanId).toBeUndefined(); + }); + + it("populates replay-relevant fields from the snapshot", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "my-task", + environment: { id: "env_a" }, + workerQueue: "default", + queue: "task/my-task", + concurrencyKey: "tenant-42", + machine: "medium-1x", + realtimeStreamsVersion: "v2", + seedMetadata: '{"k":"v"}', + seedMetadataType: "application/json", + tags: ["t1", "t2"], + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.id).toBeTypeOf("string"); + expect(result!.id.length).toBeGreaterThan(0); + expect(result!.engine).toBe("V2"); + expect(result!.runtimeEnvironmentId).toBe("env_a"); + expect(result!.workerQueue).toBe("default"); + expect(result!.queue).toBe("task/my-task"); + expect(result!.concurrencyKey).toBe("tenant-42"); + expect(result!.machinePreset).toBe("medium-1x"); + expect(result!.realtimeStreamsVersion).toBe("v2"); + expect(result!.seedMetadata).toBe('{"k":"v"}'); + expect(result!.seedMetadataType).toBe("application/json"); + expect(result!.runTags).toEqual(["t1", "t2"]); + }); + + it("falls back to entry.envId for runtimeEnvironmentId when snapshot lacks environment.id", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.runtimeEnvironmentId).toBe("env_a"); + expect(result!.workerQueue).toBeUndefined(); + expect(result!.queue).toBeUndefined(); + }); +}); diff --git a/apps/webapp/test/mollifierRealtimeRunResource.test.ts b/apps/webapp/test/mollifierRealtimeRunResource.test.ts new file mode 100644 index 00000000000..2f53ecb892f --- /dev/null +++ b/apps/webapp/test/mollifierRealtimeRunResource.test.ts @@ -0,0 +1,90 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { resolveRealtimeRunResource } from "~/v3/mollifier/realtimeRunResource.server"; + +const pgRun = { + id: "pg_internal_id", + friendlyId: "run_pg_friendly", + taskIdentifier: "hello-world", + runTags: ["a", "b"], + batch: { friendlyId: "batch_1" }, +}; + +const bufferedSynthetic = { + id: "buffered_id", + friendlyId: "run_buffered_id", + taskIdentifier: "hello-world", + runTags: ["c"], + // Six seconds ago against the fixed `now` below. + createdAt: new Date("2026-05-22T12:00:00.000Z"), +}; + +const fixedNow = () => new Date("2026-05-22T12:00:06.000Z").getTime(); + +describe("resolveRealtimeRunResource", () => { + it("returns the PG run unchanged when one exists", () => { + // PG wins even if the buffer also has the entry — the drainer may + // be racing the route call and the PG row is the canonical source. + expect( + resolveRealtimeRunResource({ pgRun, bufferedSynthetic: null }), + ).toEqual(pgRun); + expect( + resolveRealtimeRunResource({ pgRun, bufferedSynthetic }), + ).toEqual(pgRun); + }); + + it("never stamps __bufferedDwellMs on a PG-sourced resource", () => { + // The loader body uses __bufferedDwellMs as a discriminant for + // emitting buffered-subscription observability. A PG-resident run + // must never carry it or every PG subscription would over-count. + const result = resolveRealtimeRunResource({ pgRun, bufferedSynthetic }); + expect(result).not.toHaveProperty("__bufferedDwellMs"); + }); + + it("synthesises a resource from the buffered entry when PG misses", () => { + // Load-bearing assertion: `id` must equal `bufferedSynthetic.id`. + // The realtime route hands this `id` to streamRun, which builds + // Electric's `WHERE id=''` clause. When the drainer materialises + // the run, engine.trigger writes the row with that same id (derived + // deterministically from friendlyId), and Electric streams the + // INSERT to the client. If the synthesised `id` ever drifts from + // what the drainer writes, the customer subscribes to a shape that + // never matches and the hook silently hangs even after materialise. + const result = resolveRealtimeRunResource({ + pgRun: null, + bufferedSynthetic, + now: fixedNow, + }); + expect(result).toEqual({ + id: "buffered_id", + friendlyId: "run_buffered_id", + taskIdentifier: "hello-world", + runTags: ["c"], + batch: null, + __bufferedDwellMs: 6000, + }); + }); + + it("defaults a missing taskIdentifier to empty string", () => { + const result = resolveRealtimeRunResource({ + pgRun: null, + bufferedSynthetic: { ...bufferedSynthetic, taskIdentifier: undefined }, + now: fixedNow, + }); + expect(result?.taskIdentifier).toBe(""); + }); + + it("returns null when neither PG nor buffer have the run", () => { + // This is the genuine not-found case — typo'd runId, deleted run, + // etc. The api-builder maps null to 404. Critically, the buffered- + // fallback must NOT promote a missing run to a synthetic resource — + // that would cause Electric to open a shape for a runId that may + // never exist, which is also a silent-hang situation but for a + // different reason. + expect( + resolveRealtimeRunResource({ pgRun: null, bufferedSynthetic: null }), + ).toBeNull(); + }); +}); diff --git a/apps/webapp/test/mollifierRealtimeRunResourceBuffer.test.ts b/apps/webapp/test/mollifierRealtimeRunResourceBuffer.test.ts new file mode 100644 index 00000000000..5cf0610b73b --- /dev/null +++ b/apps/webapp/test/mollifierRealtimeRunResourceBuffer.test.ts @@ -0,0 +1,152 @@ +import { describe, expect, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { resolveRealtimeRunResource } from "~/v3/mollifier/realtimeRunResource.server"; + +const SNAPSHOT_BASE = { + friendlyId: "run_phase52e2e", + taskIdentifier: "hello-world", + payload: '{"x":1}', + payloadType: "application/json", + traceContext: { traceparent: "00-0123456789abcdef0123456789abcdef-fedcba9876543210-01" }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + queue: "task/hello-world", + tags: ["realtime-e2e"], + depth: 0, + isTest: false, + taskEventStore: "taskEvent", +}; + +// End-to-end: a real MollifierBuffer has an entry, the real +// readFallback helper deserialises it, and the resolveRealtimeRunResource +// helper produces the resource shape the realtime route returns from +// findResource. Regression intent: if any link in the chain breaks — +// buffer interface rename, snapshot field rename, id-derivation drift, +// synthetic-shape change — this test fails. The route file itself is +// then a thin glue layer over tested pieces. +describe("realtime buffered-subscription resource resolution (testcontainers)", () => { + redisTest( + "synthesises a resource whose `id` matches RunId.fromFriendlyId", + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: SNAPSHOT_BASE.friendlyId, + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT_BASE), + }); + + const bufferedSynthetic = await findRunByIdWithMollifierFallback( + { + runId: SNAPSHOT_BASE.friendlyId, + environmentId: "env_a", + organizationId: "org_1", + }, + { getBuffer: () => buffer }, + ); + expect(bufferedSynthetic).not.toBeNull(); + + const resource = resolveRealtimeRunResource({ + pgRun: null, + bufferedSynthetic, + }); + + // The load-bearing contract: the resolved `id` MUST equal what + // engine.trigger will write to PG.TaskRun.id when the drainer + // materialises this run. Electric's `WHERE id=''` clause + // depends on this match — drift means a silent-hang regression. + expect(resource?.id).toBe(RunId.fromFriendlyId(SNAPSHOT_BASE.friendlyId)); + expect(resource?.friendlyId).toBe(SNAPSHOT_BASE.friendlyId); + expect(resource?.taskIdentifier).toBe("hello-world"); + expect(resource?.runTags).toEqual(["realtime-e2e"]); + expect(resource?.batch).toBeNull(); + expect(resource?.__bufferedDwellMs).toBeTypeOf("number"); + expect(resource?.__bufferedDwellMs).toBeGreaterThanOrEqual(0); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns null when neither PG nor the buffer have the entry", + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + const bufferedSynthetic = await findRunByIdWithMollifierFallback( + { + runId: "run_does_not_exist", + environmentId: "env_a", + organizationId: "org_1", + }, + { getBuffer: () => buffer }, + ); + expect(bufferedSynthetic).toBeNull(); + + const resource = resolveRealtimeRunResource({ + pgRun: null, + bufferedSynthetic, + }); + // The api builder relies on this null to emit a real 404 for + // genuinely missing runs. If we ever promote unknown runIds to + // synthetic resources here, the route opens an Electric shape + // for a run that may never exist — a different silent-hang + // failure mode for typos, deleted runs, etc. + expect(resource).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "does not fall back to buffer when PG has the row", + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: SNAPSHOT_BASE.friendlyId, + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT_BASE), + }); + + // Simulate the drainer having materialised the run: PG has the + // canonical row, the buffer still has its entry (would be + // ack'd & removed in real ops). The resolver must return the + // PG row and NOT carry the __bufferedDwellMs flag — otherwise + // the loader body would emit a buffered-subscription log for a + // run that's actually PG-resident, over-counting the signal. + const pgRun = { + id: RunId.fromFriendlyId(SNAPSHOT_BASE.friendlyId), + friendlyId: SNAPSHOT_BASE.friendlyId, + taskIdentifier: "hello-world", + runTags: ["realtime-e2e"], + batch: null, + }; + + const bufferedSynthetic = await findRunByIdWithMollifierFallback( + { + runId: SNAPSHOT_BASE.friendlyId, + environmentId: "env_a", + organizationId: "org_1", + }, + { getBuffer: () => buffer }, + ); + + const resource = resolveRealtimeRunResource({ pgRun, bufferedSynthetic }); + expect(resource).toEqual(pgRun); + expect(resource).not.toHaveProperty("__bufferedDwellMs"); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/apps/webapp/test/mollifierRealtimeSubscription.test.ts b/apps/webapp/test/mollifierRealtimeSubscription.test.ts new file mode 100644 index 00000000000..0ea0471a5f1 --- /dev/null +++ b/apps/webapp/test/mollifierRealtimeSubscription.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { isInitialBufferedSubscriptionRequest } from "~/v3/mollifier/mollifierTelemetry.server"; + +describe("isInitialBufferedSubscriptionRequest", () => { + // Electric's shape-stream protocol returns a `handle=` in + // the first response. The SDK echoes that handle on every reconnect / + // live-poll iteration thereafter. The realtime route logs + + // increments the mollifier.realtime_subscriptions.buffered counter + // only on the initial connect (handle absent) so each subscription + // produces a single observability event instead of one per + // long-poll round-trip (~20s). + it("returns true for the SDK's initial GET (no handle param)", () => { + expect( + isInitialBufferedSubscriptionRequest( + "http://localhost:3030/realtime/v1/runs/run_x?log=full&offset=-1", + ), + ).toBe(true); + }); + + it("returns false for Electric's reconnects (handle present)", () => { + expect( + isInitialBufferedSubscriptionRequest( + "http://localhost:3030/realtime/v1/runs/run_x?handle=100344308-1779&log=full&offset=0_0", + ), + ).toBe(false); + }); + + it("returns false for Electric live-poll reconnects (handle + cursor)", () => { + expect( + isInitialBufferedSubscriptionRequest( + "http://localhost:3030/realtime/v1/runs/run_x?cursor=51020980&handle=100344308&live=true&log=full&offset=0_inf", + ), + ).toBe(false); + }); + + it("accepts a URL instance as well as a string", () => { + const url = new URL("http://localhost:3030/realtime/v1/runs/run_x?log=full"); + expect(isInitialBufferedSubscriptionRequest(url)).toBe(true); + }); +}); diff --git a/apps/webapp/test/mollifierResolveRunForMutation.test.ts b/apps/webapp/test/mollifierResolveRunForMutation.test.ts new file mode 100644 index 00000000000..c552a3cd182 --- /dev/null +++ b/apps/webapp/test/mollifierResolveRunForMutation.test.ts @@ -0,0 +1,154 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: { taskRun: { findFirst: vi.fn(async () => null) } }, +})); + +import { resolveRunForMutation } from "~/v3/mollifier/resolveRunForMutation.server"; +import type { BufferEntry, MollifierBuffer } from "@trigger.dev/redis-worker"; + +// Regression coverage for the cancel-route 404 bug (commit b490afe23). +// Before the fix the route had `findResource: async () => null`, which +// caused the route builder to 404 every cancel — including for valid +// PG-row runs — BEFORE the action handler could run. The helper +// resolveRunForMutation has to return a non-null discriminated value +// whenever the run exists in either store. + +const NOW = new Date("2026-05-21T10:00:00Z"); + +function fakeReplica(row: { friendlyId: string } | null) { + return { taskRun: { findFirst: vi.fn(async () => row) } }; +} + +function fakeBuffer(entry: BufferEntry | null): MollifierBuffer { + return { + getEntry: vi.fn(async () => entry), + } as unknown as MollifierBuffer; +} + +const baseInput = { + runParam: "run_1", + environmentId: "env_a", + organizationId: "org_1", +}; + +describe("resolveRunForMutation", () => { + it("returns { source: 'pg' } when the PG row exists", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica({ friendlyId: "run_1" }), + getBuffer: () => null, + }, + }); + expect(result).toEqual({ source: "pg", friendlyId: "run_1" }); + }); + + it("returns { source: 'buffer' } when PG misses and the buffer entry matches env+org", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toEqual({ source: "buffer", friendlyId: "run_1" }); + }); + + it("returns null when PG misses and the buffer entry env doesn't match", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_OTHER", + orgId: "org_1", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when PG misses and the buffer entry org doesn't match", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_OTHER", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when both PG and buffer miss", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(null), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when buffer is unavailable (mollifier disabled) and PG misses", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => null, + }, + }); + expect(result).toBeNull(); + }); + + it("PG-hit short-circuits before consulting the buffer", async () => { + const buffer = fakeBuffer(null); + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica({ friendlyId: "run_1" }), + getBuffer: () => buffer, + }, + }); + expect(result?.source).toBe("pg"); + expect(buffer.getEntry).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts new file mode 100644 index 00000000000..029b90cb761 --- /dev/null +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -0,0 +1,231 @@ +import { describe, expect, it, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { runStaleSweepOnce } from "~/v3/mollifier/mollifierStaleSweep.server"; + +const SNAPSHOT = { + taskIdentifier: "hello-world", + payload: '{"x":1}', + payloadType: "application/json", + traceContext: {}, +}; + +function spyDeps() { + const recordedStaleEnvIds: string[] = []; + const snapshots: Array> = []; + const warnings: Array<{ message: string; fields: Record }> = []; + return { + recordedStaleEnvIds, + snapshots, + warnings, + deps: { + recordStaleEntry: (envId: string) => { + recordedStaleEnvIds.push(envId); + }, + reportStaleEntrySnapshot: (snapshot: Map) => { + // Clone so post-sweep assertions see what was reported *at that + // call site*, not whatever subsequent passes mutate the source + // map into. + snapshots.push(new Map(snapshot)); + }, + logger: { + warn: (message: string, fields: Record) => { + warnings.push({ message, fields }); + }, + }, + }, + }; +} + +describe("runStaleSweepOnce — unit", () => { + it("returns zeros when the buffer is null", async () => { + // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer + // singleton is null and the sweep is a no-op. We don't want it to + // emit a metric (or throw) just because mollifier is disabled. + const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 1000 }, + { ...deps, getBuffer: () => null }, + ); + expect(result).toEqual({ + orgsScanned: 0, + envsScanned: 0, + entriesScanned: 0, + staleCount: 0, + }); + expect(recordedStaleEnvIds).toEqual([]); + expect(warnings).toEqual([]); + // An empty snapshot is still reported so any previously-paging env + // (from a prior sweep before mollifier was disabled) clears. + expect(snapshots).toHaveLength(1); + expect(snapshots[0].size).toBe(0); + }); +}); + +describe("runStaleSweepOnce — testcontainers", () => { + redisTest( + "flags entries whose dwell exceeds the stale threshold and skips fresh ones", + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + // Two stale entries (one in each env) + one fresh entry. Sweep + // should flag the two stale, leave the fresh one alone, record + // the counter once per stale entry, and emit a warning per + // stale entry with the dwell + threshold. + await buffer.accept({ + runId: "run_stale_a", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_stale_b", + envId: "env_b", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_fresh", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + // Yank the system clock forward 5 minutes for the sweep — way + // past the threshold below. The `now` deps seam lets us drive + // the threshold without actually waiting in real time. + const futureNow = Date.now() + 5 * 60 * 1000; + + const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { + ...deps, + getBuffer: () => buffer, + now: () => futureNow, + }, + ); + + expect(result.envsScanned).toBe(2); + expect(result.entriesScanned).toBe(3); + expect(result.staleCount).toBe(3); + // All three entries have dwell ~5min, all exceed the 1-min + // threshold; each emits one counter tick + one warning. + expect(recordedStaleEnvIds.sort()).toEqual( + ["env_a", "env_a", "env_b"].sort(), + ); + expect(warnings).toHaveLength(3); + for (const w of warnings) { + expect(w.message).toBe("mollifier.stale_entry"); + expect(w.fields.staleThresholdMs).toBe(60 * 1000); + expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); + } + // Snapshot drives the alertable gauge — env_a has 2 stale + // entries, env_b has 1. Both must appear so a future alert can + // identify which env is paging. + expect(snapshots).toHaveLength(1); + expect(Object.fromEntries(snapshots[0])).toEqual({ + env_a: 2, + env_b: 1, + }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "snapshot reports zero for envs that have entries but none stale (clears latched alerts)", + async ({ redisOptions }) => { + // Critical for alert behaviour: a previous sweep reported env_a + // stale, alert fired, drainer caught up. The next sweep must + // report `env_a -> 0` so the gauge drops below the alert + // threshold instead of staying latched at the last stale value. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_just_arrived", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const { deps, snapshots } = spyDeps(); + await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer }, + ); + expect(snapshots).toHaveLength(1); + expect(Object.fromEntries(snapshots[0])).toEqual({ env_a: 0 }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "leaves fresh entries alone (dwell below threshold)", + async ({ redisOptions }) => { + // Regression guard for the inequality direction. A bug that flipped + // `dwellMs > threshold` to `dwellMs >= threshold` would flag every + // entry the first time the sweep runs after a perfectly synchronised + // accept call — the dashboard would page on every burst. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_fresh_only", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const { deps, recordedStaleEnvIds, warnings } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer }, + ); + expect(result.staleCount).toBe(0); + expect(recordedStaleEnvIds).toEqual([]); + expect(warnings).toEqual([]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "scans across multiple orgs", + async ({ redisOptions }) => { + // Phase-3 design has org-level fairness in the drainer; the sweep + // must walk every org/env, not just the first one it finds. If a + // future refactor collapsed listOrgs/listEnvsForOrg into a single + // env-flat list this test catches a regression there. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_x", + envId: "env_x", + orgId: "org_x", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_y", + envId: "env_y", + orgId: "org_y", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const { deps } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer, now: () => futureNow }, + ); + expect(result.orgsScanned).toBe(2); + expect(result.envsScanned).toBe(2); + expect(result.staleCount).toBe(2); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts new file mode 100644 index 00000000000..4a773caa10f --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts @@ -0,0 +1,162 @@ +import { describe, expect, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server"; + +const SNAPSHOT = { + spanId: "span_1", + environment: { + slug: "dev", + project: { slug: "hello-world-bN7m" }, + organization: { slug: "references-6120" }, + }, +}; + +function fakePrisma(member: { id: string } | null) { + return { + orgMember: { findFirst: vi.fn(async () => member) }, + } as unknown as Parameters[1]["prismaClient"]; +} + +describe("findBufferedRunRedirectInfo (testcontainers)", () => { + redisTest("returns slugs + spanId for a real buffer entry when user is a member", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_1", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toEqual({ + organizationSlug: "references-6120", + projectSlug: "hello-world-bN7m", + environmentSlug: "dev", + spanId: "span_1", + }); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when no buffer entry exists for the runId", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_missing", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when the user is not an org member (default check enforced)", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_2", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_2", userId: "user_other" }, + { getBuffer: () => buffer, prismaClient: fakePrisma(null) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("skips the org-membership check when skipOrgMembershipCheck is set (admin path)", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_3", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const findFirst = vi.fn(); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_3", userId: "user_admin", skipOrgMembershipCheck: true }, + { + getBuffer: () => buffer, + prismaClient: { orgMember: { findFirst } } as unknown as Parameters[1]["prismaClient"], + }, + ); + expect(info?.organizationSlug).toBe("references-6120"); + expect(findFirst).not.toHaveBeenCalled(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when snapshot is malformed JSON", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_4", + envId: "env_a", + orgId: "org_1", + payload: "{not-json", + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_4", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when snapshot lacks org/project slugs", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_5", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ spanId: "s", environment: { slug: "dev" } }), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_5", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns info with undefined spanId when snapshot has no spanId", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_6", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ environment: SNAPSHOT.environment }), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_6", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info?.spanId).toBeUndefined(); + expect(info?.environmentSlug).toBe("dev"); + } finally { + await buffer.close(); + } + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticSpanRun.test.ts b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts new file mode 100644 index 00000000000..68c3c4cfc48 --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts @@ -0,0 +1,158 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { buildSyntheticSpanRun } from "~/v3/mollifier/syntheticSpanRun.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-21T10:00:00Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + taskIdentifier: "hello-world", + createdAt: NOW, + payload: { message: "hi" }, + payloadType: "application/json", + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: "10m", + tags: ["a", "b"], + runTags: ["a", "b"], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: "worker-queue-1", + queue: "task/hello-world", + concurrencyKey: undefined, + machinePreset: "small-1x", + realtimeStreamsVersion: "v1", + maxAttempts: 3, + maxDurationInSeconds: 3600, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +const ENV = { + id: "env_a", + slug: "dev", + type: "DEVELOPMENT" as const, +}; + +describe("buildSyntheticSpanRun", () => { + it("populates the core identity fields from the snapshot", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.id).toBe("run_internal_1"); + expect(synth.friendlyId).toBe("run_friendly_1"); + expect(synth.taskIdentifier).toBe("hello-world"); + expect(synth.traceId).toBe("trace_1"); + expect(synth.spanId).toBe("span_1"); + expect(synth.environmentId).toBe("env_a"); + expect(synth.engine).toBe("V2"); + expect(synth.workerQueue).toBe("worker-queue-1"); + }); + + it("reports PENDING status and the non-final flags", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.status).toBe("PENDING"); + expect(synth.isFinished).toBe(false); + expect(synth.isRunning).toBe(false); + expect(synth.isError).toBe(false); + expect(synth.startedAt).toBeNull(); + expect(synth.completedAt).toBeNull(); + }); + + it("pretty-prints the JSON payload from the snapshot", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ payload: { message: "hi" }, payloadType: "application/json" }), + environment: ENV, + }); + // prettyPrintPacket round-trips JSON with 2-space indent. + expect(synth.payload).toContain('"message": "hi"'); + expect(synth.payloadType).toBe("application/json"); + }); + + it("forwards runTags onto `tags` exactly", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ runTags: ["alpha", "beta"] }), + environment: ENV, + }); + expect(synth.tags).toEqual(["alpha", "beta"]); + }); + + it("classifies the queue name as custom when it does not start with 'task/'", async () => { + const taskQueue = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ queue: "task/hello-world" }), + environment: ENV, + }); + expect(taskQueue.queue.isCustomQueue).toBe(false); + + const customQueue = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ queue: "my-custom" }), + environment: ENV, + }); + expect(customQueue.queue.isCustomQueue).toBe(true); + }); + + it("derives idempotency status from the snapshot key/options", async () => { + const withKey = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ idempotencyKey: "abc", idempotencyKeyOptions: ["scope"] }), + environment: ENV, + }); + expect(withKey.idempotencyKey).toBe("abc"); + expect(withKey.idempotencyKeyStatus).toBe("active"); + + const noKey = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ idempotencyKey: undefined, idempotencyKeyOptions: undefined }), + environment: ENV, + }); + expect(noKey.idempotencyKeyStatus).toBeUndefined(); + }); + + it("fills relationship metadata from parent/root snapshot fields when present", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ + parentTaskRunFriendlyId: "run_parent", + rootTaskRunFriendlyId: "run_root", + }), + environment: ENV, + }); + expect(synth.relationships.parent?.friendlyId).toBe("run_parent"); + expect(synth.relationships.root?.friendlyId).toBe("run_root"); + expect(synth.relationships.root?.isParent).toBe(false); + }); + + it("returns no relationship objects when the snapshot has no parent/root", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun(), + environment: ENV, + }); + expect(synth.relationships.parent).toBeUndefined(); + expect(synth.relationships.root).toBeUndefined(); + }); + + it("flags the synthetic run as 'not cached' since cache lookup did not match it", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.isCached).toBe(false); + }); +}); diff --git a/apps/webapp/test/mollifierTripEvaluator.test.ts b/apps/webapp/test/mollifierTripEvaluator.test.ts index b9a9bf8c94a..14ac0cc55bc 100644 --- a/apps/webapp/test/mollifierTripEvaluator.test.ts +++ b/apps/webapp/test/mollifierTripEvaluator.test.ts @@ -14,7 +14,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=false when the sliding window stays under threshold", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); try { const evaluator = createRealTripEvaluator({ getBuffer: () => buffer, @@ -32,7 +32,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=true with reason per_env_rate once the window trips", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); try { // threshold=2 → the 3rd call within windowMs is the first that trips. const options = { windowMs: 5000, threshold: 2, holdMs: 5000 } as const; @@ -73,7 +73,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=false when buffer throws (fail-open)", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); // Closing the client up front means evaluateTrip will throw on the first // Redis command — a real failure mode, not a stub. await buffer.close(); diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index da42247111a..e461fddf6c5 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -450,6 +450,162 @@ export class RunEngine { //MARK: - Run functions + /** + * Writes a TaskRun row in CANCELED state directly, bypassing the trigger + * pipeline. Used by the mollifier drainer when a cancel API call lands on + * a buffered run before it materialises (Q4 mollifier-cancel design). + * + * Skips: queue insertion (no execution), waitpoint creation (single- + * triggerAndWait can't enter the buffer; F4 bypass), concurrency + * reservation. Emits `runCancelled` so the existing TaskEvent handler + * writes the cancellation event row — the only side effect PG-side cancel + * has today per audit. + * + * Idempotent: if a row with the same friendlyId already exists (double + * drainer pop after requeue), Prisma's P2002 unique-constraint violation + * is caught and the existing row is returned. The duplicate runCancelled + * emission is skipped — the original drain's emit already wrote the + * TaskEvent. + */ + async createCancelledRun( + { + snapshot, + cancelledAt, + cancelReason, + }: { + snapshot: TriggerParams; + cancelledAt: Date; + cancelReason: string; + }, + tx?: PrismaClientOrTransaction, + ): Promise { + const prisma = tx ?? this.prisma; + return startSpan(this.tracer, "createCancelledRun", async (span) => { + span.setAttribute("friendlyId", snapshot.friendlyId); + span.setAttribute("taskIdentifier", snapshot.taskIdentifier); + const id = RunId.fromFriendlyId(snapshot.friendlyId); + const error: TaskRunError = { type: "STRING_ERROR", raw: cancelReason }; + + try { + const taskRun = await prisma.taskRun.create({ + data: { + id, + engine: "V2", + status: "CANCELED", + friendlyId: snapshot.friendlyId, + runtimeEnvironmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + organizationId: snapshot.environment.organization.id, + projectId: snapshot.environment.project.id, + idempotencyKey: snapshot.idempotencyKey, + idempotencyKeyExpiresAt: snapshot.idempotencyKeyExpiresAt, + idempotencyKeyOptions: snapshot.idempotencyKeyOptions, + taskIdentifier: snapshot.taskIdentifier, + payload: snapshot.payload, + payloadType: snapshot.payloadType, + context: snapshot.context, + traceContext: snapshot.traceContext, + traceId: snapshot.traceId, + spanId: snapshot.spanId, + parentSpanId: snapshot.parentSpanId, + lockedToVersionId: snapshot.lockedToVersionId, + taskVersion: snapshot.taskVersion, + sdkVersion: snapshot.sdkVersion, + cliVersion: snapshot.cliVersion, + concurrencyKey: snapshot.concurrencyKey, + queue: snapshot.queue, + lockedQueueId: snapshot.lockedQueueId, + workerQueue: snapshot.workerQueue, + isTest: snapshot.isTest, + taskEventStore: snapshot.taskEventStore, + // Defensive: the snapshot comes from a cjson-encoded buffer + // payload, where empty Lua tables encode as `{}` not `[]`. If + // the drainer pops a buffered run with no tags, snapshot.tags + // will be an empty object, which Prisma misreads as a relation + // update op. Normalise to a real array (or undefined for the + // empty case). + runTags: Array.isArray(snapshot.tags) && snapshot.tags.length > 0 + ? snapshot.tags + : undefined, + oneTimeUseToken: snapshot.oneTimeUseToken, + parentTaskRunId: snapshot.parentTaskRunId, + rootTaskRunId: snapshot.rootTaskRunId, + replayedFromTaskRunFriendlyId: snapshot.replayedFromTaskRunFriendlyId, + batchId: snapshot.batch?.id, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion, + depth: snapshot.depth, + seedMetadata: snapshot.seedMetadata, + seedMetadataType: snapshot.seedMetadataType, + metadata: snapshot.metadata, + metadataType: snapshot.metadataType, + machinePreset: snapshot.machine, + scheduleId: snapshot.scheduleId, + scheduleInstanceId: snapshot.scheduleInstanceId, + createdAt: snapshot.createdAt, + bulkActionGroupIds: snapshot.bulkActionId ? [snapshot.bulkActionId] : undefined, + planType: snapshot.planType, + realtimeStreamsVersion: snapshot.realtimeStreamsVersion, + streamBasinName: snapshot.streamBasinName, + annotations: snapshot.annotations, + completedAt: cancelledAt, + updatedAt: cancelledAt, + error: error as unknown as Prisma.InputJsonValue, + attemptNumber: 0, + executionSnapshots: { + create: { + engine: "V2", + executionStatus: "FINISHED", + description: "Run cancelled before materialisation", + runStatus: "CANCELED", + environmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + projectId: snapshot.environment.project.id, + organizationId: snapshot.environment.organization.id, + }, + }, + }, + }); + + this.eventBus.emit("runCancelled", { + time: cancelledAt, + run: { + id: taskRun.id, + status: taskRun.status, + friendlyId: taskRun.friendlyId, + spanId: taskRun.spanId, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + error, + updatedAt: taskRun.updatedAt, + attemptNumber: taskRun.attemptNumber ?? 0, + }, + organization: { id: snapshot.environment.organization.id }, + project: { id: snapshot.environment.project.id }, + environment: { id: snapshot.environment.id }, + }); + + return taskRun; + } catch (err) { + // P2002 = unique constraint violation. Double-pop after a drainer + // requeue can reach this. Idempotent: return the existing row + // without re-emitting. + if ( + err instanceof Prisma.PrismaClientKnownRequestError && + err.code === "P2002" + ) { + this.logger.info( + "createCancelledRun: row already exists, returning existing (idempotent)", + { friendlyId: snapshot.friendlyId }, + ); + const existing = await prisma.taskRun.findFirst({ where: { id } }); + if (existing) return existing; + } + throw err; + } + }); + } + /** "Triggers" one run. */ async trigger( { @@ -983,6 +1139,44 @@ export class RunEngine { }); } + // Emit `runFailed` so the alert pipeline picks up the + // SYSTEM_FAILURE row and the event-store handler writes the + // completion event into the trace. Without this the mollifier + // drainer's terminal failures (and batch-trigger's + // exceed-limit failures) land in PG silently — visible in the + // dashboard list but never reaching customers' configured + // ERROR alert channels. + this.eventBus.emit("runFailed", { + time: taskRun.completedAt ?? new Date(), + run: { + id: taskRun.id, + status: taskRun.status, + spanId: taskRun.spanId, + error, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + updatedAt: taskRun.updatedAt, + // This row never attempted execution — it's a synthesised + // terminal failure. The alert payload's `attemptNumber=0` + // is the signal downstream consumers can use to + // distinguish a never-ran failure from a run that + // exhausted its retries. + attemptNumber: 0, + usageDurationMs: 0, + costInCents: 0, + }, + organization: { + id: environment.organization.id, + }, + project: { + id: environment.project.id, + }, + environment: { + id: environment.id, + }, + }); + return taskRun; }, { diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts new file mode 100644 index 00000000000..0a541b5349e --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -0,0 +1,233 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +function freshRunId() { + return RunId.generate().friendlyId; +} +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import type { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function baseEngineOptions(redisOptions: Parameters[0]["queue"]["redis"]) { + return { + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +// Phase C1 / Q4 design — engine.createCancelledRun writes a CANCELED +// TaskRun row directly from a buffer snapshot. Verifies the bypass- +// queue / bypass-waitpoint / emit-runCancelled contract. +describe("RunEngine.createCancelledRun", () => { + containerTest( + "writes CANCELED PG row with snapshot fields, completedAt, error", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + const cancelledAt = new Date("2026-05-20T12:00:00.000Z"); + const cancelReason = "Canceled by user"; + + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: ["test-tag"], + }, + cancelledAt, + cancelReason, + }); + + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + expect(result.id).toBe(RunId.fromFriendlyId(friendlyId)); + expect(result.completedAt?.toISOString()).toBe(cancelledAt.toISOString()); + expect(result.taskIdentifier).toBe("test-task"); + expect(result.runTags).toEqual(["test-tag"]); + expect(result.payload).toBe('{"hello":"world"}'); + const err = result.error as { type?: string; raw?: string }; + expect(err.type).toBe("STRING_ERROR"); + expect(err.raw).toBe(cancelReason); + + // Verify the PG row is canonical (findFirst returns the row). + const stored = await prisma.taskRun.findFirst({ + where: { friendlyId }, + }); + expect(stored).not.toBeNull(); + expect(stored!.status).toBe("CANCELED"); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "emits runCancelled with correct payload", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const cancelledAt = new Date(); + const cancelReason = "Test cancel"; + const friendlyId = freshRunId(); + await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000cccc000000000000", + spanId: "dddd000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt, + cancelReason, + }); + + expect(captured).toHaveLength(1); + expect(captured[0]!.run.status).toBe("CANCELED"); + expect(captured[0]!.run.friendlyId).toBe(friendlyId); + expect(captured[0]!.run.error).toEqual({ type: "STRING_ERROR", raw: cancelReason }); + expect(captured[0]!.organization.id).toBe(env.organization.id); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "idempotent on double-pop: second call returns existing row without re-emitting", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const snapshot = { + friendlyId: freshRunId(), + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000eeee000000000000", + spanId: "ffff000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }; + const cancelledAt = new Date(); + const cancelReason = "Test idempotent"; + + const first = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + const second = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + + expect(second.id).toBe(first.id); + // Only the first call's emit fired; the P2002 path skips re-emission. + expect(captured).toHaveLength(1); + } finally { + await engine.quit(); + } + }, + ); + + // Regression: cjson encodes empty Lua tables as `{}`, not `[]`. When + // the drainer pops a buffered run that never had a tag set, the + // deserialised snapshot's `tags` field is an empty object. The old + // implementation passed it straight into Prisma's `runTags:` field; + // Prisma misread the object as a relation update op and threw + // `Argument 'set' is missing`. The drainer caught the error and + // marked the buffer entry FAILED — so the CANCELED PG row never + // landed. Found while running the Phase F challenge suite. + containerTest( + "tolerates snapshot.tags being an empty object (cjson edge case)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + // Cast through unknown to simulate the cjson-decode output shape + // for an empty Lua table — TypeScript's snapshot type says + // string[], but the buffer Lua delivers {} for the empty case. + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000abcd000000000000", + spanId: "1234000000000000", + queue: "task/test-task", + isTest: false, + tags: {} as unknown as string[], + }, + cancelledAt: new Date(), + cancelReason: "Cancelled — empty tags", + }); + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + // Prisma normalises the absent-tags case to either [] or null + // depending on the column default; assert it's an empty array. + expect(result.runTags).toEqual([]); + } finally { + await engine.quit(); + } + }, + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts new file mode 100644 index 00000000000..0619eeffc2f --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts @@ -0,0 +1,111 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunEngine.createFailedTaskRun", () => { + containerTest("emits runFailed so the alert pipeline wakes up", async ({ prisma, redisOptions }) => { + // The mollifier drainer (and batch-trigger over-limit path) call + // createFailedTaskRun to write a terminal SYSTEM_FAILURE PG row + // for runs that never actually executed. Without an explicit + // runFailed emit, the row lands silently — the + // runEngineHandlers' `runFailed` listener (which enqueues + // PerformTaskRunAlertsService) never fires, so customers' + // configured TASK_RUN alert channels miss the failure entirely. + // + // Regression intent: if the emit is removed or moved out of + // createFailedTaskRun's success path, this test fails. The + // shape assertions pin the fields the alert delivery service + // reads from the event payload (run.id, run.status, error, + // attemptNumber=0 as the never-ran-marker). + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const failedEvents: EventBusEventArgs<"runFailed">[0][] = []; + engine.eventBus.on("runFailed", (event) => { + failedEvents.push(event); + }); + + const friendlyId = generateFriendlyId("run"); + const taskIdentifier = "drainer-terminal-test"; + + const failed = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: authenticatedEnvironment.id, + type: authenticatedEnvironment.type, + project: { id: authenticatedEnvironment.project.id }, + organization: { id: authenticatedEnvironment.organization.id }, + }, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + error: { + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + }); + + expect(failed.status).toBe("SYSTEM_FAILURE"); + + expect(failedEvents).toHaveLength(1); + const event = failedEvents[0]; + expect(event.run.id).toBe(failed.id); + expect(event.run.status).toBe("SYSTEM_FAILURE"); + expect(event.run.spanId).toBe("fedcba9876543210"); + // attemptNumber=0 is the marker that the run never executed — + // it's a synthesised terminal failure, not an exhausted-retries + // failure. Downstream consumers can use this to distinguish. + expect(event.run.attemptNumber).toBe(0); + expect(event.run.usageDurationMs).toBe(0); + expect(event.run.costInCents).toBe(0); + expect(event.run.error).toEqual({ + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }); + expect(event.organization.id).toBe(authenticatedEnvironment.organization.id); + expect(event.project.id).toBe(authenticatedEnvironment.project.id); + expect(event.environment.id).toBe(authenticatedEnvironment.id); + } finally { + await engine.quit(); + } + }); +}); diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index e86e503de47..cc10f69286c 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -236,6 +236,13 @@ export type TriggerTaskRequestBody = z.infer; export const TriggerTaskResponse = z.object({ id: z.string(), isCached: z.boolean().optional(), + notice: z + .object({ + code: z.string(), + message: z.string(), + docs: z.string().url(), + }) + .optional(), }); export type TriggerTaskResponse = z.infer; diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index c8f7b95c97a..a4c1be35eb3 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -20,12 +20,14 @@ describe("schemas", () => { status: "QUEUED", attempts: "0", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", }; const parsed = BufferEntrySchema.parse(raw); expect(parsed.runId).toBe("run_abc"); expect(parsed.status).toBe("QUEUED"); expect(parsed.attempts).toBe(0); expect(parsed.createdAt).toBeInstanceOf(Date); + expect(parsed.createdAtMicros).toBe(1747044000000000); }); it("BufferEntrySchema parses a FAILED entry with lastError", () => { @@ -37,6 +39,7 @@ describe("schemas", () => { status: "FAILED", attempts: "3", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", lastError: JSON.stringify({ code: "P2024", message: "connection lost" }), }; const parsed = BufferEntrySchema.parse(raw); @@ -52,7 +55,6 @@ describe("MollifierBuffer construction", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -68,7 +70,6 @@ describe("MollifierBuffer.accept", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -105,7 +106,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -132,7 +132,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -151,7 +150,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -169,24 +167,56 @@ describe("MollifierBuffer.pop", () => { }); describe("MollifierBuffer.ack", () => { - redisTest("ack deletes the entry", { timeout: 20_000 }, async ({ redisContainer }) => { + redisTest( + "ack marks entry materialised and applies the grace TTL — entry persists as a read-fallback safety net", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + await buffer.ack("run_x"); + + const after = await buffer.getEntry("run_x"); + expect(after).not.toBeNull(); + expect(after!.materialised).toBe(true); + + // ack grace TTL is the only context where an entry hash gets + // an EXPIRE — accept no longer sets one. Should be at most 30s. + const ttl = await buffer.getEntryTtlSeconds("run_x"); + expect(ttl).toBeGreaterThan(0); + expect(ttl).toBeLessThanOrEqual(30); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("ack on missing entry is a no-op", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - await buffer.ack("run_x"); - - const after = await buffer.getEntry("run_x"); - expect(after).toBeNull(); + await buffer.ack("run_ghost"); + const stored = await buffer.getEntry("run_ghost"); + expect(stored).toBeNull(); + // Critical: no partial hash created. + const raw = await buffer["redis"].hgetall("mollifier:entries:run_ghost"); + expect(Object.keys(raw)).toHaveLength(0); } finally { await buffer.close(); } @@ -204,13 +234,12 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { // Simulate a TTL-expired orphan: queue ref exists, entry hash does not. - await buffer["redis"].lpush("mollifier:queue:env_a", "run_orphan"); + await buffer["redis"].zadd("mollifier:queue:env_a", 1, "run_orphan"); const popped = await buffer.pop("env_a"); expect(popped).toBeNull(); @@ -220,7 +249,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(Object.keys(raw)).toHaveLength(0); // Queue is drained — the loop pops orphans until empty. - const qLen = await buffer["redis"].llen("mollifier:queue:env_a"); + const qLen = await buffer["redis"].zcard("mollifier:queue:env_a"); expect(qLen).toBe(0); } finally { await buffer.close(); @@ -238,17 +267,16 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - // Layout (oldest-first, since RPOP takes from tail): orphan, valid, orphan. - // LPUSH puts items at the head, so to get RPOP order [orphan_a, valid, orphan_b] - // we LPUSH in reverse: orphan_b first, then valid, then orphan_a. - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b"); + // Layout by score (lowest-first, since ZPOPMIN takes the min): + // orphan_a (score 1) → valid (score = its createdAtMicros, large) → orphan_b (score 1e18). + // First pop skips orphan_a, returns valid; orphan_b remains. + await buffer["redis"].zadd("mollifier:queue:env_a", 1, "orphan_a"); await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_a"); + await buffer["redis"].zadd("mollifier:queue:env_a", 1e18, "orphan_b"); const popped = await buffer.pop("env_a"); expect(popped).not.toBeNull(); @@ -256,7 +284,7 @@ describe("MollifierBuffer.pop orphan handling", () => { expect(popped!.status).toBe("DRAINING"); // The trailing orphan_b is still in the queue (single pop call). - const remaining = await buffer["redis"].llen("mollifier:queue:env_a"); + const remaining = await buffer["redis"].zcard("mollifier:queue:env_a"); expect(remaining).toBe(1); // A second pop drains the trailing orphan_b. The queue is now @@ -283,7 +311,6 @@ describe("MollifierBuffer.requeue", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -305,30 +332,43 @@ describe("MollifierBuffer.requeue", () => { }); describe("MollifierBuffer.fail", () => { - redisTest("fail transitions to FAILED and stores lastError", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "fail returns true and tears the entry down (drainer-terminal cleanup)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the drainer's createFailedTaskRun has + // already written a SYSTEM_FAILURE PG row by the time we call + // fail(), so the entry hash is no longer load-bearing. fail + // returns true and removes the entry; without this teardown + // failed entries would accrete forever now that there's no + // accept-time TTL. The Lua also DELs the idempotency lookup so + // future retries with the same key go through to PG instead of + // hitting an orphan dedup record. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); - expect(failed).toBe(true); + try { + await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); + expect(failed).toBe(true); - const entry = await buffer.getEntry("run_f"); - expect(entry!.status).toBe("FAILED"); - expect(entry!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); - } finally { - await buffer.close(); - } - }); + // Entry hash is gone post-fail. + const entry = await buffer.getEntry("run_f"); + expect(entry).toBeNull(); + const raw = await buffer["redis"].hgetall("mollifier:entries:run_f"); + expect(Object.keys(raw)).toHaveLength(0); + } finally { + await buffer.close(); + } + }, + ); redisTest( "fail on missing entry is a no-op (returns false; no partial hash created)", @@ -340,7 +380,6 @@ describe("MollifierBuffer.fail", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -361,27 +400,35 @@ describe("MollifierBuffer.fail", () => { }); describe("MollifierBuffer TTL", () => { - redisTest("entry has TTL applied on accept", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "entry has NO TTL applied on accept — drainer is the only cleanup path", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Regression guard for the design change: buffer entries must + // persist until the drainer ACKs or FAILs them. An accept-time + // EXPIRE would re-introduce the silent-loss-when-drainer-offline + // failure mode that the stale-entry alerting pipeline depends on + // *not* happening. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); + try { + await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); - const ttl = await buffer.getEntryTtlSeconds("run_t"); - expect(ttl).toBeGreaterThan(0); - expect(ttl).toBeLessThanOrEqual(600); - } finally { - await buffer.close(); - } - }); + // Redis returns -1 when the key exists but has no TTL set. + const ttl = await buffer.getEntryTtlSeconds("run_t"); + expect(ttl).toBe(-1); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer payload encoding", () => { @@ -395,7 +442,6 @@ describe("MollifierBuffer payload encoding", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -437,7 +483,6 @@ describe("MollifierBuffer.requeue on missing entry", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -458,22 +503,27 @@ describe("MollifierBuffer.requeue on missing entry", () => { describe("MollifierBuffer.requeue ordering", () => { redisTest( - "requeued entry is popped AFTER other queued entries on the same env (FIFO retry)", + "requeued entry retains its original createdAt and pops next (oldest-first by createdAt)", { timeout: 20_000 }, async ({ redisContainer }) => { + // Score == createdAtMicros; requeue does not bump the score. The + // oldest entry continues to pop first across retries. `maxAttempts` + // in the drainer bounds the retry loop for a persistently failing + // entry (after which it goes to the `fail` path, not requeue). const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { await buffer.accept({ runId: "a", envId: "env_a", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "b", envId: "env_a", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 2)); await buffer.accept({ runId: "c", envId: "env_a", orgId: "org_1", payload: "{}" }); const first = await buffer.pop("env_a"); @@ -481,12 +531,13 @@ describe("MollifierBuffer.requeue ordering", () => { await buffer.requeue("a"); + // a still has the smallest createdAtMicros → pops next. const next = await buffer.pop("env_a"); - expect(next!.runId).toBe("b"); + expect(next!.runId).toBe("a"); const after = await buffer.pop("env_a"); - expect(after!.runId).toBe("c"); + expect(after!.runId).toBe("b"); const last = await buffer.pop("env_a"); - expect(last!.runId).toBe("a"); + expect(last!.runId).toBe("c"); } finally { await buffer.close(); } @@ -508,7 +559,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -530,7 +580,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -557,7 +606,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -585,7 +633,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -610,7 +657,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -638,7 +684,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -671,7 +716,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -707,22 +751,21 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { await buffer.accept({ runId: "run_ttl", envId: "env_a", orgId: "org_1", payload: "{}" }); const beforeTtl = await buffer.getEntryTtlSeconds("run_ttl"); - expect(beforeTtl).toBeGreaterThan(0); + expect(beforeTtl).toBe(-1); await buffer.pop("env_a"); const afterTtl = await buffer.getEntryTtlSeconds("run_ttl"); - // TTL must still be present (>0). Redis returns -1 if the key has no - // TTL — that's the leak shape we're guarding against. - expect(afterTtl).toBeGreaterThan(0); - expect(afterTtl).toBeLessThanOrEqual(beforeTtl); + // No TTL applied at any point during accept/pop — the entry + // persists until the drainer ACKs or FAILs. Returning -1 from + // Redis here is the expected steady state, not a leak. + expect(afterTtl).toBe(-1); } finally { await buffer.close(); } @@ -739,7 +782,6 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -795,7 +837,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -813,8 +854,8 @@ describe("MollifierBuffer.accept idempotency", () => { payload: serialiseSnapshot({ first: false }), }); - expect(first).toBe(true); - expect(second).toBe(false); + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ kind: "duplicate_run_id" }); // First payload preserved; second was a no-op. const stored = await buffer.getEntry("run_dup"); @@ -844,7 +885,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -855,7 +895,7 @@ describe("MollifierBuffer.accept idempotency", () => { expect(stored!.status).toBe("DRAINING"); const dup = await buffer.accept({ runId: "run_dr", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + expect(dup).toEqual({ kind: "duplicate_run_id" }); const afterDup = await buffer.getEntry("run_dr"); expect(afterDup!.status).toBe("DRAINING"); // unchanged @@ -866,16 +906,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "accept refused while existing entry is FAILED", + "runId slot is reclaimable after fail tears the entry down", { timeout: 20_000 }, async ({ redisContainer }) => { + // Post-TTL-drop design: fail() deletes the entry hash because + // the SYSTEM_FAILURE PG row is the canonical record of the + // failure. The runId slot is therefore free for a fresh accept + // afterwards — runIds are server-generated CUIDs and don't + // collide in practice, but the contract pinning here documents + // that a re-acceptance does NOT see a phantom "FAILED" entry. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -883,15 +928,20 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); await buffer.pop("env_a"); await buffer.fail("run_fl", { code: "VALIDATION", message: "boom" }); - const stored = await buffer.getEntry("run_fl"); - expect(stored!.status).toBe("FAILED"); - const dup = await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + // Entry hash gone after fail (see "fail returns true and tears + // the entry down" — this test pins the accept-side effect). + expect(await buffer.getEntry("run_fl")).toBeNull(); - const afterDup = await buffer.getEntry("run_fl"); - expect(afterDup!.status).toBe("FAILED"); // unchanged - expect(afterDup!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); + const fresh = await buffer.accept({ + runId: "run_fl", + envId: "env_a", + orgId: "org_1", + payload: '{"fresh":true}', + }); + expect(fresh).toEqual({ kind: "accepted" }); + const after = await buffer.getEntry("run_fl"); + expect(after?.status).toBe("QUEUED"); } finally { await buffer.close(); } @@ -899,16 +949,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "re-accept after ack works (terminal entry can be re-accepted)", + "accept refused while a previously-acked (materialised) entry is still inside its grace TTL", { timeout: 20_000 }, async ({ redisContainer }) => { + // After ack, the entry hash persists for the grace window as a + // read-fallback safety net (Q1 D2). RunIds are server-generated and + // never collide in practice, but defense-in-depth: accept refuses + // while *any* entry exists for the runId, including materialised + // ones. The entry hash's TTL is now ~30s instead of the original + // entryTtlSeconds. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -922,7 +977,6 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.pop("env_a"); await buffer.ack("run_x"); - // Entry is gone — re-accept should succeed. const reAccept = await buffer.accept({ runId: "run_x", envId: "env_a", @@ -930,8 +984,11 @@ describe("MollifierBuffer.accept idempotency", () => { payload: "{}", }); - expect(first).toBe(true); - expect(reAccept).toBe(true); + expect(first).toEqual({ kind: "accepted" }); + expect(reAccept).toEqual({ kind: "duplicate_run_id" }); + + const stored = await buffer.getEntry("run_x"); + expect(stored!.materialised).toBe(true); } finally { await buffer.close(); } @@ -950,7 +1007,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -976,7 +1032,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1006,7 +1061,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1025,3 +1079,952 @@ describe("MollifierBuffer envs set lifecycle", () => { }, ); }); + +describe("MollifierBuffer idempotency lookup", () => { + redisTest( + "accept with idempotencyKey + taskIdentifier writes the lookup with no TTL", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the idempotency lookup has no TTL, so it + // can never expire ahead of the entry hash (which used to cause + // a dedup-drift bug — once the lookup expired but the entry + // didn't, a retry with the same key would create a *new* + // buffered run for the same key). The drainer's ack and fail + // both DEL the lookup as part of teardown. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.accept({ + runId: "ri1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-1", + taskIdentifier: "my-task", + }); + expect(result).toEqual({ kind: "accepted" }); + + const lookupKey = "mollifier:idempotency:env_i:my-task:ikey-1"; + const stored = await buffer["redis"].get(lookupKey); + expect(stored).toBe("ri1"); + // -1 = key exists with no TTL set. + expect(await buffer["redis"].ttl(lookupKey)).toBe(-1); + + const entry = await buffer.getEntry("ri1"); + expect(entry!.idempotencyLookupKey).toBe(lookupKey); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "second accept with same (env, task, idempotencyKey) returns duplicate_idempotency with the winner's runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const first = await buffer.accept({ + runId: "ri-a", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + const second = await buffer.accept({ + runId: "ri-b", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ + kind: "duplicate_idempotency", + existingRunId: "ri-a", + }); + + // The loser's runId entry was never created. + const loserEntry = await buffer.getEntry("ri-b"); + expect(loserEntry).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency hits when the run is buffered", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rl1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "k1", + taskIdentifier: "t", + }); + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "k1", + }); + expect(found).toBe("rl1"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency returns null when no lookup is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(found).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency self-heals when the lookup points at an expired entry", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Plant a stale lookup pointing at a non-existent entry. + const lookupKey = "mollifier:idempotency:env_i:t:stale"; + await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600); + expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale"); + + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "stale", + }); + expect(found).toBeNull(); + // Self-healed. + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "ack DELs the idempotency lookup along with marking materialised", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "ra1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ka", + taskIdentifier: "t", + }); + await buffer.pop("env_i"); + await buffer.ack("ra1"); + + const lookupKey = "mollifier:idempotency:env_i:t:ka"; + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + const entry = await buffer.getEntry("ra1"); + expect(entry!.materialised).toBe(true); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency clears snapshot fields + lookup; returns the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rr1", + envId: "env_i", + orgId: "org_1", + payload: serialiseSnapshot({ + idempotencyKey: "kr", + idempotencyKeyExpiresAt: "2026-12-01T00:00:00Z", + other: "field", + }), + idempotencyKey: "kr", + taskIdentifier: "t", + }); + + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "kr", + }); + expect(result.clearedRunId).toBe("rr1"); + + // Lookup is gone. + const lookupKey = "mollifier:idempotency:env_i:t:kr"; + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + + // Snapshot's idempotency fields are nulled, other fields kept. + const entry = await buffer.getEntry("rr1"); + const payload = JSON.parse(entry!.payload) as { + idempotencyKey: unknown; + idempotencyKeyExpiresAt: unknown; + other: string; + }; + expect(payload.idempotencyKey).toBeNull(); + expect(payload.idempotencyKeyExpiresAt).toBeNull(); + expect(payload.other).toBe("field"); + expect(entry!.idempotencyLookupKey).toBe(""); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency returns null when nothing is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(result.clearedRunId).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.casSetMetadata", () => { + redisTest( + "applies when expectedVersion matches; increments version; updates payload", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas1", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.casSetMetadata({ + runId: "cas1", + expectedVersion: 0, + newMetadata: '{"v":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "applied", newVersion: 1 }); + + const entry = await buffer.getEntry("cas1"); + expect(entry!.metadataVersion).toBe(1); + const payload = JSON.parse(entry!.payload) as { metadata: string }; + expect(payload.metadata).toBe('{"v":2}'); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns version_conflict when expectedVersion is stale", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas2", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":1}', + newMetadataType: "application/json", + }); + + // Second write with stale expectedVersion = 0 must conflict. + const result = await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found / busy on missing or terminal entries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const nf = await buffer.casSetMetadata({ + runId: "absent", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(nf).toEqual({ kind: "not_found" }); + + await buffer.accept({ + runId: "cas3", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.pop("env_c"); + const busy = await buffer.casSetMetadata({ + runId: "cas3", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(busy).toEqual({ kind: "busy" }); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.mutateSnapshot", () => { + redisTest( + "returns not_found when no entry exists for the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.mutateSnapshot("nope", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags on QUEUED entry appends and dedupes", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r1", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: ["existing"] }), + }); + const first = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["existing", "new"], + }); + expect(first).toBe("applied_to_snapshot"); + + const entry = await buffer.getEntry("r1"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["existing", "new"]); + + // Second mutation appends without duplicating + const second = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["new", "third"], + }); + expect(second).toBe("applied_to_snapshot"); + const e2 = await buffer.getEntry("r1"); + const p2 = JSON.parse(e2!.payload) as { tags: string[] }; + expect(p2.tags).toEqual(["existing", "new", "third"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags creates payload.tags when absent", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r2", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r2", { + type: "append_tags", + tags: ["a", "b"], + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r2"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["a", "b"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_metadata replaces metadata + metadataType (last-write-wins)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r3", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.mutateSnapshot("r3", { + type: "set_metadata", + metadata: '{"v":2}', + metadataType: "application/json", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r3"); + const payload = JSON.parse(entry!.payload) as { + metadata: string; + metadataType: string; + }; + expect(payload.metadata).toBe('{"v":2}'); + expect(payload.metadataType).toBe("application/json"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_delay sets payload.delayUntil", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r4", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r4", { + type: "set_delay", + delayUntil: "2026-06-01T00:00:00.000Z", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r4"); + const payload = JSON.parse(entry!.payload) as { delayUntil: string }; + expect(payload.delayUntil).toBe("2026-06-01T00:00:00.000Z"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "mark_cancelled stamps cancelledAt + cancelReason", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r5", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r5", { + type: "mark_cancelled", + cancelledAt: "2026-05-19T12:00:00.000Z", + cancelReason: "user-initiated", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r5"); + const payload = JSON.parse(entry!.payload) as { + cancelledAt: string; + cancelReason: string; + }; + expect(payload.cancelledAt).toBe("2026-05-19T12:00:00.000Z"); + expect(payload.cancelReason).toBe("user-initiated"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is DRAINING", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rd", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + const result = await buffer.mutateSnapshot("rd", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found when entry was FAILED (drainer-terminal teardown)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: fail() DELs the entry hash because the + // drainer has already written the canonical SYSTEM_FAILURE PG + // row, and without an accept-time TTL we'd otherwise accrete + // failed entries in Redis forever. Late mutations against a + // failed run therefore see `not_found`, matching the same shape + // they'd get for any other already-cleaned-up runId. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rf", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.fail("rf", { code: "X", message: "boom" }); + const result = await buffer.mutateSnapshot("rf", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is materialised (post-ack grace window)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rm", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.ack("rm"); + const result = await buffer.mutateSnapshot("rm", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "Lua atomicity serialises concurrent mutations per-runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rcc", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + + const tagsToAdd = Array.from({ length: 50 }, (_, i) => `t${i}`); + await Promise.all( + tagsToAdd.map((t) => buffer.mutateSnapshot("rcc", { type: "append_tags", tags: [t] })), + ); + + const entry = await buffer.getEntry("rcc"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags.sort()).toEqual(tagsToAdd.sort()); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer ZSET storage", () => { + redisTest( + "queue key is a ZSET scored by entry's createdAtMicros", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" }); + + // ZSET-only commands must succeed against the queue key. + const card = await buffer["redis"].zcard("mollifier:queue:env_z"); + expect(card).toBe(1); + + const score = await buffer["redis"].zscore("mollifier:queue:env_z", "z1"); + expect(score).not.toBeNull(); + const scoreNum = Number(score); + expect(Number.isFinite(scoreNum)).toBe(true); + + // Score matches the entry hash's createdAtMicros field. + const micros = await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros"); + expect(micros).not.toBeNull(); + expect(Number(micros)).toBe(scoreNum); + + // Score is plausibly recent (within last minute as microseconds). + const nowMicros = Date.now() * 1000; + expect(scoreNum).toBeGreaterThan(nowMicros - 60_000_000); + expect(scoreNum).toBeLessThanOrEqual(nowMicros + 1_000_000); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "pop returns entries in ascending createdAtMicros order (FIFO by time, not by member)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + // Insert runIds in reverse-lex order to prove ordering is by score, not member. + await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 5)); + await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" }); + await new Promise((r) => setTimeout(r, 5)); + await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" }); + + const first = await buffer.pop("env_o"); + expect(first!.runId).toBe("zzz"); + const second = await buffer.pop("env_o"); + expect(second!.runId).toBe("mmm"); + const third = await buffer.pop("env_o"); + expect(third!.runId).toBe("aaa"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "requeue keeps original score; createdAt is immutable across retries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" }); + const originalScore = Number( + await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), + ); + const originalMicros = Number( + await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), + ); + + await buffer.pop("env_rq"); + await new Promise((r) => setTimeout(r, 5)); + await buffer.requeue("rq"); + + const newScore = Number( + await buffer["redis"].zscore("mollifier:queue:env_rq", "rq"), + ); + const newMicros = Number( + await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"), + ); + expect(newScore).toBe(originalScore); + expect(newMicros).toBe(originalMicros); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.listEntriesForEnv", () => { + redisTest( + "returns up to maxCount entries from the queue without consuming them", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "r1", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r2", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r3", envId: "env_a", orgId: "org_1", payload: "{}" }); + + const entries = await buffer.listEntriesForEnv("env_a", 2); + expect(entries).toHaveLength(2); + const runIds = entries.map((e) => e.runId); + expect(new Set(runIds).size).toBe(2); + for (const id of runIds) expect(["r1", "r2", "r3"]).toContain(id); + + // Non-destructive: the drainer can still pop all three. + const popped: string[] = []; + for (let i = 0; i < 3; i++) { + const entry = await buffer.pop("env_a"); + if (entry) popped.push(entry.runId); + } + expect(new Set(popped)).toEqual(new Set(["r1", "r2", "r3"])); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("returns empty array when env queue is empty", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_empty", 10)).toEqual([]); + } finally { + await buffer.close(); + } + }); + + redisTest("maxCount <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_a", 0)).toEqual([]); + expect(await buffer.listEntriesForEnv("env_a", -5)).toEqual([]); + } finally { + await buffer.close(); + } + }); +}); diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index f739e3ff362..fd53f59efea 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -10,17 +10,66 @@ import { BufferEntry, BufferEntrySchema } from "./schemas.js"; export type MollifierBufferOptions = { redisOptions: RedisOptions; - entryTtlSeconds: number; logger?: Logger; }; +// Grace TTL applied to the entry hash on drainer ack. The entry survives +// this long after materialisation so direct reads (retrieve, trace, etc.) +// have a safety net while PG replica lag settles. Q1 D2. +const ACK_GRACE_TTL_SECONDS = 30; + +export type SnapshotPatch = + | { type: "append_tags"; tags: string[] } + | { type: "set_metadata"; metadata: string; metadataType: string } + | { type: "set_delay"; delayUntil: string } + | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string }; + +export type MutateSnapshotResult = "applied_to_snapshot" | "not_found" | "busy"; + +export type CasSetMetadataResult = + | { kind: "applied"; newVersion: number } + | { kind: "version_conflict"; currentVersion: number } + | { kind: "not_found" } + | { kind: "busy" }; + +export type AcceptResult = + | { kind: "accepted" } + | { kind: "duplicate_run_id" } + | { kind: "duplicate_idempotency"; existingRunId: string }; + +export type IdempotencyLookupInput = { + envId: string; + taskIdentifier: string; + idempotencyKey: string; +}; + +function makeIdempotencyLookupKey(input: IdempotencyLookupInput): string { + return `mollifier:idempotency:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; +} + +// Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the +// existing B6a buffer-side dedup stays isolated. The claim is the +// authoritative cross-store "this idempotency key is in flight or +// resolved" pointer used by the trigger hot path +// (`_plans/2026-05-21-mollifier-idempotency-claim.md`). Values: +// "pending" → a trigger pipeline owns the key and hasn't published yet +// → the winning trigger's runId (resolved) +export const IDEMPOTENCY_CLAIM_PENDING = "pending"; + +function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { + return `mollifier:claim:${input.envId}:${input.taskIdentifier}:${input.idempotencyKey}`; +} + +export type IdempotencyClaimResult = + | { kind: "claimed" } + | { kind: "pending" } + | { kind: "resolved"; runId: string }; + export class MollifierBuffer { private readonly redis: Redis; - private readonly entryTtlSeconds: number; private readonly logger: Logger; constructor(options: MollifierBufferOptions) { - this.entryTtlSeconds = options.entryTtlSeconds; this.logger = options.logger ?? new Logger("MollifierBuffer", "debug"); this.redis = createRedisClient( @@ -41,19 +90,47 @@ export class MollifierBuffer { this.#registerCommands(); } - // Returns true if the entry was newly written; false if a duplicate runId - // was already buffered (idempotent no-op). Callers can use the boolean to - // record a duplicate-accept metric without affecting buffer state. + // Three outcomes: + // - { kind: "accepted" } — entry was newly written. + // - { kind: "duplicate_run_id" } — runId was already buffered (idempotent + // no-op, same semantic as the previous boolean-false return). + // - { kind: "duplicate_idempotency", existingRunId } — the (env, task, + // idempotencyKey) tuple was already bound to another buffered run. + // The Lua's atomic SETNX is the race-winner; the second caller gets + // the winner's runId so it can return that as the trigger response. async accept(input: { runId: string; envId: string; orgId: string; payload: string; - }): Promise { + // Optional idempotency-key triple. When all three are present we + // SETNX a Redis lookup at `mollifier:idempotency:{env}:{task}:{key}` + // pointing at the runId so trigger-time dedup during the buffered + // window resolves the same way PG's unique constraint resolves it + // post-materialisation (Q5). + idempotencyKey?: string; + taskIdentifier?: string; + }): Promise { const entryKey = `mollifier:entries:${input.runId}`; const queueKey = `mollifier:queue:${input.envId}`; const orgsKey = "mollifier:orgs"; - const createdAt = new Date().toISOString(); + const nowMs = Date.now(); + const createdAt = new Date(nowMs).toISOString(); + // Microsecond epoch. JS only has millisecond precision, so multiple + // accepts in the same ms share a score; ZSET ties resolve by member + // (runId) lex order, which is deterministic and acceptable for FIFO + // pop. The hash carries the same value as `createdAtMicros` so the + // listing helper (Phase E) can read a stable per-run timestamp + // without re-fetching the score. + const createdAtMicros = nowMs * 1000; + const idempotencyLookupKey = + input.idempotencyKey && input.taskIdentifier + ? makeIdempotencyLookupKey({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }) + : ""; const result = await this.redis.acceptMollifierEntry( entryKey, queueKey, @@ -63,10 +140,17 @@ export class MollifierBuffer { input.orgId, input.payload, createdAt, - String(this.entryTtlSeconds), + String(createdAtMicros), "mollifier:org-envs:", + idempotencyLookupKey, ); - return result === 1; + // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId + // (duplicate idempotency — value is the existing winner's runId). + if (typeof result === "string" && result.length > 0) { + return { kind: "duplicate_idempotency", existingRunId: result }; + } + if (result === 1) return { kind: "accepted" }; + return { kind: "duplicate_run_id" }; } async pop(envId: string): Promise { @@ -128,8 +212,247 @@ export class MollifierBuffer { return this.redis.smembers(`mollifier:org-envs:${orgId}`); } + // Paginated read of currently-queued entries newest-first, bounded by + // an optional `(createdAtMicros, runId)` watermark. Q1 listing design. + // Returns hydrated `BufferEntry` rows up to `pageSize`. Skips orphans + // (queue ref without an entry hash) silently. Non-destructive — the + // drainer keeps popping these entries in createdAt order regardless. + async listForEnvWithWatermark(input: { + envId: string; + watermark?: { createdAtMicros: number; runId: string }; + pageSize: number; + }): Promise { + if (input.pageSize <= 0) return []; + const queueKey = `mollifier:queue:${input.envId}`; + + let runIds: string[]; + if (!input.watermark) { + // Page 1 — newest first. + runIds = await this.redis.zrevrangebyscore( + queueKey, + "+inf", + "-inf", + "LIMIT", + 0, + input.pageSize, + ); + } else { + // Page N — strictly below the watermark score. + const belowScore = await this.redis.zrevrangebyscore( + queueKey, + `(${input.watermark.createdAtMicros}`, + "-inf", + "LIMIT", + 0, + input.pageSize, + ); + runIds = belowScore; + // Tied-score scan: ZSET ties broken by member-DESC, so entries + // sharing the watermark score with a lex-smaller runId still + // need to surface. Cheap second range over the tied band. + if (belowScore.length < input.pageSize) { + const remaining = input.pageSize - belowScore.length; + const tied = await this.redis.zrangebyscore( + queueKey, + input.watermark.createdAtMicros, + input.watermark.createdAtMicros, + ); + // Filter to runIds lex-less than the watermark anchor, sort + // member-DESC, take `remaining`. + const tiedFiltered = tied + .filter((r) => r < input.watermark!.runId) + .sort((a, b) => (a < b ? 1 : a > b ? -1 : 0)) + .slice(0, remaining); + runIds = [...belowScore, ...tiedFiltered]; + } + } + + if (runIds.length === 0) return []; + + // Parallel HGETALL — one round-trip per entry, all in flight. + const fetched = await Promise.all( + runIds.map((runId) => this.redis.hgetall(`mollifier:entries:${runId}`)), + ); + const entries: BufferEntry[] = []; + for (const value of fetched) { + if (!value || Object.keys(value).length === 0) continue; + const parsed = BufferEntrySchema.safeParse(value); + if (parsed.success) entries.push(parsed.data); + } + return entries; + } + + // Read-only listing of currently-queued entries for a single env. Used by + // the dashboard's "Recently queued" surface — non-destructive, so the + // drainer still pops these entries in order. Returns up to `maxCount` + // entries newest-first (highest score, which is `createdAtMicros`). + // Each entry hash is fetched separately; a `null` from getEntry (TTL + // expired between ZREVRANGE and HGETALL) is skipped. + async listEntriesForEnv(envId: string, maxCount: number): Promise { + if (maxCount <= 0) return []; + const runIds = await this.redis.zrevrange( + `mollifier:queue:${envId}`, + 0, + maxCount - 1, + ); + const entries: BufferEntry[] = []; + for (const runId of runIds) { + const entry = await this.getEntry(runId); + if (entry) entries.push(entry); + } + return entries; + } + + // Atomic snapshot mutation. Used by customer-mutation API endpoints + // (tags, metadata-put, reschedule, cancel) when the run is still in + // the buffer. Three outcomes: + // - "applied_to_snapshot": entry was QUEUED + not materialised; the + // drainer will read the patched payload on its next pop. + // - "not_found": no entry hash exists for this runId. + // - "busy": entry is DRAINING / FAILED / materialised. The API + // wait-and-bounces through PG (Q3 design). + async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise { + const result = (await this.redis.mutateMollifierSnapshot( + `mollifier:entries:${runId}`, + JSON.stringify(patch), + )) as string; + if ( + result === "applied_to_snapshot" || + result === "not_found" || + result === "busy" + ) { + return result; + } + throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`); + } + + // Optimistic compare-and-swap on the snapshot's metadata. Caller reads + // the current metadataVersion via getEntry, applies operations in JS via + // `applyMetadataOperations`, then calls this with the new metadata + the + // expected version. Lua refuses if the version has moved (caller retries + // up to N times). Mirrors the PG-side `UpdateMetadataService` retry + // loop so concurrent increment/append operations don't lose deltas. + async casSetMetadata(input: { + runId: string; + expectedVersion: number; + newMetadata: string; + newMetadataType: string; + }): Promise { + const entryKey = `mollifier:entries:${input.runId}`; + const raw = (await this.redis.casSetMollifierMetadata( + entryKey, + String(input.expectedVersion), + input.newMetadata, + input.newMetadataType, + )) as string; + if (raw === "not_found") return { kind: "not_found" }; + if (raw === "busy") return { kind: "busy" }; + if (raw.startsWith("conflict:")) { + return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) }; + } + if (raw.startsWith("applied:")) { + return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) }; + } + throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`); + } + + // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One + // call across both PG and buffer paths serialises through this claim; + // closes the race the buffer-side B6a SETNX leaves open during the + // gate-transition burst window (see + // `_plans/2026-05-21-mollifier-idempotency-claim.md`). + // + // - "claimed": we now own the claim, the caller proceeds with the + // trigger pipeline and must `publishClaim` on success or + // `releaseClaim` on failure. + // - "pending": another trigger owns the claim and hasn't published + // yet; the caller should poll. + // - "resolved": the claim already holds a runId; the caller can + // return that runId as a cached hit. + async claimIdempotency( + input: IdempotencyLookupInput & { ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const raw = (await this.redis.claimMollifierIdempotency( + claimKey, + IDEMPOTENCY_CLAIM_PENDING, + String(input.ttlSeconds), + )) as string; + if (raw === "claimed") return { kind: "claimed" }; + if (raw === "pending") return { kind: "pending" }; + if (raw.startsWith("resolved:")) { + return { kind: "resolved", runId: raw.slice("resolved:".length) }; + } + throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`); + } + + // Publish the winning runId to the claim so subsequent claimants / + // waiters see "resolved". TTL bounded by the customer's + // `idempotencyKeyExpiresAt` minus now; caller computes. + async publishClaim( + input: IdempotencyLookupInput & { runId: string; ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + await this.redis.set(claimKey, input.runId, "EX", input.ttlSeconds); + } + + // Release the claim on pipeline error so waiters can re-claim and + // retry. Idempotent. + async releaseClaim(input: IdempotencyLookupInput): Promise { + const claimKey = makeIdempotencyClaimKey(input); + await this.redis.del(claimKey); + } + + // Read the current claim value, used by the wait/poll loop on losers + // to detect "pending" → "resolved" transitions and timeouts. + async readClaim(input: IdempotencyLookupInput): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const value = await this.redis.get(claimKey); + if (value === null) return null; + if (value === IDEMPOTENCY_CLAIM_PENDING) return { kind: "pending" }; + return { kind: "resolved", runId: value }; + } + + // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by + // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check + // misses — same key may belong to a buffered run waiting to drain. The + // lookup self-heals: if the lookup points at an entry hash that's + // expired, we DEL the lookup and report a miss. + async lookupIdempotency(input: IdempotencyLookupInput): Promise { + const lookupKey = makeIdempotencyLookupKey(input); + const runId = await this.redis.get(lookupKey); + if (!runId) return null; + const entry = await this.getEntry(runId); + if (!entry) { + await this.redis.del(lookupKey); + return null; + } + return runId; + } + + // Clear the idempotency binding from a buffered run. Used by + // `ResetIdempotencyKeyService` alongside the existing PG-side + // `updateMany`. Returns the runId that was cleared, or null if no + // buffered run held this key. + async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> { + const lookupKey = makeIdempotencyLookupKey(input); + const clearedRunId = (await this.redis.resetMollifierIdempotency( + lookupKey, + "mollifier:entries:", + )) as string; + return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null }; + } + + // Marks the entry as materialised (PG row written) and resets its TTL to + // the grace window. Entry hash persists past ack as a read-fallback + // safety net for the brief PG replica-lag window between drainer-side + // write and reader-side visibility (Q1 D2). Also clears the associated + // idempotency lookup if one was set on accept (Q5). async ack(runId: string): Promise { - await this.redis.del(`mollifier:entries:${runId}`); + await this.redis.ackMollifierEntry( + `mollifier:entries:${runId}`, + String(ACK_GRACE_TTL_SECONDS), + ); } async requeue(runId: string): Promise { @@ -153,10 +476,16 @@ export class MollifierBuffer { return result === 1; } + // Returns Redis-side TTL on the entry hash. Returns -1 for entries + // with no TTL — the steady state under the current design, where + // entries persist until drainer ack/fail. The ack grace TTL (30s + // post-materialise) is the only context where this returns a + // positive value; tests around the grace TTL still rely on it. async getEntryTtlSeconds(runId: string): Promise { return this.redis.ttl(`mollifier:entries:${runId}`); } + async evaluateTrip( envId: string, options: { windowMs: number; threshold: number; holdMs: number }, @@ -190,8 +519,9 @@ export class MollifierBuffer { local orgId = ARGV[3] local payload = ARGV[4] local createdAt = ARGV[5] - local ttlSeconds = tonumber(ARGV[6]) + local createdAtMicros = ARGV[6] local orgEnvsPrefix = ARGV[7] + local idempotencyLookupKey = ARGV[8] or '' -- Idempotent: refuse if an entry for this runId already exists in any -- state. Caller-side dedup is also enforced via API idempotency keys, @@ -200,6 +530,20 @@ export class MollifierBuffer { return 0 end + -- Idempotency-key dedup (Q5). If the caller passed a lookup key + -- and it's already bound to another buffered run, return the + -- winner's runId so the loser's API response can echo it as a + -- cached hit. Otherwise SET the lookup (no TTL — lifecycle is + -- paired with the entry hash; drainer ack/fail clear it + -- explicitly). + if idempotencyLookupKey ~= '' then + local existing = redis.call('GET', idempotencyLookupKey) + if existing then + return existing + end + redis.call('SET', idempotencyLookupKey, runId) + end + redis.call('HSET', entryKey, 'runId', runId, 'envId', envId, @@ -207,9 +551,22 @@ export class MollifierBuffer { 'payload', payload, 'status', 'QUEUED', 'attempts', '0', - 'createdAt', createdAt) - redis.call('EXPIRE', entryKey, ttlSeconds) - redis.call('LPUSH', queueKey, runId) + 'createdAt', createdAt, + 'createdAtMicros', createdAtMicros, + 'idempotencyLookupKey', idempotencyLookupKey, + 'metadataVersion', '0') + -- No EXPIRE on the entry hash. Buffer entries persist until the + -- drainer ACKs (post-materialise grace) or FAILs them — the + -- drainer is the only recovery mechanism, so silent TTL-based + -- eviction would lose runs with no customer-visible signal. + -- Memory pressure from an offline drainer is the alertable + -- failure mode instead; see _ops/mollifier-ops.md. + -- ZSET keyed by createdAtMicros: ZPOPMIN drains oldest-first + -- (FIFO); listing pagination uses ZREVRANGEBYSCORE with a + -- (createdAt, runId) cursor anchor. Score is stable across the + -- entry's lifecycle — requeue does not bump it (see Phase 3b / + -- Q1 design). + redis.call('ZADD', queueKey, createdAtMicros, runId) -- Org-level membership: maintained atomically with the per-env -- queue so the drainer can walk orgs → envs-for-org and -- schedule one env per org per tick. SADDs are idempotent if the @@ -231,7 +588,8 @@ export class MollifierBuffer { local envId = redis.call('HGET', entryKey, 'envId') local orgId = redis.call('HGET', entryKey, 'orgId') - if not envId then + local createdAtMicros = redis.call('HGET', entryKey, 'createdAtMicros') + if not envId or not createdAtMicros then return 0 end @@ -239,7 +597,11 @@ export class MollifierBuffer { local nextAttempts = tonumber(currentAttempts or '0') + 1 redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts)) - redis.call('LPUSH', queuePrefix .. envId, runId) + -- Requeue re-adds with the ORIGINAL createdAtMicros score. + -- createdAt is immutable across retries (Phase 3b decision). + -- The drainer's maxAttempts caps the retry loop so a poisoned + -- entry doesn't head-of-line forever. + redis.call('ZADD', queuePrefix .. envId, tonumber(createdAtMicros), runId) -- Re-track the org/env: pop may have SREM'd them when the queue -- last emptied. SADDs are idempotent if the values are still -- present. @@ -279,7 +641,9 @@ export class MollifierBuffer { -- hash without a TTL, leaking memory. The loop is bounded by queue -- length; entire Lua script remains atomic. while true do - local runId = redis.call('RPOP', queueKey) + -- ZPOPMIN returns {member, score} as a flat array, or {} when empty. + local popped = redis.call('ZPOPMIN', queueKey) + local runId = popped[1] if not runId then -- Queue is empty AND we have no entry to read orgId from, so -- skip org-level cleanup. Stale org-envs entries are bounded @@ -296,9 +660,9 @@ export class MollifierBuffer { result[raw[i]] = raw[i + 1] end -- Prune org-level membership if this pop drained the queue. - -- Atomic with the RPOP above — a concurrent accept AFTER this - -- script will SADD both back along with its LPUSH. - if redis.call('LLEN', queueKey) == 0 then + -- Atomic with the ZPOPMIN above — a concurrent accept AFTER + -- this script will SADD both back along with its ZADD. + if redis.call('ZCARD', queueKey) == 0 then pruneOrgMembership(result['orgId']) end return cjson.encode(result) @@ -309,19 +673,220 @@ export class MollifierBuffer { `, }); + this.redis.defineCommand("casSetMollifierMetadata", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local expectedVersion = tonumber(ARGV[1]) + local newMetadata = ARGV[2] + local newMetadataType = ARGV[3] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0' + local currentVersion = tonumber(currentVersionStr) or 0 + if currentVersion ~= expectedVersion then + return 'conflict:' .. tostring(currentVersion) + end + + -- Write the new metadata onto the snapshot's payload JSON. We + -- keep the rest of the payload intact — only metadata/metadataType + -- change. metadataVersion is denormalised on the hash for cheap + -- CAS reads; it's intentionally NOT stored inside the payload + -- itself (PG-side metadataVersion is a column, not a JSON field). + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + payload.metadata = newMetadata + payload.metadataType = newMetadataType + + local newVersion = currentVersion + 1 + redis.call('HSET', entryKey, + 'payload', cjson.encode(payload), + 'metadataVersion', tostring(newVersion)) + return 'applied:' .. tostring(newVersion) + `, + }); + + this.redis.defineCommand("claimMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local pending = ARGV[1] + local ttl = tonumber(ARGV[2]) + + -- SETNX-with-TTL: atomic; only one caller can win. + local won = redis.call('SET', claimKey, pending, 'NX', 'EX', ttl) + if won then + return 'claimed' + end + + local existing = redis.call('GET', claimKey) + if existing == pending then + return 'pending' + end + return 'resolved:' .. existing + `, + }); + + this.redis.defineCommand("resetMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local lookupKey = KEYS[1] + local entryPrefix = ARGV[1] + + local runId = redis.call('GET', lookupKey) + if not runId then + return '' + end + + local entryKey = entryPrefix .. runId + if redis.call('EXISTS', entryKey) == 0 then + -- Stale lookup. Lazy cleanup. + redis.call('DEL', lookupKey) + return '' + end + + -- Clear the idempotency fields on the snapshot payload so the + -- drainer's eventual engine.trigger call inserts a PG row + -- without the key set. + local payloadJson = redis.call('HGET', entryKey, 'payload') + if payloadJson then + local ok, payload = pcall(cjson.decode, payloadJson) + if ok then + payload.idempotencyKey = cjson.null + payload.idempotencyKeyExpiresAt = cjson.null + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + end + end + -- Clear the denormalised lookup pointer on the hash so a later + -- ack doesn't try to DEL a key that's already gone. + redis.call('HSET', entryKey, 'idempotencyLookupKey', '') + redis.call('DEL', lookupKey) + return runId + `, + }); + + this.redis.defineCommand("mutateMollifierSnapshot", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local patchJson = ARGV[1] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + + local patch = cjson.decode(patchJson) + + if patch.type == 'append_tags' then + -- cjson decode of an absent or empty-array field gives nil or + -- an empty table; we rebuild as a dense array. Existing tags + -- are preserved; new tags are appended only if not present. + local existing = payload.tags or {} + local seen = {} + local merged = {} + for _, t in ipairs(existing) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + for _, t in ipairs(patch.tags or {}) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + payload.tags = merged + elseif patch.type == 'set_metadata' then + payload.metadata = patch.metadata + payload.metadataType = patch.metadataType + elseif patch.type == 'set_delay' then + payload.delayUntil = patch.delayUntil + elseif patch.type == 'mark_cancelled' then + payload.cancelledAt = patch.cancelledAt + payload.cancelReason = patch.cancelReason + else + return 'busy' + end + + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + return 'applied_to_snapshot' + `, + }); + + this.redis.defineCommand("ackMollifierEntry", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local graceTtlSeconds = tonumber(ARGV[1]) + + -- Guard: never create a partial entry. If the hash expired between + -- pop and ack, the run is gone — nothing to mark materialised. + if redis.call('EXISTS', entryKey) == 0 then + return 0 + end + + -- If the entry was accepted with an idempotency key, the lookup + -- string was stored on the hash at accept time. Clear it now — + -- PG becomes canonical for the key post-materialisation (Q5). + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + + redis.call('HSET', entryKey, 'materialised', 'true') + redis.call('EXPIRE', entryKey, graceTtlSeconds) + return 1 + `, + }); + this.redis.defineCommand("failMollifierEntry", { numberOfKeys: 1, lua: ` local entryKey = KEYS[1] local errorPayload = ARGV[1] - -- Guard: never create a partial entry. If the hash expired between - -- pop and fail, the run is gone — nothing to mark FAILED. + -- Guard: nothing to mark FAILED if the hash is gone (concurrent + -- ack/manual cleanup). Returning 0 lets the caller distinguish + -- "marked failed" from "no-op". if redis.call('EXISTS', entryKey) == 0 then return 0 end redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload) + + -- The drainer has already written a SYSTEM_FAILURE PG row for + -- terminal failures (see mollifierDrainerHandler.server.ts), so + -- the buffer entry is no longer load-bearing. Clear the + -- idempotency lookup — PG's unique constraint is the canonical + -- dedup mechanism post-materialise — and drop the entry hash so + -- failed runs don't accrete forever now that there's no + -- accept-time TTL. + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + redis.call('DEL', entryKey) return 1 `, }); @@ -362,10 +927,11 @@ declare module "@internal/redis" { orgId: string, payload: string, createdAt: string, - ttlSeconds: string, + createdAtMicros: string, orgEnvsPrefix: string, - callback?: Callback, - ): Result; + idempotencyLookupKey: string, + callback?: Callback, + ): Result; popAndMarkDraining( queueKey: string, orgsKey: string, @@ -382,6 +948,34 @@ declare module "@internal/redis" { orgEnvsPrefix: string, callback?: Callback, ): Result; + mutateMollifierSnapshot( + entryKey: string, + patchJson: string, + callback?: Callback, + ): Result; + casSetMollifierMetadata( + entryKey: string, + expectedVersion: string, + newMetadata: string, + newMetadataType: string, + callback?: Callback, + ): Result; + resetMollifierIdempotency( + lookupKey: string, + entryPrefix: string, + callback?: Callback, + ): Result; + claimMollifierIdempotency( + claimKey: string, + pendingMarker: string, + ttlSeconds: string, + callback?: Callback, + ): Result; + ackMollifierEntry( + entryKey: string, + graceTtlSeconds: string, + callback?: Callback, + ): Result; failMollifierEntry( entryKey: string, errorPayload: string, diff --git a/packages/redis-worker/src/mollifier/drainer.test.ts b/packages/redis-worker/src/mollifier/drainer.test.ts index c8f68977f69..ce41f8e9845 100644 --- a/packages/redis-worker/src/mollifier/drainer.test.ts +++ b/packages/redis-worker/src/mollifier/drainer.test.ts @@ -6,7 +6,6 @@ import { MollifierDrainer } from "./drainer.js"; import { serialiseSnapshot } from "./schemas.js"; const noopOptions = { - entryTtlSeconds: 600, logger: new Logger("test", "log"), }; @@ -87,8 +86,11 @@ describe("MollifierDrainer.runOnce", () => { payload: { foo: 1 }, }); + // After ack the entry persists as a read-fallback safety net with + // materialised=true and a fresh grace TTL (Q1 D2 / Phase B2). const entry = await buffer.getEntry("run_1"); - expect(entry).toBeNull(); + expect(entry).not.toBeNull(); + expect(entry!.materialised).toBe(true); } finally { await buffer.close(); } diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts index 5e6fe202e3d..2751a6615eb 100644 --- a/packages/redis-worker/src/mollifier/index.ts +++ b/packages/redis-worker/src/mollifier/index.ts @@ -1,4 +1,13 @@ -export { MollifierBuffer, type MollifierBufferOptions } from "./buffer.js"; +export { + MollifierBuffer, + type MollifierBufferOptions, + type SnapshotPatch, + type MutateSnapshotResult, + type CasSetMetadataResult, + type IdempotencyClaimResult, + type IdempotencyLookupInput, + IDEMPOTENCY_CLAIM_PENDING, +} from "./buffer.js"; export { MollifierDrainer, type MollifierDrainerOptions, diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts index f93b0f0a3c3..c5d9915575a 100644 --- a/packages/redis-worker/src/mollifier/schemas.ts +++ b/packages/redis-worker/src/mollifier/schemas.ts @@ -27,6 +27,10 @@ const stringToDate = z.string().transform((v, ctx) => { return d; }); +const stringToBool = z + .union([z.literal("true"), z.literal("false")]) + .transform((v) => v === "true"); + const stringToError = z.string().transform((v, ctx) => { try { return BufferEntryError.parse(JSON.parse(v)); @@ -44,6 +48,24 @@ export const BufferEntrySchema = z.object({ status: BufferEntryStatus, attempts: stringToInt, createdAt: stringToDate, + // Microsecond epoch matching the ZSET queue score. Stable across + // requeues — the score never moves once set at accept time. + createdAtMicros: stringToInt, + // Drainer-ack flag: `true` once the drainer has materialised this run + // into PG. The hash persists for a short grace TTL after ack so direct + // reads (retrieve, trace, etc.) still resolve while PG replica lag + // settles. Absent on pre-ack entries. + materialised: stringToBool.default("false"), + // Denormalised pointer to the Redis idempotency lookup key (set when + // the run was accepted with an idempotency key, empty otherwise). The + // ack Lua reads this to DEL the lookup atomically with marking the + // entry materialised (Q5). + idempotencyLookupKey: z.string().optional().default(""), + // Optimistic-lock counter for the snapshot's `metadata` field. + // Incremented atomically by the CAS metadata Lua. Matches the + // semantic of `TaskRun.metadataVersion` on the PG side (which the + // UpdateMetadataService uses for the same retry-on-conflict pattern). + metadataVersion: stringToInt.default("0"), lastError: stringToError.optional(), }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c742ab1bfc4..e729eec716f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -3063,6 +3063,22 @@ importers: specifier: workspace:* version: link:../../packages/cli-v3 + references/stress-tasks: + dependencies: + '@trigger.dev/build': + specifier: workspace:* + version: link:../../packages/build + '@trigger.dev/sdk': + specifier: workspace:* + version: link:../../packages/trigger-sdk + zod: + specifier: 3.25.76 + version: 3.25.76 + devDependencies: + trigger.dev: + specifier: workspace:* + version: link:../../packages/cli-v3 + references/telemetry: dependencies: '@opentelemetry/resources':