From a6db5069278f057d152b4d63f31064a790e31447 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 09:45:41 +0100 Subject: [PATCH 1/8] feat(webapp): add a new backend for the realtime runs feed Adds an opt-in backend for realtime run subscriptions (single runs, tag lists, and batches), selected per organization by a feature flag and gated by a global environment-variable switch, both defaulting off so nothing changes until enabled. Run changes are signalled over Redis pub/sub; a live subscription wakes, refetches the current rows from a read replica, and re-emits them, resolving tag and batch membership from ClickHouse. Concurrent subscribers watching the same runs, tags, or batch share a single resolve-and-hydrate per short window, so read load scales with distinct filters rather than connection count. --- .../realtime-runs-subscription-scalability.md | 6 + apps/webapp/app/entry.server.tsx | 4 + apps/webapp/app/env.server.ts | 25 + .../app/routes/api.v1.runs.$runId.tags.ts | 3 + .../routes/realtime.v1.batches.$batchId.ts | 8 +- .../app/routes/realtime.v1.runs.$runId.ts | 9 +- apps/webapp/app/routes/realtime.v1.runs.ts | 8 +- .../app/services/realtime/boundedTtlCache.ts | 57 ++ .../clickHouseRunListResolver.server.ts | 40 + .../realtime/electricStreamProtocol.server.ts | 301 +++++++ .../realtime/notifierRealtimeClient.server.ts | 752 ++++++++++++++++++ .../notifierRealtimeClientInstance.server.ts | 99 +++ .../realtimeConcurrencyLimiter.server.ts | 111 +++ .../resolveRealtimeStreamClient.server.ts | 86 ++ .../realtime/runChangeNotifier.server.ts | 228 ++++++ .../runChangeNotifierHandlers.server.ts | 73 ++ .../runChangeNotifierInstance.server.ts | 73 ++ .../app/services/realtime/runReader.server.ts | 191 +++++ .../services/realtime/shadowCompare.server.ts | 289 +++++++ .../realtime/shadowRealtimeClient.server.ts | 192 +++++ .../shadowRealtimeClientInstance.server.ts | 66 ++ apps/webapp/app/v3/featureFlags.ts | 5 + .../test/realtime/boundedTtlCache.test.ts | 41 + .../realtime/electricStreamProtocol.test.ts | 304 +++++++ .../realtime/notifierRealtimeClient.test.ts | 107 +++ .../test/realtime/notifierRunSetCache.test.ts | 173 ++++ .../test/realtime/runChangeNotifier.test.ts | 211 +++++ .../test/realtime/runReaderProjection.test.ts | 57 ++ .../test/realtime/shadowCompare.test.ts | 212 +++++ 29 files changed, 3725 insertions(+), 6 deletions(-) create mode 100644 .server-changes/realtime-runs-subscription-scalability.md create mode 100644 apps/webapp/app/services/realtime/boundedTtlCache.ts create mode 100644 apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts create mode 100644 apps/webapp/app/services/realtime/electricStreamProtocol.server.ts create mode 100644 apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts create mode 100644 apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts create mode 100644 apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts create mode 100644 apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifier.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts create mode 100644 apps/webapp/app/services/realtime/runReader.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowCompare.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts create mode 100644 apps/webapp/test/realtime/boundedTtlCache.test.ts create mode 100644 apps/webapp/test/realtime/electricStreamProtocol.test.ts create mode 100644 apps/webapp/test/realtime/notifierRealtimeClient.test.ts create mode 100644 apps/webapp/test/realtime/notifierRunSetCache.test.ts create mode 100644 apps/webapp/test/realtime/runChangeNotifier.test.ts create mode 100644 apps/webapp/test/realtime/runReaderProjection.test.ts create mode 100644 apps/webapp/test/realtime/shadowCompare.test.ts diff --git a/.server-changes/realtime-runs-subscription-scalability.md b/.server-changes/realtime-runs-subscription-scalability.md new file mode 100644 index 00000000000..5de00aae675 --- /dev/null +++ b/.server-changes/realtime-runs-subscription-scalability.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a new backend for the realtime runs feed (single runs, tags, and batches) that scales under high concurrency, available behind a feature flag diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 9996eb7b30a..8cc23bff089 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -27,6 +27,7 @@ import { registerRunEngineEventBusHandlers, setupBatchQueueCallbacks, } from "./v3/runEngineHandlers.server"; +import { registerRunChangeNotifierHandlers } from "./services/realtime/runChangeNotifierHandlers.server"; // Touch the sessions replication singleton at entry so it boots deterministically // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors @@ -269,6 +270,9 @@ process.on("uncaughtException", (error, origin) => { singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); +// Attach the run-changed notifier delegations to the engine event bus. +// No-ops (registers nothing) unless REALTIME_NOTIFIER_ENABLED=1. +singleton("RunChangeNotifierHandlers", registerRunChangeNotifierHandlers); // Wrapped in singleton() so Remix's dev-mode CJS reloads don't append // duplicate copies of the processor — Sentry's processor list lives in diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c55bb424001..3cdfdbf51fc 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -300,6 +300,31 @@ const EnvironmentSchema = z .int() .default(24 * 60 * 60 * 1000), // 1 day in milliseconds + // Master switch for the notifier-backed realtime feed. + // "0" (default) = the existing realtime path serves everything, publishes are + // no-ops, and no notifier Redis connections are opened (zero-overhead off). + // "1" = run-changed signals are published and the per-org `realtimeBackend` + // feature flag selects the backend per request. + REALTIME_NOTIFIER_ENABLED: z.string().default("0"), + // Backstop wait before a live notifier request refetches the run (ms). + REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(5_000), + // Hard cap on the tag-list snapshot size served by the notifier feed. + REALTIME_NOTIFIER_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), + // Short-TTL coalescing cache for the multi-run (tag-list/batch) resolve+hydrate. + // Concurrent same-filter feeds share one ClickHouse resolve + Postgres hydrate + // within this window, so an env-wide wake doesn't fan out into per-feed queries. + // Staleness budget: a newly-matching run is visible within ~ttl + poll interval. + REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), + REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // Cap on the per-handle working-set cache (runId -> updatedAt) the notifier keeps + // for diffing multi-run live polls. + REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), + // Quantize the tag-list createdAt lower bound to this epoch-aligned bucket (ms) so + // same-tag feeds that pin their window within the same bucket share one resolve+ + // hydrate cache entry. Floored, so the window only ever widens by < bucket. 0 + // disables bucketing (each feed keeps its exact lower bound). + REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + PUBSUB_REDIS_HOST: z .string() .optional() diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index ef7f3180bf3..9dd184fa25e 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -7,6 +7,7 @@ import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { publishRunChanged } from "~/services/realtime/runChangeNotifierInstance.server"; import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; // Pull the existing tags out of a buffer entry's serialised payload so @@ -90,6 +91,8 @@ export async function action({ request, params }: ActionFunctionArgs) { }, data: { runTags: { push: newTags } }, }); + // Delegate a run-changed notify (no-op unless enabled). + publishRunChanged({ runId: taskRun.id, environmentId: env.id }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, // Buffer-applied patch path. The mutateSnapshot Lua deduplicates diff --git a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts index 2b8fb106681..973cd5f96cd 100644 --- a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts +++ b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts @@ -1,7 +1,7 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; const ParamsSchema = z.object({ @@ -33,7 +33,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: batchRun, apiVersion }) => { - return realtimeClient.streamBatch( + // Pick the Electric proxy or the notifier-backed batch feed + // per org (defaults to Electric). Both implement streamBatch. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamBatch( request.url, authentication.environment, batchRun.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index e03787c6200..3e224ddedf2 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -2,7 +2,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -48,7 +48,12 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { - return realtimeClient.streamRun( + // Pick the Electric proxy or the notifier-backed shim per org (defaults to + // Electric; controlled by REALTIME_NOTIFIER_ENABLED + the realtimeBackend + // feature flag). Both implement the same streamRun contract. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRun( request.url, authentication.environment, run.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.ts b/apps/webapp/app/routes/realtime.v1.runs.ts index b04c2d55bbc..436f4ef48d8 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.ts @@ -1,6 +1,6 @@ import { z } from "zod"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -39,7 +39,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ searchParams, authentication, request, apiVersion }) => { - return realtimeClient.streamRuns( + // Pick the Electric proxy or the notifier-backed tag-list feed per org + // (defaults to Electric). Both implement streamRuns. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRuns( request.url, authentication.environment, searchParams, diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts new file mode 100644 index 00000000000..643f23607c5 --- /dev/null +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -0,0 +1,57 @@ +/** + * Tiny in-process bounded TTL cache shared by the realtime feeds. + * + * Entries expire after `ttlMs`. An expired entry is evicted when read (`get`); on + * write, if the cache is at `maxEntries`, expired entries are swept and, if it's + * still full (pathologically all live), the oldest insertion is dropped. Node is + * single-threaded so no locking is needed. Used where a miss is cheap and + * correctness-safe (read-through hydration, per-handle working sets, per-org flag + * resolution). + * + * A stored value of `undefined` cannot be distinguished from a miss; callers that + * need to cache "absence" should store an explicit sentinel (e.g. `null`). + */ +export class BoundedTtlCache { + readonly #entries = new Map(); + + constructor( + private readonly ttlMs: number, + private readonly maxEntries: number + ) {} + + get(key: string): V | undefined { + const entry = this.#entries.get(key); + if (!entry) { + return undefined; + } + if (entry.expiresAt > Date.now()) { + return entry.value; + } + // Evict on read so expired entries don't linger until the next at-capacity + // sweep — important for read-heavy / low-churn caches (per-handle working sets). + this.#entries.delete(key); + return undefined; + } + + set(key: string, value: V): void { + if (this.#entries.size >= this.maxEntries) { + const now = Date.now(); + for (const [key, entry] of this.#entries) { + if (entry.expiresAt <= now) { + this.#entries.delete(key); + } + } + if (this.#entries.size >= this.maxEntries) { + const oldest = this.#entries.keys().next().value; + if (oldest !== undefined) { + this.#entries.delete(oldest); + } + } + } + this.#entries.set(key, { value, expiresAt: Date.now() + this.ttlMs }); + } + + get size(): number { + return this.#entries.size; + } +} diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts new file mode 100644 index 00000000000..545c4a43211 --- /dev/null +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -0,0 +1,40 @@ +import { type ClickHouse } from "@internal/clickhouse"; +import { type PrismaClientOrTransaction } from "~/db.server"; +import { RunsRepository } from "~/services/runsRepository/runsRepository.server"; +import { type RunListFilter, type RunListResolver } from "./runReader.server"; + +export type ClickHouseRunListResolverOptions = { + /** Resolves the per-organization ClickHouse client (multi-tenant routing). */ + getClickhouse: (organizationId: string) => Promise; + prisma: PrismaClientOrTransaction; +}; + +/** + * Resolves the realtime tag/list filter into matching run ids via ClickHouse + * `listRunIds`. Tag matching is contains-ANY (OR), the same + * semantics the dashboard runs list uses. Filter-only: ids only, hydrated from + * Postgres by id afterward. This keeps the realtime tag feed off the Postgres + * `runTags` GIN index entirely. + * + * (Multi-tag subscribeToRunsWithTag is therefore OR, not the AND that Electric's + * `runTags @> ARRAY[...]` shape used. Restoring AND is a follow-up: add a + * `hasAll` mode to the ClickHouse runs filter and use it here.) + */ +export class ClickHouseRunListResolver implements RunListResolver { + constructor(private readonly options: ClickHouseRunListResolverOptions) {} + + async resolveMatchingRunIds(filter: RunListFilter): Promise { + const clickhouse = await this.options.getClickhouse(filter.organizationId); + const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); + + return repository.listRunIds({ + organizationId: filter.organizationId, + projectId: filter.projectId, + environmentId: filter.environmentId, + tags: filter.tags && filter.tags.length > 0 ? filter.tags : undefined, + batchId: filter.batchId, + from: filter.createdAtAfter?.getTime(), + page: { size: filter.limit }, + }); + } +} diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts new file mode 100644 index 00000000000..c7c90a7f17b --- /dev/null +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -0,0 +1,301 @@ +/** + * Electric HTTP shape-stream wire protocol serializer for the single-run feed. + * + * This re-emits the exact wire shape that the deployed `@electric-sql/client` + * (1.0.14 modern + 0.4.0 legacy) and the SDK's `SubscribeRunRawShape` expect, + * so the notifier-backed realtime feed stays byte-faithful to what those clients + * already expect. + * + * The module is intentionally pure: no DB, Redis, or env access, so the wire + * contract can be unit-tested by round-tripping through the real client parser + * + the SDK schema. Header rewrites, tokens, and transport live in the client. + * + * Wire facts this encodes (verified against @electric-sql/client@1.0.14): + * - Response body is a JSON array of messages; an empty body is treated as `[]`. + * - Each column value is wire-encoded as a STRING (or null); the client decodes + * it back using the per-column `electric-schema` header. Columns absent from + * the schema are passed through unparsed (so text/timestamp stay strings). + * - `up-to-date` is the only control message that makes the client emit rows. + * - Re-sending the full row each cycle is idempotent: the client merges by `key`. + */ + +export type ElectricColumnType = + | "text" + | "timestamp" + | "int4" + | "int8" + | "float8" + | "bool" + | "jsonb"; + +type ElectricColumn = { + name: string; + type: ElectricColumnType; + /** Array dimensionality. 1 => `type[]` (Postgres `{a,b}` literal). */ + dims?: number; + /** + * Array columns only. True when the Postgres column has NO default, so an + * empty/absent value is stored as SQL NULL (Electric emits `null`) rather than + * an empty-array literal `{}`. Prisma erases this distinction — it coerces both + * NULL and `{}` to `[]` on read — so we re-derive the wire form from the column's + * known schema. `runTags` has no default; `realtimeStreams` has `@default([])`. + */ + emptyArrayAsNull?: boolean; +}; + +/** + * The columns the realtime run feed exposes, mirroring `DEFAULT_ELECTRIC_COLUMNS` + * in `realtimeClient.server.ts` and their Postgres types from the `TaskRun` + * Prisma model. The `type`/`dims` drive both the `electric-schema` header and + * the value encoding. Keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. + */ +export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ + { name: "id", type: "text" }, + { name: "taskIdentifier", type: "text" }, + { name: "createdAt", type: "timestamp" }, + { name: "updatedAt", type: "timestamp" }, + { name: "startedAt", type: "timestamp" }, + { name: "delayUntil", type: "timestamp" }, + { name: "queuedAt", type: "timestamp" }, + { name: "expiredAt", type: "timestamp" }, + { name: "completedAt", type: "timestamp" }, + { name: "friendlyId", type: "text" }, + { name: "number", type: "int4" }, + { name: "isTest", type: "bool" }, + { name: "status", type: "text" }, + { name: "usageDurationMs", type: "int4" }, + { name: "costInCents", type: "float8" }, + { name: "baseCostInCents", type: "float8" }, + { name: "ttl", type: "text" }, + { name: "payload", type: "text" }, + { name: "payloadType", type: "text" }, + { name: "metadata", type: "text" }, + { name: "metadataType", type: "text" }, + { name: "output", type: "text" }, + { name: "outputType", type: "text" }, + { name: "runTags", type: "text", dims: 1, emptyArrayAsNull: true }, + { name: "error", type: "jsonb" }, + { name: "realtimeStreams", type: "text", dims: 1 }, +]; + +/** Columns that can never be skipped via `skipColumns` (mirrors realtimeClient). */ +export const RESERVED_COLUMNS = ["id", "taskIdentifier", "friendlyId", "status", "createdAt"]; + +/** + * Shape of a single run hydrated for the realtime feed. Structurally compatible + * with the Prisma `TaskRun` projection produced by `RunHydrator`. + */ +export type RealtimeRunRow = { + id: string; + taskIdentifier: string; + createdAt: Date; + updatedAt: Date; + startedAt: Date | null; + delayUntil: Date | null; + queuedAt: Date | null; + expiredAt: Date | null; + completedAt: Date | null; + friendlyId: string; + number: number; + isTest: boolean; + status: string; + usageDurationMs: number; + costInCents: number; + baseCostInCents: number; + ttl: string | null; + payload: string; + payloadType: string; + metadata: string | null; + metadataType: string; + output: string | null; + outputType: string; + runTags: string[]; + error: unknown; + realtimeStreams: string[]; +}; + +type Operation = "insert" | "update" | "delete"; + +type ChangeMessage = { + key: string; + value: Record; + headers: { operation: Operation }; +}; + +type ControlMessage = { + headers: { control: "up-to-date" | "must-refetch" }; +}; + +type ShapeMessage = ChangeMessage | ControlMessage; + +const UP_TO_DATE: ControlMessage = { headers: { control: "up-to-date" } }; + +function effectiveSkipColumns(skipColumns: string[]): Set { + return new Set(skipColumns.filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c))); +} + +function quoteArrayElement(value: string): string { + return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; +} + +function pgArrayLiteral(values: unknown[]): string { + if (values.length === 0) { + return "{}"; + } + return `{${values.map((v) => quoteArrayElement(String(v))).join(",")}}`; +} + +function serializeValue(value: unknown, column: ElectricColumn): string | null { + if (value === null || value === undefined) { + return null; + } + + if (column.dims && column.dims > 0) { + if (!Array.isArray(value)) { + return null; + } + // A no-default array column stores NULL when empty, so Electric emits `null` + // (not `{}`); match that here since Prisma handed us `[]` for the NULL value. + if (value.length === 0 && column.emptyArrayAsNull) { + return null; + } + return pgArrayLiteral(value); + } + + switch (column.type) { + case "bool": + // Postgres text representation; the client's parseBool accepts "t"/"f". + return value ? "t" : "f"; + case "timestamp": + // The SDK's RawShapeDate appends "Z" before parsing, so we emit the ISO + // string WITHOUT the trailing "Z". + return value instanceof Date ? value.toISOString().slice(0, -1) : String(value); + case "jsonb": + return JSON.stringify(value); + case "int4": + case "int8": + case "float8": + case "text": + default: + return String(value); + } +} + +/** The merge key the client uses to reassemble a row across insert/update cycles. */ +export function runShapeKey(runId: string): string { + return `"public"."TaskRun"/"${runId}"`; +} + +/** Encode a single run row into the wire `value` object (column -> string|null). */ +export function serializeRunRow( + row: RealtimeRunRow, + skipColumns: string[] = [] +): Record { + const skip = effectiveSkipColumns(skipColumns); + const value: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + value[column.name] = serializeValue((row as Record)[column.name], column); + } + + return value; +} + +/** The `electric-schema` response header value for the (optionally trimmed) column set. */ +export function buildElectricSchemaHeader(skipColumns: string[] = []): string { + const skip = effectiveSkipColumns(skipColumns); + const schema: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + schema[column.name] = column.dims ? { type: column.type, dims: column.dims } : { type: column.type }; + } + + return JSON.stringify(schema); +} + +/** + * Initial snapshot body: a single `insert` for the row (if it exists) followed by + * `up-to-date`. An absent row emits a bare `up-to-date` (an empty shape), which is + * how Electric represents "no rows match". + */ +export function buildSnapshotBody(row: RealtimeRunRow | null, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = []; + if (row) { + messages.push({ + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "insert" }, + }); + } + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** Live body when the row advanced: a full-row `update` followed by `up-to-date`. */ +export function buildUpdateBody(row: RealtimeRunRow, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = [ + { + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "update" }, + }, + UP_TO_DATE, + ]; + return JSON.stringify(messages); +} + +/** Live body when nothing advanced: a bare `up-to-date` (no row emission). */ +export function buildUpToDateBody(): string { + return JSON.stringify([UP_TO_DATE]); +} + +export type RowChange = { row: RealtimeRunRow; operation: "insert" | "update" }; + +/** + * Multi-row body for the tag-list feed: one change message per row (insert for + * rows new to the shape, update for rows that advanced) followed by `up-to-date`. + * An empty `changes` array emits a bare `up-to-date`. The client merges every row + * by key, so re-emitting a full row is idempotent. + */ +export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.row.id), + value: serializeRunRow(change.row, skipColumns), + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +export const INITIAL_OFFSET = "-1"; + +/** + * Opaque offset token, formatted to satisfy the client's `${number}_${number}` + * type. The first segment is the row's `updatedAt` epoch-ms (lets a live request + * detect whether the replica row has advanced past what the client already has); + * the second is a per-connection sequence counter. + */ +export function encodeOffset(updatedAtMs: number, seq: number): string { + return `${Math.trunc(updatedAtMs)}_${Math.trunc(seq)}`; +} + +/** Extract the `updatedAt` epoch-ms a client last saw from its echoed offset. */ +export function parseOffsetUpdatedAtMs(offset: string | null | undefined): number { + if (!offset) { + return 0; + } + const [first] = offset.split("_"); + const value = Number(first); + return Number.isFinite(value) && value > 0 ? value : 0; +} + +/** Mirror of realtimeClient's DEQUEUED->EXECUTING rewrite for non-current API versions. */ +export function rewriteBodyForLegacyApiVersion(body: string): string { + return body.replace(/"status":"DEQUEUED"/g, '"status":"EXECUTING"'); +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts new file mode 100644 index 00000000000..9c70fd1acb9 --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -0,0 +1,752 @@ +import { json } from "@remix-run/server-runtime"; +import { safeParseNaturalLanguageDurationAgo } from "@trigger.dev/core/v3/isomorphic"; +import { randomUUID } from "node:crypto"; +import { API_VERSIONS, CURRENT_API_VERSION } from "~/api/versions"; +import { + type CachedLimitProvider, + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { logger } from "../logger.server"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + INITIAL_OFFSET, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + RESERVED_COLUMNS, + type RowChange, +} from "./electricStreamProtocol.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RunChangeNotifier, type RunChangeSubscription } from "./runChangeNotifier.server"; +import { type RunHydrator, type RunListResolver } from "./runReader.server"; +import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; + +/** The tag-list feed resolves ids via ClickHouse, which needs org + project + env. + * `authentication.environment` (AuthenticatedEnvironment) provides projectId, so + * widening here avoids touching the Electric client's RealtimeEnvironment type. */ +export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; + +/** The realtime feeds the run routes depend on (single-run, tag-list, batch). Both + * the Electric client and this notifier client satisfy it, so the routes can switch + * between them behind a flag. */ +export interface RealtimeStreamClient { + streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; +} + +export type WakeupReason = "notify" | "timeout" | "abort"; + +export type NotifierRealtimeClientOptions = { + runReader: RunHydrator; + /** Resolves the tag/list filter into the matching id-set (filter-only). */ + runListResolver: RunListResolver; + notifier: RunChangeNotifier; + limiter: RealtimeConcurrencyLimiter; + cachedLimitProvider: CachedLimitProvider; + /** Backstop wait before refetching on a live request (ms). Defaults to 5000. */ + livePollTimeoutMs?: number; + /** Ceiling for the tag-list createdAt lookback window (ms). */ + maximumCreatedAtFilterAgeMs: number; + /** Hard cap on tag-list snapshot size. Defaults to 1000. */ + maxListResults?: number; + /** TTL (ms) for the multi-run resolve+hydrate coalescing cache. Defaults to 1000. */ + runSetResolveCacheTtlMs?: number; + /** Max entries in the resolve+hydrate cache. Defaults to 5000. */ + runSetResolveCacheMaxEntries?: number; + /** Max entries in the per-handle working-set cache. Defaults to 10000. */ + listCacheMaxEntries?: number; + /** Epoch-aligned bucket (ms) the tag-list createdAt lower bound is floored to, so + * same-tag feeds pinned within the same bucket share a cache entry. Defaults to + * 60000. 0 disables bucketing. */ + runSetCreatedAtBucketMs?: number; + /** Observability hook: why a live request woke (notify vs timeout vs abort). */ + onWakeup?: (reason: WakeupReason) => void; + /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto + * an in-flight resolve, or missed (issued fresh ClickHouse + Postgres queries). */ + onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; + /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ + onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; +}; + +const DEFAULT_CONCURRENCY_LIMIT = 100_000; +const DEFAULT_LIVE_POLL_TIMEOUT_MS = 5_000; +const DEFAULT_MAX_LIST_RESULTS = 1_000; +const LIST_CACHE_TTL_MS = 5 * 60_000; +const LIST_CACHE_MAX_ENTRIES = 10_000; +const DEFAULT_RUNSET_CACHE_TTL_MS = 1_000; +const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; +const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; + +/** A multi-run feed's filter. Tag-list sets `tags` (+ pinned `createdAtAfter`); + * the batch feed sets `batchId`. Both resolve to an id-set via the resolver. */ +type RunSetFilter = { + tags?: string[]; + batchId?: string; + createdAtAfter?: Date; +}; + +/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls + * emit only rows that advanced. */ +type WorkingSet = Map; + +type ResponseHeaderInput = { + offset: string; + handle: string; + cursor?: string; + schema?: string; +}; + +/** + * Notifier-backed implementation of the realtime run feeds: signals run changes + * over Redis pub/sub and refetches the current rows from a read replica. + * + * Single-run (`streamRun`): + * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema) + * - live: race a per-run notification vs a ~5s backstop and the abort signal, + * refetch, and emit a full-row `update` ONLY when `updatedAt` advanced past what + * the client has (a stale replica read never regresses); else a bare `up-to-date`. + * + * Multi-run feeds (`streamRuns` tag-list, `streamBatch`) share one core: + * - initial: resolve the matching id-set via ClickHouse `listRunIds` (filter-only, + * tag-OR or batchId), hydrate by-id from Postgres, emit N `insert`s. + * - live: one per-env subscription wakes the feed; re-resolve the set, hydrate it, + * and emit only new (`insert`) / advanced (`update`) rows — diffed on the + * authoritative Postgres `updatedAt` against a per-handle working set (cache miss + * falls back to the offset floor, merge-safe). ClickHouse supplies membership; + * Postgres supplies fresh row state, so CH ingest lag never stales the rows. + * Tag-list pins its `createdAt` window in the handle; batch needs no window. + * + * Tokens are opaque: `offset` = `_`, `handle` is per-shape, + * `cursor` is a live-only counter. The wire format is produced by + * `electricStreamProtocol`. + */ +export class NotifierRealtimeClient implements RealtimeStreamClient { + #seq = 0; + readonly #workingSetCache: BoundedTtlCache; + /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair, keyed by + * (env, filter, columns). Collapses an env-wide wake's per-feed query fan-out into + * one shared resolve+hydrate per filter per short window. */ + readonly #runSetCache: BoundedTtlCache; + readonly #runSetInflight = new Map>(); + + constructor(private readonly options: NotifierRealtimeClientOptions) { + this.#workingSetCache = new BoundedTtlCache( + LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); + this.#runSetCache = new BoundedTtlCache( + options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, + options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES + ); + } + + /** Current size of the per-handle working-set cache (for a metrics gauge). */ + get workingSetCacheSize(): number { + return this.#workingSetCache.size; + } + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // Initial snapshot — no prior offset/handle. + if (offset === INITIAL_OFFSET || !handle) { + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion); + } + + if (isLive) { + return this.#liveResponse({ + environment, + runId, + offset, + handle, + skipColumns, + apiVersion, + clientVersion, + signal, + }); + } + + // Non-live catch-up with a handle: re-emit the current snapshot (idempotent). + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion, handle); + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + const tags = params.tags ?? []; + + // Initial snapshot — pin the createdAt window in a fresh handle. + if (offset === INITIAL_OFFSET || !handle) { + const createdAtFilterMs = this.#computeCreatedAtFilter(params.createdAt).getTime(); + return this.#runSetSnapshotResponse( + environment, + { tags, createdAtAfter: new Date(createdAtFilterMs) }, + this.#mintListHandle(createdAtFilterMs), + skipColumns, + apiVersion, + clientVersion + ); + } + + // Recover the pinned window from the handle so the lower bound never drifts. + const filter: RunSetFilter = { + tags, + createdAtAfter: new Date( + this.#filterMsFromHandle(handle) ?? this.#computeCreatedAtFilter(params.createdAt).getTime() + ), + }; + + if (isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Non-live catch-up under the same handle. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // The batch set is fully defined by batchId (the route resolves it from the + // friendlyId on every request), so the handle is derived and stable and there's + // no createdAt window to pin. + const handle = `batch-${batchId}`; + const filter: RunSetFilter = { batchId }; + + if (offset !== INITIAL_OFFSET && isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Initial snapshot + non-live catch-up. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + #snapshotResponse( + runId: string, + row: Awaited>, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string, + existingHandle?: string + ): Response { + const body = buildSnapshotBody(row, skipColumns); + const offset = row ? encodeOffset(row.updatedAt.getTime(), this.#nextSeq()) : encodeOffset(0, 0); + return this.#buildResponse(body, apiVersion, clientVersion, { + offset, + handle: existingHandle ?? this.#mintHandle(runId), + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + async #liveResponse(params: { + environment: RealtimeEnvironment; + runId: string; + offset: string; + handle: string; + skipColumns: string[]; + apiVersion: API_VERSIONS; + clientVersion?: string; + signal?: AbortSignal; + }): Promise { + const { environment, runId, offset, handle, skipColumns, apiVersion, clientVersion, signal } = + params; + + return this.#withConcurrencySlot(environment, async () => { + const reason = await this.#waitForChange(runId, signal); + this.options.onWakeup?.(reason); + + const row = await this.options.runReader.getRunById(environment.id, runId); + const lastSeenMs = parseOffsetUpdatedAtMs(offset); + const seq = this.#nextSeq(); + + // Only-on-advance: emit a full-row update when the replica row moved past + // what the client already has; otherwise a bare up-to-date keeps the offset. + // Live responses carry electric-cursor but NOT electric-schema (the client + // already has the schema from the initial snapshot) — matching real Electric. + if (row && row.updatedAt.getTime() > lastSeenMs) { + return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + }); + } + + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + }); + } + + /** Initial (and non-live catch-up) snapshot for a multi-run feed: resolve the + * id-set, hydrate, emit every row as an `insert`, and seed the working set. */ + async #runSetSnapshotResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string + ): Promise { + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + const changes: RowChange[] = rows.map((row) => ({ row, operation: "insert" as const })); + + // updatedAt comes from the authoritative Postgres hydrate, not ClickHouse. + const seen: WorkingSet = new Map(); + let maxUpdatedAt = 0; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + } + this.#workingSetCache.set(handle, seen); + + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), + handle, + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** Live poll for a multi-run feed: wait, re-resolve the set, and emit only the + * rows that are new or advanced vs the cached working set. */ + async #runSetLiveResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + offset: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + signal: AbortSignal | undefined + ): Promise { + return this.#withConcurrencySlot(environment, async () => { + // One env-scoped subscription per feed (not one per run): any run change in + // the env wakes us, then we re-resolve the filter. + const reason = await this.#waitForEnvChange(environment.id, signal); + this.options.onWakeup?.(reason); + + const cached = this.#workingSetCache.get(handle); + const offsetFloorMs = parseOffsetUpdatedAtMs(offset); + const seq = this.#nextSeq(); + + // ClickHouse resolves the (possibly stale) membership; Postgres hydrates the + // authoritative current rows, so status is always fresh even if CH lags. The + // resolve+hydrate is coalesced + short-TTL cached so a single env-wide wake + // doesn't fan out into one CH+PG query per concurrent same-filter feed. + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + // Diff against what the client already has, using the hydrated updatedAt: + // cache hit => per-row (new = insert, advanced = update); miss => anything + // newer than the offset floor as a merge-safe update. + const changes: RowChange[] = []; + const seen: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (cached) { + const prior = cached.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ row, operation: "update" }); + } + } + + // Refresh the working set so runs that left the filter stop being tracked + // (the client keeps showing them; the SDK never applies deletes). + this.#workingSetCache.set(handle, seen); + + const body = changes.length === 0 ? buildUpToDateBody() : buildRowsBody(changes, skipColumns); + + return this.#buildResponse(body, apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }); + } + + /** + * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), + * coalesced + short-TTL cached by (env, filter, columns). Every batch feed for a + * batch, and every tag feed sharing tags+window+columns, shares ONE resolve+hydrate + * instead of each firing its own when the per-env channel wakes them together. + * Concurrent callers await an in-flight resolve; callers within the TTL reuse the + * cached rows (staleness budget: up to the TTL; the next live poll catches up). + */ + async #resolveAndHydrate( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const key = this.#runSetCacheKey(environment.id, filter, skipColumns); + + const cached = this.#runSetCache.get(key); + if (cached) { + this.options.onRunSetResolve?.("hit"); + return cached; + } + + const existing = this.#runSetInflight.get(key); + if (existing) { + this.options.onRunSetResolve?.("coalesced"); + return existing; + } + + this.options.onRunSetResolve?.("miss"); + const promise = this.#resolveAndHydrateUncached(environment, filter, skipColumns) + .then((rows) => { + this.#runSetCache.set(key, rows); + return rows; + }) + .finally(() => { + this.#runSetInflight.delete(key); + }); + + this.#runSetInflight.set(key, promise); + return promise; + } + + async #resolveAndHydrateUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const resolveStart = Date.now(); + const ids = await this.#resolveIds(environment, filter); + this.options.onRunSetQuery?.("resolve", Date.now() - resolveStart); + + const hydrateStart = Date.now(); + const rows = await this.options.runReader.hydrateByIds(environment.id, ids, skipColumns); + this.options.onRunSetQuery?.("hydrate", Date.now() - hydrateStart); + + return rows; + } + + /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the + * same projected columns, so cached rows always match the requesting feed. */ + #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { + const tags = filter.tags && filter.tags.length > 0 ? [...filter.tags].sort().join(",") : ""; + const cols = skipColumns.length > 0 ? [...skipColumns].sort().join(",") : ""; + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ + filter.createdAtAfter?.getTime() ?? "" + }|${maxListResults}|${cols}`; + } + + async #resolveIds(environment: RealtimeListEnvironment, filter: RunSetFilter): Promise { + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + const ids = await this.options.runListResolver.resolveMatchingRunIds({ + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: filter.tags, + batchId: filter.batchId, + createdAtAfter: filter.createdAtAfter, + limit: maxListResults, + }); + + if (ids.length >= maxListResults) { + logger.warn("[notifierRealtimeClient] run-set feed hit the result cap", { + environmentId: environment.id, + filter, + cap: maxListResults, + }); + } + + return ids; + } + + #computeCreatedAtFilter(createdAt: string | undefined): Date { + // Clamp to the maximum lookback window, mirroring realtimeClient. + const floor = new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs); + const parsed = safeParseNaturalLanguageDurationAgo(createdAt ?? "24h"); + const resolved = !parsed || parsed < floor ? floor : parsed; + // Quantize the lower bound to a coarse epoch-aligned bucket and pin THAT in the + // handle, so same-tag feeds whose windows land in the same bucket resolve to the + // same filter -> same coalescing cache key -> one shared ClickHouse + Postgres + // query instead of one per feed. Floored (rounds the bound earlier), so the + // window only ever widens by < bucket and never drops a run the client should see. + return new Date(this.#bucketCreatedAtMs(resolved.getTime())); + } + + #bucketCreatedAtMs(ms: number): number { + const bucket = this.options.runSetCreatedAtBucketMs ?? DEFAULT_RUNSET_CREATED_AT_BUCKET_MS; + return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; + } + + #mintListHandle(createdAtFilterMs: number): string { + // Pins the createdAt threshold in the opaque handle so live polls reuse the + // same lower bound even on a working-set cache miss. + return `runs_${Math.trunc(createdAtFilterMs)}_${this.#nextSeq()}`; + } + + #filterMsFromHandle(handle: string): number | undefined { + const parts = handle.split("_"); + if (parts[0] !== "runs") { + return undefined; + } + const ms = Number(parts[1]); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } + + #parseStreamRequest( + url: URL | string, + requestOptions?: RealtimeRequestOptions + ): { offset: string; handle: string | null; isLive: boolean; skipColumns: string[] } { + const $url = new URL(url.toString()); + return { + offset: $url.searchParams.get("offset") ?? INITIAL_OFFSET, + handle: $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"), + isLive: $url.searchParams.get("live") === "true", + skipColumns: this.#resolveSkipColumns($url, requestOptions), + }; + } + + /** + * Runs `work` inside a per-env concurrency slot: acquires a slot (429 if over the + * org limit, 500 if the limit can't be read) and always releases it afterward. + */ + async #withConcurrencySlot( + environment: RealtimeEnvironment, + work: () => Promise + ): Promise { + const requestId = randomUUID(); + const concurrencyLimit = await this.options.cachedLimitProvider.getCachedLimit( + environment.organizationId, + DEFAULT_CONCURRENCY_LIMIT + ); + + if (!concurrencyLimit) { + logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { + organizationId: environment.organizationId, + }); + return json({ error: "Failed to get concurrency limit" }, { status: 500 }); + } + + const canProceed = await this.options.limiter.incrementAndCheck( + environment.id, + requestId, + concurrencyLimit + ); + + if (!canProceed) { + return json({ error: "Too many concurrent requests" }, { status: 429 }); + } + + try { + return await work(); + } finally { + await this.options.limiter.decrement(environment.id, requestId); + } + } + + #waitForChange(runId: string, signal?: AbortSignal): Promise { + return this.#waitForSubscription(this.options.notifier.subscribeToRunChanges(runId), signal); + } + + #waitForEnvChange(environmentId: string, signal?: AbortSignal): Promise { + return this.#waitForSubscription( + this.options.notifier.subscribeToEnvChanges(environmentId), + signal + ); + } + + /** Race a notifier subscription against the backstop timeout and the abort signal. */ + async #waitForSubscription( + subscription: RunChangeSubscription, + signal?: AbortSignal + ): Promise { + if (signal?.aborted) { + subscription.unsubscribe(); + return "abort"; + } + + let timer: ReturnType | undefined; + let onAbort: (() => void) | undefined; + + try { + return await new Promise((resolve) => { + subscription.changed.then(() => resolve("notify")).catch(() => resolve("timeout")); + + timer = setTimeout(() => resolve("timeout"), this.#jitteredTimeout()); + + if (signal) { + onAbort = () => resolve("abort"); + signal.addEventListener("abort", onAbort, { once: true }); + } + }); + } finally { + if (timer) { + clearTimeout(timer); + } + if (signal && onAbort) { + signal.removeEventListener("abort", onAbort); + } + subscription.unsubscribe(); + } + } + + #jitteredTimeout(): number { + const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; + // +/-15% jitter to avoid synchronized refetch herds. + return Math.round(base * (0.85 + Math.random() * 0.3)); + } + + #buildResponse( + body: string, + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + headers: ResponseHeaderInput + ): Response { + const finalBody = + apiVersion === CURRENT_API_VERSION ? body : rewriteBodyForLegacyApiVersion(body); + + const responseHeaders = new Headers(); + responseHeaders.set("content-type", "application/json"); + responseHeaders.set("cache-control", "no-store"); + + // Carry CORS on the response itself, mirroring how the Electric upstream does + // (apiCors passes a response through untouched once it has allow-origin). Browsers + // can only read the electric-* headers cross-origin if they're explicitly exposed; + // without this the deployed react-hooks fail with MissingHeadersError. Bearer-token + // requests are non-credentialed, so a wildcard is safe. + responseHeaders.set("access-control-allow-origin", "*"); + responseHeaders.set("access-control-expose-headers", "*"); + + // Modern clients (1.0.14) send `x-trigger-electric-version` and read the + // lowercase `electric-*` headers. Legacy clients (0.4.0) omit the version and + // read `electric-shape-id`/`electric-chunk-last-offset` (case-insensitive), + // matching realtimeClient's rewriteResponseHeaders behavior exactly. + if (clientVersion) { + responseHeaders.set("electric-offset", headers.offset); + responseHeaders.set("electric-handle", headers.handle); + } else { + responseHeaders.set("electric-chunk-last-offset", headers.offset); + responseHeaders.set("electric-shape-id", headers.handle); + } + + if (headers.cursor !== undefined) { + responseHeaders.set("electric-cursor", headers.cursor); + } + if (headers.schema !== undefined) { + responseHeaders.set("electric-schema", headers.schema); + } + + return new Response(finalBody, { status: 200, headers: responseHeaders }); + } + + #mintHandle(runId: string): string { + // Stable per-run handle: the single-run shape never changes columns, so the + // client never needs a must-refetch from a handle change. + return `run-${runId}`; + } + + #nextSeq(): number { + this.#seq = (this.#seq + 1) % Number.MAX_SAFE_INTEGER; + return this.#seq; + } + + #resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); + } +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..2888deec863 --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -0,0 +1,99 @@ +import { Counter, Gauge, Histogram } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { getCachedLimit } from "../platform.v3.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { NotifierRealtimeClient } from "./notifierRealtimeClient.server"; +import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; +import { RunHydrator } from "./runReader.server"; + +/** + * Process-singleton wiring for the notifier-backed realtime client. Only + * constructed when a request actually routes to the + * notifier backend, so a disabled webapp never instantiates it. + */ +function initializeNotifierRealtimeClient(): NotifierRealtimeClient { + const wakeups = new Counter({ + name: "realtime_notifier_wakeups_total", + help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishRunChanged delegate.", + labelNames: ["reason"] as const, + registers: [metricsRegister], + }); + + const runSetResolves = new Counter({ + name: "realtime_notifier_runset_resolve_total", + help: "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query under an env-wide wake.", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + + const runSetQueryMs = new Histogram({ + name: "realtime_notifier_runset_query_ms", + help: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", + labelNames: ["stage"] as const, + buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000], + registers: [metricsRegister], + }); + + const limiter = new RealtimeConcurrencyLimiter({ + keyPrefix: "tr:realtime:notifier:concurrency", + redis: { + port: env.RATE_LIMIT_REDIS_PORT, + host: env.RATE_LIMIT_REDIS_HOST, + username: env.RATE_LIMIT_REDIS_USERNAME, + password: env.RATE_LIMIT_REDIS_PASSWORD, + tlsDisabled: env.RATE_LIMIT_REDIS_TLS_DISABLED === "true", + clusterMode: env.RATE_LIMIT_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + const client = new NotifierRealtimeClient({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + prisma: $replica, + }), + notifier: getRunChangeNotifier(), + limiter, + cachedLimitProvider: { + async getCachedLimit(organizationId, defaultValue) { + const result = await getCachedLimit( + organizationId, + "realtimeConcurrentConnections", + defaultValue + ); + return result.val; + }, + }, + livePollTimeoutMs: env.REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + runSetResolveCacheTtlMs: env.REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS, + runSetResolveCacheMaxEntries: env.REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES, + listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, + runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, + onWakeup: (reason) => wakeups.inc({ reason }), + onRunSetResolve: (result) => runSetResolves.inc({ result }), + onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), + }); + + new Gauge({ + name: "realtime_notifier_working_set_size", + help: "Entries in the per-handle working-set cache (one per active multi-run feed session).", + registers: [metricsRegister], + collect() { + this.set(client.workingSetCacheSize); + }, + }); + + return client; +} + +export function getNotifierRealtimeClient(): NotifierRealtimeClient { + return singleton("notifierRealtimeClient", initializeNotifierRealtimeClient); +} diff --git a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts new file mode 100644 index 00000000000..a935858fef0 --- /dev/null +++ b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts @@ -0,0 +1,111 @@ +import { Callback, Result } from "ioredis"; +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RealtimeConcurrencyLimiterOptions = { + redis: RedisWithClusterOptions; + keyPrefix: string; + /** How long a tracked request lives before it's swept as stale (seconds). */ + expiryTimeInSeconds?: number; + connectionName?: string; +}; + +/** + * Per-environment concurrent-connection limiter for realtime long-polls. + * + * This is a standalone copy of the limiter embedded in `realtimeClient.server.ts` + * (Electric path), so the notifier-backed client can enforce the same per-env cap + * WITHOUT modifying the existing Electric client. The Lua + key shape are + * identical; only the key prefix differs, so the two paths track independently. + */ +export class RealtimeConcurrencyLimiter { + private redis: RedisClient; + private expiryTimeInSeconds: number; + + constructor(private options: RealtimeConcurrencyLimiterOptions) { + this.redis = createRedisClient( + options.connectionName ?? "trigger:realtime:notifier:concurrency", + options.redis + ); + this.expiryTimeInSeconds = options.expiryTimeInSeconds ?? 60 * 5; + this.#registerCommands(); + } + + async incrementAndCheck(environmentId: string, requestId: string, limit: number): Promise { + const key = this.#getKey(environmentId); + const now = Date.now(); + + const result = await this.redis.incrementAndCheckRealtimeNotifierConcurrency( + key, + now.toString(), + requestId, + this.expiryTimeInSeconds.toString(), + (now - this.expiryTimeInSeconds * 1000).toString(), + limit.toString() + ); + + return result === 1; + } + + async decrement(environmentId: string, requestId: string): Promise { + const key = this.#getKey(environmentId); + await this.redis.zrem(key, requestId); + } + + #getKey(environmentId: string): string { + return `${this.options.keyPrefix}:${environmentId}`; + } + + #registerCommands() { + this.redis.defineCommand("incrementAndCheckRealtimeNotifierConcurrency", { + numberOfKeys: 1, + lua: /* lua */ ` + local concurrencyKey = KEYS[1] + + local timestamp = tonumber(ARGV[1]) + local requestId = ARGV[2] + local expiryTime = tonumber(ARGV[3]) + local cutoffTime = tonumber(ARGV[4]) + local limit = tonumber(ARGV[5]) + + -- Remove expired entries + redis.call('ZREMRANGEBYSCORE', concurrencyKey, '-inf', cutoffTime) + + -- Add the new request to the sorted set + redis.call('ZADD', concurrencyKey, timestamp, requestId) + + -- Set the expiry time on the key + redis.call('EXPIRE', concurrencyKey, expiryTime) + + -- Get the total number of concurrent requests + local totalRequests = redis.call('ZCARD', concurrencyKey) + + -- Check if the limit has been exceeded + if totalRequests > limit then + redis.call('ZREM', concurrencyKey, requestId) + return 0 + end + + return 1 + `, + }); + + this.redis.on("error", (error) => { + logger.error("[realtimeConcurrencyLimiter] redis error", { error }); + }); + } +} + +declare module "ioredis" { + interface RedisCommander { + incrementAndCheckRealtimeNotifierConcurrency( + key: string, + timestamp: string, + requestId: string, + expiryTime: string, + cutoffTime: string, + limit: string, + callback?: Callback + ): Result; + } +} diff --git a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts new file mode 100644 index 00000000000..220f79f9308 --- /dev/null +++ b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts @@ -0,0 +1,86 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { logger } from "../logger.server"; +import { type RealtimeEnvironment } from "../realtimeClient.server"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeStreamClient } from "./notifierRealtimeClient.server"; +import { getNotifierRealtimeClient } from "./notifierRealtimeClientInstance.server"; +import { getShadowRealtimeClient } from "./shadowRealtimeClientInstance.server"; + +type RealtimeBackend = "electric" | "notifier" | "shadow"; + +/** + * Chooses which backend serves a realtime run request. + * + * Two gates, both defaulting to the Electric path: + * 1. `REALTIME_NOTIFIER_ENABLED` (env master switch). When off, this returns the + * Electric client immediately — no flag read, no notifier client construction, + * byte-identical to pre-Electric-Sunset behavior. + * 2. the `realtimeBackend` feature flag (global + per-org, org wins), resolved per + * org and cached in-process for 30s so the long-poll feed doesn't hit the DB + * on every request. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; +const BACKEND_CACHE_TTL_MS = 30_000; +// Org count is bounded, but cap to avoid unbounded growth. +const BACKEND_CACHE_MAX_ENTRIES = 50_000; + +const flag = makeFlag($replica); +const backendCache = new BoundedTtlCache( + BACKEND_CACHE_TTL_MS, + BACKEND_CACHE_MAX_ENTRIES +); + +export async function resolveRealtimeStreamClient( + environment: RealtimeEnvironment +): Promise { + if (!notifierEnabled) { + return realtimeClient; + } + + switch (await getRealtimeBackend(environment.organizationId)) { + case "notifier": + return getNotifierRealtimeClient(); + case "shadow": + // Client is still served Electric; the notifier path is diffed in the background. + return getShadowRealtimeClient(); + case "electric": + default: + return realtimeClient; + } +} + +async function getRealtimeBackend(organizationId: string): Promise { + const cached = backendCache.get(organizationId); + if (cached !== undefined) { + return cached; + } + + let backend: RealtimeBackend = "electric"; + + try { + const org = await $replica.organization.findFirst({ + where: { id: organizationId }, + select: { featureFlags: true }, + }); + + backend = await flag({ + key: FEATURE_FLAG.realtimeBackend, + defaultValue: "electric", + overrides: (org?.featureFlags as Record) ?? {}, + }); + } catch (error) { + // Never let a flag lookup failure break the realtime feed — fall back to Electric. + logger.error("[resolveRealtimeStreamClient] failed to resolve realtimeBackend flag", { + organizationId, + error, + }); + backend = "electric"; + } + + backendCache.set(organizationId, backend); + return backend; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts new file mode 100644 index 00000000000..ba8748c6cf4 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -0,0 +1,228 @@ +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RunChangeInput = { + runId: string; + /** + * Optional. The single-run channel is keyed by runId alone; environmentId is + * carried for the per-env channels and metrics. Write sites that don't + * have it cheaply in scope may omit it. + */ + environmentId?: string; + /** Optional monotonic hint; not required since consumers always refetch. */ + version?: number; +}; + +export type RunChangeNotifierOptions = { + redis: RedisWithClusterOptions; + /** Channel name prefix; the runId is appended inside a hash-tag for slot locality. */ + channelPrefix?: string; + connectionName?: string; +}; + +export type RunChangeSubscription = { + /** Resolves the next time a change is published for the subscribed run. */ + changed: Promise; + unsubscribe: () => void; +}; + +const DEFAULT_CHANNEL_PREFIX = "realtime:"; + +/** + * RunChangeNotifier — the single, encapsulated module that carries "run X changed" + * signals from write sites to the realtime feed. + * + * Design constraints baked in here: + * - IDs only on the wire, never row data. Consumers refetch from Postgres. + * - ONE shared, multiplexed subscriber connection per process with a refcounted + * `Map>` (per-run + per-env channels). The RunQueue + * pattern, deliberately NOT + * the per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would + * exhaust ElastiCache `maxclients`). + * - Connections are created lazily: a process that never publishes or subscribes + * (the default, flag-off state) opens no Redis connections at all. + * - `publish` is fire-and-forget and never throws; a dropped publish only costs + * latency because the consumer has a timeout backstop. + * + * Channels are hash-tagged (`{}`) so a later move to sharded + * pub/sub (SPUBLISH/SSUBSCRIBE) keeps slot locality without a channel rename. + */ +export class RunChangeNotifier { + #publisher: RedisClient | undefined; + #subscriber: RedisClient | undefined; + readonly #listeners = new Map void>>(); + readonly #channelPrefix: string; + readonly #connectionName: string; + + constructor(private readonly options: RunChangeNotifierOptions) { + this.#channelPrefix = options.channelPrefix ?? DEFAULT_CHANNEL_PREFIX; + this.#connectionName = options.connectionName ?? "trigger:realtime:run-change-notifier"; + } + + /** + * Fire-and-forget publish of a run-changed signal. Never throws. Publishes to + * the per-run channel (single-run feed) and, when environmentId is known, the + * per-env channel (tag/list feed). Payload is the runId so env consumers can + * tell which run moved. IDs only, never row data. + */ + publish(input: RunChangeInput): void { + this.#publishToChannel(this.#channelForRun(input.runId), input.runId); + if (input.environmentId) { + this.#publishToChannel(this.#channelForEnv(input.environmentId), input.runId); + } + } + + #publishToChannel(channel: string, payload: string): void { + try { + const publisher = this.#ensurePublisher(); + const result = publisher.publish(channel, payload); + if (typeof (result as Promise)?.catch === "function") { + (result as Promise).catch((error) => { + logger.debug("[runChangeNotifier] publish failed", { error, channel }); + }); + } + } catch (error) { + logger.debug("[runChangeNotifier] publish threw", { error, channel }); + } + } + + /** Fire-and-forget publish of many run-changed signals. Never throws. */ + publishMany(inputs: RunChangeInput[]): void { + for (const input of inputs) { + this.publish(input); + } + } + + /** + * Subscribe to the next change for a single run (single-run feed). + */ + subscribeToRunChanges(runId: string): RunChangeSubscription { + return this.#subscribe(this.#channelForRun(runId)); + } + + /** + * Subscribe to the next change of ANY run in an environment (tag/list feed). + * The consumer re-resolves its filter on each wake. + */ + subscribeToEnvChanges(environmentId: string): RunChangeSubscription { + return this.#subscribe(this.#channelForEnv(environmentId)); + } + + /** + * Refcounted subscribe over the shared subscriber, keyed by the full channel: + * the first listener for a channel issues SUBSCRIBE, the last one UNSUBSCRIBE. + */ + #subscribe(channel: string): RunChangeSubscription { + const subscriber = this.#ensureSubscriber(); + + let resolveChanged: () => void = () => {}; + const changed = new Promise((resolve) => { + resolveChanged = resolve; + }); + + let listeners = this.#listeners.get(channel); + if (!listeners) { + listeners = new Set(); + this.#listeners.set(channel, listeners); + subscriber.subscribe(channel).catch((error) => { + logger.debug("[runChangeNotifier] subscribe failed", { error, channel }); + }); + } + listeners.add(resolveChanged); + + let unsubscribed = false; + const unsubscribe = () => { + if (unsubscribed) { + return; + } + unsubscribed = true; + + const current = this.#listeners.get(channel); + if (!current) { + return; + } + current.delete(resolveChanged); + if (current.size === 0) { + // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and + // only if no new listener re-subscribed while it was in flight. The map + // entry's existence mirrors "subscribed (or subscribe in flight) in Redis", + // so the subscribe path safely reuses it without a duplicate SUBSCRIBE. + subscriber + .unsubscribe(channel) + .then(() => { + const latest = this.#listeners.get(channel); + if (!latest) { + return; + } + if (latest.size === 0) { + this.#listeners.delete(channel); + } else { + // A listener arrived during the in-flight UNSUBSCRIBE; the channel is + // now unsubscribed in Redis but has live waiters. Re-subscribe so they + // still receive messages (the long-poll backstop covers the gap). + subscriber.subscribe(channel).catch((error) => { + logger.debug("[runChangeNotifier] resubscribe failed", { error, channel }); + }); + } + }) + .catch((error) => { + // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. + // Keep the (empty) map entry so a future subscriber reuses it without a + // duplicate SUBSCRIBE and #onMessage stays consistent with Redis state. + logger.debug("[runChangeNotifier] unsubscribe failed", { error, channel }); + }); + } + }; + + return { changed, unsubscribe }; + } + + /** Number of distinct channels currently subscribed (for metrics). */ + get activeSubscriptionCount(): number { + return this.#listeners.size; + } + + async quit(): Promise { + await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); + this.#subscriber = undefined; + this.#publisher = undefined; + this.#listeners.clear(); + } + + #ensurePublisher(): RedisClient { + if (!this.#publisher) { + this.#publisher = createRedisClient(`${this.#connectionName}:pub`, this.options.redis); + } + return this.#publisher; + } + + #ensureSubscriber(): RedisClient { + if (!this.#subscriber) { + const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); + subscriber.on("message", (channel: string) => this.#onMessage(channel)); + this.#subscriber = subscriber; + } + return this.#subscriber; + } + + #onMessage(channel: string) { + const listeners = this.#listeners.get(channel); + if (!listeners) { + return; + } + // One-shot: each waiter resolves its race and removes itself via unsubscribe(). + for (const resolve of [...listeners]) { + resolve(); + } + } + + // Channels are hash-tagged (`...{}`) so a later move to sharded pub/sub + // keeps slot locality without a rename. + #channelForRun(runId: string): string { + return `${this.#channelPrefix}run:{${runId}}`; + } + + #channelForEnv(environmentId: string): string { + return `${this.#channelPrefix}env:{${environmentId}}`; + } +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts new file mode 100644 index 00000000000..791991178e4 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -0,0 +1,73 @@ +import { env } from "~/env.server"; +import { engine } from "~/v3/runEngine.server"; +import { logger } from "../logger.server"; +import { publishRunChanged } from "./runChangeNotifierInstance.server"; + +/** + * Registers the run-changed delegations as additive listeners on the Run Engine + * 2.0 event bus. All logic lives in the notifier + * module; each listener here is a one-line, fire-and-forget delegate. Because + * they only attach to engine events, they cover V2 runs exclusively (V1/MarQS + * never reach this engine), and they're trivially reversible (delete this file + + * its boot registration). + * + * Coverage is intentionally not exhaustive: a dropped or uncovered transition + * only adds latency because the consumer has a ~5s refetch backstop. We cover the + * high-value, env-cheap transitions here. + */ +export function registerRunChangeNotifierHandlers() { + if (env.REALTIME_NOTIFIER_ENABLED !== "1") { + return; + } + + // Status transitions (checkpoint suspend/resume, pending version, dequeue) — + // environment.id is in the payload. + engine.eventBus.on("runStatusChanged", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Dequeue/lock (sets startedAt) and attempt start (DEQUEUED -> EXECUTING) — the + // most-watched "my run started" transitions. + engine.eventBus.on("runLocked", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runAttemptStarted", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Terminal + failure transitions. + engine.eventBus.on("runSucceeded", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runFailed", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runExpired", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runCancelled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runRetryScheduled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Delay lifecycle (delayUntil / queued-after-delay changes). + engine.eventBus.on("runDelayRescheduled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runEnqueuedAfterDelay", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Attempt failures and metadata updates don't carry environmentId, but the + // single-run channel is keyed by runId alone. + engine.eventBus.on("runAttemptFailed", ({ run }) => { + publishRunChanged({ runId: run.id }); + }); + engine.eventBus.on("runMetadataUpdated", ({ run }) => { + publishRunChanged({ runId: run.id }); + }); + + logger.info("[runChangeNotifier] realtime run-change notifier handlers registered"); +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts new file mode 100644 index 00000000000..545887abc61 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -0,0 +1,73 @@ +import { Gauge } from "prom-client"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { + RunChangeNotifier, + type RunChangeInput, + type RunChangeSubscription, +} from "./runChangeNotifier.server"; + +/** + * Process-singleton wiring for the RunChangeNotifier plus the thin, gated + * convenience functions that write sites and the realtime route delegate to. + * + * The notifier is constructed lazily (only on the first publish/subscribe when + * enabled), so a webapp running with `REALTIME_NOTIFIER_ENABLED=0` (the default) + * opens no Redis connections and registers no metrics for this subsystem. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; + +function initializeRunChangeNotifier(): RunChangeNotifier { + const notifier = new RunChangeNotifier({ + redis: { + host: env.PUBSUB_REDIS_HOST, + port: env.PUBSUB_REDIS_PORT, + username: env.PUBSUB_REDIS_USERNAME, + password: env.PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + new Gauge({ + name: "realtime_run_change_notifier_active_subscriptions", + help: "Distinct runs currently subscribed for realtime change notifications", + collect() { + this.set(notifier.activeSubscriptionCount); + }, + registers: [metricsRegister], + }); + + return notifier; +} + +/** Lazily construct (and memoize) the notifier singleton. */ +export function getRunChangeNotifier(): RunChangeNotifier { + return singleton("runChangeNotifier", initializeRunChangeNotifier); +} + +/** Whether the notifier subsystem is enabled for this process. */ +export function isRunChangeNotifierEnabled(): boolean { + return notifierEnabled; +} + +/** Fire-and-forget run-changed notify. No-op (and no notifier construction) when disabled. */ +export function publishRunChanged(input: RunChangeInput): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publish(input); +} + +export function publishManyRunChanged(inputs: RunChangeInput[]): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publishMany(inputs); +} + +/** Subscribe to the next change for a run via the shared subscriber. */ +export function subscribeToRunChanges(runId: string): RunChangeSubscription { + return getRunChangeNotifier().subscribeToRunChanges(runId); +} diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts new file mode 100644 index 00000000000..6fe59c3c059 --- /dev/null +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -0,0 +1,191 @@ +import { type Prisma, type PrismaClient } from "@trigger.dev/database"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeRunRow } from "./electricStreamProtocol.server"; + +/** + * RunReader — the pluggable read half of the notifier-backed realtime feed. + * + * The mandate: ClickHouse is filter-only and resolves IDs, + * Postgres always hydrates row columns. This file owns the Postgres hydration + * half (`RunHydrator`, by-id) and the `RunListResolver` interface (the tag/list + * filter -> id-set seam, implemented over ClickHouse). + * + * Splitting hydration behind this small surface keeps the realtime feed + * decoupled from where runs physically live, ready for a future `TaskRunFast` + * table or a non-Postgres row store. + */ + +/** The TaskRun columns the realtime feed projects (mirrors DEFAULT_ELECTRIC_COLUMNS). */ +export const RUN_HYDRATOR_SELECT = { + id: true, + taskIdentifier: true, + createdAt: true, + updatedAt: true, + startedAt: true, + delayUntil: true, + queuedAt: true, + expiredAt: true, + completedAt: true, + friendlyId: true, + number: true, + isTest: true, + status: true, + usageDurationMs: true, + costInCents: true, + baseCostInCents: true, + ttl: true, + payload: true, + payloadType: true, + metadata: true, + metadataType: true, + output: true, + outputType: true, + runTags: true, + error: true, + realtimeStreams: true, +} satisfies Prisma.TaskRunSelect; + +/** + * Columns the feed needs internally regardless of the client's `skipColumns`: + * `id` keys the row, `updatedAt` drives the offset and the live working-set diff. + * Everything else can be projected away when the client skips it (see + * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ + * `metadata`/`error` columns the response will drop anyway. + */ +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt"]); + +/** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus + * the always-needed ones). An empty skip set returns the full select unchanged. */ +export function buildHydratorSelect(skipColumns: string[] = []): Prisma.TaskRunSelect { + if (skipColumns.length === 0) { + return RUN_HYDRATOR_SELECT; + } + const skip = new Set(skipColumns); + const select: Record = {}; + for (const column of Object.keys(RUN_HYDRATOR_SELECT)) { + if (ALWAYS_HYDRATED_COLUMNS.has(column) || !skip.has(column)) { + select[column] = true; + } + } + return select as Prisma.TaskRunSelect; +} + +export type RunListFilter = { + organizationId: string; + projectId: string; + environmentId: string; + /** Contains-ANY tag match (OR). Omit/empty for non-tag feeds. */ + tags?: string[]; + /** Restrict to a single batch (internal batch id) — the batch feed. */ + batchId?: string; + /** Lower bound on createdAt (the tag-list feed pins this; batch omits it). */ + createdAtAfter?: Date; + /** Hard cap on the result set so a broad filter can't unbound the snapshot. */ + limit: number; +}; + +/** + * Resolves a tag/list filter into the matching run id-set, filter-only (no row + * columns; rows are hydrated from Postgres by id afterward). Pluggable so the + * resolution source can change without touching the feed. The ClickHouse + * implementation lives in `clickHouseRunListResolver.server.ts`. + */ +export interface RunListResolver { + resolveMatchingRunIds(filter: RunListFilter): Promise; +} + +export type RunHydratorOptions = { + /** A read-replica Prisma client (`$replica`). Always Postgres. */ + replica: Pick; + /** + * Read-through cache TTL (ms) to collapse duplicate refetches across a burst + * of live polls for the same run. Fan-in is low in practice, so this is + * insurance, not load-bearing. Set to 0 to disable. Defaults to 250ms. + */ + cacheTtlMs?: number; + /** Hard cap on cache entries before expired entries are swept. */ + maxCacheEntries?: number; +}; + +const DEFAULT_CACHE_TTL_MS = 250; +const DEFAULT_MAX_CACHE_ENTRIES = 5_000; + +/** + * Hydrates a single run by id from the read replica, projected to the realtime + * columns. Concurrent refetches for the same (env, run) are single-flighted, and + * a short TTL cache collapses rapid repeats. + */ +export class RunHydrator { + readonly #inflight = new Map>(); + readonly #cache: BoundedTtlCache; + readonly #cacheTtlMs: number; + + constructor(private readonly options: RunHydratorOptions) { + this.#cacheTtlMs = options.cacheTtlMs ?? DEFAULT_CACHE_TTL_MS; + this.#cache = new BoundedTtlCache( + this.#cacheTtlMs, + options.maxCacheEntries ?? DEFAULT_MAX_CACHE_ENTRIES + ); + } + + async getRunById(environmentId: string, runId: string): Promise { + const key = `${environmentId}:${runId}`; + + if (this.#cacheTtlMs > 0) { + // A cached null is a valid "run not found" hit; only undefined is a miss. + const cached = this.#cache.get(key); + if (cached !== undefined) { + return cached; + } + } + + const existing = this.#inflight.get(key); + if (existing) { + return existing; + } + + const promise = this.#fetch(environmentId, runId).finally(() => this.#inflight.delete(key)); + this.#inflight.set(key, promise); + + const row = await promise; + + if (this.#cacheTtlMs > 0) { + this.#cache.set(key, row); + } + + return row; + } + + /** Hydrate many runs by id in one query (tag/list feed). Order is not guaranteed. + * `skipColumns` projects the SELECT so the replica doesn't ship columns the client + * dropped (notably the large `payload`/`output`/`metadata`/`error` columns). */ + async hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] = [] + ): Promise { + if (ids.length === 0) { + return []; + } + const rows = await this.options.replica.taskRun.findMany({ + where: { + runtimeEnvironmentId: environmentId, + id: { in: ids }, + }, + select: buildHydratorSelect(skipColumns), + }); + return rows as unknown as RealtimeRunRow[]; + } + + async #fetch(environmentId: string, runId: string): Promise { + const run = await this.options.replica.taskRun.findFirst({ + where: { + id: runId, + runtimeEnvironmentId: environmentId, + }, + select: RUN_HYDRATOR_SELECT, + }); + + return (run ?? null) as RealtimeRunRow | null; + } +} diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts new file mode 100644 index 00000000000..9a30d93c4da --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -0,0 +1,289 @@ +import { + type ElectricColumnType, + RUN_ELECTRIC_COLUMNS, + serializeRunRow, +} from "./electricStreamProtocol.server"; +import { type RunHydrator, type RunListFilter, type RunListResolver } from "./runReader.server"; + +/** + * Dual-run shadow-compare. + * + * The client is always served the Electric response; in the background this + * re-derives what the notifier path WOULD emit and diffs the two, so we can prove + * parity on real production traffic before any cutover. + * + * Two kinds of divergence are checked: + * - serialization: for each run Electric emitted, re-hydrate it and serialize via + * the notifier serializer, then compare SEMANTICALLY (decode both sides per + * column type) so equivalent-but-differently-encoded wire values (timestamp + * format, bool t/true, number formatting) are not false positives. The compare + * is gated on same-version (matching updatedAt) so a row that changed between + * Electric's emit and our refetch is recorded as "skew", not a divergence. + * - membership (tag/batch initial snapshot only): the set of run ids Electric + * emitted vs the set the notifier resolver returns. This is where the known + * tag OR-vs-AND difference shows up. + * + * Pure except for the injected RunHydrator/RunListResolver, so it's unit-testable. + */ + +export type ShadowFeed = "run" | "runs" | "batch"; + +type WireValue = Record; + +type ShapeMessage = { + key?: string; + value?: WireValue; + headers: { operation?: string; control?: string }; +}; + +const COLUMN_BY_NAME = new Map(RUN_ELECTRIC_COLUMNS.map((column) => [column.name, column])); + +export type ColumnDiff = { + runId: string; + column: string; + electric: string | null; + notifier: string | null; +}; + +export type ShadowCompareOutcome = { + feed: ShadowFeed; + /** Runs whose every emitted column matched (same-version). */ + serializationMatched: number; + /** Runs with at least one semantic column divergence (same-version). */ + serializationDiverged: number; + /** Runs that changed between Electric's emit and our refetch (not a divergence). */ + serializationSkew: number; + /** Per-column divergences (capped) for logging. */ + diffs: ColumnDiff[]; + /** Set membership (tag/batch initial snapshot only). undefined when not checked. */ + membershipMatch?: boolean; + missingInNotifier?: string[]; + extraInNotifier?: string[]; +}; + +export type ShadowCompareInput = { + feed: ShadowFeed; + /** The served Electric response body (a JSON array of messages, or "" / "[]"). */ + electricBody: string; + environment: { id: string }; + skipColumns: string[]; + /** True when this was an initial snapshot request (offset=-1); enables membership compare. */ + isInitialSnapshot: boolean; + /** When set (tag/batch initial snapshot), compare the resolved id-set. */ + membershipFilter?: RunListFilter; +}; + +const MAX_DIFFS = 20; + +export class RealtimeShadowComparator { + constructor( + private readonly options: { runReader: RunHydrator; runListResolver: RunListResolver } + ) {} + + async compare(input: ShadowCompareInput): Promise { + const messages = parseBody(input.electricBody); + const changes = messages.filter( + (m): m is ShapeMessage & { value: WireValue } => + typeof m.headers?.operation === "string" && !!m.value && m.headers.operation !== "delete" + ); + + const outcome: ShadowCompareOutcome = { + feed: input.feed, + serializationMatched: 0, + serializationDiverged: 0, + serializationSkew: 0, + diffs: [], + }; + + for (const message of changes) { + const runId = message.value.id ?? undefined; + if (!runId) { + continue; + } + + const row = await this.options.runReader.getRunById(input.environment.id, runId); + if (!row) { + // Run no longer readable (deleted / replica miss). Not a serialization divergence. + outcome.serializationSkew++; + continue; + } + + const notifierValue = serializeRunRow(row, input.skipColumns); + + // Only compare rows at the same version; otherwise the row advanced between + // Electric's emit and our refetch (timing skew, not a divergence). + if (!sameInstant(message.value.updatedAt, notifierValue.updatedAt)) { + outcome.serializationSkew++; + continue; + } + + let rowDiverged = false; + for (const [column, electricRaw] of Object.entries(message.value)) { + const meta = COLUMN_BY_NAME.get(column); + if (!meta) { + continue; + } + const notifierRaw = notifierValue[column] ?? null; + if (!valuesEqual(electricRaw, notifierRaw, meta.type, meta.dims, column)) { + rowDiverged = true; + if (outcome.diffs.length < MAX_DIFFS) { + outcome.diffs.push({ runId, column, electric: electricRaw, notifier: notifierRaw }); + } + } + } + + if (rowDiverged) { + outcome.serializationDiverged++; + } else { + outcome.serializationMatched++; + } + } + + if (input.isInitialSnapshot && input.membershipFilter) { + const electricIds = new Set( + changes.map((m) => m.value.id).filter((id): id is string => typeof id === "string") + ); + const notifierIds = new Set( + await this.options.runListResolver.resolveMatchingRunIds(input.membershipFilter) + ); + + outcome.missingInNotifier = [...electricIds].filter((id) => !notifierIds.has(id)); + outcome.extraInNotifier = [...notifierIds].filter((id) => !electricIds.has(id)); + outcome.membershipMatch = + outcome.missingInNotifier.length === 0 && outcome.extraInNotifier.length === 0; + } + + return outcome; + } +} + +function parseBody(body: string): ShapeMessage[] { + const text = body.trim(); + if (!text) { + return []; + } + try { + const parsed = JSON.parse(text); + return Array.isArray(parsed) ? (parsed as ShapeMessage[]) : []; + } catch { + return []; + } +} + +/** Status carries a known legacy rewrite (DEQUEUED -> EXECUTING) applied equally to + * both paths for non-current API versions; treat them as equivalent. */ +function normalizeStatus(value: string): string { + return value === "DEQUEUED" ? "EXECUTING" : value; +} + +function sameInstant(a: string | null | undefined, b: string | null | undefined): boolean { + if (a == null || b == null) { + return a == null && b == null; + } + // Mirror the SDK's RawShapeDate (`new Date(val + "Z")`). + return new Date(`${a}Z`).getTime() === new Date(`${b}Z`).getTime(); +} + +function valuesEqual( + electricRaw: string | null, + notifierRaw: string | null, + type: ElectricColumnType, + dims: number | undefined, + column: string +): boolean { + if (electricRaw == null || notifierRaw == null) { + return electricRaw == null && notifierRaw == null; + } + + if (dims && dims > 0) { + return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(notifierRaw)); + } + + switch (type) { + case "timestamp": + return new Date(`${electricRaw}Z`).getTime() === new Date(`${notifierRaw}Z`).getTime(); + case "bool": + return parseBool(electricRaw) === parseBool(notifierRaw); + case "int4": + case "int8": + case "float8": + return Number(electricRaw) === Number(notifierRaw); + case "jsonb": + return jsonEqual(electricRaw, notifierRaw); + case "text": + default: + if (column === "status") { + return normalizeStatus(electricRaw) === normalizeStatus(notifierRaw); + } + return electricRaw === notifierRaw; + } +} + +function parseBool(value: string): boolean { + return value === "t" || value === "true"; +} + +function jsonEqual(a: string, b: string): boolean { + try { + return deepEqual(JSON.parse(a), JSON.parse(b)); + } catch { + return a === b; + } +} + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (typeof a !== typeof b || a === null || b === null) return false; + if (Array.isArray(a) && Array.isArray(b)) { + return a.length === b.length && a.every((v, i) => deepEqual(v, b[i])); + } + if (typeof a === "object" && typeof b === "object") { + const ak = Object.keys(a as object).sort(); + const bk = Object.keys(b as object).sort(); + return ( + ak.length === bk.length && + ak.every((k, i) => k === bk[i]) && + ak.every((k) => deepEqual((a as any)[k], (b as any)[k])) + ); + } + return false; +} + +function arraysEqual(a: string[], b: string[]): boolean { + return a.length === b.length && a.every((v, i) => v === b[i]); +} + +/** Parse a Postgres text-array literal (`{"a","b"}` / `{}`). Mirrors the client's pgArrayParser. */ +function parsePgTextArray(literal: string): string[] { + if (literal === "{}" || literal === "") { + return []; + } + const inner = literal.startsWith("{") && literal.endsWith("}") ? literal.slice(1, -1) : literal; + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts new file mode 100644 index 00000000000..1ddf162fd87 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -0,0 +1,192 @@ +import { API_VERSIONS } from "~/api/versions"; +import { logger } from "../logger.server"; +import { + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { RESERVED_COLUMNS } from "./electricStreamProtocol.server"; +import { + type RealtimeListEnvironment, + type RealtimeStreamClient, +} from "./notifierRealtimeClient.server"; +import { type RunListFilter } from "./runReader.server"; +import { + type RealtimeShadowComparator, + type ShadowCompareOutcome, + type ShadowFeed, +} from "./shadowCompare.server"; + +export type ShadowRealtimeClientOptions = { + /** The path actually served to the client (Electric). */ + electric: RealtimeStreamClient; + comparator: RealtimeShadowComparator; + /** createdAt window (ms) used to resolve tag-list membership for the compare. */ + maximumCreatedAtFilterAgeMs: number; + /** Cap for the membership resolve. */ + maxListResults: number; + /** Metrics sink for compare outcomes. */ + onOutcome?: (outcome: ShadowCompareOutcome) => void; +}; + +/** + * Dual-run gate: a transparent wrapper that serves the Electric + * response unchanged and, in the background, diffs what the notifier path would emit + * against it. The shadow work is fire-and-forget — it never blocks or fails the + * client's request — and it exercises the read replica so the notifier's real load + * can be measured before cutover. + */ +export class ShadowRealtimeClient implements RealtimeStreamClient { + constructor(private readonly options: ShadowRealtimeClientOptions) {} + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRun( + url, + environment, + runId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("run", response, url, environment, requestOptions); + return response; + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRuns( + url, + environment, + params, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("runs", response, url, environment, requestOptions, { tags: params.tags ?? [] }); + return response; + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamBatch( + url, + environment, + batchId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("batch", response, url, environment, requestOptions, { batchId }); + return response; + } + + /** Fire-and-forget; never blocks the served response, never throws into the request. */ + #shadow( + feed: ShadowFeed, + electricResponse: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions?: RealtimeRequestOptions, + membership?: { tags?: string[]; batchId?: string } + ): void { + // Clone synchronously before the client consumes the body. + let bodyClone: Response; + try { + if (electricResponse.status !== 200) { + return; + } + bodyClone = electricResponse.clone(); + } catch { + return; + } + + void this.#runShadow(feed, bodyClone, url, environment, requestOptions, membership).catch( + (error) => logger.debug("[shadowRealtime] compare failed", { feed, error }) + ); + } + + async #runShadow( + feed: ShadowFeed, + bodyClone: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions: RealtimeRequestOptions | undefined, + membership: { tags?: string[]; batchId?: string } | undefined + ): Promise { + const $url = new URL(url.toString()); + const offset = $url.searchParams.get("offset") ?? "-1"; + const handle = $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"); + const isInitialSnapshot = offset === "-1" || !handle; + const skipColumns = resolveSkipColumns($url, requestOptions); + const electricBody = await bodyClone.text(); + + let membershipFilter: RunListFilter | undefined; + if (isInitialSnapshot && membership && environment.projectId) { + membershipFilter = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: membership.tags, + batchId: membership.batchId, + createdAtAfter: membership.batchId + ? undefined + : new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs), + limit: this.options.maxListResults, + }; + } + + const outcome = await this.options.comparator.compare({ + feed, + electricBody, + environment: { id: environment.id }, + skipColumns, + isInitialSnapshot, + membershipFilter, + }); + + this.options.onOutcome?.(outcome); + + if (outcome.serializationDiverged > 0 || outcome.membershipMatch === false) { + logger.warn("[shadowRealtime] divergence detected", { + feed, + serializationDiverged: outcome.serializationDiverged, + serializationMatched: outcome.serializationMatched, + serializationSkew: outcome.serializationSkew, + membershipMatch: outcome.membershipMatch, + missingInNotifier: outcome.missingInNotifier?.slice(0, 20), + extraInNotifier: outcome.extraInNotifier?.slice(0, 20), + diffs: outcome.diffs, + }); + } + } +} + +function resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..36ce0a4325b --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -0,0 +1,66 @@ +import { Counter } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { singleton } from "~/utils/singleton"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { RunHydrator } from "./runReader.server"; +import { RealtimeShadowComparator } from "./shadowCompare.server"; +import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; + +/** + * Process-singleton wiring for the shadow-compare client. Only constructed + * when an org's `realtimeBackend` flag is set to "shadow". + */ +function initializeShadowRealtimeClient(): ShadowRealtimeClient { + const compares = new Counter({ + name: "realtime_shadow_compare_total", + help: "Dual-run shadow-compare outcomes (Electric vs notifier). kind=serialization|membership, result=match|diverge|skew.", + labelNames: ["feed", "kind", "result"] as const, + registers: [metricsRegister], + }); + + const comparator = new RealtimeShadowComparator({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + prisma: $replica, + }), + }); + + return new ShadowRealtimeClient({ + electric: realtimeClient, + comparator, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + onOutcome: (outcome) => { + const { feed } = outcome; + if (outcome.serializationMatched) { + compares.inc({ feed, kind: "serialization", result: "match" }, outcome.serializationMatched); + } + if (outcome.serializationDiverged) { + compares.inc( + { feed, kind: "serialization", result: "diverge" }, + outcome.serializationDiverged + ); + } + if (outcome.serializationSkew) { + compares.inc({ feed, kind: "serialization", result: "skew" }, outcome.serializationSkew); + } + if (outcome.membershipMatch !== undefined) { + compares.inc({ + feed, + kind: "membership", + result: outcome.membershipMatch ? "match" : "diverge", + }); + } + }, + }); +} + +export function getShadowRealtimeClient(): ShadowRealtimeClient { + return singleton("shadowRealtimeClient", initializeShadowRealtimeClient); +} diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 9a5d75cfe25..55b30a8396e 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -10,6 +10,7 @@ export const FEATURE_FLAG = { hasPrivateConnections: "hasPrivateConnections", mollifierEnabled: "mollifierEnabled", workerQueueScheduledSplitEnabled: "workerQueueScheduledSplitEnabled", + realtimeBackend: "realtimeBackend", } as const; export const FeatureFlagCatalog = { @@ -22,6 +23,10 @@ export const FeatureFlagCatalog = { [FEATURE_FLAG.hasPrivateConnections]: z.coerce.boolean(), [FEATURE_FLAG.mollifierEnabled]: z.coerce.boolean(), [FEATURE_FLAG.workerQueueScheduledSplitEnabled]: z.coerce.boolean(), + // Which backend serves the realtime run feed. Controllable + // globally and per-org (org wins). Defaults to "electric" when unset. + // "shadow" serves Electric but diffs the notifier path in the background. + [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "notifier", "shadow"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts new file mode 100644 index 00000000000..e487798750e --- /dev/null +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -0,0 +1,41 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; + +describe("BoundedTtlCache", () => { + afterEach(() => { + vi.useRealTimers(); + }); + + it("returns a live entry within its TTL", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("k", "v"); + vi.advanceTimersByTime(500); + expect(cache.get("k")).toBe("v"); + expect(cache.size).toBe(1); + }); + + it("evicts an expired entry on read instead of letting it linger", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("a", 1); + expect(cache.size).toBe(1); + + vi.advanceTimersByTime(1_001); + expect(cache.get("a")).toBeUndefined(); + // The previous bug left expired entries in the map until an at-capacity sweep; + // they must now be removed on read. + expect(cache.size).toBe(0); + }); + + it("drops the oldest entry when full of still-live entries", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + cache.set("c", 3); // over capacity, none expired -> evict oldest insertion (a) + expect(cache.get("a")).toBeUndefined(); + expect(cache.get("b")).toBe(2); + expect(cache.get("c")).toBe(3); + expect(cache.size).toBe(2); + }); +}); diff --git a/apps/webapp/test/realtime/electricStreamProtocol.test.ts b/apps/webapp/test/realtime/electricStreamProtocol.test.ts new file mode 100644 index 00000000000..a48f4f9f8e8 --- /dev/null +++ b/apps/webapp/test/realtime/electricStreamProtocol.test.ts @@ -0,0 +1,304 @@ +import { SubscribeRunRawShape } from "@trigger.dev/core/v3/schemas"; +import { describe, expect, it } from "vitest"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_abc123", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-06T10:00:00.000Z"), + updatedAt: new Date("2026-06-06T10:05:30.123Z"), + startedAt: new Date("2026-06-06T10:01:00.000Z"), + delayUntil: null, + queuedAt: new Date("2026-06-06T10:00:30.000Z"), + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_abc", + number: 42, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: '{"step":1}', + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["user:123", "env:prod"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +/** + * Faithful re-implementation of the @electric-sql/client value parser rules + * (defaultParser + pgArrayParser), so we can decode our wire `value` object the + * same way the deployed client would, then validate against the real SDK schema. + * Source: @electric-sql/client@1.0.14 src/parser.ts. + */ +function electricParse( + value: Record, + schema: Record +): Record { + const out: Record = {}; + for (const [key, raw] of Object.entries(value)) { + if (raw === null) { + out[key] = null; + continue; + } + const info = schema[key]; + if (!info) { + out[key] = raw; + continue; + } + if (info.dims && info.dims > 0) { + out[key] = parsePgTextArray(raw); + continue; + } + switch (info.type) { + case "bool": + out[key] = raw === "t" || raw === "true"; + break; + case "int8": + out[key] = BigInt(raw); + break; + case "int2": + case "int4": + case "float4": + case "float8": + out[key] = Number(raw); + break; + case "json": + case "jsonb": + out[key] = JSON.parse(raw); + break; + default: + out[key] = raw; // text/timestamp pass through as strings + } + } + return out; +} + +function parsePgTextArray(literal: string): string[] { + if (literal === "{}") { + return []; + } + const inner = literal.slice(1, -1); + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; // closing quote + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} + +describe("electricStreamProtocol serializer", () => { + it("encodes each Postgres type the way the Electric client expects", () => { + const value = serializeRunRow(sampleRow()); + + // text: passed through as-is + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + expect(value.payload).toBe('{"hello":"world"}'); + + // int/float: stringified + expect(value.number).toBe("42"); + expect(value.usageDurationMs).toBe("1234"); + expect(value.costInCents).toBe("0.55"); + + // bool: postgres "t"/"f" + expect(value.isTest).toBe("t"); + + // timestamp: ISO without trailing Z (the SDK appends Z before parsing) + expect(value.updatedAt).toBe("2026-06-06T10:05:30.123"); + expect(value.createdAt).toBe("2026-06-06T10:00:00.000"); + + // nullable timestamp: null stays null + expect(value.delayUntil).toBeNull(); + expect(value.completedAt).toBeNull(); + + // text[]: quoted pg array literal; empty realtimeStreams (@default([])) => {} + expect(value.runTags).toBe('{"user:123","env:prod"}'); + expect(value.realtimeStreams).toBe("{}"); + + // jsonb: null stays null + expect(value.error).toBeNull(); + }); + + it("encodes an empty no-default array column (runTags) as null, matching Electric", () => { + // runTags has no Postgres default, so an empty value is stored as SQL NULL and + // Electric emits `null` (not `{}`). realtimeStreams has @default([]), so its + // empty value is `{}`. Prisma hands us `[]` for both; we re-derive the wire form. + const value = serializeRunRow(sampleRow({ runTags: [], realtimeStreams: [] })); + expect(value.runTags).toBeNull(); + expect(value.realtimeStreams).toBe("{}"); + }); + + it("encodes jsonb error as a JSON string", () => { + const value = serializeRunRow(sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } })); + expect(value.error).toBe('{"type":"STRING_ERROR","raw":"boom"}'); + }); + + it("round-trips through the client parser into a valid SubscribeRunRawShape", () => { + const row = sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } }); + const value = serializeRunRow(row); + const schema = JSON.parse(buildElectricSchemaHeader()); + + const decoded = electricParse(value, schema); + const parsed = SubscribeRunRawShape.parse(decoded); + + expect(parsed.id).toBe("run_abc123"); + expect(parsed.friendlyId).toBe("run_friendly_abc"); + expect(parsed.status).toBe("EXECUTING"); + expect(parsed.number).toBe(42); + expect(parsed.isTest).toBe(true); + expect(parsed.usageDurationMs).toBe(1234); + expect(parsed.costInCents).toBeCloseTo(0.55); + expect(parsed.runTags).toEqual(["user:123", "env:prod"]); + expect(parsed.realtimeStreams).toEqual([]); + // RawShapeDate appends "Z" and coerces to a Date equal to the source instant. + expect(parsed.createdAt.toISOString()).toBe("2026-06-06T10:00:00.000Z"); + expect(parsed.updatedAt.toISOString()).toBe("2026-06-06T10:05:30.123Z"); + expect(parsed.startedAt?.toISOString()).toBe("2026-06-06T10:01:00.000Z"); + expect(parsed.delayUntil ?? null).toBeNull(); + expect(parsed.error).toEqual({ type: "STRING_ERROR", raw: "boom" }); + }); + + it("honors skipColumns (but never the reserved columns)", () => { + const value = serializeRunRow(sampleRow(), ["payload", "output", "id", "status"]); + expect(value.payload).toBeUndefined(); + expect(value.output).toBeUndefined(); + // reserved columns can't be skipped + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + + const schema = JSON.parse(buildElectricSchemaHeader(["payload"])); + expect(schema.payload).toBeUndefined(); + expect(schema.status).toBeDefined(); + }); +}); + +describe("electricStreamProtocol message bodies", () => { + it("emits insert + up-to-date for an initial snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(sampleRow())); + expect(messages).toHaveLength(2); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_abc123"'); + expect(messages[0].value.status).toBe("EXECUTING"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty (missing) run snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(null)); + expect(messages).toHaveLength(1); + expect(messages[0].headers.control).toBe("up-to-date"); + }); + + it("emits update + up-to-date for a live change", () => { + const messages = JSON.parse(buildUpdateBody(sampleRow())); + expect(messages[0].headers.operation).toBe("update"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date when nothing advanced", () => { + const messages = JSON.parse(buildUpToDateBody()); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("uses the same merge key across insert and update so the client merges by row", () => { + const insert = JSON.parse(buildSnapshotBody(sampleRow()))[0]; + const update = JSON.parse(buildUpdateBody(sampleRow()))[0]; + expect(insert.key).toBe(update.key); + }); +}); + +describe("electricStreamProtocol multi-row (tag-list) bodies", () => { + it("emits one change message per row with per-row operation, then up-to-date", () => { + const a = sampleRow({ id: "run_a" }); + const b = sampleRow({ id: "run_b", status: "QUEUED" }); + const messages = JSON.parse( + buildRowsBody([ + { row: a, operation: "insert" }, + { row: b, operation: "update" }, + ]) + ); + expect(messages).toHaveLength(3); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_a"'); + expect(messages[1].headers.operation).toBe("update"); + expect(messages[1].key).toBe('"public"."TaskRun"/"run_b"'); + expect(messages[1].value.status).toBe("QUEUED"); + expect(messages[2].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty change set", () => { + const messages = JSON.parse(buildRowsBody([])); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("honors skipColumns across all rows", () => { + const messages = JSON.parse( + buildRowsBody([{ row: sampleRow(), operation: "insert" }], ["payload"]) + ); + expect(messages[0].value.payload).toBeUndefined(); + expect(messages[0].value.status).toBe("EXECUTING"); + }); +}); + +describe("electricStreamProtocol tokens + legacy rewrite", () => { + it("encodes and parses the offset updatedAt segment", () => { + const offset = encodeOffset(1717667130123, 7); + expect(offset).toBe("1717667130123_7"); + expect(parseOffsetUpdatedAtMs(offset)).toBe(1717667130123); + }); + + it("treats the initial offset (-1) and garbage as zero", () => { + expect(parseOffsetUpdatedAtMs("-1")).toBe(0); + expect(parseOffsetUpdatedAtMs(null)).toBe(0); + expect(parseOffsetUpdatedAtMs("nonsense")).toBe(0); + }); + + it("rewrites DEQUEUED to EXECUTING for legacy API versions", () => { + const body = buildUpdateBody(sampleRow({ status: "DEQUEUED" })); + expect(body).toContain('"status":"DEQUEUED"'); + const rewritten = rewriteBodyForLegacyApiVersion(body); + expect(rewritten).not.toContain('"status":"DEQUEUED"'); + expect(rewritten).toContain('"status":"EXECUTING"'); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts new file mode 100644 index 00000000000..fb3349e0e62 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts @@ -0,0 +1,107 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(): RealtimeRunRow { + return { + id: "run_1", + taskIdentifier: "t", + createdAt: new Date("2026-06-07T10:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:01.000Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_1", + number: 1, + isTest: false, + status: "EXECUTING", + usageDurationMs: 0, + costInCents: 0, + baseCostInCents: 0, + ttl: null, + payload: "{}", + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: [], + error: null, + realtimeStreams: [], + }; +} + +// Only the initial-snapshot path is exercised here, which touches the shared +// #buildResponse — enough to lock the response-header contract. +function makeClient(row: RealtimeRunRow | null) { + const never = { changed: new Promise(() => {}), unsubscribe() {} }; + return new NotifierRealtimeClient({ + runReader: { + getRunById: async () => row, + hydrateByIds: async () => (row ? [row] : []), + } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + notifier: { + subscribeToRunChanges: () => never, + subscribeToEnvChanges: () => never, + } as any, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + }); +} + +const ENV: RealtimeListEnvironment = { + id: "env_1", + organizationId: "org_1", + projectId: "proj_1", +}; + +describe("NotifierRealtimeClient response headers", () => { + it("exposes electric headers cross-origin so browser hooks can read them", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + "1.0.0-beta.1" // modern client => lowercase electric-* headers + ); + + // Without these the deployed @electric-sql/client throws MissingHeadersError + // (it can't read the electric-* headers across origins). This regressed once. + expect(res.headers.get("access-control-allow-origin")).toBe("*"); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + + // Initial (non-live) snapshot requires offset + handle + schema. + expect(res.headers.get("electric-offset")).toBeTruthy(); + expect(res.headers.get("electric-handle")).toBeTruthy(); + expect(res.headers.get("electric-schema")).toBeTruthy(); + expect(res.headers.get("content-type")).toBe("application/json"); + }); + + it("renames headers for legacy (0.4.0) clients", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + undefined // no client version => legacy header names + ); + + expect(res.headers.get("electric-chunk-last-offset")).toBeTruthy(); + expect(res.headers.get("electric-shape-id")).toBeTruthy(); + expect(res.headers.get("electric-offset")).toBeNull(); + expect(res.headers.get("electric-handle")).toBeNull(); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts new file mode 100644 index 00000000000..a0beb0fd728 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -0,0 +1,173 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +function row(id: string): RealtimeRunRow { + // Only id/createdAt/updatedAt are read directly; the rest serialize to null. + return { + id, + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:00.000Z"), + } as unknown as RealtimeRunRow; +} + +function makeClient(overrides: Record = {}) { + const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + const never = { changed: new Promise(() => {}), unsubscribe() {} }; + + const client = new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + notifier: { subscribeToRunChanges: () => never, subscribeToEnvChanges: () => never } as any, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 5_000, + ...overrides, + }); + + return { client, resolveSpy, hydrateSpy }; +} + +// streamBatch with offset=-1 takes the snapshot path, which calls the coalescing +// resolve+hydrate directly (no concurrency slot / subscription needed). +function snapshot(client: NotifierRealtimeClient, batchId: string, skipColumns?: string) { + const skip = skipColumns ? `&skipColumns=${skipColumns}` : ""; + return client.streamBatch( + `http://localhost:3030/realtime/v1/batches/${batchId}?offset=-1${skip}`, + ENV, + batchId, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +// Tag-list snapshot (offset=-1) — exercises the createdAt bucketing + cache key. +function snapshotTag(client: NotifierRealtimeClient, tags: string[]) { + return client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=-1", + ENV, + { tags }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { + it("coalesces concurrent same-filter resolves into one ClickHouse + Postgres query", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + let release!: (ids: string[]) => void; + const gate = new Promise((resolve) => { + release = resolve; + }); + resolveSpy.mockReturnValueOnce(gate); + + const p1 = snapshot(client, "batch_1"); + const p2 = snapshot(client, "batch_1"); + release(["run_1"]); + await Promise.all([p1, p2]); + + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("serves a second same-filter request from the cache within the TTL", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share the cache across different filters", async () => { + const { client, resolveSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_2"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("re-queries after the cache TTL expires", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + try { + const { client, resolveSpy } = makeClient({ runSetResolveCacheTtlMs: 1_000 }); + await snapshot(client, "batch_1"); + vi.advanceTimersByTime(1_001); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } + }); + + it("passes the client's skipColumns through to the hydrator (column projection)", async () => { + const { client, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1", "payload,output"); + expect(hydrateSpy).toHaveBeenCalledWith("env_1", expect.any(Array), ["payload", "output"]); + }); + + it("reports resolve outcomes (miss then hit) to the metrics hook", async () => { + const results: string[] = []; + const { client } = makeClient({ onRunSetResolve: (r: string) => results.push(r) }); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(results).toEqual(["miss", "hit"]); + }); +}); + +describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { + it("floors the resolved createdAt lower bound to the bucket boundary", async () => { + // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60_000 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + expect(passed.getTime() % 60_000).toBe(0); + } finally { + vi.useRealTimers(); + } + }); + + it("lets two same-tag feeds in the same bucket share one resolve", async () => { + // A large bucket guarantees both windows floor to the same boundary regardless of + // the sub-millisecond gap between the two calls. + const { client, resolveSpy, hydrateSpy } = makeClient({ + runSetCreatedAtBucketMs: 60 * 60_000, + }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["critical"]); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share across different tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["debug"]); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 0 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Exact (now - 24h) lower bound, not floored to a 60s boundary. + expect(passed.getTime() % 60_000).not.toBe(0); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts new file mode 100644 index 00000000000..7459c9f5df5 --- /dev/null +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -0,0 +1,211 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, vi } from "vitest"; +import { RunChangeNotifier } from "~/services/realtime/runChangeNotifier.server"; + +function toRedisOptions(redisOptions: { host?: string; port?: number; password?: string }) { + return { + host: redisOptions.host, + port: redisOptions.port, + password: redisOptions.password, + tlsDisabled: true, + clusterMode: false, + }; +} + +// Time for a SUBSCRIBE to register server-side before we publish. +const SUBSCRIBE_SETTLE_MS = 250; + +describe("RunChangeNotifier", () => { + redisTest( + "delivers a published change to a subscriber", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const subscription = notifier.subscribeToRunChanges("run_1"); + expect(notifier.activeSubscriptionCount).toBe(1); + + let resolved = false; + void subscription.changed.then(() => { + resolved = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1" }); + + await vi.waitFor(() => expect(resolved).toBe(true), { timeout: 5_000, interval: 50 }); + + subscription.unsubscribe(); + // Cleanup is deferred until Redis confirms UNSUBSCRIBE (avoids a + // subscribe/unsubscribe race), so the count converges to 0 asynchronously. + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not wake a subscriber for a different run", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const subscription = notifier.subscribeToRunChanges("run_a"); + let resolved = false; + void subscription.changed.then(() => { + resolved = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_b" }); + await sleep(500); + + expect(resolved).toBe(false); + subscription.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "refcounts subscriptions per run and wakes all waiters", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const first = notifier.subscribeToRunChanges("run_x"); + const second = notifier.subscribeToRunChanges("run_x"); + + // Two waiters, one distinct channel. + expect(notifier.activeSubscriptionCount).toBe(1); + + let firstResolved = false; + let secondResolved = false; + void first.changed.then(() => (firstResolved = true)); + void second.changed.then(() => (secondResolved = true)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_x" }); + + await vi.waitFor(() => expect(firstResolved && secondResolved).toBe(true), { + timeout: 5_000, + interval: 50, + }); + + // Channel stays until the last waiter unsubscribes. Dropping one waiter only + // shrinks the listener set (no UNSUBSCRIBE), so the count is still 1 synchronously. + first.unsubscribe(); + expect(notifier.activeSubscriptionCount).toBe(1); + // The last unsubscribe issues UNSUBSCRIBE; the channel is dropped once Redis confirms. + second.unsubscribe(); + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "publish with no subscribers is a harmless no-op", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + expect(() => notifier.publish({ runId: "nobody_listening" })).not.toThrow(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "wakes an env subscriber when a run in that env changes (tag-list feed)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const envSub = notifier.subscribeToEnvChanges("env_1"); + let envWoke = false; + void envSub.changed.then(() => { + envWoke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + // A run change WITH an environmentId fans out to the per-env channel. + notifier.publish({ runId: "run_1", environmentId: "env_1" }); + + await vi.waitFor(() => expect(envWoke).toBe(true), { timeout: 5_000, interval: 50 }); + envSub.unsubscribe(); + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not wake an env subscriber for a different env, nor when env is omitted", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const envSub = notifier.subscribeToEnvChanges("env_a"); + let envWoke = false; + void envSub.changed.then(() => { + envWoke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", environmentId: "env_b" }); // different env + notifier.publish({ runId: "run_2" }); // no env -> per-run channel only + await sleep(500); + + expect(envWoke).toBe(false); + envSub.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "re-subscribing right after the last unsubscribe still delivers", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const first = notifier.subscribeToRunChanges("run_race"); + await sleep(SUBSCRIBE_SETTLE_MS); + + // Drop the last waiter (issues UNSUBSCRIBE) and immediately re-subscribe before + // it can settle. The channel must end up subscribed so the new waiter wakes. + first.unsubscribe(); + const second = notifier.subscribeToRunChanges("run_race"); + let woke = false; + void second.changed.then(() => { + woke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_race" }); + + await vi.waitFor(() => expect(woke).toBe(true), { timeout: 5_000, interval: 50 }); + second.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts new file mode 100644 index 00000000000..22ba3ac72fa --- /dev/null +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildHydratorSelect, RunHydrator } from "~/services/realtime/runReader.server"; + +describe("buildHydratorSelect", () => { + it("returns the full select when nothing is skipped", () => { + const select = buildHydratorSelect([]); + expect(select.id).toBe(true); + expect(select.payload).toBe(true); + expect(select.output).toBe(true); + expect(select.metadata).toBe(true); + expect(select.error).toBe(true); + }); + + it("drops skipped columns but always keeps id + updatedAt", () => { + const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.metadata).toBeUndefined(); + expect(select.error).toBeUndefined(); + // Needed internally regardless of skipColumns (keys the row, drives the diff/offset). + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + // A non-skipped column survives. + expect(select.status).toBe(true); + }); +}); + +describe("RunHydrator.hydrateByIds column projection", () => { + function makeHydrator() { + let capturedSelect: Record | undefined; + const replica = { + taskRun: { + findMany: vi.fn(async ({ select }: { select: Record }) => { + capturedSelect = select; + return []; + }), + }, + } as any; + return { hydrator: new RunHydrator({ replica }), getSelect: () => capturedSelect }; + } + + it("projects the SELECT by skipColumns", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"], ["payload", "output"]); + const select = getSelect()!; + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + }); + + it("selects the full column set when no skipColumns are given", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"]); + expect(getSelect()!.payload).toBe(true); + }); +}); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts new file mode 100644 index 00000000000..31bb527589f --- /dev/null +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -0,0 +1,212 @@ +import { + type RealtimeRunRow, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; +import { type RunListFilter } from "~/services/realtime/runReader.server"; +import { RealtimeShadowComparator } from "~/services/realtime/shadowCompare.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_a", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:05:30.123Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_a", + number: 7, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["a", "b"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +const UP_TO_DATE = { headers: { control: "up-to-date" } }; + +function insert(value: Record) { + return { key: `"public"."TaskRun"/"${value.id}"`, value, headers: { operation: "insert" } }; +} + +function makeComparator( + rowsById: Record, + resolvedIds: string[] = [] +) { + return new RealtimeShadowComparator({ + runReader: { getRunById: async (_env: string, id: string) => rowsById[id] ?? null } as any, + runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, + }); +} + +describe("RealtimeShadowComparator serialization", () => { + it("counts a faithful re-serialization as a match", async () => { + const row = sampleRow(); + const body = JSON.stringify([insert(serializeRunRow(row)), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + expect(out.serializationSkew).toBe(0); + expect(out.diffs).toEqual([]); + }); + + it("does not flag semantically-equivalent but differently-encoded values", async () => { + const row = sampleRow(); + // Electric encodes bool as "true" (notifier uses "t"), a number with a trailing + // zero, and a timestamp without millis — all equal after decoding. + const value = { + ...serializeRunRow(row), + isTest: "true", + costInCents: "0.5500", + createdAt: "2026-06-07T09:00:00", + }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + }); + + it("flags a genuine column divergence (same version)", async () => { + const row = sampleRow(); + const value = { ...serializeRunRow(row), payload: '{"hello":"TAMPERED"}' }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.diffs).toEqual([ + { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', notifier: '{"hello":"world"}' }, + ]); + }); + + it("treats DEQUEUED/EXECUTING as equivalent (legacy status rewrite)", async () => { + const row = sampleRow({ status: "EXECUTING" }); + const value = { ...serializeRunRow(row), status: "DEQUEUED" }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(0); + expect(out.serializationMatched).toBe(1); + }); + + it("records skew when the row advanced between emit and refetch", async () => { + const row = sampleRow(); + // Electric emitted an older version; the refetched row is newer. + const value = { ...serializeRunRow(sampleRow({ updatedAt: new Date("2026-06-07T10:00:00.000Z") })) }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationSkew).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.serializationDiverged).toBe(0); + }); +}); + +describe("RealtimeShadowComparator membership", () => { + const filter: RunListFilter = { + organizationId: "org_1", + projectId: "proj_1", + environmentId: "env_1", + tags: ["t"], + createdAtAfter: new Date("2026-06-06T00:00:00.000Z"), + limit: 1000, + }; + + function bodyFor(ids: string[]) { + const msgs = ids.map((id) => insert(serializeRunRow(sampleRow({ id })))); + return JSON.stringify([...msgs, UP_TO_DATE]); + } + + it("matches when Electric's set equals the notifier resolver's set", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "b"] + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(true); + expect(out.missingInNotifier).toEqual([]); + expect(out.extraInNotifier).toEqual([]); + }); + + it("reports rows missing from / extra in the notifier resolution", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "c"] // notifier missing b, has extra c + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(false); + expect(out.missingInNotifier).toEqual(["b"]); + expect(out.extraInNotifier).toEqual(["c"]); + }); +}); From 023588a7c9d86556d11969e6c010b9fbdf2413b1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 10:39:51 +0100 Subject: [PATCH 2/8] fix(webapp): harden the realtime runs backend Addresses review feedback on the new backend: - skip cache eviction when updating an existing key at capacity - treat a concurrency limit of 0 as valid (enforce it, not a 500) - gate subscribeToRunChanges behind the enable switch - keep protocol-reserved columns in the hydration projection - re-clamp a handle-recovered createdAt to the max-age floor - bulk-hydrate the shadow comparator instead of per-run reads - log only run id and column on divergence, never raw cell values --- .../app/services/realtime/boundedTtlCache.ts | 4 +- .../realtime/notifierRealtimeClient.server.ts | 16 +++++- .../runChangeNotifierInstance.server.ts | 3 ++ .../app/services/realtime/runReader.server.ts | 4 +- .../services/realtime/shadowCompare.server.ts | 10 +++- .../realtime/shadowRealtimeClient.server.ts | 4 +- .../test/realtime/boundedTtlCache.test.ts | 11 ++++ .../test/realtime/notifierRunSetCache.test.ts | 54 +++++++++++++++++++ .../test/realtime/runReaderProjection.test.ts | 18 +++++++ .../test/realtime/shadowCompare.test.ts | 6 ++- 10 files changed, 122 insertions(+), 8 deletions(-) diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts index 643f23607c5..8efcde55609 100644 --- a/apps/webapp/app/services/realtime/boundedTtlCache.ts +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -34,7 +34,9 @@ export class BoundedTtlCache { } set(key: string, value: V): void { - if (this.#entries.size >= this.maxEntries) { + // Only run capacity eviction when inserting a NEW key — updating an existing key + // doesn't grow the map, so it must never drop an unrelated live entry. + if (!this.#entries.has(key) && this.#entries.size >= this.maxEntries) { const now = Date.now(); for (const [key, entry] of this.#entries) { if (entry.expiresAt <= now) { diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 9c70fd1acb9..38874b2de4b 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -238,10 +238,15 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { } // Recover the pinned window from the handle so the lower bound never drifts. + // Re-clamp the recovered value to the max-age floor so a stale or crafted handle + // can't widen the lookback past the configured ceiling. + const recoveredMs = this.#filterMsFromHandle(handle); const filter: RunSetFilter = { tags, createdAtAfter: new Date( - this.#filterMsFromHandle(handle) ?? this.#computeCreatedAtFilter(params.createdAt).getTime() + recoveredMs !== undefined + ? this.#clampCreatedAtFloor(recoveredMs) + : this.#computeCreatedAtFilter(params.createdAt).getTime() ), }; @@ -573,6 +578,13 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; } + /** Clamp a handle-recovered createdAt lower bound up to the max-age floor (so a + * stale or crafted handle can't widen the window past the ceiling), then re-bucket. */ + #clampCreatedAtFloor(ms: number): number { + const floorMs = Date.now() - this.options.maximumCreatedAtFilterAgeMs; + return this.#bucketCreatedAtMs(Math.max(ms, floorMs)); + } + #mintListHandle(createdAtFilterMs: number): string { // Pins the createdAt threshold in the opaque handle so live polls reuse the // same lower bound even on a working-set cache miss. @@ -615,7 +627,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { DEFAULT_CONCURRENCY_LIMIT ); - if (!concurrencyLimit) { + if (concurrencyLimit == null) { logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { organizationId: environment.organizationId, }); diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index 545887abc61..71001192c1a 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -69,5 +69,8 @@ export function publishManyRunChanged(inputs: RunChangeInput[]): void { /** Subscribe to the next change for a run via the shared subscriber. */ export function subscribeToRunChanges(runId: string): RunChangeSubscription { + if (!notifierEnabled) { + throw new Error("Run change notifier is disabled"); + } return getRunChangeNotifier().subscribeToRunChanges(runId); } diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts index 6fe59c3c059..4135e94366b 100644 --- a/apps/webapp/app/services/realtime/runReader.server.ts +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -1,6 +1,6 @@ import { type Prisma, type PrismaClient } from "@trigger.dev/database"; import { BoundedTtlCache } from "./boundedTtlCache"; -import { type RealtimeRunRow } from "./electricStreamProtocol.server"; +import { RESERVED_COLUMNS, type RealtimeRunRow } from "./electricStreamProtocol.server"; /** * RunReader — the pluggable read half of the notifier-backed realtime feed. @@ -52,7 +52,7 @@ export const RUN_HYDRATOR_SELECT = { * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ * `metadata`/`error` columns the response will drop anyway. */ -const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt"]); +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt", ...RESERVED_COLUMNS]); /** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus * the always-needed ones). An empty skip set returns the full select unchanged. */ diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts index 9a30d93c4da..b24540bfca3 100644 --- a/apps/webapp/app/services/realtime/shadowCompare.server.ts +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -95,13 +95,21 @@ export class RealtimeShadowComparator { diffs: [], }; + // Bulk-hydrate every emitted run in one query rather than a per-message round + // trip, so shadow mode doesn't inflate the very replica load it's measuring. + const emittedIds = changes + .map((m) => m.value.id) + .filter((id): id is string => typeof id === "string"); + const hydrated = await this.options.runReader.hydrateByIds(input.environment.id, emittedIds); + const rowsById = new Map(hydrated.map((row) => [row.id, row])); + for (const message of changes) { const runId = message.value.id ?? undefined; if (!runId) { continue; } - const row = await this.options.runReader.getRunById(input.environment.id, runId); + const row = rowsById.get(runId); if (!row) { // Run no longer readable (deleted / replica miss). Not a serialization divergence. outcome.serializationSkew++; diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts index 1ddf162fd87..b66b70e7ad5 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -180,7 +180,9 @@ export class ShadowRealtimeClient implements RealtimeStreamClient { membershipMatch: outcome.membershipMatch, missingInNotifier: outcome.missingInNotifier?.slice(0, 20), extraInNotifier: outcome.extraInNotifier?.slice(0, 20), - diffs: outcome.diffs, + // Log only which run/column diverged, never the raw cell values — they can + // include run payload/output/metadata and must not leak into logs. + diffs: outcome.diffs.map(({ runId, column }) => ({ runId, column })), }); } } diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts index e487798750e..a3fb0b1e425 100644 --- a/apps/webapp/test/realtime/boundedTtlCache.test.ts +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -28,6 +28,17 @@ describe("BoundedTtlCache", () => { expect(cache.size).toBe(0); }); + it("does not evict another entry when updating an existing key at capacity", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + // Updating an existing key doesn't grow the map, so it must not drop "b". + cache.set("a", 11); + expect(cache.get("a")).toBe(11); + expect(cache.get("b")).toBe(2); + expect(cache.size).toBe(2); + }); + it("drops the oldest entry when full of still-live entries", () => { const cache = new BoundedTtlCache(60_000, 2); cache.set("a", 1); diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index a0beb0fd728..90e3446e792 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -171,3 +171,57 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { } }); }); + +describe("NotifierRealtimeClient review fixes", () => { + const ready = { changed: Promise.resolve(), unsubscribe() {} }; + const liveNotifier = { subscribeToRunChanges: () => ready, subscribeToEnvChanges: () => ready }; + + it("clamps a stale/crafted handle's createdAt up to the max-age floor", async () => { + const maxAge = 24 * 60 * 60 * 1000; + const { client, resolveSpy } = makeClient({ + notifier: liveNotifier, + maximumCreatedAtFilterAgeMs: maxAge, + runSetCreatedAtBucketMs: 0, + livePollTimeoutMs: 50, + }); + const before = Date.now(); + // Handle encodes createdAt = 1ms epoch, far older than the 24h ceiling. + await client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=123_1&live=true&handle=runs_1_7", + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Clamped to ~now - maxAge, not the epoch value encoded in the handle. + expect(passed.getTime()).toBeGreaterThan(before - maxAge - 1_000); + }); + + it("enforces a concurrency limit of 0 instead of failing with a 500", async () => { + let limitCheckedWith: number | undefined; + const { client } = makeClient({ + notifier: liveNotifier, + cachedLimitProvider: { getCachedLimit: async () => 0 }, + limiter: { + incrementAndCheck: async (_env: string, _id: string, limit: number) => { + limitCheckedWith = limit; + return true; + }, + decrement: async () => {}, + }, + livePollTimeoutMs: 50, + }); + const res = await client.streamBatch( + "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true", + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res.status).toBe(200); + expect(limitCheckedWith).toBe(0); + }); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts index 22ba3ac72fa..07aebf92589 100644 --- a/apps/webapp/test/realtime/runReaderProjection.test.ts +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -11,6 +11,24 @@ describe("buildHydratorSelect", () => { expect(select.error).toBe(true); }); + it("keeps protocol-reserved columns even when asked to skip them", () => { + // Reserved columns are always emitted by the serializer, so hydration must keep + // them regardless of skipColumns or the output is null/incorrect. + const select = buildHydratorSelect([ + "status", + "taskIdentifier", + "createdAt", + "friendlyId", + "payload", + ]); + expect(select.status).toBe(true); + expect(select.taskIdentifier).toBe(true); + expect(select.createdAt).toBe(true); + expect(select.friendlyId).toBe(true); + // A non-reserved skipped column is still dropped. + expect(select.payload).toBeUndefined(); + }); + it("drops skipped columns but always keeps id + updatedAt", () => { const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); expect(select.payload).toBeUndefined(); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts index 31bb527589f..e6604a02cd6 100644 --- a/apps/webapp/test/realtime/shadowCompare.test.ts +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -49,7 +49,11 @@ function makeComparator( resolvedIds: string[] = [] ) { return new RealtimeShadowComparator({ - runReader: { getRunById: async (_env: string, id: string) => rowsById[id] ?? null } as any, + runReader: { + getRunById: async (_env: string, id: string) => rowsById[id] ?? null, + hydrateByIds: async (_env: string, ids: string[]) => + ids.map((id) => rowsById[id]).filter((row): row is RealtimeRunRow => Boolean(row)), + } as any, runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, }); } From 058311a7acd922dc0e685b5ccc5ddd0f88f042f3 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 10:56:23 +0100 Subject: [PATCH 3/8] fix(webapp): enforce the realtime tag/batch result cap exactly The id resolver returned the repository's has-more overfetch (size + 1), so the feed could emit one run past the configured cap. Trim to the limit. --- .../services/realtime/clickHouseRunListResolver.server.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 545c4a43211..16dda7838b7 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -27,7 +27,7 @@ export class ClickHouseRunListResolver implements RunListResolver { const clickhouse = await this.options.getClickhouse(filter.organizationId); const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); - return repository.listRunIds({ + const ids = await repository.listRunIds({ organizationId: filter.organizationId, projectId: filter.projectId, environmentId: filter.environmentId, @@ -36,5 +36,9 @@ export class ClickHouseRunListResolver implements RunListResolver { from: filter.createdAtAfter?.getTime(), page: { size: filter.limit }, }); + + // listRunIds overfetches by one (size + 1) for has-more detection and doesn't + // trim, so enforce the caller's cap here. + return ids.slice(0, filter.limit); } } From 9a32dd1a2c8f76fd6735556a9b6ee6cdd9fe35ae Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 12:08:21 +0100 Subject: [PATCH 4/8] feat(webapp): give the realtime runs feed its own ClickHouse pool Resolve tag/batch run ids on a dedicated REALTIME_RUNS_CLICKHOUSE_* pool (falling back to CLICKHOUSE_URL) so the feed can't contend with the shared analytics client. --- apps/webapp/app/env.server.ts | 14 ++++++ .../clickhouse/clickhouseFactory.server.ts | 49 ++++++++++++++++++- .../notifierRealtimeClientInstance.server.ts | 2 +- .../shadowRealtimeClientInstance.server.ts | 2 +- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 3cdfdbf51fc..c0c61912414 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1633,6 +1633,20 @@ const EnvironmentSchema = z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), RUN_ENGINE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), + // ClickHouse client used by the realtime runs feed for tag/batch id resolution. + // Kept on its own URL + pool so the feed's reads can't contend with the main + // analytics client (CLICKHOUSE_URL). Falls back to the main URL when unset. + REALTIME_RUNS_CLICKHOUSE_URL: z + .string() + .optional() + .transform((v) => v ?? process.env.CLICKHOUSE_URL), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000), EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000), diff --git a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts index fb7f384fd27..c563621408c 100644 --- a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts +++ b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts @@ -211,6 +211,36 @@ function initializeRunEngineClickhouseClient(): ClickHouse { }); } +/** Realtime runs feed tag/batch id resolution (`REALTIME_RUNS_CLICKHOUSE_URL`); + * falls back to the default client if unset. */ +const defaultRealtimeClickhouseClient = singleton( + "realtimeClickhouseClient", + initializeRealtimeClickhouseClient +); + +function initializeRealtimeClickhouseClient(): ClickHouse { + if (!env.REALTIME_RUNS_CLICKHOUSE_URL) { + return defaultClickhouseClient; + } + + const url = new URL(env.REALTIME_RUNS_CLICKHOUSE_URL); + url.searchParams.delete("secure"); + + return new ClickHouse({ + url: url.toString(), + name: "realtime-runs-clickhouse", + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); +} + /** Task events (`EVENTS_CLICKHOUSE_URL`); not exported — accessed via factory. */ const defaultEventsClickhouseClient = singleton( "eventsClickhouseClient", @@ -257,7 +287,8 @@ export type ClientType = | "logs" | "query" | "admin" - | "engine"; + | "engine" + | "realtime"; function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHouse { const parsed = new URL(url); @@ -330,6 +361,20 @@ function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHou }, maxOpenConnections: env.RUN_ENGINE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); + case "realtime": + return new ClickHouse({ + url: parsed.toString(), + name, + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); case "standard": case "query": case "admin": @@ -398,6 +443,8 @@ export class ClickhouseFactory { return defaultAdminClickhouseClient; case "engine": return defaultRunEngineClickhouseClient; + case "realtime": + return defaultRealtimeClickhouseClient; } } diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts index 2888deec863..1b645eb5fb8 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -55,7 +55,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { runReader: new RunHydrator({ replica: $replica }), runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => - clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), prisma: $replica, }), notifier: getRunChangeNotifier(), diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts index 36ce0a4325b..95edc82620d 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -26,7 +26,7 @@ function initializeShadowRealtimeClient(): ShadowRealtimeClient { runReader: new RunHydrator({ replica: $replica }), runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => - clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), prisma: $replica, }), }); From 905b7deba67f960a711d393981003eedddb37319 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 12:10:35 +0100 Subject: [PATCH 5/8] fix(webapp): log realtime run-change pub/sub failures at error level Surface publish, subscribe, and unsubscribe failures in the realtime run-change pub/sub at error level with clearer static messages, instead of debug. --- .../realtime/runChangeNotifier.server.ts | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index ba8748c6cf4..9bc0f69d6e6 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -78,11 +78,17 @@ export class RunChangeNotifier { const result = publisher.publish(channel, payload); if (typeof (result as Promise)?.catch === "function") { (result as Promise).catch((error) => { - logger.debug("[runChangeNotifier] publish failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); }); } } catch (error) { - logger.debug("[runChangeNotifier] publish threw", { error, channel }); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); } } @@ -125,7 +131,10 @@ export class RunChangeNotifier { listeners = new Set(); this.#listeners.set(channel, listeners); subscriber.subscribe(channel).catch((error) => { - logger.debug("[runChangeNotifier] subscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to subscribe to run-change channel", { + error, + channel, + }); }); } listeners.add(resolveChanged); @@ -161,7 +170,10 @@ export class RunChangeNotifier { // now unsubscribed in Redis but has live waiters. Re-subscribe so they // still receive messages (the long-poll backstop covers the gap). subscriber.subscribe(channel).catch((error) => { - logger.debug("[runChangeNotifier] resubscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { + error, + channel, + }); }); } }) @@ -169,7 +181,10 @@ export class RunChangeNotifier { // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. // Keep the (empty) map entry so a future subscriber reuses it without a // duplicate SUBSCRIBE and #onMessage stays consistent with Redis state. - logger.debug("[runChangeNotifier] unsubscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { + error, + channel, + }); }); } }; From 3d07c474dd419f161d00a93f396c0713fab5b4f8 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 13:47:43 +0100 Subject: [PATCH 6/8] feat(webapp): give the realtime runs feed its own pub/sub Redis Run the realtime runs feed's run-changed pub/sub on a dedicated REALTIME_RUNS_PUBSUB_REDIS_* connection set (falling back to PUBSUB_REDIS_* / REDIS_*), so its publish/subscribe traffic can be isolated from the shared pub/sub Redis. --- apps/webapp/app/env.server.ts | 31 +++++++++++++++++++ .../runChangeNotifierInstance.server.ts | 12 +++---- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c0c61912414..4920355f68f 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -357,6 +357,37 @@ const EnvironmentSchema = z PUBSUB_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"), + // Dedicated pub/sub Redis for the realtime runs feed's run-changed notifier, so + // its publish/subscribe traffic can run on its own instance. Each value falls + // back to the shared PUBSUB_REDIS_* (then REDIS_*) when unset, so the default is + // unchanged until explicitly pointed at a dedicated instance. + REALTIME_RUNS_PUBSUB_REDIS_HOST: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_HOST ?? process.env.REDIS_HOST), + REALTIME_RUNS_PUBSUB_REDIS_PORT: z.coerce + .number() + .optional() + .transform((v) => { + if (v !== undefined) return v; + const raw = process.env.PUBSUB_REDIS_PORT ?? process.env.REDIS_PORT; + return raw ? parseInt(raw) : undefined; + }), + REALTIME_RUNS_PUBSUB_REDIS_USERNAME: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_USERNAME ?? process.env.REDIS_USERNAME), + REALTIME_RUNS_PUBSUB_REDIS_PASSWORD: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_PASSWORD ?? process.env.REDIS_PASSWORD), + REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED: z + .string() + .default(process.env.PUBSUB_REDIS_TLS_DISABLED ?? process.env.REDIS_TLS_DISABLED ?? "false"), + REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z + .string() + .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), + DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), DEFAULT_ORG_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(300), diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index 71001192c1a..78f68537c70 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -21,12 +21,12 @@ const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; function initializeRunChangeNotifier(): RunChangeNotifier { const notifier = new RunChangeNotifier({ redis: { - host: env.PUBSUB_REDIS_HOST, - port: env.PUBSUB_REDIS_PORT, - username: env.PUBSUB_REDIS_USERNAME, - password: env.PUBSUB_REDIS_PASSWORD, - tlsDisabled: env.PUBSUB_REDIS_TLS_DISABLED === "true", - clusterMode: env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + host: env.REALTIME_RUNS_PUBSUB_REDIS_HOST, + port: env.REALTIME_RUNS_PUBSUB_REDIS_PORT, + username: env.REALTIME_RUNS_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_RUNS_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", }, }); From d3730d90a1be2a045ebd65b02d6dea9c78d4b561 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 14:55:51 +0100 Subject: [PATCH 7/8] fix(webapp): adapt the realtime run-id resolver to paginated listRunIds listRunIds now returns a keyset page ({ runIds, pagination }); read runIds from it. The page is already capped to the requested size, so the manual trim is gone. Also make the run-change event-bus handler registration return a truthy value so the singleton() wrapper doesn't re-attach listeners on dev reloads. --- .../services/realtime/clickHouseRunListResolver.server.ts | 7 +++---- .../services/realtime/runChangeNotifierHandlers.server.ts | 7 ++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 16dda7838b7..003646bb74a 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -27,7 +27,7 @@ export class ClickHouseRunListResolver implements RunListResolver { const clickhouse = await this.options.getClickhouse(filter.organizationId); const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); - const ids = await repository.listRunIds({ + const { runIds } = await repository.listRunIds({ organizationId: filter.organizationId, projectId: filter.projectId, environmentId: filter.environmentId, @@ -37,8 +37,7 @@ export class ClickHouseRunListResolver implements RunListResolver { page: { size: filter.limit }, }); - // listRunIds overfetches by one (size + 1) for has-more detection and doesn't - // trim, so enforce the caller's cap here. - return ids.slice(0, filter.limit); + // listRunIds is keyset-paginated; runIds is already capped to page.size (= limit). + return runIds; } } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts index 791991178e4..9ed93e66a4a 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -16,8 +16,11 @@ import { publishRunChanged } from "./runChangeNotifierInstance.server"; * high-value, env-cheap transitions here. */ export function registerRunChangeNotifierHandlers() { + // Return a truthy value in every path so the singleton() wrapper (which uses + // ??=) caches the result and never re-runs this factory — re-running would + // attach duplicate engine-bus listeners on each Remix dev-mode reload. if (env.REALTIME_NOTIFIER_ENABLED !== "1") { - return; + return true; } // Status transitions (checkpoint suspend/resume, pending version, dequeue) — @@ -70,4 +73,6 @@ export function registerRunChangeNotifierHandlers() { }); logger.info("[runChangeNotifier] realtime run-change notifier handlers registered"); + + return true; } From d2987a1ae718da6c208ba89617d45562b806cb8e Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 15:48:19 +0100 Subject: [PATCH 8/8] fix(webapp): JSON-encode the run-set cache key to avoid separator collisions A tag containing a comma keyed the same as two separate tags, so the resolve+hydrate coalescing cache could serve the wrong runs for up to its TTL. Encode the tag/column arrays instead of joining them. --- .../app/services/realtime/notifierRealtimeClient.server.ts | 7 +++++-- apps/webapp/test/realtime/notifierRunSetCache.test.ts | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 38874b2de4b..9c49e62e4c4 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -529,8 +529,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the * same projected columns, so cached rows always match the requesting feed. */ #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { - const tags = filter.tags && filter.tags.length > 0 ? [...filter.tags].sort().join(",") : ""; - const cols = skipColumns.length > 0 ? [...skipColumns].sort().join(",") : ""; + // JSON-encode the arrays (not a join) so a value containing the separators — + // e.g. a tag with a comma — can't collide: ["a,b"] must not key the same as + // ["a","b"], which are different ClickHouse filters. + const tags = filter.tags && filter.tags.length > 0 ? JSON.stringify([...filter.tags].sort()) : ""; + const cols = skipColumns.length > 0 ? JSON.stringify([...skipColumns].sort()) : ""; const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ filter.createdAtAfter?.getTime() ?? "" diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index 90e3446e792..2f325296f1c 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -157,6 +157,13 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { expect(resolveSpy).toHaveBeenCalledTimes(2); }); + it("does not collide a comma-containing tag with two separate tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["a,b"]); // one tag "a,b" + await snapshotTag(client, ["a", "b"]); // two tags a OR b — a different filter + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { vi.useFakeTimers({ toFake: ["Date"] }); vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z"));