diff --git a/.changeset/ai-prompt-management.md b/.changeset/ai-prompt-management.md new file mode 100644 index 00000000000..d3250bebda7 --- /dev/null +++ b/.changeset/ai-prompt-management.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/sdk": patch +--- + +Define and manage AI prompts with `prompts.define()`. Create typesafe prompt templates with variables, resolve them at runtime, and manage versions and overrides from the dashboard without redeploying. diff --git a/.changeset/bright-buckets-bow.md b/.changeset/bright-buckets-bow.md deleted file mode 100644 index f9a33470bf8..00000000000 --- a/.changeset/bright-buckets-bow.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/sdk": patch ---- - -[TRI-1449] Display warning message when duplicate job IDs are detected diff --git a/.changeset/chilly-tips-explode.md b/.changeset/chilly-tips-explode.md new file mode 100644 index 00000000000..7a5235904a4 --- /dev/null +++ b/.changeset/chilly-tips-explode.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Add platform notifications support to the CLI. The `trigger dev` and `trigger login` commands now fetch and display platform notifications (info, warn, error, success) from the server. Includes discovery-based filtering to conditionally show notifications based on project file patterns, color markup rendering for styled terminal output, and a non-blocking display flow with a spinner fallback for slow fetches. Use `--skip-platform-notifications` flag with `trigger dev` to disable the notification check. diff --git a/.changeset/config.json b/.changeset/config.json index 5e8716336bb..115f54fefee 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -1,22 +1,25 @@ { "$schema": "https://unpkg.com/@changesets/config@2.2.0/schema.json", - "changelog": "@changesets/cli/changelog", - "commit": false, - "fixed": [ - [ - "@trigger.dev/*" - ] + "changelog": [ + "@remix-run/changelog-github", + { + "repo": "triggerdotdev/trigger.dev" + } ], + "commit": false, + "fixed": [["@trigger.dev/*", "trigger.dev"]], "linked": [], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", "ignore": [ "webapp", - "emails", - "@trigger.dev/database" + "coordinator", + "docker-provider", + "kubernetes-provider", + "supervisor" ], "___experimentalUnsafeOptions_WILL_CHANGE_IN_PATCH": { "onlyUpdatePeerDependentsWhenOutOfRange": true } -} \ No newline at end of file +} diff --git a/.changeset/fix-dev-build-dir-leak.md b/.changeset/fix-dev-build-dir-leak.md new file mode 100644 index 00000000000..a1e6219c8bb --- /dev/null +++ b/.changeset/fix-dev-build-dir-leak.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix dev CLI leaking build directories on rebuild, causing disk space accumulation. Deprecated workers are now pruned (capped at 2 retained) when no active runs reference them. The watchdog process also cleans up `.trigger/tmp/` when the dev CLI is killed ungracefully (e.g. SIGKILL from pnpm). diff --git a/.changeset/fix-list-deploys-nullable.md b/.changeset/fix-list-deploys-nullable.md new file mode 100644 index 00000000000..d9d5e82116a --- /dev/null +++ b/.changeset/fix-list-deploys-nullable.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Fix `list_deploys` MCP tool failing when deployments have null `runtime` or `runtimeVersion` fields. diff --git a/.changeset/fix-local-build-load.md b/.changeset/fix-local-build-load.md new file mode 100644 index 00000000000..13f91da9d6a --- /dev/null +++ b/.changeset/fix-local-build-load.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix `--load` flag being silently ignored on local/self-hosted builds. diff --git a/.changeset/four-kangaroos-care.md b/.changeset/four-kangaroos-care.md deleted file mode 100644 index ddd43914bdb..00000000000 --- a/.changeset/four-kangaroos-care.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/openai": patch ---- - -Fix a ReferenceError when using backgroundCreateChatCompletion diff --git a/.changeset/good-dolphins-jam.md b/.changeset/good-dolphins-jam.md deleted file mode 100644 index f4cfbcfd53d..00000000000 --- a/.changeset/good-dolphins-jam.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/sdk": patch ---- - -Add `io.random()` which wraps `Math.random()` in a Task with helpful options. diff --git a/.changeset/happy-dryers-guess.md b/.changeset/happy-dryers-guess.md deleted file mode 100644 index 01fe5c1ada3..00000000000 --- a/.changeset/happy-dryers-guess.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"@trigger.dev/sdk": patch -"@trigger.dev/core": patch ---- - -Added invokeTrigger(), which allows jobs to be manually invoked diff --git a/.changeset/lazy-schools-grin.md b/.changeset/lazy-schools-grin.md deleted file mode 100644 index 97d46701d00..00000000000 --- a/.changeset/lazy-schools-grin.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"@trigger.dev/integration-kit": patch -"@trigger.dev/openai": patch ---- - -Allow customizing OpenAI background retries and timeouts diff --git a/.changeset/llm-metadata-run-tags.md b/.changeset/llm-metadata-run-tags.md new file mode 100644 index 00000000000..85f04c363b8 --- /dev/null +++ b/.changeset/llm-metadata-run-tags.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Propagate run tags to span attributes so they can be extracted server-side for LLM cost attribution metadata. diff --git a/.changeset/mcp-get-span-details.md b/.changeset/mcp-get-span-details.md new file mode 100644 index 00000000000..e69b7979b07 --- /dev/null +++ b/.changeset/mcp-get-span-details.md @@ -0,0 +1,11 @@ +--- +"@trigger.dev/core": patch +"trigger.dev": patch +--- + +Add `get_span_details` MCP tool for inspecting individual spans within a run trace. + +- New `get_span_details` tool returns full span attributes, timing, events, and AI enrichment (model, tokens, cost, speed) +- Span IDs now shown in `get_run_details` trace output for easy discovery +- New API endpoint `GET /api/v1/runs/:runId/spans/:spanId` +- New `retrieveSpan()` method on the API client diff --git a/.changeset/mcp-query-tools.md b/.changeset/mcp-query-tools.md new file mode 100644 index 00000000000..23e09c1afec --- /dev/null +++ b/.changeset/mcp-query-tools.md @@ -0,0 +1,42 @@ +--- +"@trigger.dev/core": patch +"trigger.dev": patch +--- + +MCP server improvements: new tools, bug fixes, and new flags. + +**New tools:** +- `get_query_schema` — discover available TRQL tables and columns +- `query` — execute TRQL queries against your data +- `list_dashboards` — list built-in dashboards and their widgets +- `run_dashboard_query` — execute a single dashboard widget query +- `whoami` — show current profile, user, and API URL +- `list_profiles` — list all configured CLI profiles +- `switch_profile` — switch active profile for the MCP session +- `start_dev_server` — start `trigger dev` in the background and stream output +- `stop_dev_server` — stop the running dev server +- `dev_server_status` — check dev server status and view recent logs + +**New API endpoints:** +- `GET /api/v1/query/schema` — query table schema discovery +- `GET /api/v1/query/dashboards` — list built-in dashboards + +**New features:** +- `--readonly` flag hides write tools (`deploy`, `trigger_task`, `cancel_run`) so the AI cannot make changes +- `read:query` JWT scope for query endpoint authorization +- `get_run_details` trace output is now paginated with cursor support +- MCP tool annotations (`readOnlyHint`, `destructiveHint`) for all tools + +**Bug fixes:** +- Fixed `search_docs` tool failing due to renamed upstream Mintlify tool (`SearchTriggerDev` → `search_trigger_dev`) +- Fixed `list_deploys` failing when deployments have null `runtime`/`runtimeVersion` fields (#3139) +- Fixed `list_preview_branches` crashing due to incorrect response shape access +- Fixed `metrics` table column documented as `value` instead of `metric_value` in query docs +- Fixed dev CLI leaking build directories on rebuild — deprecated workers now clean up their build dirs when their last run completes + +**Context optimizations:** +- `get_query_schema` now requires a table name and returns only one table's schema (was returning all tables) +- `get_current_worker` no longer inlines payload schemas; use new `get_task_schema` tool instead +- Query results formatted as text tables instead of JSON (~50% fewer tokens) +- `cancel_run`, `list_deploys`, `list_preview_branches` formatted as text instead of raw JSON +- Schema and dashboard API responses cached to avoid redundant fetches diff --git a/.changeset/packet-v2-packets-api.md b/.changeset/packet-v2-packets-api.md new file mode 100644 index 00000000000..9ec5fa1b338 --- /dev/null +++ b/.changeset/packet-v2-packets-api.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Large run outputs can use the new API which allows switching object storage providers. \ No newline at end of file diff --git a/.changeset/private-networking-dequeue.md b/.changeset/private-networking-dequeue.md new file mode 100644 index 00000000000..4a5bdba6a67 --- /dev/null +++ b/.changeset/private-networking-dequeue.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Add optional `hasPrivateLink` field to the dequeue message organization object for private networking support diff --git a/.changeset/quiet-dogs-fly.md b/.changeset/quiet-dogs-fly.md new file mode 100644 index 00000000000..e6017304760 --- /dev/null +++ b/.changeset/quiet-dogs-fly.md @@ -0,0 +1,7 @@ +--- +"@trigger.dev/sdk": patch +"@trigger.dev/core": patch +"trigger.dev": patch +--- + +Add support for setting TTL (time-to-live) defaults at the task level and globally in trigger.config.ts, with per-trigger overrides still taking precedence diff --git a/.changeset/tame-oranges-change.md b/.changeset/tame-oranges-change.md new file mode 100644 index 00000000000..9755a41a26a --- /dev/null +++ b/.changeset/tame-oranges-change.md @@ -0,0 +1,8 @@ +--- +"@trigger.dev/redis-worker": patch +"@trigger.dev/sdk": patch +"trigger.dev": patch +"@trigger.dev/core": patch +--- + +Adapted the CLI API client to propagate the trigger source via http headers. diff --git a/.claude/rules/database-safety.md b/.claude/rules/database-safety.md new file mode 100644 index 00000000000..14a6523595b --- /dev/null +++ b/.claude/rules/database-safety.md @@ -0,0 +1,13 @@ +--- +paths: + - "internal-packages/database/**" +--- + +# Database Migration Safety + +- When adding indexes to **existing tables**, use `CREATE INDEX CONCURRENTLY IF NOT EXISTS` to avoid table locks. These must be in their own separate migration file (one index per file). +- Indexes on **newly created tables** (same migration as `CREATE TABLE`) do not need CONCURRENTLY. +- When indexing a **new column on an existing table**, split into two migrations: first `ADD COLUMN IF NOT EXISTS`, then `CREATE INDEX CONCURRENTLY IF NOT EXISTS` in a separate file. +- After generating a migration with Prisma, remove extraneous lines for: `_BackgroundWorkerToBackgroundWorkerFile`, `_BackgroundWorkerToTaskQueue`, `_TaskRunToTaskRunTag`, `_WaitpointRunConnections`, `_completedWaitpoints`, `SecretStore_key_idx`, and unrelated TaskRun indexes. +- Never drop columns or tables without explicit approval. +- New code should target `RunEngineVersion.V2` only. diff --git a/.claude/rules/docs-writing.md b/.claude/rules/docs-writing.md new file mode 100644 index 00000000000..bbfb471368e --- /dev/null +++ b/.claude/rules/docs-writing.md @@ -0,0 +1,14 @@ +--- +paths: + - "docs/**" +--- + +# Documentation Writing Rules + +- Use Mintlify MDX format. Frontmatter: `title`, `description`, `sidebarTitle` (optional). +- After creating a new page, add it to `docs.json` navigation under the correct group. +- Use Mintlify components: ``, ``, ``, ``, ``, ``, ``/``. +- Code examples should be complete and runnable where possible. +- Always import from `@trigger.dev/sdk`, never `@trigger.dev/sdk/v3`. +- Keep paragraphs short. Use headers to break up content. +- Link to related pages using relative paths (e.g., `[Tasks](/tasks/overview)`). diff --git a/.claude/rules/legacy-v3-code.md b/.claude/rules/legacy-v3-code.md new file mode 100644 index 00000000000..6fd8d9402c2 --- /dev/null +++ b/.claude/rules/legacy-v3-code.md @@ -0,0 +1,33 @@ +--- +paths: + - "apps/webapp/app/v3/**" +--- + +# Legacy V1 Engine Code in `app/v3/` + +The `v3/` directory name is misleading - most code here is actively used by the current V2 engine. Only the specific files below are legacy V1-only code. + +## V1-Only Files - Never Modify + +- `marqs/` directory (entire MarQS queue system: sharedQueueConsumer, devQueueConsumer, fairDequeuingStrategy, devPubSub) +- `legacyRunEngineWorker.server.ts` (V1 background job worker) +- `services/triggerTaskV1.server.ts` (deprecated V1 task triggering) +- `services/cancelTaskRunV1.server.ts` (deprecated V1 cancellation) +- `authenticatedSocketConnection.server.ts` (V1 dev WebSocket using DevQueueConsumer) +- `sharedSocketConnection.ts` (V1 shared queue socket using SharedQueueConsumer) + +## V1/V2 Branching Pattern + +Some services act as routers that branch on `RunEngineVersion`: +- `services/cancelTaskRun.server.ts` - calls V1 service or `engine.cancelRun()` for V2 +- `services/batchTriggerV3.server.ts` - uses marqs for V1 path, run-engine for V2 + +When editing these shared services, only modify V2 code paths. + +## V2 Modern Stack + +- **Run lifecycle**: `@internal/run-engine` (internal-packages/run-engine) +- **Background jobs**: `@trigger.dev/redis-worker` (not graphile-worker/zodworker) +- **Queue operations**: RunQueue inside run-engine (not MarQS) +- **V2 engine singleton**: `runEngine.server.ts`, `runEngineHandlers.server.ts` +- **V2 workers**: `commonWorker.server.ts`, `alertsWorker.server.ts`, `batchTriggerWorker.server.ts` diff --git a/.claude/rules/sdk-packages.md b/.claude/rules/sdk-packages.md new file mode 100644 index 00000000000..549eb341809 --- /dev/null +++ b/.claude/rules/sdk-packages.md @@ -0,0 +1,12 @@ +--- +paths: + - "packages/**" +--- + +# Public Package Rules + +- Changes to `packages/` are **customer-facing**. Always add a changeset: `pnpm run changeset:add` +- Default to **patch**. Get maintainer approval for minor. Never select major without explicit approval. +- `@trigger.dev/core`: **Never import the root**. Always use subpath imports (e.g., `@trigger.dev/core/v3`). +- Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked. These are maintained in separate dedicated passes. +- Test changes using `references/hello-world` reference project. diff --git a/.claude/rules/server-apps.md b/.claude/rules/server-apps.md new file mode 100644 index 00000000000..4d46789701c --- /dev/null +++ b/.claude/rules/server-apps.md @@ -0,0 +1,23 @@ +--- +paths: + - "apps/**" +--- + +# Server App Changes + +When modifying server apps (webapp, supervisor, coordinator, etc.) with **no package changes**, add a `.server-changes/` file instead of a changeset: + +```bash +cat > .server-changes/descriptive-name.md << 'EOF' +--- +area: webapp +type: fix +--- + +Brief description of what changed and why. +EOF +``` + +- **area**: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type**: `feature` | `fix` | `improvement` | `breaking` +- If the PR also touches `packages/`, just the changeset is sufficient (no `.server-changes/` needed). diff --git a/.claude/skills/span-timeline-events/SKILL.md b/.claude/skills/span-timeline-events/SKILL.md new file mode 100644 index 00000000000..122f49912d7 --- /dev/null +++ b/.claude/skills/span-timeline-events/SKILL.md @@ -0,0 +1,78 @@ +--- +name: span-timeline-events +description: Use when adding, modifying, or debugging OTel span timeline events in the trace view. Covers event structure, ClickHouse storage constraints, rendering in SpanTimeline component, admin visibility, and the step-by-step process for adding new events. +allowed-tools: Read, Write, Edit, Glob, Grep, Bash +--- + +# Span Timeline Events + +The trace view's right panel shows a timeline of events for the selected span. These are OTel span events rendered by `app/utils/timelineSpanEvents.ts` and the `SpanTimeline` component. + +## How They Work + +1. **Span events** in OTel are attached to a parent span. In ClickHouse, they're stored as separate rows with `kind: "SPAN_EVENT"` sharing the parent span's `span_id`. The `#mergeRecordsIntoSpanDetail` method reassembles them into the span's `events` array at query time. +2. The timeline only renders events whose `name` starts with `trigger.dev/` - all others are silently filtered out. +3. The **display name** comes from `properties.event` (not the span event name), mapped through `getFriendlyNameForEvent()`. +4. Events are shown on the **span they belong to** - events on one span don't appear in another span's timeline. + +## ClickHouse Storage Constraint + +When events are written to ClickHouse, `spanEventsToTaskEventV1Input()` filters out events whose `start_time` is not greater than the parent span's `startTime`. Events at or before the span start are silently dropped. This means span events must have timestamps strictly after the span's own `startTimeUnixNano`. + +## Timeline Rendering (SpanTimeline component) + +The `SpanTimeline` component in `app/components/run/RunTimeline.tsx` renders: + +1. **Events** (thin 1px line with hollow dots) - all events from `createTimelineSpanEventsFromSpanEvents()` +2. **"Started"** marker (thick cap) - at the span's `startTime` +3. **Duration bar** (thick 7px line) - from "Started" to "Finished" +4. **"Finished"** marker (thick cap) - at `startTime + duration` + +The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued -> Pod scheduled -> Launched -> etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). + +## Trace View Sort Order + +Sibling spans (same parent) are sorted by `start_time ASC` from the ClickHouse query. The `createTreeFromFlatItems` function preserves this order. Event timestamps don't affect sort order - only the span's own `start_time`. + +## Event Structure + +```typescript +// OTel span event format +{ + name: "trigger.dev/run", // Must start with "trigger.dev/" to render + timeUnixNano: "1711200000000000000", + attributes: [ + { key: "event", value: { stringValue: "dequeue" } }, // The actual event type + { key: "duration", value: { intValue: 150 } }, // Optional: duration in ms + ] +} +``` + +## Admin-Only Events + +`getAdminOnlyForEvent()` controls visibility. Events default to **admin-only** (`true`). + +| Event | Admin-only | Friendly name | +|-------|-----------|---------------| +| `dequeue` | No | Dequeued | +| `fork` | No | Launched | +| `import` | No (if no fork event) | Importing task file | +| `create_attempt` | Yes | Attempt created | +| `lazy_payload` | Yes | Lazy attempt initialized | +| `pod_scheduled` | Yes | Pod scheduled | +| (default) | Yes | (raw event name) | + +## Adding New Timeline Events + +1. Add OTLP span event with `name: "trigger.dev/"` and `properties.event: ""` +2. Event timestamp must be strictly after the parent span's `startTimeUnixNano` (ClickHouse drops earlier events) +3. Add friendly name in `getFriendlyNameForEvent()` in `app/utils/timelineSpanEvents.ts` +4. Set admin visibility in `getAdminOnlyForEvent()` +5. Optionally add help text in `getHelpTextForEvent()` + +## Key Files + +- `app/utils/timelineSpanEvents.ts` - filtering, naming, admin logic +- `app/components/run/RunTimeline.tsx` - `SpanTimeline` component (thin line + thick bar rendering) +- `app/presenters/v3/SpanPresenter.server.ts` - loads span data including events +- `app/v3/eventRepository/clickhouseEventRepository.server.ts` - `spanEventsToTaskEventV1Input()` (storage filter), `#mergeRecordsIntoSpanDetail` (reassembly) diff --git a/.claude/skills/trigger-dev-tasks/SKILL.md b/.claude/skills/trigger-dev-tasks/SKILL.md new file mode 100644 index 00000000000..791c22c27ed --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/SKILL.md @@ -0,0 +1,200 @@ +--- +name: trigger-dev-tasks +description: Use this skill when writing, designing, or optimizing Trigger.dev background tasks and workflows. This includes creating reliable async tasks, implementing AI workflows, setting up scheduled jobs, structuring complex task hierarchies with subtasks, configuring build extensions for tools like ffmpeg or Puppeteer/Playwright, and handling task schemas with Zod validation. +allowed-tools: Read, Write, Edit, Glob, Grep, Bash +--- + +# Trigger.dev Task Expert + +You are an expert Trigger.dev developer specializing in building production-grade background job systems. Tasks deployed to Trigger.dev run in Node.js 21+ and use the `@trigger.dev/sdk` package. + +## Critical Rules + +1. **Always use `@trigger.dev/sdk`** - Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob` pattern +2. **Never use `node-fetch`** - Use the built-in `fetch` function +3. **Export all tasks** - Every task must be exported, including subtasks +4. **Never wrap wait/trigger calls in Promise.all** - `triggerAndWait`, `batchTriggerAndWait`, and `wait.*` calls cannot be wrapped in `Promise.all` or `Promise.allSettled` + +## Basic Task Pattern + +```ts +import { task } from "@trigger.dev/sdk"; + +export const processData = task({ + id: "process-data", + retry: { + maxAttempts: 10, + factor: 1.8, + minTimeoutInMs: 500, + maxTimeoutInMs: 30_000, + }, + run: async (payload: { userId: string; data: any[] }) => { + console.log(`Processing ${payload.data.length} items`); + return { processed: payload.data.length }; + }, +}); +``` + +## Schema Task (with validation) + +```ts +import { schemaTask } from "@trigger.dev/sdk"; +import { z } from "zod"; + +export const validatedTask = schemaTask({ + id: "validated-task", + schema: z.object({ + name: z.string(), + email: z.string().email(), + }), + run: async (payload) => { + // Payload is automatically validated and typed + return { message: `Hello ${payload.name}` }; + }, +}); +``` + +## Triggering Tasks + +### From Backend Code (type-only import to prevent dependency leakage) + +```ts +import { tasks } from "@trigger.dev/sdk"; +import type { processData } from "./trigger/tasks"; + +const handle = await tasks.trigger("process-data", { + userId: "123", + data: [{ id: 1 }], +}); +``` + +### From Inside Tasks + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload) => { + // Trigger and wait - returns Result object, NOT direct output + const result = await childTask.triggerAndWait({ data: "value" }); + if (result.ok) { + console.log("Output:", result.output); + } else { + console.error("Failed:", result.error); + } + + // Or unwrap directly (throws on error) + const output = await childTask.triggerAndWait({ data: "value" }).unwrap(); + }, +}); +``` + +## Idempotency (Critical for Retries) + +Always use idempotency keys when triggering tasks from inside other tasks: + +```ts +import { idempotencyKeys } from "@trigger.dev/sdk"; + +export const paymentTask = task({ + id: "process-payment", + run: async (payload: { orderId: string }) => { + // Scoped to current run - survives retries + const key = await idempotencyKeys.create(`payment-${payload.orderId}`); + + await chargeCustomer.trigger(payload, { + idempotencyKey: key, + idempotencyKeyTTL: "24h", + }); + }, +}); +``` + +## Trigger Options + +```ts +await myTask.trigger(payload, { + delay: "1h", // Delay execution + ttl: "10m", // Cancel if not started within TTL + idempotencyKey: key, + queue: "my-queue", + machine: "large-1x", // micro, small-1x, small-2x, medium-1x, medium-2x, large-1x, large-2x + maxAttempts: 3, + tags: ["user_123"], // Max 10 tags + debounce: { // Consolidate rapid triggers + key: "unique-key", + delay: "5s", + mode: "trailing", // "leading" (default) or "trailing" + }, +}); +``` + +## Debouncing + +Consolidate multiple triggers into a single execution: + +```ts +// Rapid triggers with same key = single execution +await myTask.trigger({ userId: "123" }, { + debounce: { + key: "user-123-update", + delay: "5s", + }, +}); + +// Trailing mode: use payload from LAST trigger +await myTask.trigger({ data: "latest" }, { + debounce: { + key: "my-key", + delay: "10s", + mode: "trailing", + }, +}); +``` + +Use cases: user activity updates, webhook deduplication, search indexing, notification batching. + +## Batch Triggering + +Up to 1,000 items per batch, 3MB per payload: + +```ts +const results = await myTask.batchTriggerAndWait([ + { payload: { userId: "1" } }, + { payload: { userId: "2" } }, +]); + +for (const result of results) { + if (result.ok) console.log(result.output); +} +``` + +## Machine Presets + +| Preset | vCPU | Memory | +|-------------|------|--------| +| micro | 0.25 | 0.25GB | +| small-1x | 0.5 | 0.5GB | +| small-2x | 1 | 1GB | +| medium-1x | 1 | 2GB | +| medium-2x | 2 | 4GB | +| large-1x | 4 | 8GB | +| large-2x | 8 | 16GB | + +## Design Principles + +1. **Break complex workflows into subtasks** that can be independently retried and made idempotent +2. **Don't over-complicate** - Sometimes `Promise.allSettled` inside a single task is better than many subtasks (each task has dedicated process and is charged by millisecond) +3. **Always configure retries** - Set appropriate `maxAttempts` based on the operation +4. **Use idempotency keys** - Especially for payment/critical operations +5. **Group related subtasks** - Keep subtasks only used by one parent in the same file, don't export them +6. **Use logger** - Log at key execution points with `logger.info()`, `logger.error()`, etc. + +## Reference Documentation + +For detailed documentation on specific topics, read these files: + +- `basic-tasks.md` - Task basics, triggering, waits +- `advanced-tasks.md` - Tags, queues, concurrency, metadata, error handling +- `scheduled-tasks.md` - Cron schedules, declarative and imperative +- `realtime.md` - Real-time subscriptions, streams, React hooks +- `config.md` - trigger.config.ts, build extensions (Prisma, Playwright, FFmpeg, etc.) diff --git a/.claude/skills/trigger-dev-tasks/advanced-tasks.md b/.claude/skills/trigger-dev-tasks/advanced-tasks.md new file mode 100644 index 00000000000..32a00337f89 --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/advanced-tasks.md @@ -0,0 +1,485 @@ +# Trigger.dev Advanced Tasks (v4) + +**Advanced patterns and features for writing tasks** + +## Tags & Organization + +```ts +import { task, tags } from "@trigger.dev/sdk"; + +export const processUser = task({ + id: "process-user", + run: async (payload: { userId: string; orgId: string }, { ctx }) => { + // Add tags during execution + await tags.add(`user_${payload.userId}`); + await tags.add(`org_${payload.orgId}`); + + return { processed: true }; + }, +}); + +// Trigger with tags +await processUser.trigger( + { userId: "123", orgId: "abc" }, + { tags: ["priority", "user_123", "org_abc"] } // Max 10 tags per run +); + +// Subscribe to tagged runs +for await (const run of runs.subscribeToRunsWithTag("user_123")) { + console.log(`User task ${run.id}: ${run.status}`); +} +``` + +**Tag Best Practices:** + +- Use prefixes: `user_123`, `org_abc`, `video:456` +- Max 10 tags per run, 1-64 characters each +- Tags don't propagate to child tasks automatically + +## Batch Triggering v2 + +Enhanced batch triggering with larger payloads and streaming ingestion. + +### Limits + +- **Maximum batch size**: 1,000 items (increased from 500) +- **Payload per item**: 3MB each (increased from 1MB combined) +- Payloads > 512KB automatically offload to object storage + +### Rate Limiting (per environment) + +| Tier | Bucket Size | Refill Rate | +|------|-------------|-------------| +| Free | 1,200 runs | 100 runs/10 sec | +| Hobby | 5,000 runs | 500 runs/5 sec | +| Pro | 5,000 runs | 500 runs/5 sec | + +### Concurrent Batch Processing + +| Tier | Concurrent Batches | +|------|-------------------| +| Free | 1 | +| Hobby | 10 | +| Pro | 10 | + +### Usage + +```ts +import { myTask } from "./trigger/myTask"; + +// Basic batch trigger (up to 1,000 items) +const runs = await myTask.batchTrigger([ + { payload: { userId: "user-1" } }, + { payload: { userId: "user-2" } }, + { payload: { userId: "user-3" } }, +]); + +// Batch trigger with wait +const results = await myTask.batchTriggerAndWait([ + { payload: { userId: "user-1" } }, + { payload: { userId: "user-2" } }, +]); + +for (const result of results) { + if (result.ok) { + console.log("Result:", result.output); + } +} + +// With per-item options +const batchHandle = await myTask.batchTrigger([ + { + payload: { userId: "123" }, + options: { + idempotencyKey: "user-123-batch", + tags: ["priority"], + }, + }, + { + payload: { userId: "456" }, + options: { + idempotencyKey: "user-456-batch", + }, + }, +]); +``` + +## Debouncing + +Consolidate multiple triggers into a single execution by debouncing task runs with a unique key and delay window. + +### Use Cases + +- **User activity updates**: Batch rapid user actions into a single run +- **Webhook deduplication**: Handle webhook bursts without redundant processing +- **Search indexing**: Combine document updates instead of processing individually +- **Notification batching**: Group notifications to prevent user spam + +### Basic Usage + +```ts +await myTask.trigger( + { userId: "123" }, + { + debounce: { + key: "user-123-update", // Unique identifier for debounce group + delay: "5s", // Wait duration ("5s", "1m", or milliseconds) + }, + } +); +``` + +### Execution Modes + +**Leading Mode** (default): Uses payload/options from the first trigger; subsequent triggers only reschedule execution time. + +```ts +// First trigger sets the payload +await myTask.trigger({ action: "first" }, { + debounce: { key: "my-key", delay: "10s" } +}); + +// Second trigger only reschedules - payload remains "first" +await myTask.trigger({ action: "second" }, { + debounce: { key: "my-key", delay: "10s" } +}); +// Task executes with { action: "first" } +``` + +**Trailing Mode**: Uses payload/options from the most recent trigger. + +```ts +await myTask.trigger( + { data: "latest-value" }, + { + debounce: { + key: "trailing-example", + delay: "10s", + mode: "trailing", + }, + } +); +``` + +In trailing mode, these options update with each trigger: +- `payload` — task input data +- `metadata` — run metadata +- `tags` — run tags (replaces existing) +- `maxAttempts` — retry attempts +- `maxDuration` — maximum compute time +- `machine` — machine preset + +### Important Notes + +- Idempotency keys take precedence over debounce settings +- Compatible with `triggerAndWait()` — parent runs block correctly on debounced execution +- Debounce key is scoped to the task + +## Concurrency & Queues + +```ts +import { task, queue } from "@trigger.dev/sdk"; + +// Shared queue for related tasks +const emailQueue = queue({ + name: "email-processing", + concurrencyLimit: 5, // Max 5 emails processing simultaneously +}); + +// Task-level concurrency +export const oneAtATime = task({ + id: "sequential-task", + queue: { concurrencyLimit: 1 }, // Process one at a time + run: async (payload) => { + // Critical section - only one instance runs + }, +}); + +// Per-user concurrency +export const processUserData = task({ + id: "process-user-data", + run: async (payload: { userId: string }) => { + // Override queue with user-specific concurrency + await childTask.trigger(payload, { + queue: { + name: `user-${payload.userId}`, + concurrencyLimit: 2, + }, + }); + }, +}); + +export const emailTask = task({ + id: "send-email", + queue: emailQueue, // Use shared queue + run: async (payload: { to: string }) => { + // Send email logic + }, +}); +``` + +## Error Handling & Retries + +```ts +import { task, retry, AbortTaskRunError } from "@trigger.dev/sdk"; + +export const resilientTask = task({ + id: "resilient-task", + retry: { + maxAttempts: 10, + factor: 1.8, // Exponential backoff multiplier + minTimeoutInMs: 500, + maxTimeoutInMs: 30_000, + randomize: false, + }, + catchError: async ({ error, ctx }) => { + // Custom error handling + if (error.code === "FATAL_ERROR") { + throw new AbortTaskRunError("Cannot retry this error"); + } + + // Log error details + console.error(`Task ${ctx.task.id} failed:`, error); + + // Allow retry by returning nothing + return { retryAt: new Date(Date.now() + 60000) }; // Retry in 1 minute + }, + run: async (payload) => { + // Retry specific operations + const result = await retry.onThrow( + async () => { + return await unstableApiCall(payload); + }, + { maxAttempts: 3 } + ); + + // Conditional HTTP retries + const response = await retry.fetch("https://api.example.com", { + retry: { + maxAttempts: 5, + condition: (response, error) => { + return response?.status === 429 || response?.status >= 500; + }, + }, + }); + + return result; + }, +}); +``` + +## Machines & Performance + +```ts +export const heavyTask = task({ + id: "heavy-computation", + machine: { preset: "large-2x" }, // 8 vCPU, 16 GB RAM + maxDuration: 1800, // 30 minutes timeout + run: async (payload, { ctx }) => { + // Resource-intensive computation + if (ctx.machine.preset === "large-2x") { + // Use all available cores + return await parallelProcessing(payload); + } + + return await standardProcessing(payload); + }, +}); + +// Override machine when triggering +await heavyTask.trigger(payload, { + machine: { preset: "medium-1x" }, // Override for this run +}); +``` + +**Machine Presets:** + +- `micro`: 0.25 vCPU, 0.25 GB RAM +- `small-1x`: 0.5 vCPU, 0.5 GB RAM (default) +- `small-2x`: 1 vCPU, 1 GB RAM +- `medium-1x`: 1 vCPU, 2 GB RAM +- `medium-2x`: 2 vCPU, 4 GB RAM +- `large-1x`: 4 vCPU, 8 GB RAM +- `large-2x`: 8 vCPU, 16 GB RAM + +## Idempotency + +```ts +import { task, idempotencyKeys } from "@trigger.dev/sdk"; + +export const paymentTask = task({ + id: "process-payment", + retry: { + maxAttempts: 3, + }, + run: async (payload: { orderId: string; amount: number }) => { + // Automatically scoped to this task run, so if the task is retried, the idempotency key will be the same + const idempotencyKey = await idempotencyKeys.create(`payment-${payload.orderId}`); + + // Ensure payment is processed only once + await chargeCustomer.trigger(payload, { + idempotencyKey, + idempotencyKeyTTL: "24h", // Key expires in 24 hours + }); + }, +}); + +// Payload-based idempotency +import { createHash } from "node:crypto"; + +function createPayloadHash(payload: any): string { + const hash = createHash("sha256"); + hash.update(JSON.stringify(payload)); + return hash.digest("hex"); +} + +export const deduplicatedTask = task({ + id: "deduplicated-task", + run: async (payload) => { + const payloadHash = createPayloadHash(payload); + const idempotencyKey = await idempotencyKeys.create(payloadHash); + + await processData.trigger(payload, { idempotencyKey }); + }, +}); +``` + +## Metadata & Progress Tracking + +```ts +import { task, metadata } from "@trigger.dev/sdk"; + +export const batchProcessor = task({ + id: "batch-processor", + run: async (payload: { items: any[] }, { ctx }) => { + const totalItems = payload.items.length; + + // Initialize progress metadata + metadata + .set("progress", 0) + .set("totalItems", totalItems) + .set("processedItems", 0) + .set("status", "starting"); + + const results = []; + + for (let i = 0; i < payload.items.length; i++) { + const item = payload.items[i]; + + // Process item + const result = await processItem(item); + results.push(result); + + // Update progress + const progress = ((i + 1) / totalItems) * 100; + metadata + .set("progress", progress) + .increment("processedItems", 1) + .append("logs", `Processed item ${i + 1}/${totalItems}`) + .set("currentItem", item.id); + } + + // Final status + metadata.set("status", "completed"); + + return { results, totalProcessed: results.length }; + }, +}); + +// Update parent metadata from child task +export const childTask = task({ + id: "child-task", + run: async (payload, { ctx }) => { + // Update parent task metadata + metadata.parent.set("childStatus", "processing"); + metadata.root.increment("childrenCompleted", 1); + + return { processed: true }; + }, +}); +``` + +## Logging & Tracing + +```ts +import { task, logger } from "@trigger.dev/sdk"; + +export const tracedTask = task({ + id: "traced-task", + run: async (payload, { ctx }) => { + logger.info("Task started", { userId: payload.userId }); + + // Custom trace with attributes + const user = await logger.trace( + "fetch-user", + async (span) => { + span.setAttribute("user.id", payload.userId); + span.setAttribute("operation", "database-fetch"); + + const userData = await database.findUser(payload.userId); + span.setAttribute("user.found", !!userData); + + return userData; + }, + { userId: payload.userId } + ); + + logger.debug("User fetched", { user: user.id }); + + try { + const result = await processUser(user); + logger.info("Processing completed", { result }); + return result; + } catch (error) { + logger.error("Processing failed", { + error: error.message, + userId: payload.userId, + }); + throw error; + } + }, +}); +``` + +## Hidden Tasks + +```ts +// Hidden task - not exported, only used internally +const internalProcessor = task({ + id: "internal-processor", + run: async (payload: { data: string }) => { + return { processed: payload.data.toUpperCase() }; + }, +}); + +// Public task that uses hidden task +export const publicWorkflow = task({ + id: "public-workflow", + run: async (payload: { input: string }) => { + // Use hidden task internally + const result = await internalProcessor.triggerAndWait({ + data: payload.input, + }); + + if (result.ok) { + return { output: result.output.processed }; + } + + throw new Error("Internal processing failed"); + }, +}); +``` + +## Best Practices + +- **Concurrency**: Use queues to prevent overwhelming external services +- **Retries**: Configure exponential backoff for transient failures +- **Idempotency**: Always use for payment/critical operations +- **Metadata**: Track progress for long-running tasks +- **Machines**: Match machine size to computational requirements +- **Tags**: Use consistent naming patterns for filtering +- **Debouncing**: Use for user activity, webhooks, and notification batching +- **Batch triggering**: Use for bulk operations up to 1,000 items +- **Error Handling**: Distinguish between retryable and fatal errors + +Design tasks to be stateless, idempotent, and resilient to failures. Use metadata for state tracking and queues for resource management. diff --git a/.claude/skills/trigger-dev-tasks/basic-tasks.md b/.claude/skills/trigger-dev-tasks/basic-tasks.md new file mode 100644 index 00000000000..56bff340761 --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/basic-tasks.md @@ -0,0 +1,199 @@ +# Trigger.dev Basic Tasks (v4) + +**MUST use `@trigger.dev/sdk`, NEVER `client.defineJob`** + +## Basic Task + +```ts +import { task } from "@trigger.dev/sdk"; + +export const processData = task({ + id: "process-data", + retry: { + maxAttempts: 10, + factor: 1.8, + minTimeoutInMs: 500, + maxTimeoutInMs: 30_000, + randomize: false, + }, + run: async (payload: { userId: string; data: any[] }) => { + // Task logic - runs for long time, no timeouts + console.log(`Processing ${payload.data.length} items for user ${payload.userId}`); + return { processed: payload.data.length }; + }, +}); +``` + +## Schema Task (with validation) + +```ts +import { schemaTask } from "@trigger.dev/sdk"; +import { z } from "zod"; + +export const validatedTask = schemaTask({ + id: "validated-task", + schema: z.object({ + name: z.string(), + age: z.number(), + email: z.string().email(), + }), + run: async (payload) => { + // Payload is automatically validated and typed + return { message: `Hello ${payload.name}, age ${payload.age}` }; + }, +}); +``` + +## Triggering Tasks + +### From Backend Code + +```ts +import { tasks } from "@trigger.dev/sdk"; +import type { processData } from "./trigger/tasks"; + +// Single trigger +const handle = await tasks.trigger("process-data", { + userId: "123", + data: [{ id: 1 }, { id: 2 }], +}); + +// Batch trigger (up to 1,000 items, 3MB per payload) +const batchHandle = await tasks.batchTrigger("process-data", [ + { payload: { userId: "123", data: [{ id: 1 }] } }, + { payload: { userId: "456", data: [{ id: 2 }] } }, +]); +``` + +### Debounced Triggering + +Consolidate multiple triggers into a single execution: + +```ts +// Multiple rapid triggers with same key = single execution +await myTask.trigger( + { userId: "123" }, + { + debounce: { + key: "user-123-update", // Unique key for debounce group + delay: "5s", // Wait before executing + }, + } +); + +// Trailing mode: use payload from LAST trigger +await myTask.trigger( + { data: "latest-value" }, + { + debounce: { + key: "trailing-example", + delay: "10s", + mode: "trailing", // Default is "leading" (first payload) + }, + } +); +``` + +**Debounce modes:** +- `leading` (default): Uses payload from first trigger, subsequent triggers only reschedule +- `trailing`: Uses payload from most recent trigger + +### From Inside Tasks (with Result handling) + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload) => { + // Trigger and continue + const handle = await childTask.trigger({ data: "value" }); + + // Trigger and wait - returns Result object, NOT task output + const result = await childTask.triggerAndWait({ data: "value" }); + if (result.ok) { + console.log("Task output:", result.output); // Actual task return value + } else { + console.error("Task failed:", result.error); + } + + // Quick unwrap (throws on error) + const output = await childTask.triggerAndWait({ data: "value" }).unwrap(); + + // Batch trigger and wait + const results = await childTask.batchTriggerAndWait([ + { payload: { data: "item1" } }, + { payload: { data: "item2" } }, + ]); + + for (const run of results) { + if (run.ok) { + console.log("Success:", run.output); + } else { + console.log("Failed:", run.error); + } + } + }, +}); + +export const childTask = task({ + id: "child-task", + run: async (payload: { data: string }) => { + return { processed: payload.data }; + }, +}); +``` + +> Never wrap triggerAndWait or batchTriggerAndWait calls in a Promise.all or Promise.allSettled as this is not supported in Trigger.dev tasks. + +## Waits + +```ts +import { task, wait } from "@trigger.dev/sdk"; + +export const taskWithWaits = task({ + id: "task-with-waits", + run: async (payload) => { + console.log("Starting task"); + + // Wait for specific duration + await wait.for({ seconds: 30 }); + await wait.for({ minutes: 5 }); + await wait.for({ hours: 1 }); + await wait.for({ days: 1 }); + + // Wait until specific date + await wait.until({ date: new Date("2024-12-25") }); + + // Wait for token (from external system) + await wait.forToken({ + token: "user-approval-token", + timeoutInSeconds: 3600, // 1 hour timeout + }); + + console.log("All waits completed"); + return { status: "completed" }; + }, +}); +``` + +> Never wrap wait calls in a Promise.all or Promise.allSettled as this is not supported in Trigger.dev tasks. + +## Key Points + +- **Result vs Output**: `triggerAndWait()` returns a `Result` object with `ok`, `output`, `error` properties - NOT the direct task output +- **Type safety**: Use `import type` for task references when triggering from backend +- **Waits > 5 seconds**: Automatically checkpointed, don't count toward compute usage +- **Debounce + idempotency**: Idempotency keys take precedence over debounce settings + +## NEVER Use (v2 deprecated) + +```ts +// BREAKS APPLICATION +client.defineJob({ + id: "job-id", + run: async (payload, io) => { + /* ... */ + }, +}); +``` + +Use SDK (`@trigger.dev/sdk`), check `result.ok` before accessing `result.output` diff --git a/.claude/skills/trigger-dev-tasks/config.md b/.claude/skills/trigger-dev-tasks/config.md new file mode 100644 index 00000000000..f6a4db1c4b8 --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/config.md @@ -0,0 +1,346 @@ +# Trigger.dev Configuration + +**Complete guide to configuring `trigger.config.ts` with build extensions** + +## Basic Configuration + +```ts +import { defineConfig } from "@trigger.dev/sdk"; + +export default defineConfig({ + project: "", // Required: Your project reference + dirs: ["./trigger"], // Task directories + runtime: "node", // "node", "node-22", or "bun" + logLevel: "info", // "debug", "info", "warn", "error" + + // Default retry settings + retries: { + enabledInDev: false, + default: { + maxAttempts: 3, + minTimeoutInMs: 1000, + maxTimeoutInMs: 10000, + factor: 2, + randomize: true, + }, + }, + + // Build configuration + build: { + autoDetectExternal: true, + keepNames: true, + minify: false, + extensions: [], // Build extensions go here + }, + + // Global lifecycle hooks + onStartAttempt: async ({ payload, ctx }) => { + console.log("Global task start"); + }, + onSuccess: async ({ payload, output, ctx }) => { + console.log("Global task success"); + }, + onFailure: async ({ payload, error, ctx }) => { + console.log("Global task failure"); + }, +}); +``` + +## Build Extensions + +### Database & ORM + +#### Prisma + +```ts +import { prismaExtension } from "@trigger.dev/build/extensions/prisma"; + +extensions: [ + prismaExtension({ + schema: "prisma/schema.prisma", + version: "5.19.0", // Optional: specify version + migrate: true, // Run migrations during build + directUrlEnvVarName: "DIRECT_DATABASE_URL", + typedSql: true, // Enable TypedSQL support + }), +]; +``` + +#### TypeScript Decorators (for TypeORM) + +```ts +import { emitDecoratorMetadata } from "@trigger.dev/build/extensions/typescript"; + +extensions: [ + emitDecoratorMetadata(), // Enables decorator metadata +]; +``` + +### Scripting Languages + +#### Python + +```ts +import { pythonExtension } from "@trigger.dev/build/extensions/python"; + +extensions: [ + pythonExtension({ + scripts: ["./python/**/*.py"], // Copy Python files + requirementsFile: "./requirements.txt", // Install packages + devPythonBinaryPath: ".venv/bin/python", // Dev mode binary + }), +]; + +// Usage in tasks +const result = await python.runInline(`print("Hello, world!")`); +const output = await python.runScript("./python/script.py", ["arg1"]); +``` + +### Browser Automation + +#### Playwright + +```ts +import { playwright } from "@trigger.dev/build/extensions/playwright"; + +extensions: [ + playwright({ + browsers: ["chromium", "firefox", "webkit"], // Default: ["chromium"] + headless: true, // Default: true + }), +]; +``` + +#### Puppeteer + +```ts +import { puppeteer } from "@trigger.dev/build/extensions/puppeteer"; + +extensions: [puppeteer()]; + +// Environment variable needed: +// PUPPETEER_EXECUTABLE_PATH: "/usr/bin/google-chrome-stable" +``` + +#### Lightpanda + +```ts +import { lightpanda } from "@trigger.dev/build/extensions/lightpanda"; + +extensions: [ + lightpanda({ + version: "latest", // or "nightly" + disableTelemetry: false, + }), +]; +``` + +### Media Processing + +#### FFmpeg + +```ts +import { ffmpeg } from "@trigger.dev/build/extensions/core"; + +extensions: [ + ffmpeg({ version: "7" }), // Static build, or omit for Debian version +]; + +// Automatically sets FFMPEG_PATH and FFPROBE_PATH +// Add fluent-ffmpeg to external packages if using +``` + +#### Audio Waveform + +```ts +import { audioWaveform } from "@trigger.dev/build/extensions/audioWaveform"; + +extensions: [ + audioWaveform(), // Installs Audio Waveform 1.1.0 +]; +``` + +### System & Package Management + +#### System Packages (apt-get) + +```ts +import { aptGet } from "@trigger.dev/build/extensions/core"; + +extensions: [ + aptGet({ + packages: ["ffmpeg", "imagemagick", "curl=7.68.0-1"], // Can specify versions + }), +]; +``` + +#### Additional NPM Packages + +Only use this for installing CLI tools, NOT packages you import in your code. + +```ts +import { additionalPackages } from "@trigger.dev/build/extensions/core"; + +extensions: [ + additionalPackages({ + packages: ["wrangler"], // CLI tools and specific versions + }), +]; +``` + +#### Additional Files + +```ts +import { additionalFiles } from "@trigger.dev/build/extensions/core"; + +extensions: [ + additionalFiles({ + files: ["wrangler.toml", "./assets/**", "./fonts/**"], // Glob patterns supported + }), +]; +``` + +### Environment & Build Tools + +#### Environment Variable Sync + +```ts +import { syncEnvVars } from "@trigger.dev/build/extensions/core"; + +extensions: [ + syncEnvVars(async (ctx) => { + // ctx contains: environment, projectRef, env + return [ + { name: "SECRET_KEY", value: await getSecret(ctx.environment) }, + { name: "API_URL", value: ctx.environment === "prod" ? "api.prod.com" : "api.dev.com" }, + ]; + }), +]; +``` + +#### ESBuild Plugins + +```ts +import { esbuildPlugin } from "@trigger.dev/build/extensions"; +import { sentryEsbuildPlugin } from "@sentry/esbuild-plugin"; + +extensions: [ + esbuildPlugin( + sentryEsbuildPlugin({ + org: process.env.SENTRY_ORG, + project: process.env.SENTRY_PROJECT, + authToken: process.env.SENTRY_AUTH_TOKEN, + }), + { placement: "last", target: "deploy" } // Optional config + ), +]; +``` + +## Custom Build Extensions + +```ts +import { defineConfig } from "@trigger.dev/sdk"; + +const customExtension = { + name: "my-custom-extension", + + externalsForTarget: (target) => { + return ["some-native-module"]; // Add external dependencies + }, + + onBuildStart: async (context) => { + console.log(`Build starting for ${context.target}`); + // Register esbuild plugins, modify build context + }, + + onBuildComplete: async (context, manifest) => { + console.log("Build complete, adding layers"); + // Add build layers, modify deployment + context.addLayer({ + id: "my-layer", + files: [{ source: "./custom-file", destination: "/app/custom" }], + commands: ["chmod +x /app/custom"], + }); + }, +}; + +export default defineConfig({ + project: "my-project", + build: { + extensions: [customExtension], + }, +}); +``` + +## Advanced Configuration + +### Telemetry + +```ts +import { PrismaInstrumentation } from "@prisma/instrumentation"; +import { OpenAIInstrumentation } from "@langfuse/openai"; + +export default defineConfig({ + // ... other config + telemetry: { + instrumentations: [new PrismaInstrumentation(), new OpenAIInstrumentation()], + exporters: [customExporter], // Optional custom exporters + }, +}); +``` + +### Machine & Performance + +```ts +export default defineConfig({ + // ... other config + defaultMachine: "large-1x", // Default machine for all tasks + maxDuration: 300, // Default max duration (seconds) + enableConsoleLogging: true, // Console logging in development +}); +``` + +## Common Extension Combinations + +### Full-Stack Web App + +```ts +extensions: [ + prismaExtension({ schema: "prisma/schema.prisma", migrate: true }), + additionalFiles({ files: ["./public/**", "./assets/**"] }), + syncEnvVars(async (ctx) => [...envVars]), +]; +``` + +### AI/ML Processing + +```ts +extensions: [ + pythonExtension({ + scripts: ["./ai/**/*.py"], + requirementsFile: "./requirements.txt", + }), + ffmpeg({ version: "7" }), + additionalPackages({ packages: ["wrangler"] }), +]; +``` + +### Web Scraping + +```ts +extensions: [ + playwright({ browsers: ["chromium"] }), + puppeteer(), + additionalFiles({ files: ["./selectors.json", "./proxies.txt"] }), +]; +``` + +## Best Practices + +- **Use specific versions**: Pin extension versions for reproducible builds +- **External packages**: Add modules with native addons to the `build.external` array +- **Environment sync**: Use `syncEnvVars` for dynamic secrets +- **File paths**: Use glob patterns for flexible file inclusion +- **Debug builds**: Use `--log-level debug --dry-run` for troubleshooting + +Extensions only affect deployment, not local development. Use `external` array for packages that shouldn't be bundled. diff --git a/.claude/skills/trigger-dev-tasks/realtime.md b/.claude/skills/trigger-dev-tasks/realtime.md new file mode 100644 index 00000000000..c1c4c5821a9 --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/realtime.md @@ -0,0 +1,244 @@ +# Trigger.dev Realtime + +**Real-time monitoring and updates for runs** + +## Core Concepts + +Realtime allows you to: + +- Subscribe to run status changes, metadata updates, and streams +- Build real-time dashboards and UI updates +- Monitor task progress from frontend and backend + +## Authentication + +### Public Access Tokens + +```ts +import { auth } from "@trigger.dev/sdk"; + +// Read-only token for specific runs +const publicToken = await auth.createPublicToken({ + scopes: { + read: { + runs: ["run_123", "run_456"], + tasks: ["my-task-1", "my-task-2"], + }, + }, + expirationTime: "1h", // Default: 15 minutes +}); +``` + +### Trigger Tokens (Frontend only) + +```ts +// Single-use token for triggering tasks +const triggerToken = await auth.createTriggerPublicToken("my-task", { + expirationTime: "30m", +}); +``` + +## Backend Usage + +### Subscribe to Runs + +```ts +import { runs, tasks } from "@trigger.dev/sdk"; + +// Trigger and subscribe +const handle = await tasks.trigger("my-task", { data: "value" }); + +// Subscribe to specific run +for await (const run of runs.subscribeToRun(handle.id)) { + console.log(`Status: ${run.status}, Progress: ${run.metadata?.progress}`); + if (run.status === "COMPLETED") break; +} + +// Subscribe to runs with tag +for await (const run of runs.subscribeToRunsWithTag("user-123")) { + console.log(`Tagged run ${run.id}: ${run.status}`); +} + +// Subscribe to batch +for await (const run of runs.subscribeToBatch(batchId)) { + console.log(`Batch run ${run.id}: ${run.status}`); +} +``` + +### Realtime Streams v2 + +```ts +import { streams, InferStreamType } from "@trigger.dev/sdk"; + +// 1. Define streams (shared location) +export const aiStream = streams.define({ + id: "ai-output", +}); + +export type AIStreamPart = InferStreamType; + +// 2. Pipe from task +export const streamingTask = task({ + id: "streaming-task", + run: async (payload) => { + const completion = await openai.chat.completions.create({ + model: "gpt-4", + messages: [{ role: "user", content: payload.prompt }], + stream: true, + }); + + const { waitUntilComplete } = aiStream.pipe(completion); + await waitUntilComplete(); + }, +}); + +// 3. Read from backend +const stream = await aiStream.read(runId, { + timeoutInSeconds: 300, + startIndex: 0, // Resume from specific chunk +}); + +for await (const chunk of stream) { + console.log("Chunk:", chunk); // Fully typed +} +``` + +## React Frontend Usage + +### Installation + +```bash +npm add @trigger.dev/react-hooks +``` + +### Triggering Tasks + +```tsx +"use client"; +import { useTaskTrigger, useRealtimeTaskTrigger } from "@trigger.dev/react-hooks"; +import type { myTask } from "../trigger/tasks"; + +function TriggerComponent({ accessToken }: { accessToken: string }) { + // Basic trigger + const { submit, handle, isLoading } = useTaskTrigger("my-task", { + accessToken, + }); + + // Trigger with realtime updates + const { + submit: realtimeSubmit, + run, + isLoading: isRealtimeLoading, + } = useRealtimeTaskTrigger("my-task", { accessToken }); + + return ( +
+ + + + + {run &&
Status: {run.status}
} +
+ ); +} +``` + +### Subscribing to Runs + +```tsx +"use client"; +import { useRealtimeRun, useRealtimeRunsWithTag } from "@trigger.dev/react-hooks"; +import type { myTask } from "../trigger/tasks"; + +function SubscribeComponent({ runId, accessToken }: { runId: string; accessToken: string }) { + // Subscribe to specific run + const { run, error } = useRealtimeRun(runId, { + accessToken, + onComplete: (run) => { + console.log("Task completed:", run.output); + }, + }); + + // Subscribe to tagged runs + const { runs } = useRealtimeRunsWithTag("user-123", { accessToken }); + + if (error) return
Error: {error.message}
; + if (!run) return
Loading...
; + + return ( +
+
Status: {run.status}
+
Progress: {run.metadata?.progress || 0}%
+ {run.output &&
Result: {JSON.stringify(run.output)}
} + +

Tagged Runs:

+ {runs.map((r) => ( +
+ {r.id}: {r.status} +
+ ))} +
+ ); +} +``` + +### Realtime Streams with React + +```tsx +"use client"; +import { useRealtimeStream } from "@trigger.dev/react-hooks"; +import { aiStream } from "../trigger/streams"; + +function StreamComponent({ runId, accessToken }: { runId: string; accessToken: string }) { + // Pass defined stream directly for type safety + const { parts, error } = useRealtimeStream(aiStream, runId, { + accessToken, + timeoutInSeconds: 300, + throttleInMs: 50, // Control re-render frequency + }); + + if (error) return
Error: {error.message}
; + if (!parts) return
Loading...
; + + const text = parts.join(""); // parts is typed as AIStreamPart[] + + return
Streamed Text: {text}
; +} +``` + +### Wait Tokens + +```tsx +"use client"; +import { useWaitToken } from "@trigger.dev/react-hooks"; + +function WaitTokenComponent({ tokenId, accessToken }: { tokenId: string; accessToken: string }) { + const { complete } = useWaitToken(tokenId, { accessToken }); + + return ; +} +``` + +## Run Object Properties + +Key properties available in run subscriptions: + +- `id`: Unique run identifier +- `status`: `QUEUED`, `EXECUTING`, `COMPLETED`, `FAILED`, `CANCELED`, etc. +- `payload`: Task input data (typed) +- `output`: Task result (typed, when completed) +- `metadata`: Real-time updatable data +- `createdAt`, `updatedAt`: Timestamps +- `costInCents`: Execution cost + +## Best Practices + +- **Use Realtime over SWR**: Recommended for most use cases due to rate limits +- **Scope tokens properly**: Only grant necessary read/trigger permissions +- **Handle errors**: Always check for errors in hooks and subscriptions +- **Type safety**: Use task types for proper payload/output typing +- **Cleanup subscriptions**: Backend subscriptions auto-complete, frontend hooks auto-cleanup diff --git a/.claude/skills/trigger-dev-tasks/scheduled-tasks.md b/.claude/skills/trigger-dev-tasks/scheduled-tasks.md new file mode 100644 index 00000000000..b314753497f --- /dev/null +++ b/.claude/skills/trigger-dev-tasks/scheduled-tasks.md @@ -0,0 +1,113 @@ +# Scheduled Tasks (Cron) + +Recurring tasks using cron. For one-off future runs, use the **delay** option. + +## Define a Scheduled Task + +```ts +import { schedules } from "@trigger.dev/sdk"; + +export const task = schedules.task({ + id: "first-scheduled-task", + run: async (payload) => { + payload.timestamp; // Date (scheduled time, UTC) + payload.lastTimestamp; // Date | undefined + payload.timezone; // IANA, e.g. "America/New_York" (default "UTC") + payload.scheduleId; // string + payload.externalId; // string | undefined + payload.upcoming; // Date[] + + payload.timestamp.toLocaleString("en-US", { timeZone: payload.timezone }); + }, +}); +``` + +> Scheduled tasks need at least one schedule attached to run. + +## Attach Schedules + +**Declarative (sync on dev/deploy):** + +```ts +schedules.task({ + id: "every-2h", + cron: "0 */2 * * *", // UTC + run: async () => {}, +}); + +schedules.task({ + id: "tokyo-5am", + cron: { pattern: "0 5 * * *", timezone: "Asia/Tokyo", environments: ["PRODUCTION", "STAGING"] }, + run: async () => {}, +}); +``` + +**Imperative (SDK or dashboard):** + +```ts +await schedules.create({ + task: task.id, + cron: "0 0 * * *", + timezone: "America/New_York", // DST-aware + externalId: "user_123", + deduplicationKey: "user_123-daily", // updates if reused +}); +``` + +### Dynamic / Multi-tenant Example + +```ts +// /trigger/reminder.ts +export const reminderTask = schedules.task({ + id: "todo-reminder", + run: async (p) => { + if (!p.externalId) throw new Error("externalId is required"); + const user = await db.getUser(p.externalId); + await sendReminderEmail(user); + }, +}); +``` + +```ts +// app/reminders/route.ts +export async function POST(req: Request) { + const data = await req.json(); + return Response.json( + await schedules.create({ + task: reminderTask.id, + cron: "0 8 * * *", + timezone: data.timezone, + externalId: data.userId, + deduplicationKey: `${data.userId}-reminder`, + }) + ); +} +``` + +## Cron Syntax (no seconds) + +``` +* * * * * +| | | | └ day of week (0–7 or 1L–7L; 0/7=Sun; L=last) +| | | └── month (1–12) +| | └──── day of month (1–31 or L) +| └────── hour (0–23) +└──────── minute (0–59) +``` + +## When Schedules Won't Trigger + +- **Dev:** only when the dev CLI is running. +- **Staging/Production:** only for tasks in the **latest deployment**. + +## SDK Management + +```ts +await schedules.retrieve(id); +await schedules.list(); +await schedules.update(id, { cron: "0 0 1 * *", externalId: "ext", deduplicationKey: "key" }); +await schedules.deactivate(id); +await schedules.activate(id); +await schedules.del(id); +await schedules.timezones(); // list of IANA timezones +``` diff --git a/.configs/tsconfig.base.json b/.configs/tsconfig.base.json new file mode 100644 index 00000000000..2d560d22d0f --- /dev/null +++ b/.configs/tsconfig.base.json @@ -0,0 +1,36 @@ +{ + "compilerOptions": { + "target": "es2022", + "lib": ["ES2022", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "NodeNext", + "moduleResolution": "NodeNext", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "jsx": "react", + + "strict": true, + "alwaysStrict": true, + "strictPropertyInitialization": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noImplicitAny": true, + "noImplicitReturns": true, + "noImplicitThis": true, + + "noFallthroughCasesInSwitch": true, + "resolveJsonModule": true, + + "removeComments": false, + "esModuleInterop": true, + "emitDecoratorMetadata": false, + "experimentalDecorators": false, + "downlevelIteration": true, + "isolatedModules": true, + "noUncheckedIndexedAccess": true, + + "pretty": true, + "customConditions": ["@triggerdotdev/source"] + } +} diff --git a/.cursor/commands/deslop.md b/.cursor/commands/deslop.md new file mode 100644 index 00000000000..d82835663f7 --- /dev/null +++ b/.cursor/commands/deslop.md @@ -0,0 +1,11 @@ +# Remove AI code slop + +Check the diff against main, and remove all AI generated slop introduced in this branch. + +This includes: +- Extra comments that a human wouldn't add or is inconsistent with the rest of the file +- Extra defensive checks or try/catch blocks that are abnormal for that area of the codebase (especially if called by trusted / validated codepaths) +- Casts to any to get around type issues +- Any other style that is inconsistent with the file + +Report at the end with only a 1-3 sentence summary of what you changed \ No newline at end of file diff --git a/.cursor/mcp.json b/.cursor/mcp.json new file mode 100644 index 00000000000..da39e4ffafe --- /dev/null +++ b/.cursor/mcp.json @@ -0,0 +1,3 @@ +{ + "mcpServers": {} +} diff --git a/.cursor/rules/executing-commands.mdc b/.cursor/rules/executing-commands.mdc new file mode 100644 index 00000000000..0d36b449491 --- /dev/null +++ b/.cursor/rules/executing-commands.mdc @@ -0,0 +1,24 @@ +--- +description: how to run commands in the monorepo +globs: +alwaysApply: true +--- +Almost all commands in the monorepo should be executed when `pnpm run ...` from the root of the monorepo. For example, running tests for the `@internal/run-engine` internal package: + +``` +pnpm run dev --filter webapp +``` + +But often, when running tests, it's better to `cd` into the directory and then run tests: + +``` +cd apps/webapp +pnpm run test --run +``` + +This way you can run for a single file easily: + +``` +cd internal-packages/run-engine +pnpm run test ./src/engine/tests/ttl.test.ts --run +``` diff --git a/.cursor/rules/migrations.mdc b/.cursor/rules/migrations.mdc new file mode 100644 index 00000000000..370c87c051d --- /dev/null +++ b/.cursor/rules/migrations.mdc @@ -0,0 +1,6 @@ +--- +description: how to create and apply database migrations +alwaysApply: false +--- + +Follow our [migrations.md](mdc:ai/references/migrations.md) guide for how to create and apply database migrations. diff --git a/.cursor/rules/otel-metrics.mdc b/.cursor/rules/otel-metrics.mdc new file mode 100644 index 00000000000..218f07c41e2 --- /dev/null +++ b/.cursor/rules/otel-metrics.mdc @@ -0,0 +1,66 @@ +--- +description: Guidelines for creating OpenTelemetry metrics to avoid cardinality issues +globs: + - "**/*.ts" +--- + +# OpenTelemetry Metrics Guidelines + +When creating or editing OTEL metrics (counters, histograms, gauges), always ensure metric attributes have **low cardinality**. + +## What is Cardinality? + +Cardinality refers to the number of unique values an attribute can have. Each unique combination of attribute values creates a new time series, which consumes memory and storage in your metrics backend. + +## Rules + +### DO use low-cardinality attributes: +- **Enums**: `environment_type` (PRODUCTION, STAGING, DEVELOPMENT, PREVIEW) +- **Booleans**: `hasFailures`, `streaming`, `success` +- **Bounded error codes**: A finite, controlled set of error types +- **Shard IDs**: When sharding is bounded (e.g., 0-15) + +### DO NOT use high-cardinality attributes: +- **UUIDs/IDs**: `envId`, `userId`, `runId`, `projectId`, `organizationId` +- **Unbounded integers**: `itemCount`, `batchSize`, `retryCount` +- **Timestamps**: `createdAt`, `startTime` +- **Free-form strings**: `errorMessage`, `taskName`, `queueName` + +## Example + +```typescript +// BAD - High cardinality +this.counter.add(1, { + envId: options.environmentId, // UUID - unbounded + itemCount: options.runCount, // Integer - unbounded +}); + +// GOOD - Low cardinality +this.counter.add(1, { + environment_type: options.environmentType, // Enum - 4 values + streaming: true, // Boolean - 2 values +}); +``` + +## Prometheus Metric Naming + +When metrics are exported via OTLP to Prometheus, the exporter automatically adds unit suffixes to metric names: + +| OTel Metric Name | Unit | Prometheus Name | +|------------------|------|-----------------| +| `my_duration_ms` | `ms` | `my_duration_ms_milliseconds` | +| `my_counter` | counter | `my_counter_total` | +| `items_inserted` | counter | `items_inserted_inserts_total` | +| `batch_size` | histogram | `batch_size_items_bucket` | + +Keep this in mind when writing Grafana dashboards or Prometheus queries—the metric names in Prometheus will differ from the names defined in code. + +## Reference + +See the schedule engine (`internal-packages/schedule-engine/src/engine/index.ts`) for a good example of low-cardinality metric attributes. + +High cardinality metrics can cause: +- Memory bloat in metrics backends (Axiom, Prometheus, etc.) +- Slow queries and dashboard timeouts +- Increased costs (many backends charge per time series) +- Potential data loss or crashes at scale diff --git a/.cursor/rules/repo.mdc b/.cursor/rules/repo.mdc new file mode 100644 index 00000000000..460c3893656 --- /dev/null +++ b/.cursor/rules/repo.mdc @@ -0,0 +1,6 @@ +--- +description: understanding the structure of the monorepo +globs: +alwaysApply: true +--- +We've documented the structure of our monorepo here: [repo.md](mdc:ai/references/repo.md) \ No newline at end of file diff --git a/.cursor/rules/webapp.mdc b/.cursor/rules/webapp.mdc new file mode 100644 index 00000000000..a362f14fe12 --- /dev/null +++ b/.cursor/rules/webapp.mdc @@ -0,0 +1,40 @@ +--- +description: Making updates to the main trigger.dev remix webapp +globs: apps/webapp/**/*.tsx,apps/webapp/**/*.ts +alwaysApply: false +--- + +The main trigger.dev webapp, which powers it's API and dashboard and makes up the docker image that is produced as an OSS image, is a Remix 2.1.0 app that uses an express server, written in TypeScript. The following subsystems are either included in the webapp or are used by the webapp in another part of the monorepo: + +- `@trigger.dev/database` exports a Prisma 6.14.0 client that is used extensively in the webapp to access a PostgreSQL instance. The schema file is [schema.prisma](mdc:internal-packages/database/prisma/schema.prisma) +- `@trigger.dev/core` is a published package and is used to share code between the `@trigger.dev/sdk` and the webapp. It includes functionality but also a load of Zod schemas for data validation. When importing from `@trigger.dev/core` in the webapp, we never import the root `@trigger.dev/core` path, instead we favor one of the subpath exports that you can find in [package.json](mdc:packages/core/package.json) +- `@internal/run-engine` has all the code needed to trigger a run and take it through it's lifecycle to completion. +- `@trigger.dev/redis-worker` is a custom redis based background job/worker system that's used in the webapp and also used inside the run engine. + +## Environment variables and testing + +In the webapp, all environment variables are accessed through the `env` export of [env.server.ts](mdc:apps/webapp/app/env.server.ts), instead of directly accessing `process.env`. + +Ideally, the `env.server.ts` file would never be imported into a test file, either directly or indirectly. Tests should only imported classes and functions from a file matching `app/**/*.ts` of the webapp, and that file should not use environment variables, everything should be passed through as options instead. This "service/configuration" separation is important, and can be seen in a few places in the code for examples: + +- [realtimeClient.server.ts](mdc:apps/webapp/app/services/realtimeClient.server.ts) is the testable service, and [realtimeClientGlobal.server.ts](mdc:apps/webapp/app/services/realtimeClientGlobal.server.ts) is the configuration + +Also for writing tests in the webapp, checkout our [tests.md](mdc:ai/references/tests.md) guide + +## Legacy run engine vs Run Engine 2.0 + +We originally the Trigger.dev "Run Engine" not as a single system, but just spread out all over the codebase, with no real separate or encapsulation. And we didn't even call it a "Run Engine". With Run Engine 2.0, we've completely rewritten big parts of the way the system works, and moved it over to an internal package called `@internal/run-engine`. So we've retroactively named the previous run engine "Legacy run engine". We're focused almost exclusively now on moving to Run Engine 2.0 and will be deprecating and removing the legacy run engine code eventually. + +## Where to look for code + +- The trigger API endpoint is [api.v1.tasks.$taskId.trigger.ts](mdc:apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts) +- The batch trigger API endpoint is [api.v1.tasks.batch.ts](mdc:apps/webapp/app/routes/api.v1.tasks.batch.ts) +- Setup code for the prisma client is in [db.server.ts](mdc:apps/webapp/app/db.server.ts) +- The run engine is configured in [runEngine.server.ts](mdc:apps/webapp/app/v3/runEngine.server.ts) +- All the "services" that are found in app/v3/services/\*_/_.server.ts +- The code for the TaskEvent data, which is the otel data sent from tasks to our servers, is in both the [eventRepository.server.ts](mdc:apps/webapp/app/v3/eventRepository.server.ts) and also the [otlpExporter.server.ts](mdc:apps/webapp/app/v3/otlpExporter.server.ts). The otel endpoints which are hit from production and development otel exporters is [otel.v1.logs.ts](mdc:apps/webapp/app/routes/otel.v1.logs.ts) and [otel.v1.traces.ts](mdc:apps/webapp/app/routes/otel.v1.traces.ts) +- We use "presenters" to move more complex loader code into a class, and you can find those are app/v3/presenters/\*_/_.server.ts + +- All the "services" that are found in app/v3/services/\*_/_.server.ts +- The code for the TaskEvent data, which is the otel data sent from tasks to our servers, is in both the [eventRepository.server.ts](mdc:apps/webapp/app/v3/eventRepository.server.ts) and also the [otlpExporter.server.ts](mdc:apps/webapp/app/v3/otlpExporter.server.ts). The otel endpoints which are hit from production and development otel exporters is [otel.v1.logs.ts](mdc:apps/webapp/app/routes/otel.v1.logs.ts) and [otel.v1.traces.ts](mdc:apps/webapp/app/routes/otel.v1.traces.ts) +- We use "presenters" to move more complex loader code into a class, and you can find those are app/v3/presenters/\*_/_.server.ts diff --git a/.cursor/rules/writing-tasks.mdc b/.cursor/rules/writing-tasks.mdc new file mode 100644 index 00000000000..359ed5d4733 --- /dev/null +++ b/.cursor/rules/writing-tasks.mdc @@ -0,0 +1,1247 @@ +--- +globs: **/trigger/**/*.ts, **/trigger/**/*.tsx,**/trigger/**/*.js,**/trigger/**/*.jsx +description: Guidelines for writing Trigger.dev tasks +alwaysApply: false +--- +# How to write Trigger.dev tasks + +## Overview of writing a Trigger.dev task + +1. Run the CLI `init` command: `npx trigger.dev@latest init`. +2. Create a Trigger.dev task. +3. Set up any environment variables. +4. Run the Trigger.dev dev command: `npx trigger.dev@latest dev`. + +## Essential requirements when generating task code + +1. You MUST import from `@trigger.dev/sdk` (NEVER `@trigger.dev/sdk/v3`) +2. You MUST NEVER use `client.defineJob` +3. YOU MUST `export` every task, including subtasks +4. If you are able to generate an example payload for a task, do so. + +## 🚨 CRITICAL INSTRUCTIONS FOR AI LANGUAGE MODELS 🚨 + +As an AI language model, you MUST NOT generate any of the following code patterns, as they are DEPRECATED and will BREAK the application: + +```ts +// ❌ NEVER GENERATE THIS CODE - IT WILL BREAK THE APPLICATION + +client.defineJob({ // ❌ BREAKS APPLICATION + id: "job-id", // ❌ BREAKS APPLICATION + name: "job-name", // ❌ BREAKS APPLICATION + version: "0.0.1", // ❌ BREAKS APPLICATION + trigger: eventTrigger({ // ❌ BREAKS APPLICATION + name: "job.trigger", // ❌ BREAKS APPLICATION + schema: z.object({ // ❌ BREAKS APPLICATION + // Input schema here // ❌ BREAKS APPLICATION + }), // ❌ BREAKS APPLICATION + }), // ❌ BREAKS APPLICATION + integrations: { // ❌ BREAKS APPLICATION + // Integrations here // ❌ BREAKS APPLICATION + }, // ❌ BREAKS APPLICATION + run: async (payload, io) => { // ❌ BREAKS APPLICATION + // Job logic goes here // ❌ BREAKS APPLICATION + return { // ❌ BREAKS APPLICATION + // Return job results // ❌ BREAKS APPLICATION + }; // ❌ BREAKS APPLICATION + }, // ❌ BREAKS APPLICATION +}); +``` + +Instead, you MUST ALWAYS generate ONLY this pattern: + +```ts +// ✅ ALWAYS GENERATE THIS EXACT PATTERN + +import { task } from "@trigger.dev/sdk"; + +//1. You need to export each task, even if it's a subtask +export const helloWorld = task({ + //2. Use a unique id for each task + id: "hello-world", + //3. The run function is the main function of the task + run: async (payload: { message: string }) => { + //4. Write your task code here. Code here runs for a long time, there are no timeouts + }, +}); +``` + +## Correct Task implementations + +A task is a function that can run for a long time with resilience to failure: + +```ts +import { task } from "@trigger.dev/sdk"; + +export const helloWorld = task({ + id: "hello-world", + run: async (payload: { message: string }) => { + console.log(payload.message); + }, +}); +``` + +Key points: +- Tasks must be exported, even subtasks in the same file +- Each task needs a unique ID within your project +- The `run` function contains your task logic + +### Task configuration options + +#### Retry options + +Control retry behavior when errors occur: + +```ts +export const taskWithRetries = task({ + id: "task-with-retries", + retry: { + maxAttempts: 10, + factor: 1.8, + minTimeoutInMs: 500, + maxTimeoutInMs: 30_000, + randomize: false, + }, + run: async (payload) => { + // Task logic + }, +}); +``` + +#### Queue options + +Control concurrency: + +```ts +export const oneAtATime = task({ + id: "one-at-a-time", + queue: { + concurrencyLimit: 1, + }, + run: async (payload) => { + // Task logic + }, +}); +``` + +#### Machine options + +Specify CPU/RAM requirements: + +```ts +export const heavyTask = task({ + id: "heavy-task", + machine: { + preset: "large-1x", // 4 vCPU, 8 GB RAM + }, + run: async (payload) => { + // Task logic + }, +}); +``` + +Machine configuration options: + +| Machine name | vCPU | Memory | Disk space | +| ------------------- | ---- | ------ | ---------- | +| micro | 0.25 | 0.25 | 10GB | +| small-1x (default) | 0.5 | 0.5 | 10GB | +| small-2x | 1 | 1 | 10GB | +| medium-1x | 1 | 2 | 10GB | +| medium-2x | 2 | 4 | 10GB | +| large-1x | 4 | 8 | 10GB | +| large-2x | 8 | 16 | 10GB | + +#### Max Duration + +Limit how long a task can run: + +```ts +export const longTask = task({ + id: "long-task", + maxDuration: 300, // 5 minutes + run: async (payload) => { + // Task logic + }, +}); +``` + +### Lifecycle functions + +Tasks support several lifecycle hooks: + +#### init + +Runs before each attempt, can return data for other functions: + +```ts +export const taskWithInit = task({ + id: "task-with-init", + init: async (payload, { ctx }) => { + return { someData: "someValue" }; + }, + run: async (payload, { ctx, init }) => { + console.log(init.someData); // "someValue" + }, +}); +``` + +#### cleanup + +Runs after each attempt, regardless of success/failure: + +```ts +export const taskWithCleanup = task({ + id: "task-with-cleanup", + cleanup: async (payload, { ctx }) => { + // Cleanup resources + }, + run: async (payload, { ctx }) => { + // Task logic + }, +}); +``` + +#### onStart + +Runs once when a task starts (not on retries): + +```ts +export const taskWithOnStart = task({ + id: "task-with-on-start", + onStart: async (payload, { ctx }) => { + // Send notification, log, etc. + }, + run: async (payload, { ctx }) => { + // Task logic + }, +}); +``` + +#### onSuccess + +Runs when a task succeeds: + +```ts +export const taskWithOnSuccess = task({ + id: "task-with-on-success", + onSuccess: async (payload, output, { ctx }) => { + // Handle success + }, + run: async (payload, { ctx }) => { + // Task logic + }, +}); +``` + +#### onFailure + +Runs when a task fails after all retries: + +```ts +export const taskWithOnFailure = task({ + id: "task-with-on-failure", + onFailure: async (payload, error, { ctx }) => { + // Handle failure + }, + run: async (payload, { ctx }) => { + // Task logic + }, +}); +``` + +#### handleError + +Controls error handling and retry behavior: + +```ts +export const taskWithErrorHandling = task({ + id: "task-with-error-handling", + handleError: async (error, { ctx }) => { + // Custom error handling + }, + run: async (payload, { ctx }) => { + // Task logic + }, +}); +``` + +Global lifecycle hooks can also be defined in `trigger.config.ts` to apply to all tasks. + +## Correct Schedules task (cron) implementations + +```ts +import { schedules } from "@trigger.dev/sdk"; + +export const firstScheduledTask = schedules.task({ + id: "first-scheduled-task", + run: async (payload) => { + //when the task was scheduled to run + //note this will be slightly different from new Date() because it takes a few ms to run the task + console.log(payload.timestamp); //is a Date object + + //when the task was last run + //this can be undefined if it's never been run + console.log(payload.lastTimestamp); //is a Date object or undefined + + //the timezone the schedule was registered with, defaults to "UTC" + //this is in IANA format, e.g. "America/New_York" + //See the full list here: https://cloud.trigger.dev/timezones + console.log(payload.timezone); //is a string + + //If you want to output the time in the user's timezone do this: + const formatted = payload.timestamp.toLocaleString("en-US", { + timeZone: payload.timezone, + }); + + //the schedule id (you can have many schedules for the same task) + //using this you can remove the schedule, update it, etc + console.log(payload.scheduleId); //is a string + + //you can optionally provide an external id when creating the schedule + //usually you would set this to a userId or some other unique identifier + //this can be undefined if you didn't provide one + console.log(payload.externalId); //is a string or undefined + + //the next 5 dates this task is scheduled to run + console.log(payload.upcoming); //is an array of Date objects + }, +}); +``` + +### Attach a Declarative schedule + +```ts +import { schedules } from "@trigger.dev/sdk"; + +// Sepcify a cron pattern (UTC) +export const firstScheduledTask = schedules.task({ + id: "first-scheduled-task", + //every two hours (UTC timezone) + cron: "0 */2 * * *", + run: async (payload, { ctx }) => { + //do something + }, +}); +``` + +```ts +import { schedules } from "@trigger.dev/sdk"; + +// Specify a specific timezone like this: +export const secondScheduledTask = schedules.task({ + id: "second-scheduled-task", + cron: { + //5am every day Tokyo time + pattern: "0 5 * * *", + timezone: "Asia/Tokyo", + }, + run: async (payload) => {}, +}); +``` + +### Attach an Imperative schedule + +Create schedules explicitly for tasks using the dashboard's "New schedule" button or the SDK. + +#### Benefits +- Dynamic creation (e.g., one schedule per user) +- Manage without code deployment: + - Activate/disable + - Edit + - Delete + +#### Implementation +1. Define a task using `⁠schedules.task()` +2. Attach one or more schedules via: + - Dashboard + - SDK + +#### Attach schedules with the SDK like this + +```ts +const createdSchedule = await schedules.create({ + //The id of the scheduled task you want to attach to. + task: firstScheduledTask.id, + //The schedule in cron format. + cron: "0 0 * * *", + //this is required, it prevents you from creating duplicate schedules. It will update the schedule if it already exists. + deduplicationKey: "my-deduplication-key", +}); +``` + +## Correct Schema task implementations + +Schema tasks validate payloads against a schema before execution: + +```ts +import { schemaTask } from "@trigger.dev/sdk"; +import { z } from "zod"; + +const myTask = schemaTask({ + id: "my-task", + schema: z.object({ + name: z.string(), + age: z.number(), + }), + run: async (payload) => { + // Payload is typed and validated + console.log(payload.name, payload.age); + }, +}); +``` + +## Correct implementations for triggering a task from your backend + +When you trigger a task from your backend code, you need to set the `TRIGGER_SECRET_KEY` environment variable. You can find the value on the API keys page in the Trigger.dev dashboard. + +### tasks.trigger() + +Triggers a single run of a task with specified payload and options without importing the task. Use type-only imports for full type checking. + +```ts +import { tasks } from "@trigger.dev/sdk"; +import type { emailSequence } from "~/trigger/emails"; + +export async function POST(request: Request) { + const data = await request.json(); + const handle = await tasks.trigger("email-sequence", { + to: data.email, + name: data.name, + }); + return Response.json(handle); +} +``` + +### tasks.batchTrigger() + +Triggers multiple runs of a single task with different payloads without importing the task. + +```ts +import { tasks } from "@trigger.dev/sdk"; +import type { emailSequence } from "~/trigger/emails"; + +export async function POST(request: Request) { + const data = await request.json(); + const batchHandle = await tasks.batchTrigger( + "email-sequence", + data.users.map((u) => ({ payload: { to: u.email, name: u.name } })) + ); + return Response.json(batchHandle); +} +``` + +### batch.trigger() + +Triggers multiple runs of different tasks at once, useful when you need to execute multiple tasks simultaneously. + +```ts +import { batch } from "@trigger.dev/sdk"; +import type { myTask1, myTask2 } from "~/trigger/myTasks"; + +export async function POST(request: Request) { + const data = await request.json(); + const result = await batch.trigger([ + { id: "my-task-1", payload: { some: data.some } }, + { id: "my-task-2", payload: { other: data.other } }, + ]); + return Response.json(result); +} +``` + +## Correct implementations for triggering a task from inside another task + +### yourTask.trigger() + +Triggers a single run of a task with specified payload and options. + +```ts +import { myOtherTask, runs } from "~/trigger/my-other-task"; + +export const myTask = task({ + id: "my-task", + run: async (payload: string) => { + const handle = await myOtherTask.trigger({ foo: "some data" }); + + const run = await runs.retrieve(handle); + // Do something with the run + }, +}); +``` + +If you need to call `trigger()` on a task in a loop, use `batchTrigger()` instead which can trigger up to 500 runs in a single call. + +### yourTask.batchTrigger() + +Triggers multiple runs of a single task with different payloads. + +```ts +import { myOtherTask, batch } from "~/trigger/my-other-task"; + +export const myTask = task({ + id: "my-task", + run: async (payload: string) => { + const batchHandle = await myOtherTask.batchTrigger([{ payload: "some data" }]); + + //...do other stuff + const batch = await batch.retrieve(batchHandle.id); + }, +}); +``` + +### yourTask.triggerAndWait() + +Triggers a task and waits for the result, useful when you need to call a different task and use its result. + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload: string) => { + const result = await childTask.triggerAndWait("some-data"); + console.log("Result", result); + + //...do stuff with the result + }, +}); +``` + +The result object needs to be checked to see if the child task run was successful. You can also use the `unwrap` method to get the output directly or handle errors with `SubtaskUnwrapError`. This method should only be used inside a task. + +### yourTask.batchTriggerAndWait() + +Batch triggers a task and waits for all results, useful for fan-out patterns. + +```ts +export const batchParentTask = task({ + id: "parent-task", + run: async (payload: string) => { + const results = await childTask.batchTriggerAndWait([ + { payload: "item4" }, + { payload: "item5" }, + { payload: "item6" }, + ]); + console.log("Results", results); + + //...do stuff with the result + }, +}); +``` + +You can handle run failures by inspecting individual run results and implementing custom error handling strategies. This method should only be used inside a task. + +### batch.triggerAndWait() + +Batch triggers multiple different tasks and waits for all results. + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload: string) => { + const results = await batch.triggerAndWait([ + { id: "child-task-1", payload: { foo: "World" } }, + { id: "child-task-2", payload: { bar: 42 } }, + ]); + + for (const result of results) { + if (result.ok) { + switch (result.taskIdentifier) { + case "child-task-1": + console.log("Child task 1 output", result.output); + break; + case "child-task-2": + console.log("Child task 2 output", result.output); + break; + } + } + } + }, +}); +``` + +### batch.triggerByTask() + +Batch triggers multiple tasks by passing task instances, useful for static task sets. + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload: string) => { + const results = await batch.triggerByTask([ + { task: childTask1, payload: { foo: "World" } }, + { task: childTask2, payload: { bar: 42 } }, + ]); + + const run1 = await runs.retrieve(results.runs[0]); + const run2 = await runs.retrieve(results.runs[1]); + }, +}); +``` + +### batch.triggerByTaskAndWait() + +Batch triggers multiple tasks by passing task instances and waits for all results. + +```ts +export const parentTask = task({ + id: "parent-task", + run: async (payload: string) => { + const { runs } = await batch.triggerByTaskAndWait([ + { task: childTask1, payload: { foo: "World" } }, + { task: childTask2, payload: { bar: 42 } }, + ]); + + if (runs[0].ok) { + console.log("Child task 1 output", runs[0].output); + } + + if (runs[1].ok) { + console.log("Child task 2 output", runs[1].output); + } + }, +}); +``` + +## Correct Metadata implementation + +### Overview + +Metadata allows attaching up to 256KB of structured data to a run, which can be accessed during execution, via API, Realtime, and in the dashboard. Useful for storing user information, tracking progress, or saving intermediate results. + +### Basic Usage + +Add metadata when triggering a task: + +```ts +const handle = await myTask.trigger( + { message: "hello world" }, + { metadata: { user: { name: "Eric", id: "user_1234" } } } +); +``` + +Access metadata inside a run: + +```ts +import { task, metadata } from "@trigger.dev/sdk"; + +export const myTask = task({ + id: "my-task", + run: async (payload: { message: string }) => { + // Get the whole metadata object + const currentMetadata = metadata.current(); + + // Get a specific key + const user = metadata.get("user"); + console.log(user.name); // "Eric" + }, +}); +``` + +### Update methods + +Metadata can be updated as the run progresses: + +- **set**: `metadata.set("progress", 0.5)` +- **del**: `metadata.del("progress")` +- **replace**: `metadata.replace({ user: { name: "Eric" } })` +- **append**: `metadata.append("logs", "Step 1 complete")` +- **remove**: `metadata.remove("logs", "Step 1 complete")` +- **increment**: `metadata.increment("progress", 0.4)` +- **decrement**: `metadata.decrement("progress", 0.4)` +- **stream**: `await metadata.stream("logs", readableStream)` +- **flush**: `await metadata.flush()` + +Updates can be chained with a fluent API: + +```ts +metadata.set("progress", 0.1) + .append("logs", "Step 1 complete") + .increment("progress", 0.4); +``` + +### Parent & root updates + +Child tasks can update parent task metadata: + +```ts +export const childTask = task({ + id: "child-task", + run: async (payload: { message: string }) => { + // Update parent task's metadata + metadata.parent.set("progress", 0.5); + + // Update root task's metadata + metadata.root.set("status", "processing"); + }, +}); +``` + +### Type safety + +Metadata accepts any JSON-serializable object. For type safety, consider wrapping with Zod: + +```ts +import { z } from "zod"; + +const Metadata = z.object({ + user: z.object({ + name: z.string(), + id: z.string(), + }), + date: z.coerce.date(), +}); + +function getMetadata() { + return Metadata.parse(metadata.current()); +} +``` + +### Important notes + +- Metadata methods only work inside run functions or task lifecycle hooks +- Metadata is NOT automatically propagated to child tasks +- Maximum size is 256KB (configurable if self-hosting) +- Objects like Dates are serialized to strings and must be deserialized when retrieved + +## Correct Realtime implementation + +### Overview + +Trigger.dev Realtime enables subscribing to runs for real-time updates on run status, useful for monitoring tasks, updating UIs, and building realtime dashboards. It's built on Electric SQL, a PostgreSQL syncing engine. + +### Basic usage + +Subscribe to a run after triggering a task: + +```ts +import { runs, tasks } from "@trigger.dev/sdk"; + +async function myBackend() { + const handle = await tasks.trigger("my-task", { some: "data" }); + + for await (const run of runs.subscribeToRun(handle.id)) { + console.log(run); // Logs the run every time it changes + } +} +``` + +### Subscription methods + +- **subscribeToRun**: Subscribe to changes for a specific run +- **subscribeToRunsWithTag**: Subscribe to changes for all runs with a specific tag +- **subscribeToBatch**: Subscribe to changes for all runs in a batch + +### Type safety + +You can infer types of run's payload and output by passing the task type: + +```ts +import { runs } from "@trigger.dev/sdk"; +import type { myTask } from "./trigger/my-task"; + +for await (const run of runs.subscribeToRun(handle.id)) { + console.log(run.payload.some); // Type-safe access to payload + + if (run.output) { + console.log(run.output.result); // Type-safe access to output + } +} +``` + +### Realtime Streams + +Stream data in realtime from inside your tasks using the metadata system: + +```ts +import { task, metadata } from "@trigger.dev/sdk"; +import OpenAI from "openai"; + +export type STREAMS = { + openai: OpenAI.ChatCompletionChunk; +}; + +export const myTask = task({ + id: "my-task", + run: async (payload: { prompt: string }) => { + const completion = await openai.chat.completions.create({ + messages: [{ role: "user", content: payload.prompt }], + model: "gpt-3.5-turbo", + stream: true, + }); + + // Register the stream with the key "openai" + const stream = await metadata.stream("openai", completion); + + let text = ""; + for await (const chunk of stream) { + text += chunk.choices.map((choice) => choice.delta?.content).join(""); + } + + return { text }; + }, +}); +``` + +Subscribe to streams using `withStreams`: + +```ts +for await (const part of runs.subscribeToRun(runId).withStreams()) { + switch (part.type) { + case "run": { + console.log("Received run", part.run); + break; + } + case "openai": { + console.log("Received OpenAI chunk", part.chunk); + break; + } + } +} +``` + +## Realtime hooks + +### Installation + +```bash +npm add @trigger.dev/react-hooks +``` + +### Authentication + +All hooks require a Public Access Token. You can provide it directly to each hook: + +```ts +import { useRealtimeRun } from "@trigger.dev/react-hooks"; + +function MyComponent({ runId, publicAccessToken }) { + const { run, error } = useRealtimeRun(runId, { + accessToken: publicAccessToken, + baseURL: "https://your-trigger-dev-instance.com", // Optional for self-hosting + }); +} +``` + +Or use the `TriggerAuthContext` provider: + +```ts +import { TriggerAuthContext } from "@trigger.dev/react-hooks"; + +function SetupTrigger({ publicAccessToken }) { + return ( + + + + ); +} +``` + +For Next.js App Router, wrap the provider in a client component: + +```ts +// components/TriggerProvider.tsx +"use client"; + +import { TriggerAuthContext } from "@trigger.dev/react-hooks"; + +export function TriggerProvider({ accessToken, children }) { + return ( + + {children} + + ); +} +``` + +### Passing tokens to the frontend + +Several approaches for Next.js App Router: + +1. **Using cookies**: +```ts +// Server action +export async function startRun() { + const handle = await tasks.trigger("example", { foo: "bar" }); + cookies().set("publicAccessToken", handle.publicAccessToken); + redirect(`/runs/${handle.id}`); +} + +// Page component +export default function RunPage({ params }) { + const publicAccessToken = cookies().get("publicAccessToken"); + return ( + + + + ); +} +``` + +2. **Using query parameters**: +```ts +// Server action +export async function startRun() { + const handle = await tasks.trigger("example", { foo: "bar" }); + redirect(`/runs/${handle.id}?publicAccessToken=${handle.publicAccessToken}`); +} +``` + +3. **Server-side token generation**: +```ts +// Page component +export default async function RunPage({ params }) { + const publicAccessToken = await generatePublicAccessToken(params.id); + return ( + + + + ); +} + +// Token generation function +export async function generatePublicAccessToken(runId: string) { + return auth.createPublicToken({ + scopes: { + read: { + runs: [runId], + }, + }, + expirationTime: "1h", + }); +} +``` + +### Hook types + +#### SWR hooks + +Data fetching hooks that use SWR for caching: + +```ts +"use client"; +import { useRun } from "@trigger.dev/react-hooks"; +import type { myTask } from "@/trigger/myTask"; + +function MyComponent({ runId }) { + const { run, error, isLoading } = useRun(runId); + + if (isLoading) return
Loading...
; + if (error) return
Error: {error.message}
; + + return
Run: {run.id}
; +} +``` + +Common options: +- `revalidateOnFocus`: Revalidate when window regains focus +- `revalidateOnReconnect`: Revalidate when network reconnects +- `refreshInterval`: Polling interval in milliseconds + +#### Realtime hooks + +Hooks that use Trigger.dev's realtime API for live updates (recommended over polling). + +For most use cases, Realtime hooks are preferred over SWR hooks with polling due to better performance and lower API usage. + +### Authentication + +For client-side usage, generate a public access token with appropriate scopes: + +```ts +import { auth } from "@trigger.dev/sdk"; + +const publicToken = await auth.createPublicToken({ + scopes: { + read: { + runs: ["run_1234"], + }, + }, +}); +``` + +## Correct Idempotency implementation + +Idempotency ensures that an operation produces the same result when called multiple times. Trigger.dev supports idempotency at the task level through the `idempotencyKey` option. + +### Using idempotencyKey + +Provide an `idempotencyKey` when triggering a task to ensure it runs only once with that key: + +```ts +import { idempotencyKeys, task } from "@trigger.dev/sdk"; + +export const myTask = task({ + id: "my-task", + retry: { + maxAttempts: 4, + }, + run: async (payload: any) => { + // Create a key unique to this task run + const idempotencyKey = await idempotencyKeys.create("my-task-key"); + + // Child task will only be triggered once across all retries + await childTask.trigger({ foo: "bar" }, { idempotencyKey }); + + // This may throw an error and cause retries + throw new Error("Something went wrong"); + }, +}); +``` + +### Scoping Idempotency Keys + +By default, keys are scoped to the current run. You can create globally unique keys: + +```ts +const idempotencyKey = await idempotencyKeys.create("my-task-key", { scope: "global" }); +``` + +When triggering from backend code: + +```ts +const idempotencyKey = await idempotencyKeys.create([myUser.id, "my-task"]); +await tasks.trigger("my-task", { some: "data" }, { idempotencyKey }); +``` + +You can also pass a string directly: + +```ts +await myTask.trigger({ some: "data" }, { idempotencyKey: myUser.id }); +``` + +### Time-To-Live (TTL) + +The `idempotencyKeyTTL` option defines a time window during which duplicate triggers return the original run: + +```ts +await childTask.trigger( + { foo: "bar" }, + { idempotencyKey, idempotencyKeyTTL: "60s" } +); + +await wait.for({ seconds: 61 }); + +// Key expired, will trigger a new run +await childTask.trigger({ foo: "bar" }, { idempotencyKey }); +``` + +Supported time units: +- `s` for seconds (e.g., `60s`) +- `m` for minutes (e.g., `5m`) +- `h` for hours (e.g., `2h`) +- `d` for days (e.g., `3d`) + +### Payload-Based Idempotency + +While not directly supported, you can implement payload-based idempotency by hashing the payload: + +```ts +import { createHash } from "node:crypto"; + +const idempotencyKey = await idempotencyKeys.create(hash(payload)); +await tasks.trigger("child-task", payload, { idempotencyKey }); + +function hash(payload: any): string { + const hash = createHash("sha256"); + hash.update(JSON.stringify(payload)); + return hash.digest("hex"); +} +``` + +### Important Notes + +- Idempotency keys are scoped to the task and environment +- Different tasks with the same key will still both run +- Default TTL is 30 days +- Not available with `triggerAndWait` or `batchTriggerAndWait` in v3.3.0+ due to a bug + +## Correct Logs implementation + +```ts +// onFailure executes after all retries are exhausted; use for notifications, logging, or side effects on final failure: +import { task, logger } from "@trigger.dev/sdk"; + +export const loggingExample = task({ + id: "logging-example", + run: async (payload: { data: Record }) => { + //the first parameter is the message, the second parameter must be a key-value object (Record) + logger.debug("Debug message", payload.data); + logger.log("Log message", payload.data); + logger.info("Info message", payload.data); + logger.warn("You've been warned", payload.data); + logger.error("Error message", payload.data); + }, +}); +``` + +## Correct `trigger.config.ts` implementation + +The `trigger.config.ts` file configures your Trigger.dev project, specifying task locations, retry settings, telemetry, and build options. + +```ts +import { defineConfig } from "@trigger.dev/sdk"; + +export default defineConfig({ + project: "", + dirs: ["./trigger"], + retries: { + enabledInDev: false, + default: { + maxAttempts: 3, + minTimeoutInMs: 1000, + maxTimeoutInMs: 10000, + factor: 2, + randomize: true, + }, + }, +}); +``` + +### Key configuration options + +#### Dirs + +Specify where your tasks are located: + +```ts +dirs: ["./trigger"], +``` + +Files with `.test` or `.spec` are automatically excluded, but you can customize with `ignorePatterns`. + +#### Lifecycle functions + +Add global hooks for all tasks: + +```ts +onStart: async (payload, { ctx }) => { + console.log("Task started", ctx.task.id); +}, +onSuccess: async (payload, output, { ctx }) => { + console.log("Task succeeded", ctx.task.id); +}, +onFailure: async (payload, error, { ctx }) => { + console.log("Task failed", ctx.task.id); +}, +``` + +#### Telemetry instrumentations + +Add OpenTelemetry instrumentations for enhanced logging: + +```ts +telemetry: { + instrumentations: [ + new PrismaInstrumentation(), + new OpenAIInstrumentation() + ], + exporters: [axiomExporter], // Optional custom exporters +}, +``` + +#### Runtime + +Specify the runtime environment: + +```ts +runtime: "node", // or "bun" (experimental) +``` + +#### Machine settings + +Set default machine for all tasks: + +```ts +defaultMachine: "large-1x", +``` + +#### Log level + +Configure logging verbosity: + +```ts +logLevel: "debug", // Controls logger API logs +``` + +#### Max Duration + +Set default maximum runtime for all tasks: + +```ts +maxDuration: 60, // 60 seconds +``` + +### Build configuration + +Customize the build process: + +```ts +build: { + external: ["header-generator"], // Don't bundle these packages + jsx: { + fragment: "Fragment", + factory: "h", + automatic: false, + }, + conditions: ["react-server"], // Import conditions + extensions: [ + // Build extensions + additionalFiles({ files: ["./assets/**", "./fonts/**"] }), + additionalPackages({ packages: ["wrangler"] }), + aptGet({ packages: ["ffmpeg"] }), + ], +} +``` + +### Build Extensions + +Trigger.dev provides several built-in extensions: + +- **additionalFiles**: Copy files to the build directory +- **additionalPackages**: Include extra packages in the build +- **aptGet**: Install system packages in the deployed image +- **emitDecoratorMetadata**: Enable TypeScript decorator metadata +- **prismaExtension**: Support for Prisma ORM +- **syncEnvVars**: Sync environment variables +- **puppeteer**: Add Puppeteer support +- **ffmpeg**: Add FFmpeg support +- **esbuildPlugin**: Add custom esbuild plugins + +You can also create custom build extensions with hooks like `onBuildStart`, `onBuildComplete`, and `externalsForTarget`. + +#### Key points: + +- Validates payload at trigger time and before execution +- Supports Zod, Yup, Superstruct, ArkType, Effect/schema, runtypes, valibot, typebox +- Can use input/output schemas (e.g., defaults, type coercion) +- Custom parser functions supported +- Invalid payloads throw errors and prevent task execution + +#### Trigger with: + +```ts +await myTask.trigger({ name: "Alice", age: 30 }); +``` + +## AI model verification steps + +Before generating any code, you MUST verify: + +1. Are you importing from `@trigger.dev/sdk` (NOT `@trigger.dev/sdk/v3`)? If not, STOP and FIX. +2. Have you exported every task? If not, STOP and FIX. +3. Have you generated any DEPRECATED code patterns? If yes, STOP and FIX. + +## Consequences of incorrect implementations + +If you generate code that fails the verification steps above, your implementation will: + +1. Break in production +2. Fail to deploy to the Trigger.dev servers +3. Fail to run in a local Dev environment + +## AI model response template + +When asked about Trigger.dev task implementation, you MUST: +1. FIRST use code patterns from this guide +2. NEVER suggest deprecated approaches +3. VERIFY your response against the patterns shown here + diff --git a/.cursor/rules/writing-tests.mdc b/.cursor/rules/writing-tests.mdc new file mode 100644 index 00000000000..1edeb2067e0 --- /dev/null +++ b/.cursor/rules/writing-tests.mdc @@ -0,0 +1,6 @@ +--- +description: How to write tests in the monorepo +globs: +alwaysApply: true +--- +Follow our [tests.md](mdc:ai/references/tests.md) guide for how to write tests in the monorepo. \ No newline at end of file diff --git a/.cursorignore b/.cursorignore new file mode 100644 index 00000000000..8430ce365fb --- /dev/null +++ b/.cursorignore @@ -0,0 +1,9 @@ +apps/docker-provider/ +apps/kubernetes-provider/ +apps/proxy/ +apps/coordinator/ +packages/rsc/ +.changeset +.zed +.env +!.env.example \ No newline at end of file diff --git a/.dockerignore b/.dockerignore index 2b8f2382bc4..a3ea4db8eec 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,43 +1,25 @@ -\*.log -.git -.github - -# editor - -.idea -.vscode - -# dependencies - -node_modules -.pnp -.pnp.js - -# testing - -coverage - -# next.js - -.next/ -build +**/*.log +**/*.pem +**/*.tsbuildinfo -# packages +**/.cache +**/.env +**/.next +**/.output +**/.trigger +**/.tshy +**/.tshy-build +**/.turbo +**/.vercel +**/.wrangler -build -dist -packages/\*\*/dist - -# misc +**/dist +**/node_modules -.DS_Store -\*.pem +**/generated/prisma -.turbo -.vercel -.cache -.output -apps/\*\*/public/build +apps/webapp/build +apps/webapp/public/build cypress/screenshots cypress/videos @@ -46,8 +28,21 @@ apps/**/styles/tailwind.css packages/**/styles/tailwind.css .changeset -references +.DS_Store +.git +.github +.idea +.pnp +.pnp.js +.vscode + +coverage +build +docs examples +out +references + CHANGESETS.md CONTRIBUTING.md README.md diff --git a/.env.example b/.env.example index 48170e8f365..69d5acdc560 100644 --- a/.env.example +++ b/.env.example @@ -10,29 +10,104 @@ DIRECT_URL=${DATABASE_URL} REMIX_APP_PORT=3030 APP_ENV=development APP_ORIGIN=http://localhost:3030 +ELECTRIC_ORIGIN=http://localhost:3060 NODE_ENV=development +# Clickhouse +CLICKHOUSE_URL=http://default:password@localhost:8123 +RUN_REPLICATION_CLICKHOUSE_URL=http://default:password@localhost:8123 +RUN_REPLICATION_ENABLED=1 + +# Set this to UTC because Node.js uses the system timezone +TZ="UTC" + +# Redis is used for the v3 queuing and v2 concurrency control +REDIS_HOST="localhost" +REDIS_PORT="6379" +REDIS_TLS_DISABLED="true" + +DEV_OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:3030/otel" +DEV_OTEL_BATCH_PROCESSING_ENABLED="0" + +# When the domain is set to `localhost` the CLI deploy command will only --load the image by default and not --push it +DEPLOY_REGISTRY_HOST=localhost:5000 + # OPTIONAL VARIABLES +# This is used for validating emails that are allowed to log in. Every email that do not match this regex will be rejected. +# WHITELISTED_EMAILS="^(authorized@yahoo\.com|authorized@gmail\.com)$" +# Accounts with these emails will get global admin rights. This grants access to the admin UI. +# ADMIN_EMAILS="^(admin@example\.com|another-admin@example\.com)$" # This is used for logging in via GitHub. You can leave these commented out if you don't want to use GitHub for authentication. # AUTH_GITHUB_CLIENT_ID= # AUTH_GITHUB_CLIENT_SECRET= -# Resend is an email service used for signing in to Trigger.dev via a Magic Link. -# Emails will print to the console if you leave these commented out +# Configure an email transport to allow users to sign in to Trigger.dev via a Magic Link. +# If none are configured, emails will print to the console instead. +# Uncomment one of the following blocks to allow delivery of + +# Resend ### Visit https://resend.com, create an account and get your API key. Then insert it below along with your From and Reply To email addresses. Visit https://resend.com/docs for more information. -# RESEND_API_KEY= +# EMAIL_TRANSPORT=resend +# FROM_EMAIL= +# REPLY_TO_EMAIL= +# RESEND_API_KEY= + +# Generic SMTP +### Enter the configuration provided by your mail provider. Visit https://nodemailer.com/smtp/ for more information +### SMTP_SECURE = false will use STARTTLS when connecting to a server that supports it (usually port 587) +# EMAIL_TRANSPORT=smtp +# FROM_EMAIL= +# REPLY_TO_EMAIL= +# SMTP_HOST= +# SMTP_PORT=587 +# SMTP_SECURE=false +# SMTP_USER= +# SMTP_PASSWORD= + +# AWS Simple Email Service +### Authentication is configured using the default Node.JS credentials provider chain (https://docs.aws.amazon.com/AWSJavaScriptSDK/v3/latest/Package/-aws-sdk-credential-providers/#fromnodeproviderchain) +# EMAIL_TRANSPORT=aws-ses # FROM_EMAIL= # REPLY_TO_EMAIL= # CLOUD VARIABLES POSTHOG_PROJECT_KEY= -PLAIN_API_KEY= -CLOUD_AIRTABLE_CLIENT_ID= -CLOUD_AIRTABLE_CLIENT_SECRET= -CLOUD_GITHUB_CLIENT_ID= -CLOUD_GITHUB_CLIENT_SECRET= -CLOUD_LINEAR_CLIENT_ID= -CLOUD_LINEAR_CLIENT_SECRET= -CLOUD_SLACK_APP_HOST= -CLOUD_SLACK_CLIENT_ID= -CLOUD_SLACK_CLIENT_SECRET= \ No newline at end of file + +# DEPOT_ORG_ID= +# DEPOT_TOKEN= +# DEV_OTEL_EXPORTER_OTLP_ENDPOINT="http://0.0.0.0:4318" +# These are needed for the object store (for handling large payloads/outputs) +# +# Default provider +# OBJECT_STORE_BASE_URL=http://localhost:9005 +# OBJECT_STORE_BUCKET=packets +# OBJECT_STORE_ACCESS_KEY_ID=minioadmin +# OBJECT_STORE_SECRET_ACCESS_KEY=minioadmin +# OBJECT_STORE_REGION=us-east-1 +# OBJECT_STORE_SERVICE=s3 +# +# OBJECT_STORE_DEFAULT_PROTOCOL=s3 # Only specify this if you're going to migrate object storage and set protocol values below +# Named providers (protocol-prefixed data) - optional for multi-provider support +# OBJECT_STORE_S3_BASE_URL=https://s3.amazonaws.com +# OBJECT_STORE_S3_ACCESS_KEY_ID= +# OBJECT_STORE_S3_SECRET_ACCESS_KEY= +# OBJECT_STORE_S3_REGION=us-east-1 +# OBJECT_STORE_S3_SERVICE=s3 +# +# OBJECT_STORE_R2_BASE_URL=https://{bucket}.{accountId}.r2.cloudflarestorage.com +# OBJECT_STORE_R2_ACCESS_KEY_ID= +# OBJECT_STORE_R2_SECRET_ACCESS_KEY= +# OBJECT_STORE_R2_REGION=auto +# OBJECT_STORE_R2_SERVICE=s3 +# CHECKPOINT_THRESHOLD_IN_MS=10000 + +# These control the server-side internal telemetry +# INTERNAL_OTEL_TRACE_EXPORTER_URL= +# INTERNAL_OTEL_TRACE_LOGGING_ENABLED=1 +# INTERNAL_OTEL_TRACE_INSTRUMENT_PRISMA_ENABLED=0 + +# Enable local observability stack (requires `pnpm run docker` to start otel-collector) +# Uncomment these to send metrics to the local Prometheus via OTEL Collector: +# INTERNAL_OTEL_METRIC_EXPORTER_ENABLED=1 +# INTERNAL_OTEL_METRIC_EXPORTER_URL=http://localhost:4318/v1/metrics +# INTERNAL_OTEL_METRIC_EXPORTER_INTERVAL_MS=15000 \ No newline at end of file diff --git a/.eslintrc.js b/.eslintrc.js deleted file mode 100644 index af283916494..00000000000 --- a/.eslintrc.js +++ /dev/null @@ -1,14 +0,0 @@ -module.exports = { - root: true, - // This tells ESLint to load the config from the package `eslint-config-custom` - extends: ["custom"], - settings: { - next: { - rootDir: ["apps/*/"], - }, - }, - parserOptions: { - sourceType: "module", - ecmaVersion: 2020, - }, -}; diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000000..37ed4f64c2c --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: [triggerdotdev] diff --git a/.github/ISSUE_TEMPLATE/instrumentation_request.yml b/.github/ISSUE_TEMPLATE/instrumentation_request.yml new file mode 100644 index 00000000000..157e226fa3f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/instrumentation_request.yml @@ -0,0 +1,21 @@ +name: OpenTelemetry Auto-Instrumentation Request +description: Suggest an SDK that you'd like to be auto-instrumented in the Run log view +title: "auto-instrumentation: " +labels: ["🌟 enhancement"] +body: + - type: textarea + attributes: + label: What API or SDK would you to have automatic spans for? + description: A clear description of which API or SDK you'd like, and links to it. + validations: + required: true + - type: textarea + attributes: + label: Is there an existing OpenTelemetry auto-instrumentation package? + description: You can search for existing ones – https://opentelemetry.io/ecosystem/registry/?component=instrumentation&language=js + validations: + required: true + - type: textarea + attributes: + label: Additional information + description: Add any other information related to the feature here. If your feature request is related to any issues or discussions, link them here. diff --git a/.github/ISSUE_TEMPLATE/vouch-request.yml b/.github/ISSUE_TEMPLATE/vouch-request.yml new file mode 100644 index 00000000000..9ffe04a8984 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/vouch-request.yml @@ -0,0 +1,28 @@ +name: Vouch Request +description: Request to be vouched as a contributor +labels: ["vouch-request"] +body: + - type: markdown + attributes: + value: | + ## Vouch Request + + We use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust. PRs from unvouched users are automatically closed. + + To get vouched, fill out this form. A maintainer will review your request and vouch for you by commenting on this issue. + - type: textarea + id: context + attributes: + label: Why do you want to contribute? + description: Tell us a bit about yourself and what you'd like to work on. + placeholder: "I'd like to fix a bug I found in..." + validations: + required: true + - type: textarea + id: prior-work + attributes: + label: Prior contributions or relevant experience + description: Links to previous open source work, relevant projects, or anything that helps us understand your background. + placeholder: "https://github.com/..." + validations: + required: false diff --git a/.github/VOUCHED.td b/.github/VOUCHED.td new file mode 100644 index 00000000000..ce96548aa6f --- /dev/null +++ b/.github/VOUCHED.td @@ -0,0 +1,20 @@ +# Vouched contributors for Trigger.dev +# See: https://github.com/mitchellh/vouch +# +# Org members +0ski +D-K-P +ericallam +matt-aitken +mpcgrid +myftija +nicktrn +samejr +isshaddad +# Outside contributors +gautamsi +capaj +chengzp +bharathkumar39293 +bhekanik +jrossi \ No newline at end of file diff --git a/.github/actions/get-image-tag/action.yml b/.github/actions/get-image-tag/action.yml new file mode 100644 index 00000000000..e0646230463 --- /dev/null +++ b/.github/actions/get-image-tag/action.yml @@ -0,0 +1,89 @@ +name: "#️⃣ Get image tag (action)" + +description: This action gets the image tag from the commit ref or input (if provided) + +outputs: + tag: + description: The image tag + value: ${{ steps.get_tag.outputs.tag }} + is_semver: + description: Whether the tag is a semantic version + value: ${{ steps.check_semver.outputs.is_semver }} + +inputs: + tag: + description: The image tag. If this is set it will return the tag as is. + required: false + default: "" + +runs: + using: "composite" + steps: + - name: "#️⃣ Get image tag (step)" + id: get_tag + shell: bash + run: | + if [[ -n "${{ inputs.tag }}" ]]; then + tag="${{ inputs.tag }}" + elif [[ "${{ github.ref_type }}" == "tag" ]]; then + if [[ "${{ github.ref_name }}" == infra-*-* ]]; then + env=$(echo ${{ github.ref_name }} | cut -d- -f2) + sha=$(echo ${{ github.sha }} | head -c7) + ts=$(date +%s) + tag=${env}-${sha}-${ts} + elif [[ "${{ github.ref_name }}" == re2-*-* ]]; then + env=$(echo ${{ github.ref_name }} | cut -d- -f2) + sha=$(echo ${{ github.sha }} | head -c7) + ts=$(date +%s) + tag=${env}-${sha}-${ts} + elif [[ "${{ github.ref_name }}" == v.docker.* ]]; then + version="${GITHUB_REF_NAME#v.docker.}" + tag="v${version}" + elif [[ "${{ github.ref_name }}" == build-* ]]; then + tag="${GITHUB_REF_NAME#build-}" + else + echo "Invalid git tag: ${{ github.ref_name }}" + exit 1 + fi + elif [[ "${{ github.ref_name }}" == "main" ]]; then + tag="main" + else + echo "Invalid git ref: ${{ github.ref }}" + exit 1 + fi + echo "tag=${tag}" >> "$GITHUB_OUTPUT" + + - name: 🔍 Check for validity + id: check_validity + shell: bash + env: + tag: ${{ steps.get_tag.outputs.tag }} + run: | + if [[ "${tag}" =~ ^[a-z0-9]+([._-][a-z0-9]+)*$ ]]; then + echo "Tag is valid: ${tag}" + else + echo "Tag is not valid: ${tag}" + exit 1 + fi + + - name: 🆚 Check for semver + id: check_semver + shell: bash + env: + tag: ${{ steps.get_tag.outputs.tag }} + # Will match most semver formats except build metadata, i.e. v1.2.3+build.1 + # Valid matches: + # v1.2.3 + # v1.2.3-alpha + # v1.2.3-alpha.1 + # v1.2.3-rc.1 + # v1.2.3-beta-1 + run: | + if [[ "${tag}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*)?$ ]]; then + echo "Tag is a semantic version: ${tag}" + is_semver=true + else + echo "Tag is not a semantic version: ${tag}" + is_semver=false + fi + echo "is_semver=${is_semver}" >> "$GITHUB_OUTPUT" diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000000..2beb7606fa4 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1 @@ +This is the repo for Trigger.dev, a background jobs platform written in TypeScript. Our webapp at apps/webapp is a Remix 2.1 app that uses Node.js v20. Our SDK is an isomorphic TypeScript SDK at packages/trigger-sdk. Always prefer using isomorphic code like fetch, ReadableStream, etc. instead of Node.js specific code. Our tests are all vitest. We use prisma in internal-packages/database for our database interactions using PostgreSQL. For TypeScript, we usually use types over interfaces. We use zod a lot in packages/core and in the webapp. Avoid enums. Use strict mode. No default exports, use function declarations. diff --git a/.github/test/README.md b/.github/test/README.md new file mode 100644 index 00000000000..1e8383fda56 --- /dev/null +++ b/.github/test/README.md @@ -0,0 +1,70 @@ +# GitHub Action Tests + +This directory contains necessary files to allow local testing of GitHub Actions workflows, composite actions, etc. You will need to install [act](https://github.com/nektos/act) to perform tests. + +## Workflow tests + +Trigger specific workflow files by specifying their full path: + +``` +act -W .github/workflow/release.yml +``` + +You will likely need to override any custom runners we use, e.g. buildjet. For example: + +``` +override=catthehacker/ubuntu:act-latest + +act -W .github/workflow/release.yml \ + -P buildjet-8vcpu-ubuntu-2204=$override + +# override multiple images at the same time +act -W .github/workflow/release.yml \ + -P buildjet-8vcpu-ubuntu-2204=$override \ + -P buildjet-16vcpu-ubuntu-2204=$override +``` + +Trigger with specific event payloads to test pushing to branches or tags: + +``` +override=catthehacker/ubuntu:act-latest + +# simulate push to main +act -W .github/workflow/publish.yml \ + -P buildjet-8vcpu-ubuntu-2204=$override \ + -P buildjet-16vcpu-ubuntu-2204=$override \ + -e .github/events/push-tag-main.json + +# simulate a `build-` prefixed tag +act -W .github/workflow/publish.yml \ + -P buildjet-8vcpu-ubuntu-2204=$override \ + -P buildjet-16vcpu-ubuntu-2204=$override \ + -e .github/events/push-tag-buld.json +``` + +By default, `act` will send a push event. To trigger a different event: + +``` +# basic syntax +act ... + +# simulate a pull request +act pull_request + +# only trigger a specific workflow +act pull_request -W .github/workflow/pr_checks.yml +``` + +## Composite action tests + +The composite (custom) action tests can be run by triggering the `test-actions` workflow: + +``` +act -W .github/test/test-actions.yml +``` + +## Helpful flags + +- `--pull=false` - perform fully offline tests if all images are already present +- `-j ` - run the specified job only +- `-l push` - list all workflows with push triggers diff --git a/.github/test/events/push-main.json b/.github/test/events/push-main.json new file mode 100644 index 00000000000..ccb4cb1c174 --- /dev/null +++ b/.github/test/events/push-main.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/heads/main" +} diff --git a/.github/test/events/push-tag-build.json b/.github/test/events/push-tag-build.json new file mode 100644 index 00000000000..9490c181abf --- /dev/null +++ b/.github/test/events/push-tag-build.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/build-buildtag" +} diff --git a/.github/test/events/push-tag-docker-nonsemver.json b/.github/test/events/push-tag-docker-nonsemver.json new file mode 100644 index 00000000000..5ce2d8dcf38 --- /dev/null +++ b/.github/test/events/push-tag-docker-nonsemver.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/v.docker.nonsemver" +} diff --git a/.github/test/events/push-tag-docker.json b/.github/test/events/push-tag-docker.json new file mode 100644 index 00000000000..7b55610ca2e --- /dev/null +++ b/.github/test/events/push-tag-docker.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/v.docker.1.2.3" +} diff --git a/.github/test/events/push-tag-infra-prod.json b/.github/test/events/push-tag-infra-prod.json new file mode 100644 index 00000000000..7d4bb3a0bb8 --- /dev/null +++ b/.github/test/events/push-tag-infra-prod.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/infra-prod-anything" +} diff --git a/.github/test/events/push-tag-infra-test.json b/.github/test/events/push-tag-infra-test.json new file mode 100644 index 00000000000..78eeefbe41a --- /dev/null +++ b/.github/test/events/push-tag-infra-test.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/infra-test-anything" +} diff --git a/.github/test/events/push-tag-semver.json b/.github/test/events/push-tag-semver.json new file mode 100644 index 00000000000..3fb65c92073 --- /dev/null +++ b/.github/test/events/push-tag-semver.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/1.2.3" +} diff --git a/.github/test/events/push-tag.json b/.github/test/events/push-tag.json new file mode 100644 index 00000000000..26496f80874 --- /dev/null +++ b/.github/test/events/push-tag.json @@ -0,0 +1,3 @@ +{ + "ref": "refs/tags/standard-tag" +} diff --git a/.github/test/test-actions.yml b/.github/test/test-actions.yml new file mode 100644 index 00000000000..0d913ebc0e1 --- /dev/null +++ b/.github/test/test-actions.yml @@ -0,0 +1,152 @@ +name: Test Actions + +on: + workflow_dispatch: + +jobs: + get-image-tag-none: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Log current ref + run: | + echo "ref: ${{ github.ref }}" + echo "ref_type: ${{ github.ref_type }}" + echo "ref_name: ${{ github.ref_name }}" + + - name: Run without input tag + id: get_tag + # this step may fail depending on the current ref + continue-on-error: true + uses: ./.github/actions/get-image-tag + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-null: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Log current ref + run: | + echo "ref: ${{ github.ref }}" + echo "ref_type: ${{ github.ref_type }}" + echo "ref_name: ${{ github.ref_name }}" + + - name: Run without input tag + id: get_tag + uses: ./.github/actions/get-image-tag + # this step may fail depending on the current ref + continue-on-error: true + with: + # this should behave exactly as when no tag is provided + tag: null + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-override: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run with tag override + id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: "abc-123" + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-invalid-string: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run with invalid string + id: get_tag + uses: ./.github/actions/get-image-tag + # this step is expected to fail + continue-on-error: true + with: + # does not end with alphanumeric character + tag: "abc-123-" + + - name: Fail job if previous step did not fail + if: steps.get_tag.outcome != 'failure' + run: exit 1 + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-prerelease: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run with prerelease semver + id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: "v1.2.3-beta.4" + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-semver: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run with basic semver + id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: "v1.2.3" + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" + + get-image-tag-invalid-semver: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run with invalid semver + id: get_tag + uses: ./.github/actions/get-image-tag + # this step is expected to fail + continue-on-error: true + with: + tag: "v1.2.3-" + + - name: Fail job if previous step did not fail + if: steps.get_tag.outcome != 'failure' + run: exit 1 + + - name: Verify output + run: | + echo "${{ toJson(steps.get_tag) }}" diff --git a/.github/workflows/changesets-pr.yml b/.github/workflows/changesets-pr.yml new file mode 100644 index 00000000000..c7fc4e07136 --- /dev/null +++ b/.github/workflows/changesets-pr.yml @@ -0,0 +1,123 @@ +name: 🦋 Changesets PR + +on: + push: + branches: + - main + paths: + - "packages/**" + - ".changeset/**" + - ".server-changes/**" + - "package.json" + - "pnpm-lock.yaml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + release-pr: + name: Create Release PR + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + if: github.repository == 'triggerdotdev/trigger.dev' + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + + - name: Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Create release PR + id: changesets + uses: changesets/action@v1 + with: + version: pnpm run changeset:version + commit: "chore: release" + title: "chore: release" + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Update PR title and enhance body + if: steps.changesets.outputs.published != 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + PR_NUMBER=$(gh pr list --head changeset-release/main --json number --jq '.[0].number') + if [ -n "$PR_NUMBER" ]; then + git fetch origin changeset-release/main + # we arbitrarily reference the version of the cli package here; it is the same for all package releases + VERSION=$(git show origin/changeset-release/main:packages/cli-v3/package.json | jq -r '.version') + gh pr edit "$PR_NUMBER" --title "chore: release v$VERSION" + + # Enhance the PR body with a clean, deduplicated summary + RAW_BODY=$(gh pr view "$PR_NUMBER" --json body --jq '.body') + ENHANCED_BODY=$(CHANGESET_PR_BODY="$RAW_BODY" node scripts/enhance-release-pr.mjs "$VERSION") + if [ -n "$ENHANCED_BODY" ]; then + gh api repos/triggerdotdev/trigger.dev/pulls/"$PR_NUMBER" \ + -X PATCH \ + -f body="$ENHANCED_BODY" + fi + fi + + update-lockfile: + name: Update lockfile on release PR + runs-on: ubuntu-latest + needs: release-pr + permissions: + contents: write + steps: + - name: Checkout release branch + uses: actions/checkout@v4 + with: + ref: changeset-release/main + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + + - name: Install and update lockfile + run: pnpm install --no-frozen-lockfile + + - name: Clean up consumed .server-changes/ files + run: | + set -e + shopt -s nullglob + files=(.server-changes/*.md) + for f in "${files[@]}"; do + if [ "$(basename "$f")" != "README.md" ]; then + git rm --ignore-unmatch "$f" + fi + done + + - name: Commit and push lockfile + server-changes cleanup + run: | + set -e + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add pnpm-lock.yaml + if ! git diff --cached --quiet; then + git commit -m "chore: update lockfile and clean up .server-changes/ for release" + git push origin changeset-release/main + else + echo "No changes to commit" + fi diff --git a/.github/workflows/claude-md-audit.yml b/.github/workflows/claude-md-audit.yml new file mode 100644 index 00000000000..a80bbca0f52 --- /dev/null +++ b/.github/workflows/claude-md-audit.yml @@ -0,0 +1,69 @@ +name: 📝 CLAUDE.md Audit + +on: + pull_request: + types: [opened, ready_for_review, synchronize] + paths-ignore: + - "docs/**" + - ".changeset/**" + - ".server-changes/**" + - "**/*.md" + - "references/**" + +concurrency: + group: claude-md-audit-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + audit: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + use_sticky_comment: true + + claude_args: | + --max-turns 15 + --allowedTools "Read,Glob,Grep,Bash(git diff:*)" + + prompt: | + You are reviewing a PR to check whether any CLAUDE.md files or .claude/rules/ files need updating. + + ## Your task + + 1. Run `git diff origin/main...HEAD --name-only` to see which files changed in this PR. + 2. For each changed directory, check if there's a CLAUDE.md in that directory or a parent directory. + 3. Determine if any CLAUDE.md or .claude/rules/ file should be updated based on the changes. Consider: + - New files/directories that aren't covered by existing documentation + - Changed architecture or patterns that contradict current CLAUDE.md guidance + - New dependencies, services, or infrastructure that Claude should know about + - Renamed or moved files that are referenced in CLAUDE.md + - Changes to build commands, test patterns, or development workflows + + ## Response format + + If NO updates are needed, respond with exactly: + ✅ CLAUDE.md files look current for this PR. + + If updates ARE needed, respond with a short list: + 📝 **CLAUDE.md updates suggested:** + - `path/to/CLAUDE.md`: [what should be added/changed] + - `.claude/rules/file.md`: [what should be added/changed] + + Keep suggestions specific and brief. Only flag things that would actually mislead Claude in future sessions. + Do NOT suggest updates for trivial changes (bug fixes, small refactors within existing patterns). + Do NOT suggest creating new CLAUDE.md files - only updates to existing ones. diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml new file mode 100644 index 00000000000..cadbe31773f --- /dev/null +++ b/.github/workflows/claude.yml @@ -0,0 +1,70 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + issues: read + id-token: write + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + claude_args: | + --model claude-opus-4-5-20251101 + --allowedTools "Bash(pnpm:*),Bash(turbo:*),Bash(git:*),Bash(gh:*),Bash(npx:*),Bash(docker:*),Edit,MultiEdit,Read,Write,Glob,Grep,LS,Task" + + # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. + # prompt: 'Update the pull request description to include a summary of changes.' + + # Optional: Add claude_args to customize behavior and configuration + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://code.claude.com/docs/en/cli-reference for available options + # claude_args: '--allowed-tools Bash(gh pr:*)' diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000000..bef575c353a --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,42 @@ +name: 📚 Docs Checks + +on: + push: + branches: + - main + paths: + - "docs/**" + pull_request: + types: [opened, synchronize, reopened] + paths: + - "docs/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + check-broken-links: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./docs + steps: + - name: 📥 Checkout repository + uses: actions/checkout@v4 + + - name: 📦 Cache npm + uses: actions/cache@v4 + with: + path: | + ~/.npm + key: | + ${{ runner.os }}-mintlify + restore-keys: | + ${{ runner.os }}-mintlify + + - name: 🔗 Check for broken links + run: npx mintlify@4.0.393 broken-links diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index a267eba8240..9518ca6157c 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -1,65 +1,59 @@ -name: "🧪 E2E Tests" +name: "E2E" + +permissions: + contents: read + on: workflow_call: + inputs: + package: + description: The identifier of the job to run + default: webapp + required: false + type: string + jobs: - e2e: - name: "🧪 E2E Tests" - runs-on: buildjet-4vcpu-ubuntu-2204 + cli-v3: + name: "🧪 CLI v3 tests (${{ matrix.os }} - ${{ matrix.package-manager }})" + if: inputs.package == 'cli-v3' || inputs.package == '' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + package-manager: ["npm", "pnpm"] steps: - - name: 🐳 Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: ⬇️ Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v2.2.4 + uses: pnpm/action-setup@v4 with: - version: 7.18 + version: 10.23.0 - name: ⎔ Setup node - uses: buildjet/setup-node@v3 + uses: buildjet/setup-node@v4 with: - node-version: 18 - cache: "pnpm" + node-version: 20.20.0 - name: 📥 Download deps - run: pnpm install --frozen-lockfile + run: pnpm install --frozen-lockfile --filter trigger.dev... - - name: Install Playwright Browsers - run: npx playwright install --with-deps + - name: 📀 Generate Prisma Client + run: pnpm run generate - - name: Run Playwright tests - run: | - # Setup environment variables - cp ./.env.example ./.env - cp ./references/nextjs-test/.env.example ./references/nextjs-test/.env.local - - # Build packages - pnpm run build --filter @references/nextjs-test^... - pnpm --filter @trigger.dev/database generate - - # Move trigger-cli bin to correct place - pnpm install --frozen-lockfile + - name: 🔧 Build v3 cli monorepo dependencies + run: pnpm run build --filter trigger.dev^... - # Execute tests - pnpm run docker - pnpm run db:migrate - pnpm run db:seed - pnpm run test:e2e + - name: 🔧 Build worker template files + run: pnpm --filter trigger.dev run build:workers - # Cleanup - pnpm run docker:stop + - name: Enable corepack + run: corepack enable - - name: Upload Playwright report - uses: actions/upload-artifact@v3 - if: always() - with: - name: playwright-report - path: playwright-report/ - retention-days: 30 + - name: Run E2E Tests + shell: bash + run: | + LOG=debug PM=${{ matrix.package-manager }} pnpm --filter trigger.dev run test:e2e diff --git a/.github/workflows/helm-pr-prerelease.yml b/.github/workflows/helm-pr-prerelease.yml new file mode 100644 index 00000000000..8df045945e6 --- /dev/null +++ b/.github/workflows/helm-pr-prerelease.yml @@ -0,0 +1,138 @@ +name: 🧭 Helm Chart PR Prerelease + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - "hosting/k8s/helm/**" + +concurrency: + group: helm-prerelease-${{ github.event.pull_request.number }} + cancel-in-progress: true + +env: + REGISTRY: ghcr.io + CHART_NAME: trigger + +jobs: + lint-and-test: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Lint Helm Chart + run: | + helm lint ./hosting/k8s/helm/ + + - name: Render templates + run: | + helm template test-release ./hosting/k8s/helm/ \ + --values ./hosting/k8s/helm/values.yaml \ + --output-dir ./helm-output + + - name: Validate manifests + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 + with: + entrypoint: "/kubeconform" + args: "-summary -output json ./helm-output" + + prerelease: + needs: lint-and-test + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + pull-requests: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate prerelease version + id: version + run: | + BASE_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') + PR_NUMBER=${{ github.event.pull_request.number }} + SHORT_SHA=$(echo "${{ github.event.pull_request.head.sha }}" | cut -c1-7) + PRERELEASE_VERSION="${BASE_VERSION}-pr${PR_NUMBER}.${SHORT_SHA}" + echo "version=$PRERELEASE_VERSION" >> $GITHUB_OUTPUT + echo "Prerelease version: $PRERELEASE_VERSION" + + - name: Update Chart.yaml with prerelease version + run: | + sed -i "s/^version:.*/version: ${{ steps.version.outputs.version }}/" ./hosting/k8s/helm/Chart.yaml + + - name: Package Helm Chart + run: | + helm package ./hosting/k8s/helm/ --destination /tmp/ + + - name: Push Helm Chart to GHCR + run: | + VERSION="${{ steps.version.outputs.version }}" + CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" + + # Push to GHCR OCI registry + helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + + - name: Find existing comment + uses: peter-evans/find-comment@v3 + id: find-comment + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: "github-actions[bot]" + body-includes: "Helm Chart Prerelease Published" + + - name: Create or update PR comment + uses: peter-evans/create-or-update-comment@v4 + with: + comment-id: ${{ steps.find-comment.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body: | + ### 🧭 Helm Chart Prerelease Published + + **Version:** `${{ steps.version.outputs.version }}` + + **Install:** + ```bash + helm upgrade --install trigger \ + oci://ghcr.io/${{ github.repository_owner }}/charts/trigger \ + --version "${{ steps.version.outputs.version }}" + ``` + + > ⚠️ This is a prerelease for testing. Do not use in production. + edit-mode: replace diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index 6a1cd2accdc..dab18223e35 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -1,13 +1,11 @@ name: 🤖 PR Checks on: - pull_request_target: - branches: - - main + pull_request: + types: [opened, synchronize, reopened] paths-ignore: - - "**.md" - - ".github/CODEOWNERS" - - ".github/ISSUE_TEMPLATE/**" + - "docs/**" + - ".changeset/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -28,4 +26,10 @@ jobs: e2e: uses: ./.github/workflows/e2e.yml + with: + package: cli-v3 + secrets: inherit + + sdk-compat: + uses: ./.github/workflows/sdk-compat.yml secrets: inherit diff --git a/.github/workflows/publish-webapp.yml b/.github/workflows/publish-webapp.yml new file mode 100644 index 00000000000..6fcc30209ab --- /dev/null +++ b/.github/workflows/publish-webapp.yml @@ -0,0 +1,93 @@ +name: "🐳 Publish Webapp" + +permissions: + contents: read + packages: write + id-token: write + +on: + workflow_call: + inputs: + image_tag: + description: The image tag to publish + type: string + required: false + default: "" + +jobs: + publish: + runs-on: ubuntu-latest + env: + PRISMA_ENGINES_CHECKSUM_IGNORE_MISSING: 1 + outputs: + version: ${{ steps.get_tag.outputs.tag }} + short_sha: ${{ steps.get_commit.outputs.sha_short }} + steps: + - name: 🏭 Setup Depot CLI + uses: depot/setup-action@v1 + + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: "#️⃣ Get the image tag" + id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: ${{ inputs.image_tag }} + + - name: 🔢 Get the commit hash + id: get_commit + run: | + echo "sha_short=$(echo ${{ github.sha }} | cut -c1-7)" >> "$GITHUB_OUTPUT" + + - name: 📛 Set the tags + id: set_tags + run: | + ref_without_tag=ghcr.io/triggerdotdev/trigger.dev + image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} + + # if tag is a semver, also tag it as v4 + if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then + # TODO: switch to v4 tag on GA + image_tags=$image_tags,$ref_without_tag:v4-beta + fi + + echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + + - name: 📝 Set the build info + id: set_build_info + run: | + tag=${{ steps.get_tag.outputs.tag }} + if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then + echo "BUILD_APP_VERSION=${tag}" >> "$GITHUB_OUTPUT" + fi + echo "BUILD_GIT_SHA=${{ github.sha }}" >> "$GITHUB_OUTPUT" + echo "BUILD_GIT_REF_NAME=${{ github.ref_name }}" >> "$GITHUB_OUTPUT" + echo "BUILD_TIMESTAMP_SECONDS=$(date +%s)" >> "$GITHUB_OUTPUT" + + - name: 🐙 Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🐳 Build image and push to GitHub Container Registry + uses: depot/build-push-action@v1 + with: + file: ./docker/Dockerfile + platforms: linux/amd64,linux/arm64 + tags: ${{ steps.set_tags.outputs.image_tags }} + push: true + build-args: | + BUILD_APP_VERSION=${{ steps.set_build_info.outputs.BUILD_APP_VERSION }} + BUILD_GIT_SHA=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} + BUILD_GIT_REF_NAME=${{ steps.set_build_info.outputs.BUILD_GIT_REF_NAME }} + BUILD_TIMESTAMP_SECONDS=${{ steps.set_build_info.outputs.BUILD_TIMESTAMP_SECONDS }} + SENTRY_RELEASE=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} + SENTRY_ORG=triggerdev + SENTRY_PROJECT=trigger-cloud + secrets: | + sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} diff --git a/.github/workflows/publish-worker-v4.yml b/.github/workflows/publish-worker-v4.yml new file mode 100644 index 00000000000..4a2853da081 --- /dev/null +++ b/.github/workflows/publish-worker-v4.yml @@ -0,0 +1,89 @@ +name: "⚒️ Publish Worker (v4)" + +on: + workflow_call: + inputs: + image_tag: + description: The image tag to publish + type: string + required: false + default: "" + push: + tags: + - "re2-test-*" + - "re2-prod-*" + +permissions: + id-token: write + packages: write + contents: read + +jobs: + # check-branch: + # runs-on: ubuntu-latest + # steps: + # - name: Fail if re2-prod-* is pushed from a non-main branch + # if: startsWith(github.ref_name, 're2-prod-') && github.base_ref != 'main' + # run: | + # echo "🚫 re2-prod-* tags can only be pushed from the main branch." + # exit 1 + build: + # needs: check-branch + strategy: + matrix: + package: [supervisor] + runs-on: ubuntu-latest + env: + DOCKER_BUILDKIT: "1" + steps: + - name: 🏭 Setup Depot CLI + uses: depot/setup-action@v1 + + - name: ⬇️ Checkout git repo + uses: actions/checkout@v4 + + - name: 📦 Get image repo + id: get_repository + run: | + if [[ "${{ matrix.package }}" == *-provider ]]; then + provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) + repo=provider/${provider_type} + else + repo="${{ matrix.package }}" + fi + echo "repo=${repo}" >> "$GITHUB_OUTPUT" + + - name: "#️⃣ Get image tag" + id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: ${{ inputs.image_tag }} + + - name: 📛 Set tags to push + id: set_tags + run: | + ref_without_tag=ghcr.io/triggerdotdev/${{ steps.get_repository.outputs.repo }} + image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} + + # if tag is a semver, also tag it as v4 + if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then + # TODO: switch to v4 tag on GA + image_tags=$image_tags,$ref_without_tag:v4-beta + fi + + echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + + - name: 🐙 Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🐳 Build image and push to GitHub Container Registry + uses: depot/build-push-action@v1 + with: + file: ./apps/${{ matrix.package }}/Containerfile + platforms: linux/amd64,linux/arm64 + tags: ${{ steps.set_tags.outputs.image_tags }} + push: true diff --git a/.github/workflows/publish-worker.yml b/.github/workflows/publish-worker.yml new file mode 100644 index 00000000000..74a70d83667 --- /dev/null +++ b/.github/workflows/publish-worker.yml @@ -0,0 +1,87 @@ +name: "⚒️ Publish Worker" + +on: + workflow_call: + inputs: + image_tag: + description: The image tag to publish + type: string + required: false + default: "" + push: + tags: + - "infra-dev-*" + - "infra-test-*" + - "infra-prod-*" + +permissions: + packages: write + contents: read + +jobs: + build: + strategy: + matrix: + package: [coordinator, docker-provider, kubernetes-provider] + runs-on: ubuntu-latest + env: + DOCKER_BUILDKIT: "1" + steps: + - name: ⬇️ Checkout git repo + uses: actions/checkout@v4 + + - name: 📦 Get image repo + id: get_repository + run: | + if [[ "${{ matrix.package }}" == *-provider ]]; then + provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) + repo=provider/${provider_type} + else + repo="${{ matrix.package }}" + fi + echo "repo=${repo}" >> "$GITHUB_OUTPUT" + + - id: get_tag + uses: ./.github/actions/get-image-tag + with: + tag: ${{ inputs.image_tag }} + + - name: 🐋 Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: 🚢 Build Container Image + run: | + docker build -t infra_image -f ./apps/${{ matrix.package }}/Containerfile . + + # ..to push image + - name: 🐙 Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🐙 Push to GitHub Container Registry + run: | + docker tag infra_image "$REGISTRY/$REPOSITORY:$IMAGE_TAG" + docker push "$REGISTRY/$REPOSITORY:$IMAGE_TAG" + env: + REGISTRY: ghcr.io/triggerdotdev + REPOSITORY: ${{ steps.get_repository.outputs.repo }} + IMAGE_TAG: ${{ steps.get_tag.outputs.tag }} + + # - name: 🐙 Push 'v3' tag to GitHub Container Registry + # if: steps.get_tag.outputs.is_semver == 'true' + # run: | + # docker tag infra_image "$REGISTRY/$REPOSITORY:v3" + # docker push "$REGISTRY/$REPOSITORY:v3" + # env: + # REGISTRY: ghcr.io/triggerdotdev + # REPOSITORY: ${{ steps.get_repository.outputs.repo }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3a4f2fb47cc..6213499c5ad 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,23 +1,34 @@ name: 🚀 Publish Trigger.dev Docker on: + workflow_dispatch: + workflow_call: + inputs: + image_tag: + description: The image tag to publish + required: true + type: string push: branches: - main - - improvements/* tags: - "v.docker.*" + - "build-*" paths: + - ".github/actions/**/*.yml" - ".github/workflows/publish.yml" + - ".github/workflows/typecheck.yml" + - ".github/workflows/unit-tests.yml" + - ".github/workflows/e2e.yml" + - ".github/workflows/publish-webapp.yml" + - ".github/workflows/publish-worker.yml" - "packages/**" - "!packages/**/*.md" - "!packages/**/*.eslintrc" + - "internal-packages/**" - "apps/**" - "!apps/**/*.md" - "!apps/**/*.eslintrc" - - "integrations/**" - - "!integrations/**/*.md" - - "!integrations/**/*.eslintrc" - "pnpm-lock.yaml" - "pnpm-workspace.yaml" - "turbo.json" @@ -45,77 +56,23 @@ jobs: uses: ./.github/workflows/unit-tests.yml secrets: inherit - e2e: - uses: ./.github/workflows/e2e.yml + publish-webapp: + needs: [typecheck] + uses: ./.github/workflows/publish-webapp.yml secrets: inherit + with: + image_tag: ${{ inputs.image_tag }} - publish: - needs: [typecheck, units, e2e] - runs-on: buildjet-4vcpu-ubuntu-2204 - outputs: - version: ${{ steps.get_version.outputs.version }} - short_sha: ${{ steps.get_commit.outputs.sha_short }} - steps: - - name: 🐳 Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: ⬇️ Checkout repo - uses: actions/checkout@v3 - - - name: 🆚 Get the version - id: get_version - run: | - IMAGE_TAG="${GITHUB_REF#refs/tags/}" - if [[ $GITHUB_REF == refs/tags/* ]]; then - if [[ $IMAGE_TAG == v.docker.* ]]; then - ORIGINAL_VERSION="${IMAGE_TAG#v.docker.}" - IMAGE_TAG="v${ORIGINAL_VERSION}" - fi - echo "IMAGE_TAG=${IMAGE_TAG}" - elif [[ $GITHUB_REF == refs/heads/improvements/* ]]; then - ORIGINAL_VERSION="${GITHUB_REF#refs/heads/improvements/}" - IMAGE_TAG="${ORIGINAL_VERSION}.rc" - echo "IMAGE_TAG=${IMAGE_TAG}" - elif [[ $GITHUB_REF == refs/heads/* ]]; then - IMAGE_TAG="${GITHUB_REF#refs/heads/}" - echo "IMAGE_TAG=${IMAGE_TAG}" - else - echo "Invalid reference: ${GITHUB_REF}" - exit 1 - fi - echo "::set-output name=version::${IMAGE_TAG}" - - name: 🔢 Get the commit hash - id: get_commit - run: | - echo ::set-output name=sha_short::$(echo ${{ github.sha }} | cut -c1-7) - - - name: 🐳 Build Docker Image - run: | - docker build -t release_build_image -f ./docker/Dockerfile . - - - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: 🐙 Push to GitHub Container Registry - run: | - docker tag release_build_image $REGISTRY/$REPOSITORY:$IMAGE_TAG - docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG - env: - REGISTRY: ghcr.io/triggerdotdev - REPOSITORY: trigger.dev - IMAGE_TAG: ${{ steps.get_version.outputs.version }} + publish-worker: + needs: [typecheck] + uses: ./.github/workflows/publish-worker.yml + secrets: inherit + with: + image_tag: ${{ inputs.image_tag }} - - name: 🐙 Push 'latest' to GitHub Container Registry - if: startsWith(github.ref, 'refs/tags/') - run: | - docker tag release_build_image $REGISTRY/$REPOSITORY:latest - docker push $REGISTRY/$REPOSITORY:latest - env: - REGISTRY: ghcr.io/triggerdotdev - REPOSITORY: trigger.dev + publish-worker-v4: + needs: [typecheck] + uses: ./.github/workflows/publish-worker-v4.yml + secrets: inherit + with: + image_tag: ${{ inputs.image_tag }} diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml new file mode 100644 index 00000000000..c6efd382ff6 --- /dev/null +++ b/.github/workflows/release-helm.yml @@ -0,0 +1,143 @@ +name: 🧭 Helm Chart Release + +on: + push: + tags: + - 'helm-v*' + workflow_dispatch: + inputs: + chart_version: + description: 'Chart version to release' + required: true + type: string + +env: + REGISTRY: ghcr.io + CHART_NAME: trigger + +jobs: + lint-and-test: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Lint Helm Chart + run: | + helm lint ./hosting/k8s/helm/ + + - name: Render templates + run: | + helm template test-release ./hosting/k8s/helm/ \ + --values ./hosting/k8s/helm/values.yaml \ + --output-dir ./helm-output + + - name: Validate manifests + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 + with: + entrypoint: '/kubeconform' + args: "-summary -output json ./helm-output" + + release: + needs: lint-and-test + runs-on: ubuntu-latest + permissions: + contents: write # for gh-release + packages: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract version from tag or input + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ github.event.inputs.chart_version }}" + else + VERSION="${{ github.ref_name }}" + VERSION="${VERSION#helm-v}" + fi + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Releasing version: $VERSION" + + - name: Check Chart.yaml version matches release version + run: | + VERSION="${{ steps.version.outputs.version }}" + CHART_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') + echo "Chart.yaml version: $CHART_VERSION" + echo "Release version: $VERSION" + if [ "$CHART_VERSION" != "$VERSION" ]; then + echo "❌ Chart.yaml version does not match release version!" + exit 1 + fi + echo "✅ Chart.yaml version matches release version." + + - name: Package Helm Chart + run: | + helm package ./hosting/k8s/helm/ --destination /tmp/ + + - name: Push Helm Chart to GHCR + run: | + VERSION="${{ steps.version.outputs.version }}" + CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" + + # Push to GHCR OCI registry + helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + + - name: Create GitHub Release + id: release + uses: softprops/action-gh-release@v1 + if: github.event_name == 'push' + with: + tag_name: ${{ github.ref_name }} + name: "Helm Chart ${{ steps.version.outputs.version }}" + body: | + ### Installation + ```bash + helm upgrade --install trigger \ + oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts/${{ env.CHART_NAME }} \ + --version "${{ steps.version.outputs.version }}" + ``` + + ### Changes + See commit history for detailed changes in this release. + files: | + /tmp/${{ env.CHART_NAME }}-${{ steps.version.outputs.version }}.tgz + token: ${{ secrets.GITHUB_TOKEN }} + draft: true + prerelease: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cb954be9dd0..79b113b0f2a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,72 +1,264 @@ name: 🦋 Changesets Release on: - push: + pull_request: + types: [closed] branches: - main - paths-ignore: - - "**.md" - - ".github/CODEOWNERS" - - ".github/ISSUE_TEMPLATE/**" - + workflow_dispatch: + inputs: + type: + description: "Select release type" + required: true + type: choice + options: + - release + - prerelease + default: "prerelease" + ref: + description: "The ref (branch, tag, or SHA) to checkout and release from" + required: true + type: string + prerelease_tag: + description: "The npm dist-tag for the prerelease (e.g., 'v4-prerelease')" + required: false + type: string + default: "prerelease" + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false jobs: + show-release-summary: + name: 📋 Release Summary + runs-on: ubuntu-latest + if: | + github.repository == 'triggerdotdev/trigger.dev' && + github.event_name == 'pull_request' && + github.event.pull_request.merged == true && + github.event.pull_request.head.ref == 'changeset-release/main' + steps: + - name: Show release summary + env: + PR_BODY: ${{ github.event.pull_request.body }} + run: | + echo "$PR_BODY" | sed -n '/^# Releases/,$p' >> $GITHUB_STEP_SUMMARY + release: - name: 🦋 Changesets Release - runs-on: buildjet-4vcpu-ubuntu-2204 + name: 🚀 Release npm packages + runs-on: ubuntu-latest + environment: npm-publish + permissions: + contents: write + packages: write + id-token: write if: | - github.repository == 'triggerdotdev/trigger.dev' + github.repository == 'triggerdotdev/trigger.dev' && + ( + (github.event_name == 'workflow_dispatch' && github.event.inputs.type == 'release') || + (github.event_name == 'pull_request' && github.event.pull_request.merged == true && github.event.pull_request.head.ref == 'changeset-release/main') + ) outputs: - published_packages: ${{ steps.changesets.outputs.publishedPackages }} published: ${{ steps.changesets.outputs.published }} + published_packages: ${{ steps.changesets.outputs.publishedPackages }} + published_package_version: ${{ steps.get_version.outputs.package_version }} steps: - - name: 🛑 Cancel Previous Runs - uses: styfle/cancel-workflow-action@0.11.0 - - - name: ⬇️ Checkout repo - uses: actions/checkout@v3 + - name: Checkout repo + uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.sha }} + + - name: Verify ref is on main + if: github.event_name == 'workflow_dispatch' + run: | + if ! git merge-base --is-ancestor ${{ github.event.inputs.ref }} origin/main; then + echo "Error: ref must be an ancestor of main (i.e., already merged)" + exit 1 + fi - - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v2.2.4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 with: - version: 7.18 + version: 10.23.0 - - name: ⎔ Setup node - uses: buildjet/setup-node@v3 + - name: Setup node + uses: buildjet/setup-node@v4 with: - node-version: 18 + node-version: 20.20.0 cache: "pnpm" - - name: 📥 Download deps + # npm v11.5.1 or newer is required for OIDC support + # https://github.blog/changelog/2025-07-31-npm-trusted-publishing-with-oidc-is-generally-available/#whats-new + - name: Setup npm 11.x for OIDC + run: npm install -g npm@11.6.4 + + - name: Install dependencies run: pnpm install --frozen-lockfile - - name: 📀 Generate Prisma Client + - name: Generate Prisma client run: pnpm run generate - - name: 🔎 Type check - run: pnpm run typecheck + - name: Build + run: pnpm run build --filter "@trigger.dev/*" --filter "trigger.dev" - - name: 🔐 Setup npm auth - run: | - echo "registry=https://registry.npmjs.org" >> ~/.npmrc - echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" >> ~/.npmrc - - # This action has two responsibilities. The first time the workflow runs - # (initial push to the `main` branch) it will create a new branch and - # then open a PR with the related changes for the new version. After the - # PR is merged, the workflow will run again and this action will build + - # publish to npm. - - name: 🚀 PR / Publish + - name: Type check + run: pnpm run typecheck --filter "@trigger.dev/*" --filter "trigger.dev" + + - name: Publish id: changesets uses: changesets/action@v1 with: - version: pnpm run changeset:version - commit: "chore: Update version for release" - title: "chore: Update version for release" publish: pnpm run changeset:release createGithubReleases: false env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + + - name: Show package version + if: steps.changesets.outputs.published == 'true' + id: get_version + run: | + package_version=$(echo '${{ steps.changesets.outputs.publishedPackages }}' | jq -r '.[0].version') + echo "package_version=${package_version}" >> "$GITHUB_OUTPUT" + + - name: Create unified GitHub release + if: steps.changesets.outputs.published == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_PR_BODY: ${{ github.event.pull_request.body }} + run: | + VERSION="${{ steps.get_version.outputs.package_version }}" + node scripts/generate-github-release.mjs "$VERSION" > /tmp/release-body.md + gh release create "v${VERSION}" \ + --title "trigger.dev v${VERSION}" \ + --notes-file /tmp/release-body.md \ + --target main + + - name: Create and push Docker tag + if: steps.changesets.outputs.published == 'true' + run: | + set -e + git tag "v.docker.${{ steps.get_version.outputs.package_version }}" + git push origin "v.docker.${{ steps.get_version.outputs.package_version }}" + + # Trigger Docker builds directly via workflow_call since tags pushed with + # GITHUB_TOKEN don't trigger other workflows (GitHub Actions limitation). + publish-docker: + name: 🐳 Publish Docker images + needs: release + if: needs.release.outputs.published == 'true' + uses: ./.github/workflows/publish.yml + secrets: inherit + with: + image_tag: v${{ needs.release.outputs.published_package_version }} + + # After Docker images are published, update the GitHub release with the exact GHCR tag URL. + # The GHCR package version ID is only known after the image is pushed, so we query for it here. + update-release: + name: 🔗 Update release Docker link + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + permissions: + contents: write + packages: read + steps: + - name: Update GitHub release with Docker image link + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -e + VERSION="${{ needs.release.outputs.published_package_version }}" + TAG="v${VERSION}" + + # Query GHCR for the version ID matching this tag + VERSION_ID=$(gh api --paginate -H "Accept: application/vnd.github+json" \ + /orgs/triggerdotdev/packages/container/trigger.dev/versions \ + --jq ".[] | select(.metadata.container.tags[] == \"${TAG}\") | .id" \ + | head -1) + + if [ -z "$VERSION_ID" ]; then + echo "Warning: Could not find GHCR version ID for tag ${TAG}, skipping update" + exit 0 + fi + + DOCKER_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev/${VERSION_ID}?tag=${TAG}" + GENERIC_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev" + + # Get current release body and replace the generic link with the tag-specific one. + # Use word boundary after GENERIC_URL (closing paren) to avoid matching URLs that + # already have a version ID appended (idempotent on re-runs). + gh release view "${TAG}" --repo triggerdotdev/trigger.dev --json body --jq '.body' > /tmp/release-body.md + sed -i "s|${GENERIC_URL})|${DOCKER_URL})|g" /tmp/release-body.md + + gh release edit "${TAG}" --repo triggerdotdev/trigger.dev --notes-file /tmp/release-body.md + + # Dispatch changelog entry creation to the marketing site repo. + # Runs after update-release so the GitHub release body already has the exact Docker image URL. + dispatch-changelog: + name: 📝 Dispatch changelog PR + needs: [release, update-release] + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + steps: + - uses: peter-evans/repository-dispatch@v3 + with: + token: ${{ secrets.CROSS_REPO_PAT }} + repository: triggerdotdev/trigger.dev-site-v3 + event-type: new-release + client-payload: '{"version": "${{ needs.release.outputs.published_package_version }}"}' + + # The prerelease job needs to be on the same workflow file due to a limitation related to how npm verifies OIDC claims. + prerelease: + name: 🧪 Prerelease + runs-on: ubuntu-latest + environment: npm-publish + permissions: + contents: read + id-token: write + if: github.repository == 'triggerdotdev/trigger.dev' && github.event_name == 'workflow_dispatch' && github.event.inputs.type == 'prerelease' + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.ref }} + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + # npm v11.5.1 or newer is required for OIDC support + # https://github.blog/changelog/2025-07-31-npm-trusted-publishing-with-oidc-is-generally-available/#whats-new + - name: Setup npm 11.x for OIDC + run: npm install -g npm@11.6.4 + + - name: Download deps + run: pnpm install --frozen-lockfile + + - name: Generate Prisma Client + run: pnpm run generate + + - name: Snapshot version + run: pnpm exec changeset version --snapshot ${{ github.event.inputs.prerelease_tag }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Clean + run: pnpm run clean --filter "@trigger.dev/*" --filter "trigger.dev" + + - name: Build + run: pnpm run build --filter "@trigger.dev/*" --filter "trigger.dev" + + - name: Publish prerelease + run: pnpm exec changeset publish --no-git-tag --snapshot --tag ${{ github.event.inputs.prerelease_tag }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/sdk-compat.yml b/.github/workflows/sdk-compat.yml new file mode 100644 index 00000000000..eb347c0f771 --- /dev/null +++ b/.github/workflows/sdk-compat.yml @@ -0,0 +1,178 @@ +name: "🔌 SDK Compatibility Tests" + +permissions: + contents: read + +on: + workflow_call: + +jobs: + node-compat: + name: "Node.js ${{ matrix.node }} (${{ matrix.os }})" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + node: ["20.20", "22.12"] + + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: ${{ matrix.node }} + cache: "pnpm" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🔨 Build SDK dependencies + shell: bash + run: pnpm run build --filter '@trigger.dev/sdk^...' + + - name: 🔨 Build SDK + shell: bash + run: pnpm run build --filter '@trigger.dev/sdk' + + - name: 🧪 Run SDK Compatibility Tests + shell: bash + run: pnpm --filter @internal/sdk-compat-tests test + + bun-compat: + name: "Bun Runtime" + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 🥟 Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🔨 Build SDK dependencies + run: pnpm run build --filter @trigger.dev/sdk^... + + - name: 🔨 Build SDK + run: pnpm run build --filter @trigger.dev/sdk + + - name: 🧪 Run Bun Compatibility Test + working-directory: internal-packages/sdk-compat-tests/src/fixtures/bun + run: bun run test.ts + + deno-compat: + name: "Deno Runtime" + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 🦕 Setup Deno + uses: denoland/setup-deno@v2 + with: + deno-version: v2.x + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🔨 Build SDK dependencies + run: pnpm run build --filter @trigger.dev/sdk^... + + - name: 🔨 Build SDK + run: pnpm run build --filter @trigger.dev/sdk + + - name: 🔗 Link node_modules for Deno fixture + working-directory: internal-packages/sdk-compat-tests/src/fixtures/deno + run: ln -s ../../../../../node_modules node_modules + + - name: 🧪 Run Deno Compatibility Test + working-directory: internal-packages/sdk-compat-tests/src/fixtures/deno + run: deno run --allow-read --allow-env --allow-sys test.ts + + cloudflare-compat: + name: "Cloudflare Workers" + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🔨 Build SDK dependencies + run: pnpm run build --filter @trigger.dev/sdk^... + + - name: 🔨 Build SDK + run: pnpm run build --filter @trigger.dev/sdk + + - name: 📥 Install Cloudflare fixture deps + working-directory: internal-packages/sdk-compat-tests/src/fixtures/cloudflare-worker + run: pnpm install + + - name: 🧪 Run Cloudflare Workers Compatibility Test (dry-run) + working-directory: internal-packages/sdk-compat-tests/src/fixtures/cloudflare-worker + run: npx wrangler deploy --dry-run --outdir dist diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml index 1d32f6945d8..665d54b2563 100644 --- a/.github/workflows/typecheck.yml +++ b/.github/workflows/typecheck.yml @@ -1,25 +1,30 @@ name: "ʦ TypeScript" + on: workflow_call: + +permissions: + contents: read + jobs: typecheck: - runs-on: buildjet-4vcpu-ubuntu-2204 + runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v2.2.4 + uses: pnpm/action-setup@v4 with: - version: 7.18 + version: 10.23.0 - name: ⎔ Setup node - uses: buildjet/setup-node@v3 + uses: buildjet/setup-node@v4 with: - node-version: 18 + node-version: 20.20.0 cache: "pnpm" - name: 📥 Download deps @@ -29,4 +34,9 @@ jobs: run: pnpm run generate - name: 🔎 Type check - run: pnpm run typecheck --filter webapp + run: pnpm run typecheck + env: + NODE_OPTIONS: --max-old-space-size=8192 + + - name: 🔎 Check exports + run: pnpm run check-exports diff --git a/.github/workflows/unit-tests-internal.yml b/.github/workflows/unit-tests-internal.yml new file mode 100644 index 00000000000..92b951e8aa0 --- /dev/null +++ b/.github/workflows/unit-tests-internal.yml @@ -0,0 +1,141 @@ +name: "🧪 Unit Tests: Internal" + +permissions: + contents: read + +on: + workflow_call: + +jobs: + unitTests: + name: "🧪 Unit Tests: Internal" + runs-on: ubuntu-latest + strategy: + matrix: + shardIndex: [1, 2, 3, 4, 5, 6, 7, 8] + shardTotal: [8] + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + SHARD_INDEX: ${{ matrix.shardIndex }} + SHARD_TOTAL: ${{ matrix.shardTotal }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull clickhouse/clickhouse-server:25.4-alpine + docker pull redis:7-alpine + docker pull testcontainers/ryuk:0.11.0 + docker pull electricsql/electric:1.2.4 + echo "Image pre-pull complete" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🧪 Run Internal Unit Tests + run: pnpm run test:internal --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + + - name: Gather all reports + if: ${{ !cancelled() }} + run: | + mkdir -p .vitest-reports + find . -type f -path '*/.vitest-reports/blob-*.json' \ + -exec bash -c 'src="$1"; basename=$(basename "$src"); pkg=$(dirname "$src" | sed "s|^\./||;s|/\.vitest-reports$||;s|/|_|g"); cp "$src" ".vitest-reports/${pkg}-${basename}"' _ {} \; + + - name: Upload blob reports to GitHub Actions Artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: internal-blob-report-${{ matrix.shardIndex }} + path: .vitest-reports/* + include-hidden-files: true + retention-days: 1 + + merge-reports: + name: "📊 Merge Reports" + if: ${{ !cancelled() }} + needs: [unitTests] + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + # no cache enabled, we're not installing deps + + - name: Download blob reports from GitHub Actions Artifacts + uses: actions/download-artifact@v4 + with: + path: .vitest-reports + pattern: internal-blob-report-* + merge-multiple: true + + - name: Merge reports + run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests-packages.yml b/.github/workflows/unit-tests-packages.yml new file mode 100644 index 00000000000..78474e03f27 --- /dev/null +++ b/.github/workflows/unit-tests-packages.yml @@ -0,0 +1,141 @@ +name: "🧪 Unit Tests: Packages" + +permissions: + contents: read + +on: + workflow_call: + +jobs: + unitTests: + name: "🧪 Unit Tests: Packages" + runs-on: ubuntu-latest + strategy: + matrix: + shardIndex: [1] + shardTotal: [1] + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + SHARD_INDEX: ${{ matrix.shardIndex }} + SHARD_TOTAL: ${{ matrix.shardTotal }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull clickhouse/clickhouse-server:25.4-alpine + docker pull redis:7-alpine + docker pull testcontainers/ryuk:0.11.0 + docker pull electricsql/electric:1.2.4 + echo "Image pre-pull complete" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🧪 Run Package Unit Tests + run: pnpm run test:packages --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + + - name: Gather all reports + if: ${{ !cancelled() }} + run: | + mkdir -p .vitest-reports + find . -type f -path '*/.vitest-reports/blob-*.json' \ + -exec bash -c 'src="$1"; basename=$(basename "$src"); pkg=$(dirname "$src" | sed "s|^\./||;s|/\.vitest-reports$||;s|/|_|g"); cp "$src" ".vitest-reports/${pkg}-${basename}"' _ {} \; + + - name: Upload blob reports to GitHub Actions Artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: packages-blob-report-${{ matrix.shardIndex }} + path: .vitest-reports/* + include-hidden-files: true + retention-days: 1 + + merge-reports: + name: "📊 Merge Reports" + if: ${{ !cancelled() }} + needs: [unitTests] + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + # no cache enabled, we're not installing deps + + - name: Download blob reports from GitHub Actions Artifacts + uses: actions/download-artifact@v4 + with: + path: .vitest-reports + pattern: packages-blob-report-* + merge-multiple: true + + - name: Merge reports + run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests-webapp.yml b/.github/workflows/unit-tests-webapp.yml new file mode 100644 index 00000000000..523a1887db8 --- /dev/null +++ b/.github/workflows/unit-tests-webapp.yml @@ -0,0 +1,149 @@ +name: "🧪 Unit Tests: Webapp" + +permissions: + contents: read + +on: + workflow_call: + +jobs: + unitTests: + name: "🧪 Unit Tests: Webapp" + runs-on: ubuntu-latest + strategy: + matrix: + shardIndex: [1, 2, 3, 4, 5, 6, 7, 8] + shardTotal: [8] + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + SHARD_INDEX: ${{ matrix.shardIndex }} + SHARD_TOTAL: ${{ matrix.shardTotal }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull clickhouse/clickhouse-server:25.4-alpine + docker pull redis:7-alpine + docker pull testcontainers/ryuk:0.11.0 + docker pull electricsql/electric:1.2.4 + echo "Image pre-pull complete" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🧪 Run Webapp Unit Tests + run: pnpm run test:webapp --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + env: + DATABASE_URL: postgresql://postgres:postgres@localhost:5432/postgres + DIRECT_URL: postgresql://postgres:postgres@localhost:5432/postgres + SESSION_SECRET: "secret" + MAGIC_LINK_SECRET: "secret" + ENCRYPTION_KEY: "dummy-encryption-keeeey-32-bytes" + DEPLOY_REGISTRY_HOST: "docker.io" + CLICKHOUSE_URL: "http://default:password@localhost:8123" + + - name: Gather all reports + if: ${{ !cancelled() }} + run: | + mkdir -p .vitest-reports + find . -type f -path '*/.vitest-reports/blob-*.json' \ + -exec bash -c 'src="$1"; basename=$(basename "$src"); pkg=$(dirname "$src" | sed "s|^\./||;s|/\.vitest-reports$||;s|/|_|g"); cp "$src" ".vitest-reports/${pkg}-${basename}"' _ {} \; + + - name: Upload blob reports to GitHub Actions Artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: webapp-blob-report-${{ matrix.shardIndex }} + path: .vitest-reports/* + include-hidden-files: true + retention-days: 1 + + merge-reports: + name: "📊 Merge Reports" + if: ${{ !cancelled() }} + needs: [unitTests] + runs-on: ubuntu-latest + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: ⎔ Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + # no cache enabled, we're not installing deps + + - name: Download blob reports from GitHub Actions Artifacts + uses: actions/download-artifact@v4 + with: + path: .vitest-reports + pattern: webapp-blob-report-* + merge-multiple: true + + - name: Merge reports + run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f90f81fab96..7c90a5a30ad 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,30 +1,18 @@ name: "🧪 Unit Tests" -on: - workflow_call: -jobs: - unitTests: - name: "🧪 Unit Tests" - runs-on: buildjet-4vcpu-ubuntu-2204 - steps: - - name: ⬇️ Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v2.2.4 - with: - version: 7.18 - - name: ⎔ Setup node - uses: buildjet/setup-node@v3 - with: - node-version: 18 - cache: "pnpm" +permissions: + contents: read - - name: 📥 Download deps - run: pnpm install --frozen-lockfile +on: + workflow_call: - - name: Run Unit Tests - run: | - pnpm run test +jobs: + webapp: + uses: ./.github/workflows/unit-tests-webapp.yml + secrets: inherit + packages: + uses: ./.github/workflows/unit-tests-packages.yml + secrets: inherit + internal: + uses: ./.github/workflows/unit-tests-internal.yml + secrets: inherit diff --git a/.github/workflows/vouch-check-pr.yml b/.github/workflows/vouch-check-pr.yml new file mode 100644 index 00000000000..21597cf467a --- /dev/null +++ b/.github/workflows/vouch-check-pr.yml @@ -0,0 +1,44 @@ +name: Vouch - Check PR + +on: + pull_request_target: + types: [opened, reopened] + +permissions: + contents: read + pull-requests: write + issues: read + +jobs: + check-vouch: + runs-on: ubuntu-latest + steps: + - uses: mitchellh/vouch/action/check-pr@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 + with: + pr-number: ${{ github.event.pull_request.number }} + auto-close: true + require-vouch: true + env: + GH_TOKEN: ${{ github.token }} + + require-draft: + needs: check-vouch + if: > + github.event.pull_request.draft == false && + github.event.pull_request.author_association != 'MEMBER' && + github.event.pull_request.author_association != 'OWNER' && + github.event.pull_request.author_association != 'COLLABORATOR' + runs-on: ubuntu-latest + steps: + - name: Close non-draft PR + env: + GH_TOKEN: ${{ github.token }} + run: | + STATE=$(gh pr view ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --json state -q '.state') + if [ "$STATE" != "OPEN" ]; then + echo "PR is already closed, skipping." + exit 0 + fi + gh pr close ${{ github.event.pull_request.number }} \ + --repo ${{ github.repository }} \ + --comment "Thanks for your contribution! We require all external PRs to be opened in **draft** status first so you can address CodeRabbit review comments and ensure CI passes before requesting a review. Please re-open this PR as a draft. See [CONTRIBUTING.md](https://github.com/${{ github.repository }}/blob/main/CONTRIBUTING.md#pr-workflow) for details." diff --git a/.github/workflows/vouch-manage-by-issue.yml b/.github/workflows/vouch-manage-by-issue.yml new file mode 100644 index 00000000000..51bce367b3e --- /dev/null +++ b/.github/workflows/vouch-manage-by-issue.yml @@ -0,0 +1,24 @@ +name: Vouch - Manage by Issue + +on: + issue_comment: + types: [created] + +permissions: + contents: write + issues: write + +jobs: + manage: + runs-on: ubuntu-latest + if: >- + contains(github.event.comment.body, 'vouch') || + contains(github.event.comment.body, 'denounce') || + contains(github.event.comment.body, 'unvouch') + steps: + - uses: mitchellh/vouch/action/manage-by-issue@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 + with: + comment-id: ${{ github.event.comment.id }} + issue-id: ${{ github.event.issue.number }} + env: + GH_TOKEN: ${{ github.token }} diff --git a/.gitignore b/.gitignore index c83ebf2446e..5f6adddba0a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,10 +12,12 @@ coverage # next.js .next/ out/ -build dist packages/**/dist +# vendored bundles (generated during build) +packages/**/src/**/vendor + # Tailwind apps/**/styles/tailwind.css packages/**/styles/tailwind.css @@ -30,12 +32,10 @@ yarn-debug.log* yarn-error.log* # local env files -.env.docker +.env +.env.* .docker/*.env -.env.local -.env.development.local -.env.test.local -.env.production.local +!.env.example # turbo .turbo @@ -53,4 +53,19 @@ apps/**/public/build /playwright-report/ /playwright/.cache/ -.cosine \ No newline at end of file +.cosine +.trigger +.tshy* +.yarn +*.tsbuildinfo +/packages/cli-v3/src/package.json +.husky +/packages/react-hooks/src/package.json +/packages/core/src/package.json +/packages/trigger-sdk/src/package.json +/packages/python/src/package.json +**/.claude/settings.local.json +.mcp.log +.mcp.json +.cursor/debug.log +ailogger-output.log \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000000..ecf08cb1a4d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "internal-packages/otlp-importer/protos"] + path = internal-packages/otlp-importer/protos + url = https://github.com/open-telemetry/opentelemetry-proto.git diff --git a/.npmrc b/.npmrc deleted file mode 100644 index 6da70cea9bb..00000000000 --- a/.npmrc +++ /dev/null @@ -1,2 +0,0 @@ -link-workspace-packages=false -public-hoist-pattern[]=*prisma* \ No newline at end of file diff --git a/.nvmrc b/.nvmrc index b714151ef9b..7c663e0a0bd 100644 --- a/.nvmrc +++ b/.nvmrc @@ -1 +1 @@ -v18.18.0 \ No newline at end of file +v20.20.0 \ No newline at end of file diff --git a/.prettierignore b/.prettierignore index 7c73a4c74a4..a34447dd45d 100644 --- a/.prettierignore +++ b/.prettierignore @@ -7,5 +7,4 @@ tailwind.css **/.react-email/ **/storybook-static/ **/.changeset/ -**/build/ **/dist/ \ No newline at end of file diff --git a/.server-changes/.gitkeep b/.server-changes/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/.server-changes/README.md b/.server-changes/README.md new file mode 100644 index 00000000000..82716de981c --- /dev/null +++ b/.server-changes/README.md @@ -0,0 +1,81 @@ +# Server Changes + +This directory tracks changes to server-only components (webapp, supervisor, coordinator, etc.) that are not captured by changesets. Changesets only track published npm packages — server changes would otherwise go undocumented. + +## When to add a file + +**Server-only PRs**: If your PR only changes `apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, or other server components (and does NOT change anything in `packages/`), add a `.server-changes/` file. + +**Mixed PRs** (both packages and server): Just add a changeset as usual. No `.server-changes/` file needed — the changeset covers it. + +**Package-only PRs**: Just add a changeset as usual. + +## File format + +Create a markdown file with a descriptive name: + +``` +.server-changes/fix-batch-queue-stalls.md +``` + +With this format: + +```markdown +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +``` + +### Fields + +- **area** (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type** (required): `feature` | `fix` | `improvement` | `breaking` + +### Description + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise — it will appear in release notes. + +## Lifecycle + +1. Engineer adds a `.server-changes/` file in their PR +2. Files accumulate on `main` as PRs merge +3. The changeset release PR includes these in its summary +4. After the release merges, CI cleans up the consumed files + +## Examples + +**New feature:** + +```markdown +--- +area: webapp +type: feature +--- + +TRQL query language and the Query page +``` + +**Bug fix:** + +```markdown +--- +area: webapp +type: fix +--- + +Fix schedule limit counting for orgs with custom limits +``` + +**Improvement:** + +```markdown +--- +area: webapp +type: improvement +--- + +Use the replica for API auth queries to reduce primary load +``` diff --git a/.server-changes/admin-feature-flags-dialog.md b/.server-changes/admin-feature-flags-dialog.md new file mode 100644 index 00000000000..2517e21a3b8 --- /dev/null +++ b/.server-changes/admin-feature-flags-dialog.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add admin UI for viewing and editing feature flags (org-level overrides and global defaults). diff --git a/.server-changes/ai-prompt-management.md b/.server-changes/ai-prompt-management.md new file mode 100644 index 00000000000..624ec391047 --- /dev/null +++ b/.server-changes/ai-prompt-management.md @@ -0,0 +1,30 @@ +--- +area: webapp +type: feature +--- + +AI prompt management dashboard and enhanced span inspectors. + +**Prompt management:** +- Prompts list page with version status, model, override indicators, and 24h usage sparklines +- Prompt detail page with template viewer, variable preview, version history timeline, and override editor +- Create, edit, and remove overrides to change prompt content or model without redeploying +- Promote any code-deployed version to current +- Generations tab with infinite scroll, live polling, and inline span inspector +- Per-prompt metrics: total generations, avg tokens, avg cost, latency, with version-level breakdowns + +**AI span inspectors:** +- Custom inspectors for `ai.generateText`, `ai.streamText`, `ai.generateObject`, `ai.streamObject` parent spans +- `ai.toolCall` inspector showing tool name, call ID, and input arguments +- `ai.embed` inspector showing model, provider, and input text +- Prompt tab on AI spans linking to prompt version with template and input variables +- Compact timestamp and duration header on all AI span inspectors + +**AI metrics dashboard:** +- Operations, Providers, and Prompts filters on the AI Metrics dashboard +- Cost by prompt widget +- "AI" section in the sidebar with Prompts and AI Metrics links + +**Other improvements:** +- Resizable panel sizes now persist across page refreshes +- Fixed `
` inside `

` DOM nesting warnings in span titles and chat messages diff --git a/.server-changes/allow-rollbacks-promote-api.md b/.server-changes/allow-rollbacks-promote-api.md new file mode 100644 index 00000000000..fc03fa114ff --- /dev/null +++ b/.server-changes/allow-rollbacks-promote-api.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add allowRollbacks query param to the promote deployment API to enable version downgrades diff --git a/.server-changes/batch-r2-upload-retry.md b/.server-changes/batch-r2-upload-retry.md new file mode 100644 index 00000000000..a2c6415635b --- /dev/null +++ b/.server-changes/batch-r2-upload-retry.md @@ -0,0 +1,9 @@ +--- +area: webapp +type: fix +--- + +Fix transient R2/object store upload failures during batchTrigger() item streaming. + +- Added p-retry (3 attempts, 500ms–2s exponential backoff) around `uploadPacketToObjectStore` in `BatchPayloadProcessor.process()` so transient network errors self-heal server-side rather than aborting the entire batch stream. +- Removed `x-should-retry: false` from the 500 response on the batch items route so the SDK's existing 5xx retry path can recover if server-side retries are exhausted. Item deduplication by index makes full-stream retries safe. diff --git a/.server-changes/ck-index-master-queue-dedup.md b/.server-changes/ck-index-master-queue-dedup.md new file mode 100644 index 00000000000..a2ff6495e61 --- /dev/null +++ b/.server-changes/ck-index-master-queue-dedup.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Concurrency-keyed queues now use a single master queue entry per base queue instead of one entry per key. Prevents high-CK-count tenants from consuming the entire parentQueueLimit window and starving other tenants on the same shard. diff --git a/.server-changes/compute-template-shadow-mode.md b/.server-changes/compute-template-shadow-mode.md new file mode 100644 index 00000000000..e8ae0af9b66 --- /dev/null +++ b/.server-changes/compute-template-shadow-mode.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Pre-warm compute templates on deploy for orgs with compute access. Required for projects using a compute region, background-only for others. diff --git a/.server-changes/dev-cli-disconnect-md b/.server-changes/dev-cli-disconnect-md new file mode 100644 index 00000000000..a0790d70765 --- /dev/null +++ b/.server-changes/dev-cli-disconnect-md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Added `/engine/v1/dev/disconnect` endpoint to auto-cancel runs when the CLI disconnects. Maximum of 500 runs can be cancelled. Uses the bulk action system when there are more than 25 runs to cancel. \ No newline at end of file diff --git a/.server-changes/enqueue-fast-path.md b/.server-changes/enqueue-fast-path.md new file mode 100644 index 00000000000..65ff0dbaca8 --- /dev/null +++ b/.server-changes/enqueue-fast-path.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Reduce run start latency by skipping the intermediate queue when concurrency is available. This optimization is rolled out per-region and enabled automatically for development environments. diff --git a/.server-changes/env-variables-search-by-environment.md b/.server-changes/env-variables-search-by-environment.md new file mode 100644 index 00000000000..c3f9ed8bc2a --- /dev/null +++ b/.server-changes/env-variables-search-by-environment.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Extended the search filter on the environment variables page to match on environment type (production, staging, development, preview) and branch name, not just variable name and value. diff --git a/.server-changes/fix-batch-waitpoint-lock-contention.md b/.server-changes/fix-batch-waitpoint-lock-contention.md new file mode 100644 index 00000000000..6b545eb794b --- /dev/null +++ b/.server-changes/fix-batch-waitpoint-lock-contention.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Reduce lock contention when processing large `batchTriggerAndWait` batches. Previously, each batch item acquired a Redis lock on the parent run to insert a `TaskRunWaitpoint` row, causing `LockAcquisitionTimeoutError` with high concurrency (880 errors/24h in prod). Since `blockRunWithCreatedBatch` already transitions the parent to `EXECUTING_WITH_WAITPOINTS` before items are processed, the per-item lock is unnecessary. The new `blockRunWithWaitpointLockless` method performs only the idempotent CTE insert without acquiring the lock. diff --git a/.server-changes/fix-clickhouse-query-client-secure-param.md b/.server-changes/fix-clickhouse-query-client-secure-param.md new file mode 100644 index 00000000000..4daa021fe40 --- /dev/null +++ b/.server-changes/fix-clickhouse-query-client-secure-param.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Strip `secure` query parameter from QUERY_CLICKHOUSE_URL before passing to ClickHouse client. This was already done for the main and logs ClickHouse clients but was missing for the query client, causing a startup crash with `Error: Unknown URL parameters: secure`. diff --git a/.server-changes/fix-dev-env-scope-wrong-member.md b/.server-changes/fix-dev-env-scope-wrong-member.md new file mode 100644 index 00000000000..2bd3c92825c --- /dev/null +++ b/.server-changes/fix-dev-env-scope-wrong-member.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix `OrganizationsPresenter.#getEnvironment` matching the wrong development environment on teams with multiple members. All dev environments share the slug `"dev"`, so the previous `find` by slug alone could return another member's environment. Now filters DEVELOPMENT environments by `orgMember.userId` to ensure the logged-in user's dev environment is selected. diff --git a/.server-changes/llm-cost-tracking.md b/.server-changes/llm-cost-tracking.md new file mode 100644 index 00000000000..7567aae7d1b --- /dev/null +++ b/.server-changes/llm-cost-tracking.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add automatic LLM cost calculation for spans with GenAI semantic conventions. When a span arrives with `gen_ai.response.model` and token usage data, costs are calculated from an in-memory pricing registry backed by Postgres and dual-written to both span attributes (`trigger.llm.*`) and a new `llm_metrics_v1` ClickHouse table that captures usage, cost, performance (TTFC, tokens/sec), and behavioral (finish reason, operation type) metrics. diff --git a/.server-changes/mcp-get-span-details.md b/.server-changes/mcp-get-span-details.md new file mode 100644 index 00000000000..336595d2203 --- /dev/null +++ b/.server-changes/mcp-get-span-details.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add API endpoint `GET /api/v1/runs/:runId/spans/:spanId` that returns detailed span information including properties, events, AI enrichment (model, tokens, cost), and triggered child runs. diff --git a/.server-changes/multi-provider-object-storage.md b/.server-changes/multi-provider-object-storage.md new file mode 100644 index 00000000000..6749b5dcdbb --- /dev/null +++ b/.server-changes/multi-provider-object-storage.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Multi-provider object storage with protocol-based routing for zero-downtime migration diff --git a/.server-changes/object-store-iam-auth.md b/.server-changes/object-store-iam-auth.md new file mode 100644 index 00000000000..4a400eb29fe --- /dev/null +++ b/.server-changes/object-store-iam-auth.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add IAM role-based auth support for object stores (no access keys required). diff --git a/.server-changes/platform-notifications.md b/.server-changes/platform-notifications.md new file mode 100644 index 00000000000..54d52d77673 --- /dev/null +++ b/.server-changes/platform-notifications.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add platform notifications to inform users about new features, changelogs, and platform events directly in the dashboard. diff --git a/.server-changes/prisma-application-name.md b/.server-changes/prisma-application-name.md new file mode 100644 index 00000000000..825058f3b34 --- /dev/null +++ b/.server-changes/prisma-application-name.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Set `application_name` on Prisma connections from SERVICE_NAME so DB load can be attributed by service diff --git a/.server-changes/private-networking.md b/.server-changes/private-networking.md new file mode 100644 index 00000000000..b9e0006af0f --- /dev/null +++ b/.server-changes/private-networking.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add private networking support via AWS PrivateLink. Includes BillingClient methods for managing private connections, org settings UI pages for connection management, and supervisor changes to apply `privatelink` pod labels for CiliumNetworkPolicy matching. diff --git a/.vouch.yml b/.vouch.yml new file mode 100644 index 00000000000..8a9668392d3 --- /dev/null +++ b/.vouch.yml @@ -0,0 +1,2 @@ +vouch: + - github: edosrecki diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 312d6bbd49e..ec85d436e9a 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -1,9 +1,4 @@ { - "recommendations": [ - "astro-build.astro-vscode", - "denoland.vscode-deno" - ], - "unwantedRecommendations": [ - - ] -} \ No newline at end of file + "recommendations": ["bierner.comment-tagged-templates"], + "unwantedRecommendations": [] +} diff --git a/.vscode/launch.json b/.vscode/launch.json index 37375ca33d2..71a76904a2b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,34 @@ "request": "launch", "name": "Debug WebApp", "command": "pnpm run dev --filter webapp", - "envFile": "${workspaceFolder}/apps/webapp/.env", + "envFile": "${workspaceFolder}/.env", + "cwd": "${workspaceFolder}", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug realtimeStreams.test.ts", + "command": "pnpm run test -t RealtimeStreams", + "envFile": "${workspaceFolder}/.env", + "cwd": "${workspaceFolder}/apps/webapp", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug triggerTask.test.ts", + "command": "pnpm run test --run ./test/engine/triggerTask.test.ts", + "envFile": "${workspaceFolder}/.env", + "cwd": "${workspaceFolder}/apps/webapp", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug opened test file", + "command": "pnpm run test -- ./${relativeFile}", + "envFile": "${workspaceFolder}/.env", "cwd": "${workspaceFolder}", "sourceMaps": true }, @@ -23,10 +50,121 @@ { "type": "node-terminal", "request": "launch", - "name": "Debug BYO Auth", - "command": "pnpm run byo-auth", - "envFile": "${workspaceFolder}/references/job-catalog/.env", - "cwd": "${workspaceFolder}/references/job-catalog", + "name": "Debug V3 init CLI", + "command": "pnpm exec trigger init", + "cwd": "${workspaceFolder}/references/init-shell", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 init dev CLI", + "command": "pnpm exec trigger dev", + "cwd": "${workspaceFolder}/references/init-shell", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 Dev CLI", + "command": "pnpm exec trigger dev", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug Dev Next.js Realtime", + "command": "pnpm exec trigger dev", + "cwd": "${workspaceFolder}/references/nextjs-realtime", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug prisma-catalog deploy CLI", + "command": "pnpm exec trigger deploy --self-hosted --load-image", + "cwd": "${workspaceFolder}/references/prisma-catalog", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 Deploy CLI", + "command": "pnpm exec trigger deploy --self-hosted --load-image", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 list-profiles CLI", + "command": "pnpm exec trigger list-profiles --log-level debug", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 update CLI", + "command": "pnpm exec trigger update", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug V3 Management", + "command": "pnpm run management", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node", + "request": "attach", + "name": "Attach to Trigger.dev CLI (v3)", + "port": 9229, + "restart": true, + "skipFiles": ["/**"] + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug CLI e2e tests", + "command": "MOD=otel-telemetry-loader pnpm run test:e2e", + "cwd": "${workspaceFolder}/packages/cli-v3", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "debug v3 hello-world dev", + "command": "pnpm exec trigger dev", + "cwd": "${workspaceFolder}/references/hello-world", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug RunEngine tests", + "command": "pnpm run test ./src/engine/tests/releaseConcurrencyTokenBucketQueue.test.ts -t 'Should retrieve metrics for all queues via getQueueMetrics'", + "cwd": "${workspaceFolder}/internal-packages/run-engine", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug RunQueue tests", + "command": "pnpm run test ./src/run-queue/index.test.ts --run", + "cwd": "${workspaceFolder}/internal-packages/run-engine", + "sourceMaps": true + }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug d3-demo", + "command": "pnpm exec trigger dev", + "cwd": "${workspaceFolder}/references/d3-demo", "sourceMaps": true } ] diff --git a/.vscode/settings.json b/.vscode/settings.json index 5f523903200..fd9f3dcde0c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,11 @@ { - "deno.enablePaths": ["references/deno-reference"] + "deno.enablePaths": ["references/deno-reference", "runtime_tests/tests/deno"], + "debug.toolBarLocation": "commandCenter", + "typescript.tsdk": "node_modules/typescript/lib", + "search.exclude": { + "**/node_modules/**": true, + "packages/cli-v3/e2e": true + }, + "vitest.disableWorkspaceWarning": true, + "chat.agent.maxRequests": 10000 } diff --git a/.zed/tasks.json b/.zed/tasks.json new file mode 100644 index 00000000000..8612e16bfb1 --- /dev/null +++ b/.zed/tasks.json @@ -0,0 +1,45 @@ +[ + { + "label": "Build packages", + "command": "pnpm run build --filter \"@trigger.dev/*\" --filter trigger.dev", + //"args": [], + // Env overrides for the command, will be appended to the terminal's environment from the settings. + "env": { "foo": "bar" }, + // Current working directory to spawn the command into, defaults to current project root. + //"cwd": "/path/to/working/directory", + // Whether to use a new terminal tab or reuse the existing one to spawn the process, defaults to `false`. + "use_new_terminal": false, + // Whether to allow multiple instances of the same task to be run, or rather wait for the existing ones to finish, defaults to `false`. + "allow_concurrent_runs": false, + // What to do with the terminal pane and tab, after the command was started: + // * `always` — always show the task's pane, and focus the corresponding tab in it (default) + // * `no_focus` — always show the task's pane, add the task's tab in it, but don't focus it + // * `never` — do not alter focus, but still add/reuse the task's tab in its pane + "reveal": "always", + // What to do with the terminal pane and tab, after the command has finished: + // * `never` — Do nothing when the command finishes (default) + // * `always` — always hide the terminal tab, hide the pane also if it was the last tab in it + // * `on_success` — hide the terminal tab on task success only, otherwise behaves similar to `always` + "hide": "never", + // Which shell to use when running a task inside the terminal. + // May take 3 values: + // 1. (default) Use the system's default terminal configuration in /etc/passwd + // "shell": "system" + // 2. A program: + // "shell": { + // "program": "sh" + // } + // 3. A program with arguments: + // "shell": { + // "with_arguments": { + // "program": "/bin/bash", + // "args": ["--login"] + // } + // } + "shell": "system", + // Whether to show the task line in the output of the spawned task, defaults to `true`. + "show_summary": true, + // Whether to show the command line in the output of the spawned task, defaults to `true`. + "show_output": true + } +] diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000000..99496f91bde --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,68 @@ +# Guidance for Coding Agents + +This repository is a pnpm monorepo managed with Turbo. It contains multiple apps and packages that make up the Trigger.dev platform and SDK. + +## Repository layout +- `apps/webapp` – Remix application that serves as the main API and dashboard. +- `apps/supervisor` – Node application for executing built tasks. +- `packages/*` – Published packages such as `@trigger.dev/sdk`, the CLI (`trigger.dev`), and shared libraries. +- `internal-packages/*` – Internal-only packages used by the webapp and other apps. +- `references/*` – Example projects for manual testing and development of new features. +- `ai/references` – Contains additional documentation including an overview (`repo.md`) and testing guidelines (`tests.md`). + +See `ai/references/repo.md` for a more complete explanation of the workspaces. + +## Development setup +1. Install dependencies with `pnpm i` (pnpm `10.23.0` and Node.js `20.20.0` are required). +2. Copy `.env.example` to `.env` and generate a random 16 byte hex string for `ENCRYPTION_KEY` (`openssl rand -hex 16`). Update other secrets if needed. +3. Start the local services with Docker: + ```bash + pnpm run docker + ``` +4. Run database migrations: + ```bash + pnpm run db:migrate + ``` +5. Build the webapp, CLI and SDK packages: + ```bash + pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk + ``` +6. Launch the development server: + ```bash + pnpm run dev --filter webapp + ``` + The webapp runs on . + +For full setup instructions see `CONTRIBUTING.md`. + +## Running tests +- Unit tests use **vitest**. Run all tests: + ```bash + pnpm run test + ``` +- Run tests for a specific workspace (example for `webapp`): + ```bash + pnpm run test --filter webapp + ``` +- Prefer running a single test file from within its directory: + ```bash + cd apps/webapp + pnpm run test ./src/components/Button.test.ts + ``` + If packages in that workspace need to be built first, run `pnpm run build --filter webapp`. + +Refer to `ai/references/tests.md` for details on writing tests. Tests should avoid mocks or stubs and use the helpers from `@internal/testcontainers` when Redis or Postgres are needed. + +## Coding style +- Formatting is enforced using Prettier. Run `pnpm run format` before committing. +- Follow the existing project conventions. Test files live beside the files under test and use descriptive `describe` and `it` blocks. +- Do not commit directly to the `main` branch. All changes should be made in a separate branch and go through a pull request. + +## Additional docs +- The root `README.md` describes Trigger.dev and links to documentation. +- The `docs` workspace contains our documentation site, which can be run locally with: + ```bash + pnpm run dev --filter docs + ``` +- `references/README.md` explains how to create new reference projects for manual testing. + diff --git a/CHANGESETS.md b/CHANGESETS.md index dab130bc8bc..2e225b9ad34 100644 --- a/CHANGESETS.md +++ b/CHANGESETS.md @@ -1,24 +1,49 @@ -# Changesets +# Changesets and Server Changes -Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage updated our packages and releasing them to npm. +Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage package versions and releasing them to npm. For server-only changes, we use a lightweight `.server-changes/` convention. -## Adding a changeset +## Adding a changeset (package changes) To add a changeset, use `pnpm run changeset:add` and follow the instructions [here](https://github.com/changesets/changesets/blob/main/docs/adding-a-changeset.md). Please only ever select one of our public packages when adding a changeset. -## Release instructions (local only) +## Adding a server change (server-only changes) -Based on the instructions [here](https://github.com/changesets/changesets/blob/main/docs/intro-to-using-changesets.md) +If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, etc.) and does NOT change any published packages, add a `.server-changes/` file instead of a changeset: -1. Run `pnpm run changeset:version` -2. Run `pnpm run changeset:release` +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +- `area`: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type`: `feature` | `fix` | `improvement` | `breaking` + +For **mixed PRs** (both packages and server): just add a changeset. No `.server-changes/` file needed. + +See `.server-changes/README.md` for full documentation. + +## When to add which + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset (`pnpm run changeset:add`) | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | ## Release instructions (CI) Please follow the best-practice of adding changesets in the same commit as the code making the change with `pnpm run changeset:add`, as it will allow our release.yml CI workflow to function properly: -- Anytime new changesets are added in a commit in the `main` branch, the [release.yml](./.github/workflows/release.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. -- When the version PR is merged into `main`, the release.yml workflow will automatically run `pnpm run changeset:release` to build and release packages to npm. +- Anytime new changesets are added in a commit in the `main` branch, the [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. +- The release PR body is automatically enhanced with a clean, deduplicated summary that includes both package changes and `.server-changes/` entries. +- Consumed `.server-changes/` files are removed on the `changeset-release/main` branch — the same way changesets deletes `.changeset/*.md` files. When the release PR merges, they're gone from main. +- When the version PR is merged into `main`, the [release.yml](./.github/workflows/release.yml) workflow will automatically build, release packages to npm, and create a single unified GitHub release. ## Pre-release instructions @@ -30,28 +55,16 @@ Please follow the best-practice of adding changesets in the same commit as the c ## Snapshot instructions -!MAKE SURE TO UPDATE THE TAG IN THE INSTRUCTIONS BELOW! +1. Update the `.changeset/config.json` file to set the `"changelog"` field to this: -1. Add changesets as usual - -```sh -pnpm run changeset:add +```json +"changelog": "@changesets/cli/changelog", ``` -2. Create a snapshot version (replace "prerelease" with your tag) +2. Do a temporary commit (do NOT push this, you should undo it after) -```sh -pnpm exec changeset version --snapshot prerelease -``` - -3. Build the packages: - -```sh -pnpm run build --filter "@trigger.dev/*" -``` +3. Run `./scripts/publish-prerelease.sh prerelease` -4. Publish the snapshot (replace "dev" with your tag) +You can choose a different tag if you want, but usually `prerelease` is fine. -```sh -pnpm exec changeset publish --no-git-tag --snapshot --tag prerelease -``` +5. Undo the commit where you updated the config.json file. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000..0a54cced672 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,247 @@ +# CLAUDE.md + +This file provides guidance to Claude Code when working with this repository. Subdirectory CLAUDE.md files provide deeper context when you navigate into specific areas. + +## Build and Development Commands + +This is a pnpm 10.23.0 monorepo using Turborepo. Run commands from root with `pnpm run`. + +```bash +pnpm run docker # Start Docker services (PostgreSQL, Redis, Electric) +pnpm run db:migrate # Run database migrations +pnpm run db:seed # Seed the database (required for reference projects) + +# Build packages (required before running) +pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk + +pnpm run dev --filter webapp # Run webapp (http://localhost:3030) +pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" # Watch CLI and packages +``` + +### Verifying Changes + +The verification command depends on where the change lives: + +- **Apps and internal packages** (`apps/*`, `internal-packages/*`): Use `typecheck`. **Never use `build`** for these — building proves almost nothing about correctness. +- **Public packages** (`packages/*`): Use `build`. + +```bash +# Apps and internal packages — use typecheck +pnpm run typecheck --filter webapp # ~1-2 minutes +pnpm run typecheck --filter @internal/run-engine + +# Public packages — use build +pnpm run build --filter @trigger.dev/sdk +pnpm run build --filter @trigger.dev/core +``` + +Only run typecheck/build after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +## Testing + +We use vitest exclusively. **Never mock anything** - use testcontainers instead. + +```bash +pnpm run test --filter webapp # All tests for a package +cd internal-packages/run-engine +pnpm run test ./src/engine/tests/ttl.test.ts --run # Single test file +pnpm run build --filter @internal/run-engine # May need to build deps first +``` + +Test files go next to source files (e.g., `MyService.ts` -> `MyService.test.ts`). + +### Testcontainers for Redis/PostgreSQL + +```typescript +import { redisTest, postgresTest, containerTest } from "@internal/testcontainers"; + +redisTest("should use redis", async ({ redisOptions }) => { + /* ... */ +}); +postgresTest("should use postgres", async ({ prisma }) => { + /* ... */ +}); +containerTest("should use both", async ({ prisma, redisOptions }) => { + /* ... */ +}); +``` + +## Changesets and Server Changes + +When modifying any public package (`packages/*` or `integrations/*`), add a changeset: + +```bash +pnpm run changeset:add +``` + +- Default to **patch** for bug fixes and minor changes +- Confirm with maintainers before selecting **minor** (new features) +- **Never** select major without explicit approval + +When modifying only server components (`apps/webapp/`, `apps/supervisor/`, etc.) with no package changes, add a `.server-changes/` file instead. See `.server-changes/README.md` for format and documentation. + +## Dependency Pinning + +Zod is pinned to a single version across the entire monorepo (currently `3.25.76`). When adding zod to a new or existing package, use the **exact same version** as the rest of the repo - never a different version or a range. Mismatched zod versions cause runtime type incompatibilities (e.g., schemas from one package can't be used as body validators in another). + +## Architecture Overview + +### Request Flow + +User API call -> Webapp routes -> Services -> RunEngine -> Redis Queue -> Supervisor -> Container execution -> Results back through RunEngine -> ClickHouse (analytics) + PostgreSQL (state) + +### Apps + +- **apps/webapp**: Remix 2.1.0 app - main API, dashboard, orchestration. Uses Express server. +- **apps/supervisor**: Manages task execution containers (Docker/Kubernetes). + +### Public Packages + +- **packages/trigger-sdk** (`@trigger.dev/sdk`): Main SDK for writing tasks +- **packages/cli-v3** (`trigger.dev`): CLI - also bundles code that goes into customer task images +- **packages/core** (`@trigger.dev/core`): Shared types. **Import subpaths only** (never root). +- **packages/build** (`@trigger.dev/build`): Build extensions and types +- **packages/react-hooks**: React hooks for realtime and triggering +- **packages/redis-worker** (`@trigger.dev/redis-worker`): Redis-based background job system + +### Internal Packages + +- **internal-packages/database**: Prisma 6.14.0 client and schema (PostgreSQL) +- **internal-packages/clickhouse**: ClickHouse client, schema migrations, analytics queries +- **internal-packages/run-engine**: "Run Engine 2.0" - core run lifecycle management +- **internal-packages/redis**: Redis client creation utilities (ioredis) +- **internal-packages/testcontainers**: Test helpers for Redis/PostgreSQL containers +- **internal-packages/schedule-engine**: Durable cron scheduling +- **internal-packages/zodworker**: Graphile-worker wrapper (DEPRECATED - use redis-worker) + +### Legacy V1 Engine Code + +The `apps/webapp/app/v3/` directory name is misleading - most code there is actively used by V2. Only specific files are V1-only legacy (MarQS queue, triggerTaskV1, cancelTaskRunV1, etc.). See `apps/webapp/CLAUDE.md` for the exact list. When you encounter V1/V2 branching in services, only modify V2 code paths. All new work uses Run Engine 2.0 (`@internal/run-engine`) and redis-worker. + +### Documentation + +Docs live in `docs/` as a Mintlify site (MDX format). See `docs/CLAUDE.md` for conventions. + +### Reference Projects + +The `references/` directory contains test workspaces for testing SDK and platform features. Use `references/hello-world` to manually test changes before submitting PRs. + +## Docker Image Guidelines + +When updating Docker image references: + +- **Always use multiplatform/index digests**, not architecture-specific digests +- Architecture-specific digests cause CI failures on different build environments +- Use the digest from the main Docker Hub page, not from a specific OS/ARCH variant + +## Writing Trigger.dev Tasks + +Always import from `@trigger.dev/sdk`. Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob`. + +```typescript +import { task } from "@trigger.dev/sdk"; + +export const myTask = task({ + id: "my-task", + run: async (payload: { message: string }) => { + // Task logic + }, +}); +``` + +### SDK Documentation Rules + +The `rules/` directory contains versioned SDK documentation distributed via the SDK installer. Current version: `rules/manifest.json`. Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked - these are maintained in separate dedicated passes. + +## Testing with hello-world Reference Project + +First-time setup: + +1. `pnpm run db:seed` to seed the database +2. Build CLI: `pnpm run build --filter trigger.dev && pnpm i` +3. Authorize: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` + +Running: `cd references/hello-world && pnpm exec trigger dev` + +## Local Task Testing Workflow + +### Step 1: Start Webapp in Background + +```bash +# Run from repo root with run_in_background: true +pnpm run dev --filter webapp +curl -s http://localhost:3030/healthcheck # Verify running +``` + +### Step 2: Start Trigger Dev in Background + +```bash +cd references/hello-world && pnpm exec trigger dev +# Wait for "Local worker ready [node]" +``` + +### Step 3: Trigger and Monitor Tasks via MCP + +``` +mcp__trigger__get_current_worker(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev") +mcp__trigger__trigger_task(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskId: "hello-world", payload: {"message": "Hello"}) +mcp__trigger__list_runs(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskIdentifier: "hello-world", limit: 5) +``` + +Dashboard: http://localhost:3030/orgs/references-9dfd/projects/hello-world-97DT/env/dev/runs + + + +# Skill mappings — when working in these areas, load the linked skill file into context. + +skills: + +- task: "Using agentcrumbs for debug tracing, adding crumbs, trails, markers, querying traces, or stripping debug code before merge" + load: "node_modules/agentcrumbs/skills/agentcrumbs/SKILL.md" +- task: "Setting up agentcrumbs in the project, initializing namespace catalog, running crumbs init" +load: "node_modules/agentcrumbs/skills/agentcrumbs/init/SKILL.md" + + +## agentcrumbs + +Add crumbs as you write code — not just when debugging. Mark lines with +`// @crumbs` or wrap blocks in `// #region @crumbs`. They stay on the +branch throughout development and are stripped by `agentcrumbs strip` +before merge. + +### Namespaces + +| Namespace | Description | Path | +| ----------------- | ---------------------------------------------- | ----------------------------------- | +| `webapp` | Main Remix app — API, dashboard, orchestration | `apps/webapp` | +| `supervisor` | Task execution container manager | `apps/supervisor` | +| `sdk` | Main SDK for writing tasks | `packages/trigger-sdk` | +| `cli` | CLI and task image bundling | `packages/cli-v3` | +| `core` | Shared types and subpath exports | `packages/core` | +| `build` | Build extensions and types | `packages/build` | +| `react-hooks` | React hooks for realtime/triggering | `packages/react-hooks` | +| `redis-worker` | Redis-based background job system | `packages/redis-worker` | +| `run-engine` | Run Engine 2.0 — core run lifecycle | `internal-packages/run-engine` | +| `database` | Prisma client and schema | `internal-packages/database` | +| `clickhouse` | ClickHouse client, migrations, analytics | `internal-packages/clickhouse` | +| `schedule-engine` | Durable cron scheduling | `internal-packages/schedule-engine` | +| `run-queue` | Run queue management | `internal-packages/run-queue` | + +Do not invent new namespaces — pick from this table or ask first. + +### For PR reviewers + +Do not flag `// @crumbs` markers, `#region @crumbs` blocks, or agentcrumbs +imports in reviews. These are temporary debug instrumentation stripped +before merge. Data logged in crumbs (IDs, names, values) never reaches +production. + +### CLI + +```bash +pnpm exec agentcrumbs collect # start collector (multi-service) +pnpm exec agentcrumbs tail --app trigger # live tail +pnpm exec agentcrumbs clear --app trigger # remove crumbs before merge +``` + +The preferred way to query for crumbs is to use `pnpm exec agentcrumbs query --app trigger` with the `--limit` option and cursor pagination, and clear existing crumbs before reproducing a bug via `pnpm exec agentcrumbs clear --app trigger`. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 231cbdf1f9a..88e24cba4f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,21 +2,37 @@ Thank you for taking the time to contribute to Trigger.dev. Your involvement is not just welcomed, but we encourage it! 🚀 -Please take some time to read this guide to understand contributing best practices for Trigger.dev. +Please take some time to read this guide to understand contributing best practices for Trigger.dev. Note that we use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust, so you'll need to be vouched before opening a PR. Thank you for helping us make Trigger.dev even better! 🤩 +> **Important:** We only accept PRs that address a single issue. Please do not submit PRs containing multiple unrelated fixes or features. If you have multiple contributions, open a separate PR for each one. + +## Getting vouched (required before opening a PR) + +We use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust. **PRs from unvouched users are automatically closed.** + +Before you open your first pull request, you need to be vouched by a maintainer. Here's how: + +1. Open a [Vouch Request](https://github.com/triggerdotdev/trigger.dev/issues/new?template=vouch-request.yml) issue. +2. Tell us what you'd like to work on and share any relevant background. +3. A maintainer will review your request and vouch for you by commenting on the issue. +4. Once vouched, your PRs will be accepted normally. + +If you're unsure whether you're already vouched, go ahead and open a PR — the check will tell you. + ## Developing The development branch is `main`. This is the branch that all pull requests should be made against. The changes on the `main` -branch are tagged into a release monthly. +branch are tagged into a release periodically. ### Prerequisites -- [Node.js](https://nodejs.org/en) version >=16.x -- [pnpm package manager](https://pnpm.io/installation) version 7 +- [Node.js](https://nodejs.org/en) version 20.20.0 +- [pnpm package manager](https://pnpm.io/installation) version 10.23.0 - [Docker](https://www.docker.com/get-started/) +- [protobuf](https://github.com/protocolbuffers/protobuf) ### Setup @@ -33,15 +49,19 @@ branch are tagged into a release monthly. ``` cd trigger.dev ``` -3. Install the required packages using pnpm. +3. Ensure you are on the correct version of Node.js (20.20.0). If you are using `nvm`, there is an `.nvmrc` file that will automatically select the correct version of Node.js when you navigate to the repository. + +4. Run `corepack enable` to use the correct version of pnpm (`10.23.0`) as specified in the root `package.json` file. + +5. Install the required packages using pnpm. ``` pnpm i ``` -4. Create your `.env` file +6. Create your `.env` file ``` cp .env.example .env ``` -5. Open it and generate a new value for `ENCRYPTION_KEY`: +7. Open it and generate a new value for `ENCRYPTION_KEY`: `ENCRYPTION_KEY` is used to two-way encrypt OAuth access tokens and so you'll probably want to actually generate a unique value, and it must be a random 16 byte hex string. You can generate one with the following command: @@ -51,27 +71,21 @@ branch are tagged into a release monthly. Feel free to update `SESSION_SECRET` and `MAGIC_LINK_SECRET` as well using the same method. -6. Start Docker. This starts the required services like Postgres. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md) +8. Start Docker. This starts the required services like Postgres & Redis. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md) ``` pnpm run docker ``` - This will also start and run a local instance of [pgAdmin](https://www.pgadmin.org/) on [localhost:5480](http://localhost:5480), preconfigured with email `admin@example.com` and pwd `admin`. Then use `postgres` as the password to the Trigger.dev server. - -7. Migrate the database +9. Migrate the database ``` pnpm run db:migrate ``` -8. Build the app - ``` - pnpm run build --filter webapp - ``` -9. Run the seed script - ``` - pnpm run db:seed - ``` -10. Run the app. See the section below. +10. Build everything + ``` + pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk + ``` +11. Run the app. See the section below. ## Running @@ -83,145 +97,113 @@ branch are tagged into a release monthly. It should run on port `3030`: [http://localhost:3030](http://localhost:3030/) -2. Once the app is running click the magic link button and enter your email. -3. Check your terminal, the magic link email should have printed out as following: - - ```sh - webapp:dev: Log in to Trigger.dev - webapp:dev: - webapp:dev: Click here to log in with this magic link - webapp:dev: [http://localhost:3030/magic?token=U2FsdGVkX18OvB0JxgaswTLCSbaRz%2FY82TN0EZWhSzFyZYwgG%2BIzKVTkeiaOtWfotPw7F8RwFzCHh53aBpMEu%2B%2B%2FItb%2FcJYh89MSjc3Pz92bevoEjqxSQ%2Ff%2BZbks09JOpqlBbYC3FzGWC8vuSVFBlxqLXxteSDLthZSUaC%2BS2LaA%2BJgp%2BLO7hgjAaC2lXbCHrM7MTgTdXOFt7i0Dvvuwz6%2BWY25RnfomZOPqDsyH0xz8Q2rzPTz0Xu53WSXrZ1hd] - webapp:dev: - webapp:dev: If you didn't try to log in, you can safely ignore this email. - ``` - - Paste the magic link shown in your terminal into your browser to login. - -## Adding and running migrations +2. Once the app is running click the magic link button and enter your email. You will automatically be logged in, since you are running locally. Create an Org and your first project in the dashboard. -1. Modify packages/database/prisma/schema.prisma file -2. Change directory to the packages/database folder - ```sh - cd packages/database - ``` -3. Generate the Prisma client +## Manual testing using hello-world - ```sh - pnpm run generate - ``` +We use the `/references/hello-world` subdirectory as a staging ground for testing changes to the SDK (`@trigger.dev/sdk` at `/packages/trigger-sdk`), the Core package (`@trigger.dev/core` at `packages/core`), the CLI (`trigger.dev` at `/packages/cli-v3`) and the platform (The remix app at `/apps/webapp`). The instructions below will get you started on using the `hello-world` for local development of Trigger.dev. - The above updates the prisma client generated into node_modules/.prisma/client folder. This helps with typing of relevant prisma models. It ensures typescript - recognizes fields added or removed from a model and type-checks appropriately. +### First-time setup -4. Create and apply the migrations - - ``` - pnpm run db:migrate:dev - ``` +First, make sure you are running the webapp according to the instructions above. Then: - This creates a migration file and executes the migrations against your database and applies changes to the database schema(s) +1. Visit http://localhost:3030 in your browser and create a new project called "hello-world". -5. Commit generated migrations as well as changes to the schema.prisma file -6. If you're using VSCode you may need to restart the Typescript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. +2. In Postgres go to the "Projects" table and for the project you create change the `externalRef` to `proj_rrkpdguyagvsoktglnod`. -## Testing CLI changes +3. Build the CLI -To test CLI changes, follow the steps below: +```sh +# Build the CLI +pnpm run build --filter trigger.dev +# Make it accessible to `pnpm exec` +pnpm i +``` -1. Build the CLI and watch for changes +4. Change into the `/references/hello-world` directory and authorize the CLI to the local server: ```sh -cd packages/cli -pnpm run dev +cd references/hello-world +cp .env.example .env +pnpm exec trigger login -a http://localhost:3030 ``` -2. Open a new Terminal window and run the webapp locally and then create a new project in the dashboard. Copy out the dev API key. +This will open a new browser window and authorize the CLI against your local user account. -3. Create a new temporary Next.js app in references directory +You can optionally pass a `--profile` flag to the `login` command, which will allow you to use the CLI with separate accounts/servers. We suggest using a profile called `local` for your local development: ```sh -cd ./references -pnpm create next-app@latest test-cli --ts --no-eslint --tailwind --app --src-dir --import-alias "@/*" +cd references/hello-world +pnpm exec trigger login -a http://localhost:3030 --profile local +# later when you run the dev or deploy command: +pnpm exec trigger dev --profile local +pnpm exec trigger deploy --profile local ``` -4. Then once that's finished, add the `@trigger.dev/cli` to the `devDependencies` of the newly created Next.js app's `package.json` file, like so: +### Running -```json -{ - // other package.json properties - "devDependencies": { "@trigger.dev/cli": "workspace:*" } -} -``` +The following steps should be followed any time you start working on a new feature you want to test: -5. Back in the terminal, navigate into the reference, and initialize the CLI. When prompted, select `self-hosted` and enter `localhost:3030` if you are testing against the local instance of Trigger.dev, or you can just use the Trigger.dev cloud. When asked for an API key, use the key you copied earlier. +1. Make sure the webapp is running on localhost:3030 + +2. Open a terminal window and build the CLI and packages and watch for changes ```sh -cd ./test-cli -pnpm i -pnpm exec trigger-cli init +pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" ``` -6. If you are just testing the `init` command, you can stop here. If you'd like to test the `dev` command, first start the Next.js app on port 3000: +3. Open another terminal window, and change into the `/references/hello-world` directory. + +4. Run the `dev` command, which will register all the local tasks with the platform and allow you to start testing task execution: ```sh -pnpm run dev +# in /references/hello-world +pnpm exec trigger dev ``` -7. Open a new terminal window, and then run the `dev` command like so: +If you want additional debug logging, you can use the `--log-level debug` flag: ```sh -pnpm exec trigger-cli dev +# in /references/hello-world +pnpm exec trigger dev --log-level debug ``` -8. Please remember to delete the temporary project you created after you've tested the changes, and before you raise a PR. +6. If you make any changes in the CLI/Core/SDK, you'll need to `CTRL+C` to exit the `dev` command and restart it to pickup changes. Any changes to the files inside of the `hello-world/src/trigger` dir will automatically be rebuilt by the `dev` command. -## Running end-to-end webapp tests +7. Navigate to the `hello-world` project in your local dashboard at localhost:3030 and you should see the list of tasks. -To run the end-to-end tests, follow the steps below: +8. Go to the "Test" page in the sidebar and select a task. Then enter a payload and click "Run test". You can tell what the payloads should be by looking at the relevant task file inside the `/references/hello-world/src/trigger` folder. Many of them accept an empty payload. -1. Set up environment variables (copy example envs into the correct place) +9. Feel free to add additional files in `hello-world/src/trigger` to test out specific aspects of the system, or add in edge cases. -```sh -cp ./.env.example ./.env -cp ./references/nextjs-test/.env.example ./references/nextjs-test/.env.local -``` - -2. Set up dependencies +## Adding and running migrations -```sh -# Build packages -pnpm run build --filter @references/nextjs-test^... -pnpm --filter @trigger.dev/database generate +1. Modify internal-packages/database/prisma/schema.prisma file +2. Change directory to the packages/database folder -# Move trigger-cli bin to correct place -pnpm install --frozen-lockfile + ```sh + cd packages/database + ``` -# Install playwrite browsers (ONE TIME ONLY) -npx playwright install -``` +3. Create a migration -3. Set up the database + ``` + pnpm run db:migrate:dev:create + ``` -```sh -pnpm run docker -pnpm run db:migrate -pnpm run db:seed -``` + This creates a migration file. Check the migration file does only what you want. If you're adding any database indexes they must use `CONCURRENTLY`, otherwise they'll lock the table when executed. -4. Run the end-to-end tests +4. Run the migration. -```sh -pnpm run test:e2e +``` +pnpm run db:migrate:deploy +pnpm run generate ``` -### Cleanup - -The end-to-end tests use a `setup` and `teardown` script to seed the database with test data. If the test runner doesn't exit cleanly, then the database can be left in a state where the tests can't run because the `setup` script will try to create data that already exists. If this happens, you can manually delete the `users` and `organizations` from the database using prisma studio: +This executes the migrations against your database and applies changes to the database schema(s), and then regenerates the Prisma client. -```sh -# With the database running (i.e. pnpm run docker) -pnpm run db:studio -``` +4. Commit generated migrations as well as changes to the schema.prisma file +5. If you're using VSCode you may need to restart the Typescript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. ## Add sample jobs @@ -260,9 +242,23 @@ See the [Job Catalog](./references/job-catalog/README.md) file for more. **If you get errors, be sure to fix them before committing.** -- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating you PR. -- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue - ](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). +> **Note:** We may close PRs if we decide that the cost of integrating the change outweighs the benefits. To improve the chances of your PR getting accepted, follow the guidelines below. + +### PR workflow + +1. **Always open your PR in draft status first.** Do not mark it as "Ready for Review" until the steps below are complete. +2. **Address all CodeRabbit code review comments.** Our CI runs an automated code review via CodeRabbit. Go through each comment and either fix the issue or resolve it with a comment explaining why no change is needed. +3. **Wait for all CI checks to pass.** Do not mark the PR as "Ready for Review" until every check is green. +4. **Then mark the PR as "Ready for Review"** so a maintainer can take a look. + +### Cost/benefit analysis for risky changes + +If your change touches core infrastructure, modifies widely-used code paths, or could introduce regressions, consider doing a brief cost/benefit analysis and including it in the PR description. Explain what the benefit is to users and why the risk is worth it. This goes a long way toward helping maintainers evaluate your contribution. + +### General guidelines + +- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating your PR. +- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). - Be sure to fill the PR Template accordingly. ## Adding changesets @@ -285,6 +281,39 @@ You will be prompted to select which packages to include in the changeset. Only Most of the time the changes you'll make are likely to be categorized as patch releases. If you feel like there is the need for a minor or major release of the package based on the changes being made, add the changeset as such and it will be discussed during PR review. +## Adding server changes + +Changesets only track published npm packages. If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, etc.) with no package changes, add a `.server-changes/` file so the change appears in release notes. + +Create a markdown file with a descriptive name: + +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +**Fields:** +- `area` (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type` (required): `feature` | `fix` | `improvement` | `breaking` + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise — it will appear in release notes. + +**When to add which:** + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | + +See `.server-changes/README.md` for more details. + ## Troubleshooting ### EADDRINUSE: address already in use :::3030 diff --git a/DOCKER_INSTALLATION.md b/DOCKER_INSTALLATION.md index a46e904ee2d..7e135bd6f84 100644 --- a/DOCKER_INSTALLATION.md +++ b/DOCKER_INSTALLATION.md @@ -8,47 +8,56 @@ If you don't have Docker installed on your machine, you'll run into some complic Below are the steps on how you can avoid that. -First you need to setup docker-compose as it is an underlying tool that this command: `pnpm run docker` fires behind the scene. +First you need to setup docker compose as it is an underlying tool that this command: `pnpm run docker` fires behind the scene. ## Linux -To install Docker Compose on Linux Ubuntu via the terminal, you can follow these steps: +To install Docker Compose on Linux Ubuntu, you can follow these steps: -1. Update the package index on your system by running the following command: +1. Create the Docker config directory and cli-plugins subdirectory: ```shell - sudo apt update + DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} + mkdir -p $DOCKER_CONFIG/cli-plugins ``` -2. Install the required dependencies by running the following command: +2. Download the Docker Compose plugin: ```shell - sudo apt install curl + curl -SL "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" -o $DOCKER_CONFIG/cli-plugins/docker-compose ``` -3. Download the Docker Compose binary into the `/usr/local/bin` directory using the `curl` command: + Note: + + - To install for all users, replace `$DOCKER_CONFIG/cli-plugins` with `/usr/local/lib/docker/cli-plugins` + +3. Set the appropriate permissions to make the Docker Compose plugin executable: ```shell - sudo curl -L "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose ``` -4. Set the appropriate permissions to make the `docker-compose` binary executable: + If you installed for all users: ```shell - sudo chmod +x /usr/local/bin/docker-compose + sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose ``` -5. Verify that Docker Compose has been successfully installed by running the following command: +4. Verify that Docker Compose has been successfully installed: ```shell - docker-compose --version + docker compose version ``` - This command should display the version information of Docker Compose without any errors. + You should see output similar to: + + ``` + Docker Compose version vX.Y.Z + ``` -After following these steps, you should have Docker Compose installed on your Ubuntu system, and you can use it by running `docker-compose` commands in the terminal. +After following these steps, you should have Docker Compose installed on your Ubuntu system, and you can use it by running `docker compose` commands in the terminal. -When you've verified that the `docker-compose` package is installed and you proceed to start Docker with `pnpm run docker`. +When you've verified that the `docker compose` package is installed and you proceed to start Docker with `pnpm run docker`. You'll probably get an error similar to the one below: diff --git a/README.md b/README.md index 1919613c6b6..0d7f1ca2930 100644 --- a/README.md +++ b/README.md @@ -1,90 +1,129 @@

- - - - Trigger.dev logo - - -### The open source background jobs framework -[Discord](https://discord.gg/JtBAxBr2m3) | [Website](https://trigger.dev) | [Issues](https://github.com/triggerdotdev/trigger.dev/issues) | [Docs](https://trigger.dev/docs) +![Trigger.dev logo](https://content.trigger.dev/github-header-banner.jpg) -[![Twitter](https://img.shields.io/twitter/url/https/twitter.com/triggerdotdev.svg?style=social&label=Follow%20%40trigger.dev)](https://twitter.com/triggerdotdev) -[![GitHub Repo stars](https://img.shields.io/github/stars/triggerdotdev/trigger.dev?style=social)](https://github.com/triggerdotdev/trigger.dev) +### Build and deploy fully‑managed AI agents and workflows + +[Website](https://trigger.dev) | [Docs](https://trigger.dev/docs) | [Issues](https://github.com/triggerdotdev/trigger.dev/issues) | [Example projects](https://github.com/triggerdotdev/examples) | [Feature requests](https://triggerdev.featurebase.app/) | [Public roadmap](https://triggerdev.featurebase.app/roadmap) | [Self-hosting](https://trigger.dev/docs/self-hosting/overview) + +[![Open Source](https://img.shields.io/badge/Open%20Source-%E2%9D%A4-red.svg)](https://github.com/triggerdotdev/trigger.dev) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/triggerdotdev/trigger.dev/blob/main/LICENSE) +[![npm](https://img.shields.io/npm/v/@trigger.dev/sdk.svg?label=npm)](https://www.npmjs.com/package/@trigger.dev/sdk) +[![SDK downloads](https://img.shields.io/npm/dm/@trigger.dev/sdk.svg?label=SDK%20downloads)](https://www.npmjs.com/package/@trigger.dev/sdk) + +[![Twitter Follow](https://img.shields.io/twitter/follow/triggerdotdev?style=social)](https://twitter.com/triggerdotdev) +[![Discord](https://img.shields.io/discord/1066956501299777596?logo=discord&logoColor=white&color=7289da)](https://discord.gg/nkqV9xBYWy) +[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/triggerdotdev/trigger.dev) +[![GitHub stars](https://img.shields.io/github/stars/triggerdotdev/trigger.dev?style=social)](https://github.com/triggerdotdev/trigger.dev)
-# About Trigger.dev +## About Trigger.dev -Create long-running jobs directly in your codebase with features like API integrations, webhooks, scheduling and delays. +Trigger.dev is the open-source platform for building AI workflows in TypeScript. Long-running tasks with retries, queues, observability, and elastic scaling. -## Long running Jobs on serverless +## The platform designed for building AI agents -Reliably run jobs and don’t worry about function timeouts, we handle those for you. +Build [AI agents](https://trigger.dev/product/ai-agents) using all the frameworks, services and LLMs you're used to, deploy them to Trigger.dev and get durable, long-running tasks with retries, queues, observability, and elastic scaling out of the box. -- Auto-resume after a function timeout -- Auto-resume after a server outage -- Add delays of up to a year +- **Long-running without timeouts**: Execute your tasks with absolutely no timeouts, unlike AWS Lambda, Vercel, and other serverless platforms. -## In your codebase +- **Durability, retries & queues**: Build rock solid agents and AI applications using our durable tasks, retries, queues and idempotency. -Create Jobs where they belong: in your codebase. Version control, localhost, test, review, and deploy like you're already used to. +- **True runtime freedom**: Customize your deployed tasks with system packages – run browsers, Python scripts, FFmpeg and more. -## Secure by design +- **Human-in-the-loop**: Programmatically pause your tasks until a human can approve, reject or give feedback. -We only receive Triggers and the data you choose to send to us. You can even completely self-host the entire platform. +- **Realtime apps & streaming**: Move your background jobs to the foreground by subscribing to runs or streaming AI responses to your app. -## Don't worry about deployment +- **Observability & monitoring**: Each run has full tracing and logs. Configure error alerts to catch bugs fast. -Just use our SDK to write Jobs in your codebase. There's nothing extra to deploy and no CI to configure, your Jobs just connect to our cloud. Or you can always self-host. +## Key features: -## Full visibility of every job run +- **[JavaScript and TypeScript SDK](https://trigger.dev/docs/tasks/overview)** - Build background tasks using familiar programming models +- **[Long-running tasks](https://trigger.dev/docs/runs/max-duration)** - Handle resource-heavy tasks without timeouts +- **[Durable cron schedules](https://trigger.dev/docs/tasks/scheduled#scheduled-tasks-cron)** - Create and attach recurring schedules of up to a year +- **[Trigger.dev Realtime](https://trigger.dev/docs/realtime/overview)** - Trigger, subscribe to, and get real-time updates for runs, with LLM streaming support +- **[Build extensions](https://trigger.dev/docs/config/extensions/overview#build-extensions)** - Hook directly into the build system and customize the build process. Run Python scripts, FFmpeg, browsers, and more. +- **[React hooks](https://trigger.dev/docs/frontend/react-hooks#react-hooks)** - Interact with the Trigger.dev API on your frontend using our React hooks package +- **[Batch triggering](https://trigger.dev/docs/triggering#tasks-batchtrigger)** - Use batchTrigger() to initiate multiple runs of a task with custom payloads and options +- **[Structured inputs / outputs](https://trigger.dev/docs/tasks/schemaTask#schematask)** - Define precise data schemas for your tasks with runtime payload validation +- **[Waits](https://trigger.dev/docs/wait)** - Add waits to your tasks to pause execution for a specified duration +- **[Preview branches](https://trigger.dev/docs/deployment/preview-branches)** - Create isolated environments for testing and development. Integrates with Vercel and git workflows +- **[Waitpoints](https://trigger.dev/docs/wait-for-token#wait-for-token)** - Add human-in-the-loop judgment at critical decision points without disrupting workflow +- **[Concurrency & queues](https://trigger.dev/docs/queue-concurrency#concurrency-and-queues)** - Set concurrency rules to manage how multiple tasks execute +- **[Multiple environments](https://trigger.dev/docs/how-it-works#dev-mode)** - Support for DEV, PREVIEW, STAGING, and PROD environments +- **[No infrastructure to manage](https://trigger.dev/docs/how-it-works#trigger-dev-architecture)** - Auto-scaling infrastructure that eliminates timeouts and server management +- **[Automatic retries](https://trigger.dev/docs/errors-retrying)** - If your task encounters an uncaught error, we automatically attempt to run it again +- **[Checkpointing](https://trigger.dev/docs/how-it-works#the-checkpoint-resume-system)** - Tasks are inherently durable, thanks to our checkpointing feature +- **[Versioning](https://trigger.dev/docs/versioning)** - Atomic versioning allows you to deploy new versions without affecting running tasks +- **[Machines](https://trigger.dev/docs/machines)** - Configure the number of vCPUs and GBs of RAM you want the task to use +- **[Observability & monitoring](https://trigger.dev/product/observability-and-monitoring)** - Monitor every aspect of your tasks' performance with comprehensive logging and visualization tools +- **[Logging & tracing](https://trigger.dev/docs/logging)** - Comprehensive logging and tracing for all your tasks +- **[Tags](https://trigger.dev/docs/tags#tags)** - Attach up to ten tags to each run, allowing you to filter via the dashboard, realtime, and the SDK +- **[Run metadata](https://trigger.dev/docs/runs/metadata#run-metadata)** - Attach metadata to runs which updates as the run progresses and is available to use in your frontend for live updates +- **[Bulk actions](https://trigger.dev/docs/bulk-actions)** - Perform actions on multiple runs simultaneously, including replaying and cancelling +- **[Real-time alerts](https://trigger.dev/docs/troubleshooting-alerts#alerts)** - Choose your preferred notification method for run failures and deployments -View every Task in every Run so you can tell exactly what happened. +## Write tasks in your codebase -![image](https://www.trigger.dev/build/_assets/web-app-2QFKXFLW.png) +Create tasks where they belong: in your codebase. Version control, localhost, test and review like you're already used to. -## Built-in integrations +```ts +import { task } from "@trigger.dev/sdk"; -Easily integrate with hundreds of third-party APIs – including your own. Use API keys (which never leave your server) or let us handle OAuth for you. Install our integration packages and easily subscribe to webhooks and perform common tasks, or you can easily use your existing favorite Node.JS SDKs and get resumability and idempotency through our `runTask` function. +//1. You need to export each task +export const helloWorld = task({ + //2. Use a unique id for each task + id: "hello-world", + //3. The run function is the main function of the task + run: async (payload: { message: string }) => { + //4. You can write code that runs for a long time here, there are no timeouts + console.log(payload.message); + }, +}); +``` -## Our progress +## Deployment -We’re building the most comprehensive and easy-to-use background jobs framework for developers. +Use our SDK to write tasks in your codebase. There's no infrastructure to manage, your tasks automatically scale and connect to our cloud. Or you can always self-host. -Click the links to join the discussions about our upcoming features. +## Environments + +We support `Development`, `Staging`, `Preview`, and `Production` environments, allowing you to test your tasks before deploying them to production. + +## Full visibility of every job run -| Feature | What it does | Status | -| ------------------------------------------------------------------------------------ | --------------------------------------------------- | ------ | -| Integration kit | Official Trigger.dev integrations or build your own | ✅ | -| Self-hosting | Host the platform yourself | ✅ | -| Cloud | Just write code, no deployment required | ✅ | -| Dashboard | View every Task in every Run | ✅ | -| Serverless | Long-running Jobs on your serverless backend | ✅ | -| React hooks | Easily update your UI with Job progress | ✅ | -| React frameworks | Support for Remix, Astro, RedwoodJS & more | ✅ | -| [Background tasks](https://github.com/triggerdotdev/trigger.dev/discussions/400) | Offload long or intense Tasks to our infrastructure | 🛠️ | -| [Long-running servers](https://github.com/triggerdotdev/trigger.dev/discussions/430) | Run Jobs on your long-running backend | 🛠️ | -| Polling Triggers | Subscribe to changes without webhooks | 🕝 | -| Vercel integration | Easy deploy and preview environment support | 🕝 | -| Streaming | Receive data from your Jobs in realtime | 🕝 | -| 100+ integrations | Comprehensive support for popular APIs | 🕝 | -| [Trigger.dev Connect](https://github.com/triggerdotdev/trigger.dev/discussions/441) | Use integrations signed in as your users | 🕝 | -| File IO | Create Tasks that have file outputs | 🕝 | +View every task in every run so you can tell exactly what happened. We provide a full trace view of every task run so you can see what happened at every step. + +![Trace view image](https://content.trigger.dev/trace-view.png) # Getting started -Visit our docs [here](https://trigger.dev/docs). +The quickest way to get started is to create an account and project in our [web app](https://cloud.trigger.dev), and follow the instructions in the onboarding. Build and deploy your first task in minutes. + +### Useful links: + +- [Quick start](https://trigger.dev/docs/quick-start) - get up and running in minutes +- [How it works](https://trigger.dev/docs/how-it-works) - understand how Trigger.dev works under the hood +- [Guides and examples](https://trigger.dev/docs/guides/introduction) - walk-through guides and code examples for popular frameworks and use cases + +## Self-hosting + +If you prefer to self-host Trigger.dev, you can follow our [self-hosting guides](https://trigger.dev/docs/self-hosting/overview): + +- [Docker self-hosting guide](https://trigger.dev/docs/self-hosting/docker) - use Docker Compose to spin up a Trigger.dev instance +- [Kubernetes self-hosting guide](https://trigger.dev/docs/self-hosting/kubernetes) - use our official Helm chart to deploy Trigger.dev to your Kubernetes cluster -## Self-host +## Support and community -We provide an official trigger.dev docker image you can use to easily self-host the platform. We're working on more extensive guides but we currently provide a [Fly.io example repository](https://github.com/triggerdotdev/fly.io) with instructions in the README for deploying and using a self-hosted instance of Trigger.dev on Fly.io. +We have a large active community in our official [Discord server](https://trigger.dev/discord) for support, including a dedicated channel for self-hosting. ## Development To setup and develop locally or contribute to the open source project, follow our [development guide](./CONTRIBUTING.md). -## Meet the Amazing People Behind This Project 🚀 +## Meet the Amazing People Behind This Project: diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000000..8ba3ecb5007 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,36 @@ +## Guide on releasing a new version + +### Automated release (v4+) + +Releases are fully automated via CI: + +1. PRs merge to `main` with changesets (for package changes) and/or `.server-changes/` files (for server-only changes). +2. The [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow automatically creates/updates the `changeset-release/main` PR with version bumps and an enhanced summary of all changes. Consumed `.server-changes/` files are removed on the release branch (same approach changesets uses for `.changeset/` files — they're deleted on the branch, so merging the PR cleans them up). +3. When ready to release, merge the changeset release PR into `main`. +4. The [release.yml](./.github/workflows/release.yml) workflow automatically: + - Publishes all packages to npm + - Creates a single unified GitHub release (e.g., "trigger.dev v4.3.4") + - Tags and triggers Docker image builds + - After Docker images are pushed, updates the GitHub release with the exact GHCR tag link + +### What engineers need to do + +- **Package changes**: Add a changeset with `pnpm run changeset:add` +- **Server-only changes**: Add a `.server-changes/` file (see `.server-changes/README.md`) +- **Mixed PRs**: Just the changeset is enough + +See `CHANGESETS.md` for full details on changesets and server changes. + +### Legacy release (v3) + +1. Merge in the changeset PR into main, making sure to cancel both the release and publish github actions from that merge. +2. Pull the changes locally into main +3. Run `pnpm i` which will update the pnpm lock file with the new versions +4. create a commit with "Release 3.x.x" and push. This will build and release the packages +5. Create a git tag on that release commit with v.docker.3.x.x and push the tag to origin. This will publish the `v3.x.x` docker image to GitHub Container Registry. +6. Once the image is built and pushed, create a new GitHub release and select the tag you just created, along with the previous tag that was released. +7. This will generate some release notes. Edit out the package changes and leave only the server changes. +8. Name the release `@trigger.dev/docker@3.x.x` +9. Include the package link (e.g. https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev/278459584?tag=v3.x.x) +10. Once the packages have been published, head over to the [v2-legacy repo](https://github.com/triggerdotdev/v2-legacy.trigger.dev) and follow the instructions in the README for creating a matching release. +11. Before deploying to cloud, compare the differences in the previously created release and double check to see if there are any migrations with indexes created concurrently, and make sure to run those before deploying. diff --git a/ai/references/migrations.md b/ai/references/migrations.md new file mode 100644 index 00000000000..c6fbf79e9d7 --- /dev/null +++ b/ai/references/migrations.md @@ -0,0 +1,121 @@ +## Creating and applying migrations + +We use prisma migrations to manage the database schema. Please follow the following steps when editing the `internal-packages/database/prisma/schema.prisma` file: + +Edit the `schema.prisma` file to add or modify the schema. + +Create a new migration file but don't apply it yet: + +```bash +cd internal-packages/database +pnpm run db:migrate:dev:create --name "add_new_column_to_table" +``` + +The migration file will be created in the `prisma/migrations` directory, but it will have a bunch of edits to the schema that are not needed and will need to be removed before we can apply the migration. Here's an example of what the migration file might look like: + +```sql +-- AlterEnum +ALTER TYPE "public"."TaskRunExecutionStatus" ADD VALUE 'DELAYED'; + +-- AlterTable +ALTER TABLE "public"."TaskRun" ADD COLUMN "debounce" JSONB; + +-- AlterTable +ALTER TABLE "public"."_BackgroundWorkerToBackgroundWorkerFile" ADD CONSTRAINT "_BackgroundWorkerToBackgroundWorkerFile_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_BackgroundWorkerToBackgroundWorkerFile_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_BackgroundWorkerToTaskQueue" ADD CONSTRAINT "_BackgroundWorkerToTaskQueue_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_BackgroundWorkerToTaskQueue_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_TaskRunToTaskRunTag" ADD CONSTRAINT "_TaskRunToTaskRunTag_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_TaskRunToTaskRunTag_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_WaitpointRunConnections" ADD CONSTRAINT "_WaitpointRunConnections_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_WaitpointRunConnections_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_completedWaitpoints" ADD CONSTRAINT "_completedWaitpoints_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_completedWaitpoints_AB_unique"; + +-- CreateIndex +CREATE INDEX "SecretStore_key_idx" ON "public"."SecretStore"("key" text_pattern_ops); + +-- CreateIndex +CREATE INDEX "TaskRun_runtimeEnvironmentId_id_idx" ON "public"."TaskRun"("runtimeEnvironmentId", "id" DESC); + +-- CreateIndex +CREATE INDEX "TaskRun_runtimeEnvironmentId_createdAt_idx" ON "public"."TaskRun"("runtimeEnvironmentId", "createdAt" DESC); +``` + +All the following lines should be removed: + +```sql +-- AlterTable +ALTER TABLE "public"."_BackgroundWorkerToBackgroundWorkerFile" ADD CONSTRAINT "_BackgroundWorkerToBackgroundWorkerFile_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_BackgroundWorkerToBackgroundWorkerFile_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_BackgroundWorkerToTaskQueue" ADD CONSTRAINT "_BackgroundWorkerToTaskQueue_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_BackgroundWorkerToTaskQueue_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_TaskRunToTaskRunTag" ADD CONSTRAINT "_TaskRunToTaskRunTag_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_TaskRunToTaskRunTag_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_WaitpointRunConnections" ADD CONSTRAINT "_WaitpointRunConnections_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_WaitpointRunConnections_AB_unique"; + +-- AlterTable +ALTER TABLE "public"."_completedWaitpoints" ADD CONSTRAINT "_completedWaitpoints_AB_pkey" PRIMARY KEY ("A", "B"); + +-- DropIndex +DROP INDEX "public"."_completedWaitpoints_AB_unique"; + +-- CreateIndex +CREATE INDEX "SecretStore_key_idx" ON "public"."SecretStore"("key" text_pattern_ops); + +-- CreateIndex +CREATE INDEX "TaskRun_runtimeEnvironmentId_id_idx" ON "public"."TaskRun"("runtimeEnvironmentId", "id" DESC); + +-- CreateIndex +CREATE INDEX "TaskRun_runtimeEnvironmentId_createdAt_idx" ON "public"."TaskRun"("runtimeEnvironmentId", "createdAt" DESC); +``` + +Leaving only this: + +```sql +-- AlterEnum +ALTER TYPE "public"."TaskRunExecutionStatus" ADD VALUE 'DELAYED'; + +-- AlterTable +ALTER TABLE "public"."TaskRun" ADD COLUMN "debounce" JSONB; +``` + +After editing the migration file, apply the migration: + +```bash +cd internal-packages/database +pnpm run db:migrate:deploy && pnpm run generate +``` diff --git a/ai/references/repo.md b/ai/references/repo.md new file mode 100644 index 00000000000..4f67bde2b4b --- /dev/null +++ b/ai/references/repo.md @@ -0,0 +1,37 @@ +## Repo Overview + +This is a pnpm 10.23.0 monorepo that uses turborepo @turbo.json. The following workspaces are relevant + +## Apps + +- /apps/webapp is a remix app that is the main API and dashboard for trigger.dev +- /apps/supervisor is a node.js app that handles the execution of built tasks, interaction with the webapp through internal "engine" APIs, as well as interfacing with things like docker or kubernetes, to execute the code. + +## Public Packages + +- /packages/trigger-sdk is the `@trigger.dev/sdk` main SDK package. +- /packages/cli-v3 is the `trigger.dev` CLI package. See our [CLI dev command](https://trigger.dev/docs/cli-dev.md) and [Deployment](https://trigger.dev/docs/deployment/overview.md) docs for more information. +- /packages/core is the `@trigger.dev/core` package that is shared across the SDK and other packages +- /packages/build defines the types and prebuilt build extensions for trigger.dev. See our [build extensions docs](https://trigger.dev/docs/config/extensions/overview.md) for more information. +- /packages/react-hooks defines some useful react hooks like our realtime hooks. See our [Realtime hooks](https://trigger.dev/docs/frontend/react-hooks/realtime.md) and our [Trigger hooks](https://trigger.dev/docs/frontend/react-hooks/triggering.md) for more information. +- /packages/redis-worker is the `@trigger.dev/redis-worker` package that implements a custom background job/worker sytem powered by redis for offloading work to the background, used in the webapp and also in the Run Engine 2.0. + +## Internal Packages + +- /internal-packages/\* are packages that are used internally only, not published, and usually they have a tsc build step and are used in the webapp +- /internal-packages/database is the `@trigger.dev/database` package that exports a prisma client, has the schema file, and exports a few other helpers. +- /internal-packages/run-engine is the `@internal/run-engine` package that is "Run Engine 2.0" and handles moving a run all the way through it's lifecycle +- /internal-packages/redis is the `@internal/redis` package that exports Redis types and the `createRedisClient` function to unify how we create redis clients in the repo. It's not used everywhere yet, but it's the preferred way to create redis clients from now on. +- /internal-packages/testcontainers is the `@internal/testcontainers` package that exports a few useful functions for spinning up local testcontainers when writing vitest tests. See our [tests.md](./tests.md) file for more information. +- /internal-packages/zodworker is the `@internal/zodworker` package that implements a wrapper around graphile-worker that allows us to use zod to validate our background jobs. We are moving away from using graphile-worker as our background job system, replacing it with our own redis-worker package. + +## References + +- /references/\* are test workspaces that we use to write and test the system. Not quite e2e tests or automated, but just a useful place to help develop new features + +## Other + +- /docs is our trigger.dev/docs mintlify documentation site +- /docker/Dockerfile is the one that creates the main trigger.dev published image +- /docker/docker-compose.yml is the file we run locally to start postgresql, redis, and electric when we are doing local development. You can run it with `pnpm run docker` +- /CONTRIBUTING.md defines the steps it takes for OSS contributors to start contributing. diff --git a/ai/references/tests.md b/ai/references/tests.md new file mode 100644 index 00000000000..2bb236c75bc --- /dev/null +++ b/ai/references/tests.md @@ -0,0 +1,86 @@ +## Running Tests + +We use vitest exclusively for testing. To execute tests for a particular workspace, run the following command: + +```bash +pnpm run test --filter webapp +``` + +Prefer running tests on a single file (and first cding into the directory): + +```bash +cd apps/webapp +pnpm run test ./src/components/Button.test.ts +``` + +If you are cd'ing into a directory, you may have to build dependencies first: + +```bash +pnpm run build --filter webapp +cd apps/webapp +pnpm run test ./src/components/Button.test.ts +``` + +## Writing Tests + +We use vitest for testing. We almost NEVER mock anything. Start with a top-level "describe", and have multiple "it" statements inside of it. + +New test files should be placed right next to the file being tested. For example: + +- Source file: `./src/services/MyService.ts` +- Test file: `./src/services/MyService.test.ts` + +When writing anything that needs redis or postgresql, we have some internal "testcontainers" that are used to spin up a local instance, redis, or both. + +redisTest: + +```typescript +import { redisTest } from "@internal/testcontainers"; +import { createRedisClient } from "@internal/redis"; + +describe("redisTest", () => { + redisTest("should use redis", async ({ redisOptions }) => { + const redis = createRedisClient(redisOptions); + + await redis.set("test", "test"); + const result = await redis.get("test"); + expect(result).toEqual("test"); + }); +}); +``` + +postgresTest: + +```typescript +import { postgresTest } from "@internal/testcontainers"; + +describe("postgresTest", () => { + postgresTest("should use postgres", async ({ prisma }) => { + // prisma is an instance of PrismaClient + }); +}); +``` + +containerTest: + +```typescript +import { containerTest } from "@internal/testcontainers"; + +describe("containerTest", () => { + containerTest("should use container", async ({ prisma, redisOptions }) => { + // container has both prisma and redis + }); +}); +``` + +## Dos and Dont's + +- Do not mock anything. +- Do not use mocks in tests. +- Do not use spies in tests. +- Do not use stubs in tests. +- Do not use fakes in tests. +- Do not use sinon in tests. +- Structure each test with a setup, action, and assertion style. +- Feel free to write long test names. +- If there is any randomness in the code under test, use `seedrandom` to make it deterministic by allowing the caller to provide a seed. diff --git a/ailogger-output.log b/ailogger-output.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/coordinator/.env.example b/apps/coordinator/.env.example new file mode 100644 index 00000000000..77377ab3cfd --- /dev/null +++ b/apps/coordinator/.env.example @@ -0,0 +1,4 @@ +HTTP_SERVER_PORT=8020 +PLATFORM_ENABLED=true +PLATFORM_WS_PORT=3030 +SECURE_CONNECTION=false \ No newline at end of file diff --git a/apps/coordinator/.gitignore b/apps/coordinator/.gitignore new file mode 100644 index 00000000000..5c84119d635 --- /dev/null +++ b/apps/coordinator/.gitignore @@ -0,0 +1,3 @@ +dist/ +node_modules/ +.env \ No newline at end of file diff --git a/apps/coordinator/Containerfile b/apps/coordinator/Containerfile new file mode 100644 index 00000000000..9e973675ab9 --- /dev/null +++ b/apps/coordinator/Containerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:labs + +FROM node:20-bookworm-slim@sha256:72f2f046a5f8468db28730b990b37de63ce93fd1a72a40f531d6aa82afdf0d46 AS node-20 + +WORKDIR /app + +FROM node-20 AS pruner + +COPY --chown=node:node . . +RUN npx -q turbo@1.10.9 prune --scope=coordinator --docker +RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' + + +FROM node-20 AS base + +RUN apt-get update \ + && apt-get install -y buildah ca-certificates dumb-init docker.io busybox \ + && rm -rf /var/lib/apt/lists/* + +COPY --chown=node:node .gitignore .gitignore +COPY --from=pruner --chown=node:node /app/out/json/ . +COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml +COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml + +FROM base AS dev-deps +RUN corepack enable +ENV NODE_ENV development + +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm fetch --no-frozen-lockfile +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm install --ignore-scripts --no-frozen-lockfile + +FROM base AS builder +RUN corepack enable + +COPY --from=pruner --chown=node:node /app/out/full/ . +COPY --from=dev-deps --chown=node:node /app/ . +COPY --chown=node:node turbo.json turbo.json + +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter coordinator build:bundle + +FROM alpine AS cri-tools + +WORKDIR /cri-tools + +ARG CRICTL_VERSION=v1.29.0 +ARG CRICTL_CHECKSUM=sha256:d16a1ffb3938f5a19d5c8f45d363bd091ef89c0bc4d44ad16b933eede32fdcbb +ADD --checksum=${CRICTL_CHECKSUM} \ + https://github.com/kubernetes-sigs/cri-tools/releases/download/${CRICTL_VERSION}/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz . +RUN tar zxvf crictl-${CRICTL_VERSION}-linux-amd64.tar.gz + +FROM base AS runner + +RUN corepack enable +ENV NODE_ENV production + +COPY --from=cri-tools --chown=node:node /cri-tools/crictl /usr/local/bin +COPY --from=builder --chown=node:node /app/apps/coordinator/dist/index.mjs ./index.mjs + +EXPOSE 8000 + +CMD [ "/usr/bin/dumb-init", "--", "/usr/local/bin/node", "./index.mjs" ] diff --git a/apps/coordinator/README.md b/apps/coordinator/README.md new file mode 100644 index 00000000000..fa6da2462bd --- /dev/null +++ b/apps/coordinator/README.md @@ -0,0 +1,3 @@ +# Coordinator + +Sits between the platform and tasks. Facilitates communication and checkpointing, amongst other things. diff --git a/apps/coordinator/package.json b/apps/coordinator/package.json new file mode 100644 index 00000000000..3b4240bd37d --- /dev/null +++ b/apps/coordinator/package.json @@ -0,0 +1,30 @@ +{ + "name": "coordinator", + "private": true, + "version": "0.0.1", + "description": "", + "main": "dist/index.cjs", + "scripts": { + "build": "npm run build:bundle", + "build:bundle": "esbuild src/index.ts --bundle --outfile=dist/index.mjs --platform=node --format=esm --target=esnext --banner:js=\"import { createRequire } from 'module';const require = createRequire(import.meta.url);\"", + "build:image": "docker build -f Containerfile . -t coordinator", + "dev": "tsx --no-warnings=ExperimentalWarning --require dotenv/config --watch src/index.ts", + "start": "tsx src/index.ts", + "typecheck": "tsc --noEmit" + }, + "keywords": [], + "author": "", + "license": "MIT", + "dependencies": { + "@trigger.dev/core": "workspace:*", + "nanoid": "^5.0.6", + "prom-client": "^15.1.0", + "socket.io": "4.7.4", + "tinyexec": "^0.3.0" + }, + "devDependencies": { + "dotenv": "^16.4.2", + "esbuild": "^0.19.11", + "tsx": "^4.7.0" + } +} \ No newline at end of file diff --git a/apps/coordinator/src/chaosMonkey.ts b/apps/coordinator/src/chaosMonkey.ts new file mode 100644 index 00000000000..e2bc147674f --- /dev/null +++ b/apps/coordinator/src/chaosMonkey.ts @@ -0,0 +1,88 @@ +import { setTimeout as timeout } from "node:timers/promises"; + +class ChaosMonkeyError extends Error { + constructor(message: string) { + super(message); + this.name = "ChaosMonkeyError"; + } +} + +export class ChaosMonkey { + private chaosEventRate = 0.2; + private delayInSeconds = 45; + + constructor( + private enabled = false, + private disableErrors = false, + private disableDelays = false + ) { + if (this.enabled) { + console.log("🍌 Chaos monkey enabled"); + } + } + + static Error = ChaosMonkeyError; + + enable() { + this.enabled = true; + console.log("🍌 Chaos monkey enabled"); + } + + disable() { + this.enabled = false; + console.log("🍌 Chaos monkey disabled"); + } + + async call({ + throwErrors = !this.disableErrors, + addDelays = !this.disableDelays, + }: { + throwErrors?: boolean; + addDelays?: boolean; + } = {}) { + if (!this.enabled) { + return; + } + + const random = Math.random(); + + if (random > this.chaosEventRate) { + // Don't interfere with normal operation + return; + } + + const chaosEvents: Array<() => Promise> = []; + + if (addDelays) { + chaosEvents.push(async () => { + console.log("🍌 Chaos monkey: Add delay"); + + await timeout(this.delayInSeconds * 1000); + }); + } + + if (throwErrors) { + chaosEvents.push(async () => { + console.log("🍌 Chaos monkey: Throw error"); + + throw new ChaosMonkey.Error("🍌 Chaos monkey: Throw error"); + }); + } + + if (chaosEvents.length === 0) { + console.error("🍌 Chaos monkey: No events selected"); + return; + } + + const randomIndex = Math.floor(Math.random() * chaosEvents.length); + + const chaosEvent = chaosEvents[randomIndex]; + + if (!chaosEvent) { + console.error("🍌 Chaos monkey: No event found"); + return; + } + + await chaosEvent(); + } +} diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts new file mode 100644 index 00000000000..b5d4b52a252 --- /dev/null +++ b/apps/coordinator/src/checkpointer.ts @@ -0,0 +1,708 @@ +import { ExponentialBackoff } from "@trigger.dev/core/v3/apps"; +import { testDockerCheckpoint } from "@trigger.dev/core/v3/serverOnly"; +import { nanoid } from "nanoid"; +import fs from "node:fs/promises"; +import { ChaosMonkey } from "./chaosMonkey"; +import { Buildah, Crictl, Exec } from "./exec"; +import { setTimeout } from "node:timers/promises"; +import { TempFileCleaner } from "./cleaner"; +import { numFromEnv, boolFromEnv } from "./util"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +type CheckpointerInitializeReturn = { + canCheckpoint: boolean; + willSimulate: boolean; +}; + +type CheckpointAndPushOptions = { + runId: string; + leaveRunning?: boolean; + projectRef: string; + deploymentVersion: string; + shouldHeartbeat?: boolean; + attemptNumber?: number; +}; + +type CheckpointAndPushResult = + | { success: true; checkpoint: CheckpointData } + | { + success: false; + reason?: "CANCELED" | "ERROR" | "SKIP_RETRYING"; + }; + +type CheckpointData = { + location: string; + docker: boolean; +}; + +type CheckpointerOptions = { + dockerMode: boolean; + forceSimulate: boolean; + heartbeat: (runId: string) => void; + registryHost?: string; + registryNamespace?: string; + registryTlsVerify?: boolean; + disableCheckpointSupport?: boolean; + checkpointPath?: string; + simulateCheckpointFailure?: boolean; + simulateCheckpointFailureSeconds?: number; + simulatePushFailure?: boolean; + simulatePushFailureSeconds?: number; + chaosMonkey?: ChaosMonkey; +}; + +async function getFileSize(filePath: string): Promise { + try { + const stats = await fs.stat(filePath); + return stats.size; + } catch (error) { + console.error("Error getting file size:", error); + return -1; + } +} + +async function getParsedFileSize(filePath: string) { + const sizeInBytes = await getFileSize(filePath); + + let message = `Size in bytes: ${sizeInBytes}`; + + if (sizeInBytes > 1024 * 1024) { + const sizeInMB = (sizeInBytes / 1024 / 1024).toFixed(2); + message = `Size in MB (rounded): ${sizeInMB}`; + } else if (sizeInBytes > 1024) { + const sizeInKB = (sizeInBytes / 1024).toFixed(2); + message = `Size in KB (rounded): ${sizeInKB}`; + } + + return { + path: filePath, + sizeInBytes, + message, + }; +} + +export class Checkpointer { + #initialized = false; + #canCheckpoint = false; + #dockerMode: boolean; + + #logger = new SimpleStructuredLogger("checkpointer"); + + #failedCheckpoints = new Map(); + + // Indexed by run ID + #runAbortControllers = new Map< + string, + { signal: AbortSignal; abort: AbortController["abort"] } + >(); + + private registryHost: string; + private registryNamespace: string; + private registryTlsVerify: boolean; + + private disableCheckpointSupport: boolean; + + private simulateCheckpointFailure: boolean; + private simulateCheckpointFailureSeconds: number; + private simulatePushFailure: boolean; + private simulatePushFailureSeconds: number; + + private chaosMonkey: ChaosMonkey; + private tmpCleaner?: TempFileCleaner; + + constructor(private opts: CheckpointerOptions) { + this.#dockerMode = opts.dockerMode; + + this.registryHost = opts.registryHost ?? "localhost:5000"; + this.registryNamespace = opts.registryNamespace ?? "trigger"; + this.registryTlsVerify = opts.registryTlsVerify ?? true; + + this.disableCheckpointSupport = opts.disableCheckpointSupport ?? false; + + this.simulateCheckpointFailure = opts.simulateCheckpointFailure ?? false; + this.simulateCheckpointFailureSeconds = opts.simulateCheckpointFailureSeconds ?? 300; + this.simulatePushFailure = opts.simulatePushFailure ?? false; + this.simulatePushFailureSeconds = opts.simulatePushFailureSeconds ?? 300; + + this.chaosMonkey = opts.chaosMonkey ?? new ChaosMonkey(!!process.env.CHAOS_MONKEY_ENABLED); + this.tmpCleaner = this.#createTmpCleaner(); + } + + async init(): Promise { + if (this.#initialized) { + return this.#getInitReturn(this.#canCheckpoint); + } + + this.#logger.log(`${this.#dockerMode ? "Docker" : "Kubernetes"} mode`); + + if (this.#dockerMode) { + const testCheckpoint = await testDockerCheckpoint(); + + if (testCheckpoint.ok) { + return this.#getInitReturn(true); + } + + this.#logger.error(testCheckpoint.message, { error: testCheckpoint.error }); + return this.#getInitReturn(false); + } + + const canLogin = await Buildah.canLogin(this.registryHost); + + if (!canLogin) { + this.#logger.error(`No checkpoint support: Not logged in to registry ${this.registryHost}`); + } + + return this.#getInitReturn(canLogin); + } + + #getInitReturn(canCheckpoint: boolean): CheckpointerInitializeReturn { + this.#canCheckpoint = canCheckpoint; + + if (canCheckpoint) { + if (!this.#initialized) { + this.#logger.log("Full checkpoint support!"); + } + } + + this.#initialized = true; + + const willSimulate = this.#dockerMode && (!this.#canCheckpoint || this.opts.forceSimulate); + + if (willSimulate) { + this.#logger.log("Simulation mode enabled. Containers will be paused, not checkpointed.", { + forceSimulate: this.opts.forceSimulate, + }); + } + + return { + canCheckpoint, + willSimulate, + }; + } + + #getImageRef(projectRef: string, deploymentVersion: string, shortCode: string) { + return `${this.registryHost}/${this.registryNamespace}/${projectRef}:${deploymentVersion}.prod-${shortCode}`; + } + + #getExportLocation(projectRef: string, deploymentVersion: string, shortCode: string) { + const basename = `${projectRef}-${deploymentVersion}-${shortCode}`; + + if (this.#dockerMode) { + return basename; + } else { + return Crictl.getExportLocation(basename); + } + } + + async checkpointAndPush( + opts: CheckpointAndPushOptions, + delayMs?: number + ): Promise { + const start = performance.now(); + this.#logger.log(`checkpointAndPush() start`, { start, opts }); + + const { runId } = opts; + + let interval: NodeJS.Timer | undefined; + if (opts.shouldHeartbeat) { + interval = setInterval(() => { + this.#logger.log("Sending heartbeat", { runId }); + this.opts.heartbeat(runId); + }, 20_000); + } + + const controller = new AbortController(); + const signal = controller.signal; + const abort = controller.abort.bind(controller); + + const onAbort = () => { + this.#logger.error("Checkpoint aborted", { runId, options: opts }); + }; + + signal.addEventListener("abort", onAbort, { once: true }); + + const removeCurrentAbortController = () => { + const controller = this.#runAbortControllers.get(runId); + + // Ensure only the current controller is removed + if (controller && controller.signal === signal) { + this.#runAbortControllers.delete(runId); + } + + // Remove the abort listener in case it hasn't fired + signal.removeEventListener("abort", onAbort); + }; + + if (!this.#dockerMode && !this.#canCheckpoint) { + this.#logger.error("No checkpoint support. Simulation requires docker."); + this.#failCheckpoint(runId, "NO_SUPPORT"); + return; + } + + if (this.#isRunCheckpointing(runId)) { + this.#logger.error("Checkpoint procedure already in progress", { options: opts }); + this.#failCheckpoint(runId, "IN_PROGRESS"); + return; + } + + // This is a new checkpoint, clear any last failure for this run + this.#clearFailedCheckpoint(runId); + + if (this.disableCheckpointSupport) { + this.#logger.error("Checkpoint support disabled", { options: opts }); + this.#failCheckpoint(runId, "DISABLED"); + return; + } + + this.#runAbortControllers.set(runId, { signal, abort }); + + try { + const result = await this.#checkpointAndPushWithBackoff(opts, { delayMs, signal }); + + const end = performance.now(); + this.#logger.log(`checkpointAndPush() end`, { + start, + end, + diff: end - start, + diffWithoutDelay: end - start - (delayMs ?? 0), + opts, + success: result.success, + delayMs, + }); + + if (!result.success) { + return; + } + + return result.checkpoint; + } finally { + if (opts.shouldHeartbeat) { + // @ts-ignore - Some kind of node incompatible type issue + clearInterval(interval); + } + removeCurrentAbortController(); + } + } + + #isRunCheckpointing(runId: string) { + return this.#runAbortControllers.has(runId); + } + + cancelAllCheckpointsForRun(runId: string): boolean { + this.#logger.log("cancelAllCheckpointsForRun: call", { runId }); + + // If the last checkpoint failed, pretend we canceled it + // This ensures tasks don't wait for external resume messages to continue + if (this.#hasFailedCheckpoint(runId)) { + this.#logger.log("cancelAllCheckpointsForRun: hasFailedCheckpoint", { runId }); + this.#clearFailedCheckpoint(runId); + return true; + } + + const controller = this.#runAbortControllers.get(runId); + + if (!controller) { + this.#logger.debug("cancelAllCheckpointsForRun: no abort controller", { runId }); + return false; + } + + const { abort, signal } = controller; + + if (signal.aborted) { + this.#logger.debug("cancelAllCheckpointsForRun: signal already aborted", { runId }); + return false; + } + + abort("cancelCheckpoint()"); + this.#runAbortControllers.delete(runId); + + return true; + } + + async #checkpointAndPushWithBackoff( + { + runId, + leaveRunning = true, // This mirrors kubernetes behaviour more accurately + projectRef, + deploymentVersion, + attemptNumber, + }: CheckpointAndPushOptions, + { delayMs, signal }: { delayMs?: number; signal: AbortSignal } + ): Promise { + if (delayMs && delayMs > 0) { + this.#logger.log("Delaying checkpoint", { runId, delayMs }); + + try { + await setTimeout(delayMs, undefined, { signal }); + } catch (error) { + this.#logger.log("Checkpoint canceled during initial delay", { runId }); + return { success: false, reason: "CANCELED" }; + } + } + + this.#logger.log("Checkpointing with backoff", { + runId, + leaveRunning, + projectRef, + deploymentVersion, + }); + + const backoff = new ExponentialBackoff() + .type("EqualJitter") + .base(3) + .max(3 * 3600) + .maxElapsed(48 * 3600); + + for await (const { delay, retry } of backoff) { + try { + if (retry > 0) { + this.#logger.error("Retrying checkpoint", { + runId, + retry, + delay, + }); + + try { + await setTimeout(delay.milliseconds, undefined, { signal }); + } catch (error) { + this.#logger.log("Checkpoint canceled during retry delay", { runId }); + return { success: false, reason: "CANCELED" }; + } + } + + const result = await this.#checkpointAndPush( + { + runId, + leaveRunning, + projectRef, + deploymentVersion, + attemptNumber, + }, + { signal } + ); + + if (result.success) { + return result; + } + + if (result.reason === "CANCELED") { + this.#logger.log("Checkpoint canceled, won't retry", { runId }); + // Don't fail the checkpoint, as it was canceled + return result; + } + + if (result.reason === "SKIP_RETRYING") { + this.#logger.log("Skipping retrying", { runId }); + return result; + } + + continue; + } catch (error) { + this.#logger.error("Checkpoint error", { + retry, + runId, + delay, + error: error instanceof Error ? error.message : error, + }); + } + } + + this.#logger.error(`Checkpoint failed after exponential backoff`, { + runId, + leaveRunning, + projectRef, + deploymentVersion, + }); + this.#failCheckpoint(runId, "ERROR"); + + return { success: false, reason: "ERROR" }; + } + + async #checkpointAndPush( + { + runId, + leaveRunning = true, // This mirrors kubernetes behaviour more accurately + projectRef, + deploymentVersion, + attemptNumber, + }: CheckpointAndPushOptions, + { signal }: { signal: AbortSignal } + ): Promise { + await this.init(); + + const options = { + runId, + leaveRunning, + projectRef, + deploymentVersion, + attemptNumber, + }; + + const shortCode = nanoid(8); + const imageRef = this.#getImageRef(projectRef, deploymentVersion, shortCode); + const exportLocation = this.#getExportLocation(projectRef, deploymentVersion, shortCode); + + const buildah = new Buildah({ id: `${runId}-${shortCode}`, abortSignal: signal }); + const crictl = new Crictl({ id: `${runId}-${shortCode}`, abortSignal: signal }); + + const cleanup = async () => { + const metadata = { + runId, + exportLocation, + imageRef, + }; + + if (this.#dockerMode) { + this.#logger.debug("Skipping cleanup in docker mode", metadata); + return; + } + + this.#logger.log("Cleaning up", metadata); + + try { + await buildah.cleanup(); + await crictl.cleanup(); + } catch (error) { + this.#logger.error("Error during cleanup", { ...metadata, error }); + } + }; + + try { + await this.chaosMonkey.call(); + + this.#logger.log("checkpointAndPush: checkpointing", { options }); + + const containterName = this.#getRunContainerName(runId); + + // Create checkpoint (docker) + if (this.#dockerMode) { + await this.#createDockerCheckpoint( + signal, + runId, + exportLocation, + leaveRunning, + attemptNumber + ); + + this.#logger.log("checkpointAndPush: checkpoint created", { + runId, + location: exportLocation, + }); + + return { + success: true, + checkpoint: { + location: exportLocation, + docker: true, + }, + }; + } + + // Create checkpoint (CRI) + if (!this.#canCheckpoint) { + this.#logger.error("No checkpoint support in kubernetes mode."); + return { success: false, reason: "SKIP_RETRYING" }; + } + + const containerId = await crictl.ps(containterName, true); + + if (!containerId.stdout) { + this.#logger.error("could not find container id", { options, containterName }); + return { success: false, reason: "SKIP_RETRYING" }; + } + + const start = performance.now(); + + if (this.simulateCheckpointFailure) { + if (performance.now() < this.simulateCheckpointFailureSeconds * 1000) { + this.#logger.error("Simulating checkpoint failure", { options }); + throw new Error("SIMULATE_CHECKPOINT_FAILURE"); + } + } + + // Create checkpoint + await crictl.checkpoint(containerId.stdout, exportLocation); + const postCheckpoint = performance.now(); + + // Print checkpoint size + const size = await getParsedFileSize(exportLocation); + this.#logger.log("checkpoint archive created", { size, options }); + + // Create image from checkpoint + const workingContainer = await buildah.from("scratch"); + const postFrom = performance.now(); + + await buildah.add(workingContainer.stdout, exportLocation, "/"); + const postAdd = performance.now(); + + await buildah.config(workingContainer.stdout, [ + `io.kubernetes.cri-o.annotations.checkpoint.name=${shortCode}`, + ]); + const postConfig = performance.now(); + + await buildah.commit(workingContainer.stdout, imageRef); + const postCommit = performance.now(); + + if (this.simulatePushFailure) { + if (performance.now() < this.simulatePushFailureSeconds * 1000) { + this.#logger.error("Simulating push failure", { options }); + throw new Error("SIMULATE_PUSH_FAILURE"); + } + } + + // Push checkpoint image + await buildah.push(imageRef, this.registryTlsVerify); + const postPush = performance.now(); + + const perf = { + "crictl checkpoint": postCheckpoint - start, + "buildah from": postFrom - postCheckpoint, + "buildah add": postAdd - postFrom, + "buildah config": postConfig - postAdd, + "buildah commit": postCommit - postConfig, + "buildah push": postPush - postCommit, + }; + + this.#logger.log("Checkpointed and pushed image to:", { location: imageRef, perf }); + + return { + success: true, + checkpoint: { + location: imageRef, + docker: false, + }, + }; + } catch (error) { + if (error instanceof Exec.Result) { + if (error.aborted) { + this.#logger.error("Checkpoint canceled: Exec", { options }); + + return { success: false, reason: "CANCELED" }; + } else { + this.#logger.error("Checkpoint command error", { options, error }); + + return { success: false, reason: "ERROR" }; + } + } + + this.#logger.error("Unhandled checkpoint error", { + options, + error: error instanceof Error ? error.message : error, + }); + + return { success: false, reason: "ERROR" }; + } finally { + await cleanup(); + + if (signal.aborted) { + this.#logger.error("Checkpoint canceled: Cleanup", { options }); + + // Overrides any prior return value + return { success: false, reason: "CANCELED" }; + } + } + } + + async unpause(runId: string, attemptNumber?: number): Promise { + try { + const containterNameWithAttempt = this.#getRunContainerName(runId, attemptNumber); + const exec = new Exec({ logger: this.#logger }); + await exec.x("docker", ["unpause", containterNameWithAttempt]); + } catch (error) { + this.#logger.error("[Docker] Error during unpause", { runId, attemptNumber, error }); + } + } + + async #createDockerCheckpoint( + abortSignal: AbortSignal, + runId: string, + exportLocation: string, + leaveRunning: boolean, + attemptNumber?: number + ) { + const containterNameWithAttempt = this.#getRunContainerName(runId, attemptNumber); + const exec = new Exec({ logger: this.#logger, abortSignal }); + + try { + if (this.opts.forceSimulate || !this.#canCheckpoint) { + this.#logger.log("Simulating checkpoint"); + + await exec.x("docker", ["pause", containterNameWithAttempt]); + + return; + } + + if (this.simulateCheckpointFailure) { + if (performance.now() < this.simulateCheckpointFailureSeconds * 1000) { + this.#logger.error("Simulating checkpoint failure", { + runId, + exportLocation, + leaveRunning, + attemptNumber, + }); + + throw new Error("SIMULATE_CHECKPOINT_FAILURE"); + } + } + + const args = ["checkpoint", "create"]; + + if (leaveRunning) { + args.push("--leave-running"); + } + + args.push(containterNameWithAttempt, exportLocation); + + await exec.x("docker", args); + } catch (error) { + this.#logger.error("Failed while creating docker checkpoint", { exportLocation }); + throw error; + } + } + + #failCheckpoint(runId: string, error: unknown) { + this.#failedCheckpoints.set(runId, error); + } + + #clearFailedCheckpoint(runId: string) { + this.#failedCheckpoints.delete(runId); + } + + #hasFailedCheckpoint(runId: string) { + return this.#failedCheckpoints.has(runId); + } + + #getRunContainerName(suffix: string, attemptNumber?: number) { + return `task-run-${suffix}${attemptNumber && attemptNumber > 1 ? `-att${attemptNumber}` : ""}`; + } + + #createTmpCleaner() { + if (!boolFromEnv("TMP_CLEANER_ENABLED", false)) { + return; + } + + const defaultPaths = [Buildah.tmpDir, Crictl.checkpointDir].filter(Boolean); + const pathsOverride = process.env.TMP_CLEANER_PATHS_OVERRIDE?.split(",").filter(Boolean) ?? []; + const paths = pathsOverride.length ? pathsOverride : defaultPaths; + + if (paths.length === 0) { + this.#logger.error("TempFileCleaner enabled but no paths to clean", { + defaultPaths, + pathsOverride, + TMP_CLEANER_PATHS_OVERRIDE: process.env.TMP_CLEANER_PATHS_OVERRIDE, + }); + + return; + } + const cleaner = new TempFileCleaner({ + paths, + maxAgeMinutes: numFromEnv("TMP_CLEANER_MAX_AGE_MINUTES", 60), + intervalSeconds: numFromEnv("TMP_CLEANER_INTERVAL_SECONDS", 300), + leadingEdge: boolFromEnv("TMP_CLEANER_LEADING_EDGE", false), + }); + + cleaner.start(); + + return cleaner; + } +} diff --git a/apps/coordinator/src/cleaner.ts b/apps/coordinator/src/cleaner.ts new file mode 100644 index 00000000000..58cfd24bb70 --- /dev/null +++ b/apps/coordinator/src/cleaner.ts @@ -0,0 +1,106 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { Exec } from "./exec"; +import { setTimeout } from "timers/promises"; + +interface TempFileCleanerOptions { + paths: string[]; + maxAgeMinutes: number; + intervalSeconds: number; + leadingEdge?: boolean; +} + +export class TempFileCleaner { + private enabled = false; + + private logger: SimpleStructuredLogger; + private exec: Exec; + + constructor(private opts: TempFileCleanerOptions) { + this.logger = new SimpleStructuredLogger("tmp-cleaner", undefined, { ...this.opts }); + this.exec = new Exec({ logger: this.logger }); + } + + async start() { + this.logger.log("TempFileCleaner.start"); + this.enabled = true; + + if (!this.opts.leadingEdge) { + await this.wait(); + } + + while (this.enabled) { + try { + await this.clean(); + } catch (error) { + this.logger.error("error during tick", { error }); + } + + await this.wait(); + } + } + + stop() { + this.logger.log("TempFileCleaner.stop"); + this.enabled = false; + } + + private wait() { + return setTimeout(this.opts.intervalSeconds * 1000); + } + + private async clean() { + for (const path of this.opts.paths) { + try { + await this.cleanSingle(path); + } catch (error) { + this.logger.error("error while cleaning", { path, error }); + } + } + } + + private async cleanSingle(startingPoint: string) { + const maxAgeMinutes = this.opts.maxAgeMinutes; + + const ignoreStartingPoint = ["!", "-path", startingPoint]; + const onlyDirectDescendants = ["-maxdepth", "1"]; + const onlyOldFiles = ["-mmin", `+${maxAgeMinutes}`]; + + const baseArgs = [ + startingPoint, + ...ignoreStartingPoint, + ...onlyDirectDescendants, + ...onlyOldFiles, + ]; + + const duArgs = ["-exec", "du", "-ch", "{}", "+"]; + const rmArgs = ["-exec", "rm", "-rf", "{}", "+"]; + + const du = this.x("find", [...baseArgs, ...duArgs]); + const duOutput = await du; + + const duLines = duOutput.stdout.trim().split("\n"); + const fileCount = duLines.length - 1; // last line is the total + const fileSize = duLines.at(-1)?.trim().split(/\s+/)[0]; + + if (fileCount === 0) { + this.logger.log("nothing to delete", { startingPoint, maxAgeMinutes }); + return; + } + + this.logger.log("deleting old files", { fileCount, fileSize, startingPoint, maxAgeMinutes }); + + const rm = this.x("find", [...baseArgs, ...rmArgs]); + const rmOutput = await rm; + + if (rmOutput.stderr.length > 0) { + this.logger.error("delete unsuccessful", { rmOutput }); + return; + } + + this.logger.log("deleted old files", { fileCount, fileSize, startingPoint, maxAgeMinutes }); + } + + private get x() { + return this.exec.x.bind(this.exec); + } +} diff --git a/apps/coordinator/src/exec.ts b/apps/coordinator/src/exec.ts new file mode 100644 index 00000000000..b905723c0f8 --- /dev/null +++ b/apps/coordinator/src/exec.ts @@ -0,0 +1,293 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { randomUUID } from "crypto"; +import { homedir } from "os"; +import { type Result, x } from "tinyexec"; + +class TinyResult { + pid?: number; + exitCode?: number; + aborted: boolean; + killed: boolean; + + constructor(result: Result) { + this.pid = result.pid; + this.exitCode = result.exitCode; + this.aborted = result.aborted; + this.killed = result.killed; + } +} + +interface ExecOptions { + logger?: SimpleStructuredLogger; + abortSignal?: AbortSignal; + logOutput?: boolean; + trimArgs?: boolean; + neverThrow?: boolean; +} + +export class Exec { + private logger: SimpleStructuredLogger; + private abortSignal: AbortSignal | undefined; + + private logOutput: boolean; + private trimArgs: boolean; + private neverThrow: boolean; + + constructor(opts: ExecOptions) { + this.logger = opts.logger ?? new SimpleStructuredLogger("exec"); + this.abortSignal = opts.abortSignal; + + this.logOutput = opts.logOutput ?? true; + this.trimArgs = opts.trimArgs ?? true; + this.neverThrow = opts.neverThrow ?? false; + } + + async x( + command: string, + args?: string[], + opts?: { neverThrow?: boolean; ignoreAbort?: boolean } + ) { + const argsTrimmed = this.trimArgs ? args?.map((arg) => arg.trim()) : args; + + const commandWithFirstArg = `${command}${argsTrimmed?.length ? ` ${argsTrimmed[0]}` : ""}`; + this.logger.debug(`exec: ${commandWithFirstArg}`, { command, args, argsTrimmed }); + + const result = x(command, argsTrimmed, { + signal: opts?.ignoreAbort ? undefined : this.abortSignal, + // We don't use this as it doesn't cover killed and aborted processes + // throwOnError: true, + }); + + const output = await result; + + const metadata = { + command, + argsRaw: args, + argsTrimmed, + globalOpts: { + trimArgs: this.trimArgs, + neverThrow: this.neverThrow, + hasAbortSignal: !!this.abortSignal, + }, + localOpts: opts, + stdout: output.stdout, + stderr: output.stderr, + pid: result.pid, + exitCode: result.exitCode, + aborted: result.aborted, + killed: result.killed, + }; + + if (this.logOutput) { + this.logger.debug(`output: ${commandWithFirstArg}`, metadata); + } + + if (this.neverThrow || opts?.neverThrow) { + return output; + } + + if (result.aborted) { + this.logger.error(`aborted: ${commandWithFirstArg}`, metadata); + throw new TinyResult(result); + } + + if (result.killed) { + this.logger.error(`killed: ${commandWithFirstArg}`, metadata); + throw new TinyResult(result); + } + + if (result.exitCode !== 0) { + this.logger.error(`non-zero exit: ${commandWithFirstArg}`, metadata); + throw new TinyResult(result); + } + + return output; + } + + static Result = TinyResult; +} + +interface BuildahOptions { + id?: string; + abortSignal?: AbortSignal; +} + +export class Buildah { + private id: string; + private logger: SimpleStructuredLogger; + private exec: Exec; + + private containers = new Set(); + private images = new Set(); + + constructor(opts: BuildahOptions) { + this.id = opts.id ?? randomUUID(); + this.logger = new SimpleStructuredLogger("buildah", undefined, { id: this.id }); + + this.exec = new Exec({ + logger: this.logger, + abortSignal: opts.abortSignal, + }); + + this.logger.log("initiaized", { opts }); + } + + private get x() { + return this.exec.x.bind(this.exec); + } + + async from(baseImage: string) { + const output = await this.x("buildah", ["from", baseImage]); + this.containers.add(output.stdout); + return output; + } + + async add(container: string, src: string, dest: string) { + return await this.x("buildah", ["add", container, src, dest]); + } + + async config(container: string, annotations: string[]) { + const args = ["config"]; + + for (const annotation of annotations) { + args.push(`--annotation=${annotation}`); + } + + args.push(container); + + return await this.x("buildah", args); + } + + async commit(container: string, imageRef: string) { + const output = await this.x("buildah", ["commit", container, imageRef]); + this.images.add(output.stdout); + return output; + } + + async push(imageRef: string, registryTlsVerify?: boolean) { + return await this.x("buildah", [ + "push", + `--tls-verify=${String(!!registryTlsVerify)}`, + imageRef, + ]); + } + + async cleanup() { + if (this.containers.size > 0) { + try { + const output = await this.x("buildah", ["rm", ...this.containers], { ignoreAbort: true }); + this.containers.clear(); + + if (output.stderr.length > 0) { + this.logger.error("failed to remove some containers", { output }); + } + } catch (error) { + this.logger.error("failed to clean up containers", { error, containers: this.containers }); + } + } else { + this.logger.debug("no containers to clean up"); + } + + if (this.images.size > 0) { + try { + const output = await this.x("buildah", ["rmi", ...this.images], { ignoreAbort: true }); + this.images.clear(); + + if (output.stderr.length > 0) { + this.logger.error("failed to remove some images", { output }); + } + } catch (error) { + this.logger.error("failed to clean up images", { error, images: this.images }); + } + } else { + this.logger.debug("no images to clean up"); + } + } + + static async canLogin(registryHost: string) { + try { + await x("buildah", ["login", "--get-login", registryHost], { throwOnError: true }); + return true; + } catch (error) { + return false; + } + } + + static get tmpDir() { + return process.env.TMPDIR ?? "/var/tmp"; + } + + static get storageRootDir() { + return process.getuid?.() === 0 + ? "/var/lib/containers/storage" + : `${homedir()}/.local/share/containers/storage`; + } +} + +interface CrictlOptions { + id?: string; + abortSignal?: AbortSignal; +} + +export class Crictl { + private id: string; + private logger: SimpleStructuredLogger; + private exec: Exec; + + private archives = new Set(); + + constructor(opts: CrictlOptions) { + this.id = opts.id ?? randomUUID(); + this.logger = new SimpleStructuredLogger("crictl", undefined, { id: this.id }); + + this.exec = new Exec({ + logger: this.logger, + abortSignal: opts.abortSignal, + }); + + this.logger.log("initiaized", { opts }); + } + + private get x() { + return this.exec.x.bind(this.exec); + } + + async ps(containerName: string, quiet?: boolean) { + return await this.x("crictl", ["ps", "--name", containerName, quiet ? "--quiet" : ""]); + } + + async checkpoint(containerId: string, exportLocation: string) { + const output = await this.x("crictl", [ + "checkpoint", + `--export=${exportLocation}`, + containerId, + ]); + this.archives.add(exportLocation); + return output; + } + + async cleanup() { + if (this.archives.size > 0) { + try { + const output = await this.x("rm", ["-v", ...this.archives], { ignoreAbort: true }); + this.archives.clear(); + + if (output.stderr.length > 0) { + this.logger.error("failed to remove some archives", { output }); + } + } catch (error) { + this.logger.error("failed to clean up archives", { error, archives: this.archives }); + } + } else { + this.logger.debug("no archives to clean up"); + } + } + + static getExportLocation(identifier: string) { + return `${this.checkpointDir}/${identifier}.tar`; + } + + static get checkpointDir() { + return process.env.CRI_CHECKPOINT_DIR ?? "/checkpoints"; + } +} diff --git a/apps/coordinator/src/index.ts b/apps/coordinator/src/index.ts new file mode 100644 index 00000000000..815012fe048 --- /dev/null +++ b/apps/coordinator/src/index.ts @@ -0,0 +1,1781 @@ +import { createServer } from "node:http"; +import { Server } from "socket.io"; +import { + CoordinatorToPlatformMessages, + CoordinatorToProdWorkerMessages, + omit, + PlatformToCoordinatorMessages, + ProdWorkerSocketData, + ProdWorkerToCoordinatorMessages, + WaitReason, +} from "@trigger.dev/core/v3"; +import { ZodNamespace } from "@trigger.dev/core/v3/zodNamespace"; +import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket"; +import { ExponentialBackoff, HttpReply, getTextBody } from "@trigger.dev/core/v3/apps"; +import { ChaosMonkey } from "./chaosMonkey"; +import { Checkpointer } from "./checkpointer"; +import { boolFromEnv, numFromEnv, safeJsonParse } from "./util"; + +import { collectDefaultMetrics, register, Gauge } from "prom-client"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +collectDefaultMetrics(); + +const HTTP_SERVER_PORT = Number(process.env.HTTP_SERVER_PORT || 8020); +const NODE_NAME = process.env.NODE_NAME || "coordinator"; +const DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS = 30_000; + +const PLATFORM_ENABLED = ["1", "true"].includes(process.env.PLATFORM_ENABLED ?? "true"); +const PLATFORM_HOST = process.env.PLATFORM_HOST || "127.0.0.1"; +const PLATFORM_WS_PORT = process.env.PLATFORM_WS_PORT || 3030; +const PLATFORM_SECRET = process.env.PLATFORM_SECRET || "coordinator-secret"; +const SECURE_CONNECTION = ["1", "true"].includes(process.env.SECURE_CONNECTION ?? "false"); + +const TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS = + parseInt(process.env.TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS || "") || 30_000; +const TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES = + parseInt(process.env.TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES || "") || 7; + +const WAIT_FOR_TASK_CHECKPOINT_DELAY_MS = + parseInt(process.env.WAIT_FOR_TASK_CHECKPOINT_DELAY_MS || "") || 0; +const WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS = + parseInt(process.env.WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS || "") || 0; + +const logger = new SimpleStructuredLogger("coordinator", undefined, { nodeName: NODE_NAME }); +const chaosMonkey = new ChaosMonkey( + !!process.env.CHAOS_MONKEY_ENABLED, + !!process.env.CHAOS_MONKEY_DISABLE_ERRORS, + !!process.env.CHAOS_MONKEY_DISABLE_DELAYS +); + +class CheckpointReadinessTimeoutError extends Error {} +class CheckpointCancelError extends Error {} + +class TaskCoordinator { + #httpServer: ReturnType; + #internalHttpServer: ReturnType; + + #checkpointer = new Checkpointer({ + dockerMode: !process.env.KUBERNETES_PORT, + forceSimulate: boolFromEnv("FORCE_CHECKPOINT_SIMULATION", false), + heartbeat: this.#sendRunHeartbeat.bind(this), + registryHost: process.env.REGISTRY_HOST, + registryNamespace: process.env.REGISTRY_NAMESPACE, + registryTlsVerify: boolFromEnv("REGISTRY_TLS_VERIFY", true), + disableCheckpointSupport: boolFromEnv("DISABLE_CHECKPOINT_SUPPORT", false), + simulatePushFailure: boolFromEnv("SIMULATE_PUSH_FAILURE", false), + simulatePushFailureSeconds: numFromEnv("SIMULATE_PUSH_FAILURE_SECONDS", 300), + simulateCheckpointFailure: boolFromEnv("SIMULATE_CHECKPOINT_FAILURE", false), + simulateCheckpointFailureSeconds: numFromEnv("SIMULATE_CHECKPOINT_FAILURE_SECONDS", 300), + chaosMonkey, + }); + + #prodWorkerNamespace?: ZodNamespace< + typeof ProdWorkerToCoordinatorMessages, + typeof CoordinatorToProdWorkerMessages, + typeof ProdWorkerSocketData + >; + #platformSocket?: ZodSocketConnection< + typeof CoordinatorToPlatformMessages, + typeof PlatformToCoordinatorMessages + >; + + #checkpointableTasks = new Map< + string, + { resolve: (value: void) => void; reject: (err?: any) => void } + >(); + + #delayThresholdInMs: number = DEFAULT_RETRY_DELAY_THRESHOLD_IN_MS; + + constructor( + private port: number, + private host = "0.0.0.0" + ) { + this.#httpServer = this.#createHttpServer(); + this.#internalHttpServer = this.#createInternalHttpServer(); + + this.#checkpointer.init(); + this.#platformSocket = this.#createPlatformSocket(); + + const connectedTasksTotal = new Gauge({ + name: "daemon_connected_tasks_total", // don't change this without updating dashboard config + help: "The number of tasks currently connected.", + collect: () => { + connectedTasksTotal.set(this.#prodWorkerNamespace?.namespace.sockets.size ?? 0); + }, + }); + register.registerMetric(connectedTasksTotal); + } + + #returnValidatedExtraHeaders(headers: Record) { + for (const [key, value] of Object.entries(headers)) { + if (value === undefined) { + throw new Error(`Extra header is undefined: ${key}`); + } + } + + return headers; + } + + // MARK: SOCKET: PLATFORM + #createPlatformSocket() { + if (!PLATFORM_ENABLED) { + logger.log("INFO: platform connection disabled"); + return; + } + + const extraHeaders = this.#returnValidatedExtraHeaders({ + "x-supports-dynamic-config": "yes", + }); + + const host = PLATFORM_HOST; + const port = Number(PLATFORM_WS_PORT); + + const platformLogger = new SimpleStructuredLogger("socket-platform", undefined, { + namespace: "coordinator", + }); + + platformLogger.log("connecting", { host, port }); + platformLogger.debug("connecting with extra headers", { extraHeaders }); + + const platformConnection = new ZodSocketConnection({ + namespace: "coordinator", + host, + port, + secure: SECURE_CONNECTION, + extraHeaders, + clientMessages: CoordinatorToPlatformMessages, + serverMessages: PlatformToCoordinatorMessages, + authToken: PLATFORM_SECRET, + logHandlerPayloads: false, + handlers: { + // This is used by resumeAttempt + RESUME_AFTER_DEPENDENCY: async (message) => { + const log = platformLogger.child({ + eventName: "RESUME_AFTER_DEPENDENCY", + ...omit(message, "completions", "executions"), + completions: message.completions.map((c) => ({ + id: c.id, + ok: c.ok, + })), + executions: message.executions.length, + }); + + log.log("Handling RESUME_AFTER_DEPENDENCY"); + + const taskSocket = await this.#getAttemptSocket(message.attemptFriendlyId); + + if (!taskSocket) { + log.debug("Socket for attempt not found"); + return; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for RESUME_AFTER_DEPENDENCY"); + + await chaosMonkey.call(); + + // In case the task resumes before the checkpoint is created + this.#cancelCheckpoint(message.runId, { + event: "RESUME_AFTER_DEPENDENCY", + completions: message.completions.length, + }); + + taskSocket.emit("RESUME_AFTER_DEPENDENCY", message); + }, + // This is used by sharedQueueConsumer + RESUME_AFTER_DEPENDENCY_WITH_ACK: async (message) => { + const log = platformLogger.child({ + eventName: "RESUME_AFTER_DEPENDENCY_WITH_ACK", + ...omit(message, "completions", "executions"), + completions: message.completions.map((c) => ({ + id: c.id, + ok: c.ok, + })), + executions: message.executions.length, + }); + + log.log("Handling RESUME_AFTER_DEPENDENCY_WITH_ACK"); + + const taskSocket = await this.#getAttemptSocket(message.attemptFriendlyId); + + if (!taskSocket) { + log.debug("Socket for attempt not found"); + return { + success: false, + error: { + name: "SocketNotFoundError", + message: "Socket for attempt not found", + }, + }; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for RESUME_AFTER_DEPENDENCY_WITH_ACK"); + + //if this is set, we want to kill the process because it will be resumed with the checkpoint from the queue + if (taskSocket.data.requiresCheckpointResumeWithMessage) { + log.log("RESUME_AFTER_DEPENDENCY_WITH_ACK: Checkpoint is set so going to nack"); + + return { + success: false, + error: { + name: "CheckpointMessagePresentError", + message: + "Checkpoint message is present, so we need to kill the process and resume from the queue.", + }, + }; + } + + await chaosMonkey.call(); + + // In case the task resumes before the checkpoint is created + this.#cancelCheckpoint(message.runId, { + event: "RESUME_AFTER_DEPENDENCY_WITH_ACK", + completions: message.completions.length, + }); + + taskSocket.emit("RESUME_AFTER_DEPENDENCY", message); + + return { + success: true, + }; + }, + RESUME_AFTER_DURATION: async (message) => { + const log = platformLogger.child({ + eventName: "RESUME_AFTER_DURATION", + ...message, + }); + + log.log("Handling RESUME_AFTER_DURATION"); + + const taskSocket = await this.#getAttemptSocket(message.attemptFriendlyId); + + if (!taskSocket) { + log.debug("Socket for attempt not found"); + return; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for RESUME_AFTER_DURATION"); + + await chaosMonkey.call(); + + taskSocket.emit("RESUME_AFTER_DURATION", message); + }, + REQUEST_ATTEMPT_CANCELLATION: async (message) => { + const log = platformLogger.child({ + eventName: "REQUEST_ATTEMPT_CANCELLATION", + ...message, + }); + + log.log("Handling REQUEST_ATTEMPT_CANCELLATION"); + + const taskSocket = await this.#getAttemptSocket(message.attemptFriendlyId); + + if (!taskSocket) { + logger.debug("Socket for attempt not found"); + return; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for REQUEST_ATTEMPT_CANCELLATION"); + + taskSocket.emit("REQUEST_ATTEMPT_CANCELLATION", message); + }, + REQUEST_RUN_CANCELLATION: async (message) => { + const log = platformLogger.child({ + eventName: "REQUEST_RUN_CANCELLATION", + ...message, + }); + + log.log("Handling REQUEST_RUN_CANCELLATION"); + + const taskSocket = await this.#getRunSocket(message.runId); + + if (!taskSocket) { + logger.debug("Socket for run not found"); + return; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for REQUEST_RUN_CANCELLATION"); + + this.#cancelCheckpoint(message.runId, { event: "REQUEST_RUN_CANCELLATION", ...message }); + + if (message.delayInMs) { + taskSocket.emit("REQUEST_EXIT", { + version: "v2", + delayInMs: message.delayInMs, + }); + } else { + // If there's no delay, assume the worker doesn't support non-v1 messages + taskSocket.emit("REQUEST_EXIT", { + version: "v1", + }); + } + }, + READY_FOR_RETRY: async (message) => { + const log = platformLogger.child({ + eventName: "READY_FOR_RETRY", + ...message, + }); + + const taskSocket = await this.#getRunSocket(message.runId); + + if (!taskSocket) { + logger.debug("Socket for attempt not found"); + return; + } + + log.addFields({ socketId: taskSocket.id, socketData: taskSocket.data }); + log.log("Found task socket for READY_FOR_RETRY"); + + await chaosMonkey.call(); + + taskSocket.emit("READY_FOR_RETRY", message); + }, + DYNAMIC_CONFIG: async (message) => { + const log = platformLogger.child({ + eventName: "DYNAMIC_CONFIG", + ...message, + }); + + log.log("Handling DYNAMIC_CONFIG"); + + this.#delayThresholdInMs = message.checkpointThresholdInMs; + + // The first time we receive a dynamic config, the worker namespace will be created + if (!this.#prodWorkerNamespace) { + const io = new Server(this.#httpServer); + this.#prodWorkerNamespace = this.#createProdWorkerNamespace(io); + } + }, + }, + }); + + return platformConnection; + } + + async #getRunSocket(runId: string) { + const sockets = (await this.#prodWorkerNamespace?.fetchSockets()) ?? []; + + for (const socket of sockets) { + if (socket.data.runId === runId) { + return socket; + } + } + } + + async #getAttemptSocket(attemptFriendlyId: string) { + const sockets = (await this.#prodWorkerNamespace?.fetchSockets()) ?? []; + + for (const socket of sockets) { + if (socket.data.attemptFriendlyId === attemptFriendlyId) { + return socket; + } + } + } + + // MARK: SOCKET: WORKERS + #createProdWorkerNamespace(io: Server) { + const provider = new ZodNamespace({ + io, + name: "prod-worker", + clientMessages: ProdWorkerToCoordinatorMessages, + serverMessages: CoordinatorToProdWorkerMessages, + socketData: ProdWorkerSocketData, + postAuth: async (socket, next, logger) => { + function setSocketDataFromHeader( + dataKey: keyof typeof socket.data, + headerName: string, + required: boolean = true + ) { + const value = socket.handshake.headers[headerName]; + + if (value) { + socket.data[dataKey] = Array.isArray(value) ? value[0] : value; + return; + } + + if (required) { + logger.error("missing required header", { headerName }); + throw new Error("missing header"); + } + } + + try { + setSocketDataFromHeader("podName", "x-pod-name"); + setSocketDataFromHeader("contentHash", "x-trigger-content-hash"); + setSocketDataFromHeader("projectRef", "x-trigger-project-ref"); + setSocketDataFromHeader("runId", "x-trigger-run-id"); + setSocketDataFromHeader("attemptFriendlyId", "x-trigger-attempt-friendly-id", false); + setSocketDataFromHeader("attemptNumber", "x-trigger-attempt-number", false); + setSocketDataFromHeader("envId", "x-trigger-env-id"); + setSocketDataFromHeader("deploymentId", "x-trigger-deployment-id"); + setSocketDataFromHeader("deploymentVersion", "x-trigger-deployment-version"); + } catch (error) { + logger.error("setSocketDataFromHeader error", { error }); + socket.disconnect(true); + return; + } + + logger.debug("success", socket.data); + + next(); + }, + onConnection: async (socket, handler, sender) => { + const logger = new SimpleStructuredLogger("ns-prod-worker", undefined, { + namespace: "prod-worker", + socketId: socket.id, + socketData: socket.data, + }); + + const getSocketMetadata = () => { + return { + attemptFriendlyId: socket.data.attemptFriendlyId, + attemptNumber: socket.data.attemptNumber, + requiresCheckpointResumeWithMessage: socket.data.requiresCheckpointResumeWithMessage, + }; + }; + + const getAttemptNumber = () => { + return socket.data.attemptNumber ? parseInt(socket.data.attemptNumber) : undefined; + }; + + const exitRun = () => { + logger.log("exitRun", getSocketMetadata()); + + socket.emit("REQUEST_EXIT", { + version: "v1", + }); + }; + + const crashRun = async (error: { name: string; message: string; stack?: string }) => { + logger.error("crashRun", { ...getSocketMetadata(), error }); + + try { + this.#platformSocket?.send("RUN_CRASHED", { + version: "v1", + runId: socket.data.runId, + error, + }); + } finally { + exitRun(); + } + }; + + const checkpointInProgress = () => { + return this.#checkpointableTasks.has(socket.data.runId); + }; + + const readyToCheckpoint = async ( + reason: WaitReason | "RETRY" + ): Promise< + | { + success: true; + } + | { + success: false; + reason?: string; + } + > => { + const log = logger.child(getSocketMetadata()); + + log.log("readyToCheckpoint", { runId: socket.data.runId, reason }); + + if (checkpointInProgress()) { + return { + success: false, + reason: "checkpoint in progress", + }; + } + + let timeout: NodeJS.Timeout | undefined = undefined; + + const CHECKPOINTABLE_TIMEOUT_SECONDS = 20; + + const isCheckpointable = new Promise((resolve, reject) => { + // We set a reasonable timeout to prevent waiting forever + timeout = setTimeout( + () => reject(new CheckpointReadinessTimeoutError()), + CHECKPOINTABLE_TIMEOUT_SECONDS * 1000 + ); + + this.#checkpointableTasks.set(socket.data.runId, { resolve, reject }); + }); + + try { + await isCheckpointable; + this.#checkpointableTasks.delete(socket.data.runId); + + return { + success: true, + }; + } catch (error) { + log.error("Error while waiting for checkpointable state", { error }); + + if (error instanceof CheckpointReadinessTimeoutError) { + logger.error( + `Failed to become checkpointable in ${CHECKPOINTABLE_TIMEOUT_SECONDS}s for ${reason}`, + { runId: socket.data.runId } + ); + + return { + success: false, + reason: "timeout", + }; + } + + if (error instanceof CheckpointCancelError) { + return { + success: false, + reason: "canceled", + }; + } + + return { + success: false, + reason: typeof error === "string" ? error : "unknown", + }; + } finally { + clearTimeout(timeout); + } + }; + + const updateAttemptFriendlyId = (attemptFriendlyId: string) => { + socket.data.attemptFriendlyId = attemptFriendlyId; + }; + + const updateAttemptNumber = (attemptNumber: string | number) => { + socket.data.attemptNumber = String(attemptNumber); + }; + + this.#platformSocket?.send("LOG", { + metadata: socket.data, + text: "connected", + }); + + socket.on("TEST", (message, callback) => { + logger.log("Handling TEST", { eventName: "TEST", ...getSocketMetadata(), ...message }); + + try { + callback(); + } catch (error) { + logger.error("TEST error", { error }); + } + }); + + // Deprecated: Only workers without support for lazy attempts use this + socket.on("READY_FOR_EXECUTION", async (message) => { + const log = logger.child({ + eventName: "READY_FOR_EXECUTION", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling READY_FOR_EXECUTION"); + + try { + const executionAck = await this.#platformSocket?.sendWithAck( + "READY_FOR_EXECUTION", + message + ); + + if (!executionAck) { + log.error("no execution ack"); + + await crashRun({ + name: "ReadyForExecutionError", + message: "No execution ack", + }); + + return; + } + + if (!executionAck.success) { + log.error("failed to get execution payload"); + + await crashRun({ + name: "ReadyForExecutionError", + message: "Failed to get execution payload", + }); + + return; + } + + socket.emit("EXECUTE_TASK_RUN", { + version: "v1", + executionPayload: executionAck.payload, + }); + + updateAttemptFriendlyId(executionAck.payload.execution.attempt.id); + updateAttemptNumber(executionAck.payload.execution.attempt.number); + } catch (error) { + log.error("READY_FOR_EXECUTION error", { error }); + + await crashRun({ + name: "ReadyForExecutionError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: LAZY ATTEMPT + socket.on("READY_FOR_LAZY_ATTEMPT", async (message) => { + const log = logger.child({ + eventName: "READY_FOR_LAZY_ATTEMPT", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling READY_FOR_LAZY_ATTEMPT"); + + try { + const lazyAttempt = await this.#platformSocket?.sendWithAck("READY_FOR_LAZY_ATTEMPT", { + ...message, + envId: socket.data.envId, + }); + + if (!lazyAttempt) { + log.error("no lazy attempt ack"); + + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "No lazy attempt ack", + }); + + return; + } + + if (!lazyAttempt.success) { + log.error("failed to get lazy attempt payload", { reason: lazyAttempt.reason }); + + await crashRun({ + name: "ReadyForLazyAttemptError", + message: "Failed to get lazy attempt payload", + }); + + return; + } + + await chaosMonkey.call(); + + const lazyPayload = { + ...lazyAttempt.lazyPayload, + metrics: [ + ...(message.startTime + ? [ + { + name: "start", + event: "lazy_payload", + timestamp: message.startTime, + duration: Date.now() - message.startTime, + }, + ] + : []), + ], + }; + + socket.emit("EXECUTE_TASK_RUN_LAZY_ATTEMPT", { + version: "v1", + lazyPayload, + }); + } catch (error) { + if (error instanceof ChaosMonkey.Error) { + log.error("ChaosMonkey error, won't crash run"); + return; + } + + log.error("READY_FOR_LAZY_ATTEMPT error", { error }); + + // await crashRun({ + // name: "ReadyForLazyAttemptError", + // message: + // error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + // }); + + return; + } + }); + + // MARK: RESUME READY + socket.on("READY_FOR_RESUME", async (message) => { + const log = logger.child({ + eventName: "READY_FOR_RESUME", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling READY_FOR_RESUME"); + + try { + updateAttemptFriendlyId(message.attemptFriendlyId); + + if (message.version === "v2") { + updateAttemptNumber(message.attemptNumber); + } + + this.#platformSocket?.send("READY_FOR_RESUME", { ...message, version: "v1" }); + } catch (error) { + log.error("READY_FOR_RESUME error", { error }); + + await crashRun({ + name: "ReadyForResumeError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: RUN COMPLETED + socket.on("TASK_RUN_COMPLETED", async (message, callback) => { + const log = logger.child({ + eventName: "TASK_RUN_COMPLETED", + ...getSocketMetadata(), + ...omit(message, "completion", "execution"), + completion: { + id: message.completion.id, + ok: message.completion.ok, + }, + }); + + log.log("Handling TASK_RUN_COMPLETED"); + + try { + const { completion, execution } = message; + + // Cancel all in-progress checkpoints (if any) + this.#cancelCheckpoint(socket.data.runId, { + event: "TASK_RUN_COMPLETED", + attemptNumber: execution.attempt.number, + }); + + await chaosMonkey.call({ throwErrors: false }); + + const sendCompletionWithAck = async (): Promise => { + try { + const response = await this.#platformSocket?.sendWithAck( + "TASK_RUN_COMPLETED_WITH_ACK", + { + version: "v2", + execution, + completion, + }, + TASK_RUN_COMPLETED_WITH_ACK_TIMEOUT_MS + ); + + if (!response) { + log.error("TASK_RUN_COMPLETED_WITH_ACK: no response"); + return false; + } + + if (!response.success) { + log.error("TASK_RUN_COMPLETED_WITH_ACK: error response", { + error: response.error, + }); + return false; + } + + log.log("TASK_RUN_COMPLETED_WITH_ACK: successful response"); + return true; + } catch (error) { + log.error("TASK_RUN_COMPLETED_WITH_ACK: threw error", { error }); + return false; + } + }; + + const completeWithoutCheckpoint = async (shouldExit: boolean) => { + const supportsRetryCheckpoints = message.version === "v1"; + + callback({ willCheckpointAndRestore: false, shouldExit }); + + if (supportsRetryCheckpoints) { + // This is only here for backwards compat + this.#platformSocket?.send("TASK_RUN_COMPLETED", { + version: "v1", + execution, + completion, + }); + } else { + // 99.99% of runs should end up here + + const completedWithAckBackoff = new ExponentialBackoff("FullJitter").maxRetries( + TASK_RUN_COMPLETED_WITH_ACK_MAX_RETRIES + ); + + const result = await completedWithAckBackoff.execute( + async ({ retry, delay, elapsedMs }) => { + logger.log("TASK_RUN_COMPLETED_WITH_ACK: sending with backoff", { + retry, + delay, + elapsedMs, + }); + + const success = await sendCompletionWithAck(); + + if (!success) { + throw new Error("Failed to send completion with ack"); + } + } + ); + + if (!result.success) { + logger.error("TASK_RUN_COMPLETED_WITH_ACK: failed to send with backoff", result); + return; + } + + logger.log("TASK_RUN_COMPLETED_WITH_ACK: sent with backoff", result); + } + }; + + if (completion.ok) { + await completeWithoutCheckpoint(true); + return; + } + + if ( + completion.error.type === "INTERNAL_ERROR" && + completion.error.code === "TASK_RUN_CANCELLED" + ) { + await completeWithoutCheckpoint(true); + return; + } + + if (completion.retry === undefined) { + await completeWithoutCheckpoint(true); + return; + } + + if (completion.retry.delay < this.#delayThresholdInMs) { + await completeWithoutCheckpoint(false); + + // Prevents runs that fail fast from never sending a heartbeat + this.#sendRunHeartbeat(socket.data.runId); + + return; + } + + if (message.version === "v2") { + await completeWithoutCheckpoint(true); + return; + } + + const { canCheckpoint, willSimulate } = await this.#checkpointer.init(); + + const willCheckpointAndRestore = canCheckpoint || willSimulate; + + if (!willCheckpointAndRestore) { + await completeWithoutCheckpoint(false); + return; + } + + // The worker will then put itself in a checkpointable state + callback({ willCheckpointAndRestore: true, shouldExit: false }); + + const ready = await readyToCheckpoint("RETRY"); + + if (!ready.success) { + log.error("Failed to become checkpointable", { reason: ready.reason }); + + return; + } + + const checkpoint = await this.#checkpointer.checkpointAndPush({ + runId: socket.data.runId, + projectRef: socket.data.projectRef, + deploymentVersion: socket.data.deploymentVersion, + shouldHeartbeat: true, + }); + + if (!checkpoint) { + log.error("Failed to checkpoint"); + await completeWithoutCheckpoint(false); + return; + } + + log.addFields({ checkpoint }); + + this.#platformSocket?.send("TASK_RUN_COMPLETED", { + version: "v1", + execution, + completion, + checkpoint, + }); + + if (!checkpoint.docker || !willSimulate) { + exitRun(); + } + } catch (error) { + log.error("TASK_RUN_COMPLETED error", { error }); + + await crashRun({ + name: "TaskRunCompletedError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: TASK FAILED + socket.on("TASK_RUN_FAILED_TO_RUN", async ({ completion }) => { + const log = logger.child({ + eventName: "TASK_RUN_FAILED_TO_RUN", + ...getSocketMetadata(), + completion: { + id: completion.id, + ok: completion.ok, + }, + }); + + log.log("Handling TASK_RUN_FAILED_TO_RUN"); + + try { + // Cancel all in-progress checkpoints (if any) + this.#cancelCheckpoint(socket.data.runId, { + event: "TASK_RUN_FAILED_TO_RUN", + errorType: completion.error.type, + }); + + this.#platformSocket?.send("TASK_RUN_FAILED_TO_RUN", { + version: "v1", + completion, + }); + + exitRun(); + } catch (error) { + log.error("TASK_RUN_FAILED_TO_RUN error", { error }); + + return; + } + }); + + // MARK: CHECKPOINT + socket.on("READY_FOR_CHECKPOINT", async (message) => { + const log = logger.child({ + eventName: "READY_FOR_CHECKPOINT", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling READY_FOR_CHECKPOINT"); + + try { + const checkpointable = this.#checkpointableTasks.get(socket.data.runId); + + if (!checkpointable) { + log.error("No checkpoint scheduled"); + return; + } + + checkpointable.resolve(); + } catch (error) { + log.error("READY_FOR_CHECKPOINT error", { error }); + + return; + } + }); + + // MARK: CXX CHECKPOINT + socket.on("CANCEL_CHECKPOINT", async (message, callback) => { + const log = logger.child({ + eventName: "CANCEL_CHECKPOINT", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling CANCEL_CHECKPOINT"); + + try { + if (message.version === "v1") { + this.#cancelCheckpoint(socket.data.runId, { event: "CANCEL_CHECKPOINT", ...message }); + // v1 has no callback + return; + } + + const checkpointCanceled = this.#cancelCheckpoint(socket.data.runId, { + event: "CANCEL_CHECKPOINT", + ...message, + }); + + callback({ version: "v2", checkpointCanceled }); + } catch (error) { + log.error("CANCEL_CHECKPOINT error", { error }); + } + }); + + // MARK: DURATION WAIT + socket.on("WAIT_FOR_DURATION", async (message, callback) => { + const log = logger.child({ + eventName: "WAIT_FOR_DURATION", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling WAIT_FOR_DURATION"); + + try { + await chaosMonkey.call({ throwErrors: false }); + + if (checkpointInProgress()) { + log.error("Checkpoint already in progress"); + callback({ willCheckpointAndRestore: false }); + return; + } + + const { canCheckpoint, willSimulate } = await this.#checkpointer.init(); + + const willCheckpointAndRestore = canCheckpoint || willSimulate; + + callback({ willCheckpointAndRestore }); + + if (!willCheckpointAndRestore) { + return; + } + + const ready = await readyToCheckpoint("WAIT_FOR_DURATION"); + + if (!ready.success) { + log.error("Failed to become checkpointable", { reason: ready.reason }); + return; + } + + const runId = socket.data.runId; + const attemptNumber = getAttemptNumber(); + + const checkpoint = await this.#checkpointer.checkpointAndPush({ + runId, + projectRef: socket.data.projectRef, + deploymentVersion: socket.data.deploymentVersion, + attemptNumber, + }); + + if (!checkpoint) { + // The task container will keep running until the wait duration has elapsed + log.error("Failed to checkpoint"); + return; + } + + log.addFields({ checkpoint }); + + const ack = await this.#platformSocket?.sendWithAck("CHECKPOINT_CREATED", { + version: "v1", + runId: socket.data.runId, + attemptFriendlyId: message.attemptFriendlyId, + docker: checkpoint.docker, + location: checkpoint.location, + reason: { + type: "WAIT_FOR_DURATION", + ms: message.ms, + now: message.now, + }, + }); + + if (ack?.keepRunAlive) { + log.log("keeping run alive after duration checkpoint"); + + if (checkpoint.docker && willSimulate) { + // The container is still paused so we need to unpause it + log.log("unpausing container after duration checkpoint"); + this.#checkpointer.unpause(runId, attemptNumber); + } + + return; + } + + if (!checkpoint.docker || !willSimulate) { + exitRun(); + } + } catch (error) { + log.error("WAIT_FOR_DURATION error", { error }); + + await crashRun({ + name: "WaitForDurationError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: TASK WAIT + socket.on("WAIT_FOR_TASK", async (message, callback) => { + const log = logger.child({ + eventName: "WAIT_FOR_TASK", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling WAIT_FOR_TASK"); + + try { + await chaosMonkey.call({ throwErrors: false }); + + if (checkpointInProgress()) { + log.error("Checkpoint already in progress"); + callback({ willCheckpointAndRestore: false }); + return; + } + + const { canCheckpoint, willSimulate } = await this.#checkpointer.init(); + + const willCheckpointAndRestore = canCheckpoint || willSimulate; + + callback({ willCheckpointAndRestore }); + + if (!willCheckpointAndRestore) { + return; + } + + // Workers with v1 schemas don't signal when they're ready to checkpoint for dependency waits + if (message.version === "v2") { + const ready = await readyToCheckpoint("WAIT_FOR_TASK"); + + if (!ready.success) { + log.error("Failed to become checkpointable", { reason: ready.reason }); + return; + } + } + + const runId = socket.data.runId; + const attemptNumber = getAttemptNumber(); + + const checkpoint = await this.#checkpointer.checkpointAndPush( + { + runId, + projectRef: socket.data.projectRef, + deploymentVersion: socket.data.deploymentVersion, + attemptNumber, + }, + WAIT_FOR_TASK_CHECKPOINT_DELAY_MS + ); + + if (!checkpoint) { + log.error("Failed to checkpoint"); + return; + } + + log.addFields({ checkpoint }); + + log.log("WAIT_FOR_TASK checkpoint created"); + + //setting this means we can only resume from a checkpoint + socket.data.requiresCheckpointResumeWithMessage = `location:${checkpoint.location}-docker:${checkpoint.docker}`; + log.log("WAIT_FOR_TASK set requiresCheckpointResumeWithMessage"); + + const ack = await this.#platformSocket?.sendWithAck("CHECKPOINT_CREATED", { + version: "v1", + runId: socket.data.runId, + attemptFriendlyId: message.attemptFriendlyId, + docker: checkpoint.docker, + location: checkpoint.location, + reason: { + type: "WAIT_FOR_TASK", + friendlyId: message.friendlyId, + }, + }); + + if (ack?.keepRunAlive) { + socket.data.requiresCheckpointResumeWithMessage = undefined; + log.log("keeping run alive after task checkpoint"); + + if (checkpoint.docker && willSimulate) { + // The container is still paused so we need to unpause it + log.log("unpausing container after duration checkpoint"); + this.#checkpointer.unpause(runId, attemptNumber); + } + + return; + } + + if (!checkpoint.docker || !willSimulate) { + exitRun(); + } + } catch (error) { + log.error("WAIT_FOR_TASK error", { error }); + + await crashRun({ + name: "WaitForTaskError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: BATCH WAIT + socket.on("WAIT_FOR_BATCH", async (message, callback) => { + const log = logger.child({ + eventName: "WAIT_FOR_BATCH", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling WAIT_FOR_BATCH", message); + + try { + await chaosMonkey.call({ throwErrors: false }); + + if (checkpointInProgress()) { + log.error("Checkpoint already in progress"); + callback({ willCheckpointAndRestore: false }); + return; + } + + const { canCheckpoint, willSimulate } = await this.#checkpointer.init(); + + const willCheckpointAndRestore = canCheckpoint || willSimulate; + + callback({ willCheckpointAndRestore }); + + if (!willCheckpointAndRestore) { + return; + } + + // Workers with v1 schemas don't signal when they're ready to checkpoint for dependency waits + if (message.version === "v2") { + const ready = await readyToCheckpoint("WAIT_FOR_BATCH"); + + if (!ready.success) { + log.error("Failed to become checkpointable", { reason: ready.reason }); + return; + } + } + + const runId = socket.data.runId; + const attemptNumber = getAttemptNumber(); + + const checkpoint = await this.#checkpointer.checkpointAndPush( + { + runId, + projectRef: socket.data.projectRef, + deploymentVersion: socket.data.deploymentVersion, + attemptNumber, + }, + WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS + ); + + if (!checkpoint) { + log.error("Failed to checkpoint"); + return; + } + + log.addFields({ checkpoint }); + + log.log("WAIT_FOR_BATCH checkpoint created"); + + //setting this means we can only resume from a checkpoint + socket.data.requiresCheckpointResumeWithMessage = `location:${checkpoint.location}-docker:${checkpoint.docker}`; + log.log("WAIT_FOR_BATCH set checkpoint"); + + const ack = await this.#platformSocket?.sendWithAck("CHECKPOINT_CREATED", { + version: "v1", + runId: socket.data.runId, + attemptFriendlyId: message.attemptFriendlyId, + docker: checkpoint.docker, + location: checkpoint.location, + reason: { + type: "WAIT_FOR_BATCH", + batchFriendlyId: message.batchFriendlyId, + runFriendlyIds: message.runFriendlyIds, + }, + }); + + if (ack?.keepRunAlive) { + socket.data.requiresCheckpointResumeWithMessage = undefined; + log.log("keeping run alive after batch checkpoint"); + + if (checkpoint.docker && willSimulate) { + // The container is still paused so we need to unpause it + log.log("unpausing container after batch checkpoint"); + this.#checkpointer.unpause(runId, attemptNumber); + } + + return; + } + + if (!checkpoint.docker || !willSimulate) { + exitRun(); + } + } catch (error) { + log.error("WAIT_FOR_BATCH error", { error }); + + await crashRun({ + name: "WaitForBatchError", + message: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + + return; + } + }); + + // MARK: INDEX + socket.on("INDEX_TASKS", async (message, callback) => { + const log = logger.child({ + eventName: "INDEX_TASKS", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling INDEX_TASKS"); + + try { + const workerAck = await this.#platformSocket?.sendWithAck("CREATE_WORKER", { + version: "v2", + projectRef: socket.data.projectRef, + envId: socket.data.envId, + deploymentId: message.deploymentId, + metadata: { + contentHash: socket.data.contentHash, + packageVersion: message.packageVersion, + tasks: message.tasks, + }, + supportsLazyAttempts: message.version !== "v1" && message.supportsLazyAttempts, + }); + + if (!workerAck) { + log.debug("no worker ack while indexing"); + } + + callback({ success: !!workerAck?.success }); + } catch (error) { + log.error("INDEX_TASKS error", { error }); + callback({ success: false }); + } + }); + + // MARK: INDEX FAILED + socket.on("INDEXING_FAILED", async (message) => { + const log = logger.child({ + eventName: "INDEXING_FAILED", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling INDEXING_FAILED"); + + try { + this.#platformSocket?.send("INDEXING_FAILED", { + version: "v1", + deploymentId: message.deploymentId, + error: message.error, + }); + } catch (error) { + log.error("INDEXING_FAILED error", { error }); + } + }); + + // MARK: CREATE ATTEMPT + socket.on("CREATE_TASK_RUN_ATTEMPT", async (message, callback) => { + const log = logger.child({ + eventName: "CREATE_TASK_RUN_ATTEMPT", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling CREATE_TASK_RUN_ATTEMPT"); + + try { + await chaosMonkey.call({ throwErrors: false }); + + const createAttempt = await this.#platformSocket?.sendWithAck( + "CREATE_TASK_RUN_ATTEMPT", + { + runId: message.runId, + envId: socket.data.envId, + } + ); + + if (!createAttempt?.success) { + log.debug("no ack while creating attempt", { reason: createAttempt?.reason }); + callback({ success: false, reason: createAttempt?.reason }); + return; + } + + updateAttemptFriendlyId(createAttempt.executionPayload.execution.attempt.id); + updateAttemptNumber(createAttempt.executionPayload.execution.attempt.number); + + callback({ + success: true, + executionPayload: createAttempt.executionPayload, + }); + } catch (error) { + log.error("CREATE_TASK_RUN_ATTEMPT error", { error }); + callback({ + success: false, + reason: + error instanceof Error ? `Unexpected error: ${error.message}` : "Unexpected error", + }); + } + }); + + socket.on("UNRECOVERABLE_ERROR", async (message) => { + const log = logger.child({ + eventName: "UNRECOVERABLE_ERROR", + ...getSocketMetadata(), + error: message.error, + }); + + log.log("Handling UNRECOVERABLE_ERROR"); + + try { + await crashRun(message.error); + } catch (error) { + log.error("UNRECOVERABLE_ERROR error", { error }); + } + }); + + socket.on("SET_STATE", async (message) => { + const log = logger.child({ + eventName: "SET_STATE", + ...getSocketMetadata(), + ...message, + }); + + log.log("Handling SET_STATE"); + + try { + if (message.attemptFriendlyId) { + updateAttemptFriendlyId(message.attemptFriendlyId); + } + + if (message.attemptNumber) { + updateAttemptNumber(message.attemptNumber); + } + } catch (error) { + log.error("SET_STATE error", { error }); + } + }); + }, + onDisconnect: async (socket, handler, sender, logger) => { + try { + this.#platformSocket?.send("LOG", { + metadata: socket.data, + text: "disconnect", + }); + } catch (error) { + logger.error("onDisconnect error", { error }); + } + }, + handlers: { + TASK_HEARTBEAT: async (message) => { + this.#platformSocket?.send("TASK_HEARTBEAT", message); + }, + TASK_RUN_HEARTBEAT: async (message) => { + this.#sendRunHeartbeat(message.runId); + }, + }, + }); + + return provider; + } + + #sendRunHeartbeat(runId: string) { + this.#platformSocket?.send("TASK_RUN_HEARTBEAT", { + version: "v1", + runId, + }); + } + + #cancelCheckpoint(runId: string, reason?: any): boolean { + logger.log("cancelCheckpoint: call", { runId, reason }); + + const checkpointWait = this.#checkpointableTasks.get(runId); + + if (checkpointWait) { + // Stop waiting for task to reach checkpointable state + checkpointWait.reject(new CheckpointCancelError()); + } + + // Cancel checkpointing procedure + const checkpointCanceled = this.#checkpointer.cancelAllCheckpointsForRun(runId); + + logger.log("cancelCheckpoint: result", { + runId, + reason, + checkpointCanceled, + hadCheckpointWait: !!checkpointWait, + }); + + return checkpointCanceled; + } + + // MARK: HTTP SERVER + #createHttpServer() { + const httpServer = createServer(async (req, res) => { + logger.log(`[${req.method}]`, { url: req.url }); + + const reply = new HttpReply(res); + + switch (req.url) { + case "/health": { + return reply.text("ok"); + } + case "/metrics": { + return reply.text(await register.metrics(), 200, register.contentType); + } + default: { + return reply.empty(404); + } + } + }); + + httpServer.on("clientError", (err, socket) => { + socket.end("HTTP/1.1 400 Bad Request\r\n\r\n"); + }); + + httpServer.on("listening", () => { + logger.log("server listening on port", { port: HTTP_SERVER_PORT }); + }); + + return httpServer; + } + + #createInternalHttpServer() { + const httpServer = createServer(async (req, res) => { + logger.log(`[${req.method}]`, { url: req.url }); + + const reply = new HttpReply(res); + + switch (req.url) { + case "/whoami": { + return reply.text(NODE_NAME); + } + case "/checkpoint/duration": { + try { + const body = await getTextBody(req); + const json = safeJsonParse(body); + + if (typeof json !== "object" || !json) { + return reply.text("Invalid body", 400); + } + + if (!("runId" in json) || typeof json.runId !== "string") { + return reply.text("Missing or invalid: runId", 400); + } + + if (!("now" in json) || typeof json.now !== "number") { + return reply.text("Missing or invalid: now", 400); + } + + if (!("ms" in json) || typeof json.ms !== "number") { + return reply.text("Missing or invalid: ms", 400); + } + + let keepRunAlive = false; + if ("keepRunAlive" in json && typeof json.keepRunAlive === "boolean") { + keepRunAlive = json.keepRunAlive; + } + + let async = false; + if ("async" in json && typeof json.async === "boolean") { + async = json.async; + } + + const { runId, now, ms } = json; + + if (!runId) { + return reply.text("Missing runId", 400); + } + + const runSocket = await this.#getRunSocket(runId); + if (!runSocket) { + return reply.text("Run socket not found", 404); + } + + const { data } = runSocket; + + console.log("Manual duration checkpoint", data); + + if (async) { + reply.text("Creating checkpoint in the background", 202); + } + + const checkpoint = await this.#checkpointer.checkpointAndPush({ + runId: data.runId, + projectRef: data.projectRef, + deploymentVersion: data.deploymentVersion, + attemptNumber: data.attemptNumber ? parseInt(data.attemptNumber) : undefined, + }); + + if (!checkpoint) { + return reply.text("Failed to checkpoint", 500); + } + + if (!data.attemptFriendlyId) { + return reply.text("Socket data missing attemptFriendlyId", 500); + } + + const ack = await this.#platformSocket?.sendWithAck("CHECKPOINT_CREATED", { + version: "v1", + runId, + attemptFriendlyId: data.attemptFriendlyId, + docker: checkpoint.docker, + location: checkpoint.location, + reason: { + type: "WAIT_FOR_DURATION", + ms, + now, + }, + }); + + if (ack?.keepRunAlive || keepRunAlive) { + return reply.json({ + message: `keeping run ${runId} alive after checkpoint`, + checkpoint, + requestJson: json, + platformAck: ack, + }); + } + + runSocket.emit("REQUEST_EXIT", { + version: "v1", + }); + + return reply.json({ + message: `checkpoint created for run ${runId}`, + checkpoint, + requestJson: json, + platformAck: ack, + }); + } catch (error) { + return reply.json({ + message: `error`, + error, + }); + } + } + case "/checkpoint/manual": { + try { + const body = await getTextBody(req); + const json = safeJsonParse(body); + + if (typeof json !== "object" || !json) { + return reply.text("Invalid body", 400); + } + + if (!("runId" in json) || typeof json.runId !== "string") { + return reply.text("Missing or invalid: runId", 400); + } + + let restoreAtUnixTimeMs: number | undefined; + if ("restoreAtUnixTimeMs" in json && typeof json.restoreAtUnixTimeMs === "number") { + restoreAtUnixTimeMs = json.restoreAtUnixTimeMs; + } + + let keepRunAlive = false; + if ("keepRunAlive" in json && typeof json.keepRunAlive === "boolean") { + keepRunAlive = json.keepRunAlive; + } + + let async = false; + if ("async" in json && typeof json.async === "boolean") { + async = json.async; + } + + const { runId } = json; + + if (!runId) { + return reply.text("Missing runId", 400); + } + + const runSocket = await this.#getRunSocket(runId); + if (!runSocket) { + return reply.text("Run socket not found", 404); + } + + const { data } = runSocket; + + console.log("Manual checkpoint", data); + + if (async) { + reply.text("Creating checkpoint in the background", 202); + } + + const checkpoint = await this.#checkpointer.checkpointAndPush({ + runId: data.runId, + projectRef: data.projectRef, + deploymentVersion: data.deploymentVersion, + attemptNumber: data.attemptNumber ? parseInt(data.attemptNumber) : undefined, + }); + + if (!checkpoint) { + return reply.text("Failed to checkpoint", 500); + } + + if (!data.attemptFriendlyId) { + return reply.text("Socket data missing attemptFriendlyId", 500); + } + + const ack = await this.#platformSocket?.sendWithAck("CHECKPOINT_CREATED", { + version: "v1", + runId, + attemptFriendlyId: data.attemptFriendlyId, + docker: checkpoint.docker, + location: checkpoint.location, + reason: { + type: "MANUAL", + restoreAtUnixTimeMs, + }, + }); + + if (ack?.keepRunAlive || keepRunAlive) { + return reply.json({ + message: `keeping run ${runId} alive after checkpoint`, + checkpoint, + requestJson: json, + platformAck: ack, + }); + } + + runSocket.emit("REQUEST_EXIT", { + version: "v1", + }); + + return reply.json({ + message: `checkpoint created for run ${runId}`, + checkpoint, + requestJson: json, + platformAck: ack, + }); + } catch (error) { + return reply.json({ + message: `error`, + error, + }); + } + } + default: { + return reply.empty(404); + } + } + }); + + httpServer.on("clientError", (err, socket) => { + socket.end("HTTP/1.1 400 Bad Request\r\n\r\n"); + }); + + httpServer.on("listening", () => { + logger.log("internal server listening on port", { port: HTTP_SERVER_PORT + 100 }); + }); + + return httpServer; + } + + listen() { + this.#httpServer.listen(this.port, this.host); + this.#internalHttpServer.listen(this.port + 100, "127.0.0.1"); + } +} + +const coordinator = new TaskCoordinator(HTTP_SERVER_PORT); +coordinator.listen(); diff --git a/apps/coordinator/src/util.ts b/apps/coordinator/src/util.ts new file mode 100644 index 00000000000..18464f230b6 --- /dev/null +++ b/apps/coordinator/src/util.ts @@ -0,0 +1,31 @@ +export const boolFromEnv = (env: string, defaultValue: boolean): boolean => { + const value = process.env[env]; + + if (!value) { + return defaultValue; + } + + return ["1", "true"].includes(value); +}; + +export const numFromEnv = (env: string, defaultValue: number): number => { + const value = process.env[env]; + + if (!value) { + return defaultValue; + } + + return parseInt(value, 10); +}; + +export function safeJsonParse(json?: string): unknown { + if (!json) { + return; + } + + try { + return JSON.parse(json); + } catch (e) { + return null; + } +} diff --git a/apps/coordinator/tsconfig.json b/apps/coordinator/tsconfig.json new file mode 100644 index 00000000000..e03fd024126 --- /dev/null +++ b/apps/coordinator/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "es2020", + "module": "commonjs", + "esModuleInterop": true, + "resolveJsonModule": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true, + "paths": { + "@trigger.dev/core/v3": ["../../packages/core/src/v3"], + "@trigger.dev/core/v3/*": ["../../packages/core/src/v3/*"] + } + } +} diff --git a/apps/docker-provider/.env.example b/apps/docker-provider/.env.example new file mode 100644 index 00000000000..75c54083d1a --- /dev/null +++ b/apps/docker-provider/.env.example @@ -0,0 +1,11 @@ +HTTP_SERVER_PORT=8050 + +PLATFORM_WS_PORT=3030 +PLATFORM_SECRET=provider-secret +SECURE_CONNECTION=false + +OTEL_EXPORTER_OTLP_ENDPOINT=http://0.0.0.0:3030/otel + +# Use this if you are on macOS +# COORDINATOR_HOST="host.docker.internal" +# OTEL_EXPORTER_OTLP_ENDPOINT="http://host.docker.internal:4318" \ No newline at end of file diff --git a/apps/docker-provider/.gitignore b/apps/docker-provider/.gitignore new file mode 100644 index 00000000000..5c84119d635 --- /dev/null +++ b/apps/docker-provider/.gitignore @@ -0,0 +1,3 @@ +dist/ +node_modules/ +.env \ No newline at end of file diff --git a/apps/docker-provider/Containerfile b/apps/docker-provider/Containerfile new file mode 100644 index 00000000000..42a7ac23092 --- /dev/null +++ b/apps/docker-provider/Containerfile @@ -0,0 +1,47 @@ +FROM node:20-alpine@sha256:7a91aa397f2e2dfbfcdad2e2d72599f374e0b0172be1d86eeb73f1d33f36a4b2 AS node-20-alpine + +WORKDIR /app + +FROM node-20-alpine AS pruner + +COPY --chown=node:node . . +RUN npx -q turbo@1.10.9 prune --scope=docker-provider --docker +RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' + + +FROM node-20-alpine AS base + +RUN apk add --no-cache dumb-init docker + +COPY --chown=node:node .gitignore .gitignore +COPY --from=pruner --chown=node:node /app/out/json/ . +COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml +COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml + +FROM base AS dev-deps +RUN corepack enable +ENV NODE_ENV development + +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm fetch --no-frozen-lockfile +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm install --ignore-scripts --no-frozen-lockfile + +FROM base AS builder +RUN corepack enable + +COPY --from=pruner --chown=node:node /app/out/full/ . +COPY --from=dev-deps --chown=node:node /app/ . +COPY --chown=node:node turbo.json turbo.json + +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter docker-provider build:bundle + +FROM base AS runner + +RUN corepack enable +ENV NODE_ENV production + +COPY --from=builder --chown=node:node /app/apps/docker-provider/dist/index.mjs ./index.mjs + +EXPOSE 8000 + +USER node + +CMD [ "/usr/bin/dumb-init", "--", "/usr/local/bin/node", "./index.mjs" ] diff --git a/apps/docker-provider/README.md b/apps/docker-provider/README.md new file mode 100644 index 00000000000..647db280a5b --- /dev/null +++ b/apps/docker-provider/README.md @@ -0,0 +1,3 @@ +# Docker provider + +The `docker-provider` allows the platform to be orchestrator-agnostic. The platform can perform actions such as `INDEX_TASKS` or `INVOKE_TASK` which the provider translates into Docker actions. diff --git a/apps/docker-provider/package.json b/apps/docker-provider/package.json new file mode 100644 index 00000000000..f3e4015ef08 --- /dev/null +++ b/apps/docker-provider/package.json @@ -0,0 +1,27 @@ +{ + "name": "docker-provider", + "private": true, + "version": "0.0.1", + "description": "", + "main": "dist/index.cjs", + "scripts": { + "build": "npm run build:bundle", + "build:bundle": "esbuild src/index.ts --bundle --outfile=dist/index.mjs --platform=node --format=esm --target=esnext --banner:js=\"import { createRequire } from 'module';const require = createRequire(import.meta.url);\"", + "build:image": "docker build -f Containerfile . -t docker-provider", + "dev": "tsx --no-warnings=ExperimentalWarning --require dotenv/config --watch src/index.ts", + "start": "tsx src/index.ts", + "typecheck": "tsc --noEmit" + }, + "keywords": [], + "author": "", + "license": "MIT", + "dependencies": { + "@trigger.dev/core": "workspace:*", + "execa": "^8.0.1" + }, + "devDependencies": { + "dotenv": "^16.4.2", + "esbuild": "^0.19.11", + "tsx": "^4.7.0" + } +} \ No newline at end of file diff --git a/apps/docker-provider/src/index.ts b/apps/docker-provider/src/index.ts new file mode 100644 index 00000000000..a0b0554fb23 --- /dev/null +++ b/apps/docker-provider/src/index.ts @@ -0,0 +1,297 @@ +import { $, type ExecaChildProcess, execa } from "execa"; +import { + ProviderShell, + TaskOperations, + TaskOperationsCreateOptions, + TaskOperationsIndexOptions, + TaskOperationsRestoreOptions, +} from "@trigger.dev/core/v3/apps"; +import { SimpleLogger } from "@trigger.dev/core/v3/apps"; +import { isExecaChildProcess } from "@trigger.dev/core/v3/apps"; +import { testDockerCheckpoint } from "@trigger.dev/core/v3/serverOnly"; +import { setTimeout } from "node:timers/promises"; +import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3"; + +const MACHINE_NAME = process.env.MACHINE_NAME || "local"; +const COORDINATOR_PORT = process.env.COORDINATOR_PORT || 8020; +const COORDINATOR_HOST = process.env.COORDINATOR_HOST || "127.0.0.1"; +const DOCKER_NETWORK = process.env.DOCKER_NETWORK || "host"; + +const OTEL_EXPORTER_OTLP_ENDPOINT = + process.env.OTEL_EXPORTER_OTLP_ENDPOINT || "http://0.0.0.0:4318"; + +const FORCE_CHECKPOINT_SIMULATION = ["1", "true"].includes( + process.env.FORCE_CHECKPOINT_SIMULATION ?? "false" +); + +const logger = new SimpleLogger(`[${MACHINE_NAME}]`); + +type TaskOperationsInitReturn = { + canCheckpoint: boolean; + willSimulate: boolean; +}; + +class DockerTaskOperations implements TaskOperations { + #initialized = false; + #canCheckpoint = false; + + constructor(private opts = { forceSimulate: false }) {} + + async init(): Promise { + if (this.#initialized) { + return this.#getInitReturn(this.#canCheckpoint); + } + + logger.log("Initializing task operations"); + + const testCheckpoint = await testDockerCheckpoint(); + + if (testCheckpoint.ok) { + return this.#getInitReturn(true); + } + + logger.error(testCheckpoint.message, testCheckpoint.error); + return this.#getInitReturn(false); + } + + #getInitReturn(canCheckpoint: boolean): TaskOperationsInitReturn { + this.#canCheckpoint = canCheckpoint; + + if (canCheckpoint) { + if (!this.#initialized) { + logger.log("Full checkpoint support!"); + } + } + + this.#initialized = true; + + const willSimulate = !canCheckpoint || this.opts.forceSimulate; + + if (willSimulate) { + logger.log("Simulation mode enabled. Containers will be paused, not checkpointed.", { + forceSimulate: this.opts.forceSimulate, + }); + } + + return { + canCheckpoint, + willSimulate, + }; + } + + async index(opts: TaskOperationsIndexOptions) { + await this.init(); + + const containerName = this.#getIndexContainerName(opts.shortCode); + + logger.log(`Indexing task ${opts.imageRef}`, { + host: COORDINATOR_HOST, + port: COORDINATOR_PORT, + }); + + logger.debug( + await execa("docker", [ + "run", + `--network=${DOCKER_NETWORK}`, + "--rm", + `--env=INDEX_TASKS=true`, + `--env=TRIGGER_SECRET_KEY=${opts.apiKey}`, + `--env=TRIGGER_API_URL=${opts.apiUrl}`, + `--env=TRIGGER_ENV_ID=${opts.envId}`, + `--env=OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT}`, + `--env=POD_NAME=${containerName}`, + `--env=COORDINATOR_HOST=${COORDINATOR_HOST}`, + `--env=COORDINATOR_PORT=${COORDINATOR_PORT}`, + `--name=${containerName}`, + `${opts.imageRef}`, + ]) + ); + } + + async create(opts: TaskOperationsCreateOptions) { + await this.init(); + + const containerName = this.#getRunContainerName(opts.runId, opts.nextAttemptNumber); + + const runArgs = [ + "run", + `--network=${DOCKER_NETWORK}`, + "--detach", + `--env=TRIGGER_ENV_ID=${opts.envId}`, + `--env=TRIGGER_RUN_ID=${opts.runId}`, + `--env=OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT}`, + `--env=POD_NAME=${containerName}`, + `--env=COORDINATOR_HOST=${COORDINATOR_HOST}`, + `--env=COORDINATOR_PORT=${COORDINATOR_PORT}`, + `--env=TRIGGER_POD_SCHEDULED_AT_MS=${Date.now()}`, + `--name=${containerName}`, + ]; + + if (process.env.ENFORCE_MACHINE_PRESETS) { + runArgs.push(`--cpus=${opts.machine.cpu}`, `--memory=${opts.machine.memory}G`); + } + + if (opts.dequeuedAt) { + runArgs.push(`--env=TRIGGER_RUN_DEQUEUED_AT_MS=${opts.dequeuedAt}`); + } + + runArgs.push(`${opts.image}`); + + try { + logger.debug(await execa("docker", runArgs)); + } catch (error) { + if (!isExecaChildProcess(error)) { + throw error; + } + + logger.error("Create failed:", { + opts, + exitCode: error.exitCode, + escapedCommand: error.escapedCommand, + stdout: error.stdout, + stderr: error.stderr, + }); + } + } + + async restore(opts: TaskOperationsRestoreOptions) { + await this.init(); + + const containerName = this.#getRunContainerName(opts.runId, opts.attemptNumber); + + if (!this.#canCheckpoint || this.opts.forceSimulate) { + logger.log("Simulating restore"); + + const unpause = logger.debug(await $`docker unpause ${containerName}`); + + if (unpause.exitCode !== 0) { + throw new Error("docker unpause command failed"); + } + + await this.#sendPostStart(containerName); + return; + } + + const { exitCode } = logger.debug( + await $`docker start --checkpoint=${opts.checkpointRef} ${containerName}` + ); + + if (exitCode !== 0) { + throw new Error("docker start command failed"); + } + + await this.#sendPostStart(containerName); + } + + async delete(opts: { runId: string }) { + await this.init(); + + const containerName = this.#getRunContainerName(opts.runId); + await this.#sendPreStop(containerName); + + logger.log("noop: delete"); + } + + async get(opts: { runId: string }) { + await this.init(); + + logger.log("noop: get"); + } + + #getIndexContainerName(suffix: string) { + return `task-index-${suffix}`; + } + + #getRunContainerName(suffix: string, attemptNumber?: number) { + return `task-run-${suffix}${attemptNumber && attemptNumber > 1 ? `-att${attemptNumber}` : ""}`; + } + + async #sendPostStart(containerName: string): Promise { + try { + const port = await this.#getHttpServerPort(containerName); + logger.debug(await this.#runLifecycleCommand(containerName, port, "postStart", "restore")); + } catch (error) { + logger.error("postStart error", { error }); + throw new Error("postStart command failed"); + } + } + + async #sendPreStop(containerName: string): Promise { + try { + const port = await this.#getHttpServerPort(containerName); + logger.debug(await this.#runLifecycleCommand(containerName, port, "preStop", "terminate")); + } catch (error) { + logger.error("preStop error", { error }); + throw new Error("preStop command failed"); + } + } + + async #getHttpServerPort(containerName: string): Promise { + // We first get the correct port, which is random during dev as we run with host networking and need to avoid clashes + // FIXME: Skip this in prod + const logs = logger.debug(await $`docker logs ${containerName}`); + const matches = logs.stdout.match(/http server listening on port (?[0-9]+)/); + + const port = Number(matches?.groups?.port); + + if (!port) { + throw new Error("failed to extract port from logs"); + } + + return port; + } + + async #runLifecycleCommand( + containerName: string, + port: number, + type: THookType, + cause: THookType extends "postStart" ? PostStartCauses : PreStopCauses, + retryCount = 0 + ): Promise { + try { + return await execa("docker", [ + "exec", + containerName, + "busybox", + "wget", + "-q", + "-O-", + `127.0.0.1:${port}/${type}?cause=${cause}`, + ]); + } catch (error: any) { + if (type === "postStart" && retryCount < 6) { + logger.debug(`retriable ${type} error`, { retryCount, message: error?.message }); + await setTimeout(exponentialBackoff(retryCount + 1, 2, 50, 1150, 50)); + + return this.#runLifecycleCommand(containerName, port, type, cause, retryCount + 1); + } + + logger.error(`final ${type} error`, { message: error?.message }); + throw new Error(`${type} command failed after ${retryCount - 1} retries`); + } + } +} + +const provider = new ProviderShell({ + tasks: new DockerTaskOperations({ forceSimulate: FORCE_CHECKPOINT_SIMULATION }), + type: "docker", +}); + +provider.listen(); + +function exponentialBackoff( + retryCount: number, + exponential: number, + minDelay: number, + maxDelay: number, + jitter: number +): number { + // Calculate the delay using the exponential backoff formula + const delay = Math.min(Math.pow(exponential, retryCount) * minDelay, maxDelay); + + // Calculate the jitter + const jitterValue = Math.random() * jitter; + + // Return the calculated delay with jitter + return delay + jitterValue; +} diff --git a/apps/docker-provider/tsconfig.json b/apps/docker-provider/tsconfig.json new file mode 100644 index 00000000000..f87adfc2d7f --- /dev/null +++ b/apps/docker-provider/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "es2020", + "module": "commonjs", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "strict": true, + "skipLibCheck": true, + "paths": { + "@trigger.dev/core/v3": ["../../packages/core/src/v3"], + "@trigger.dev/core/v3/*": ["../../packages/core/src/v3/*"] + } + } +} diff --git a/apps/kubernetes-provider/.env.example b/apps/kubernetes-provider/.env.example new file mode 100644 index 00000000000..f21ee29bac9 --- /dev/null +++ b/apps/kubernetes-provider/.env.example @@ -0,0 +1,9 @@ +HTTP_SERVER_PORT=8060 + +PLATFORM_WS_PORT=3030 +PLATFORM_SECRET=provider-secret +SECURE_CONNECTION=false + +# Use this if you are on macOS +# COORDINATOR_HOST="host.docker.internal" +# OTEL_EXPORTER_OTLP_ENDPOINT="http://host.docker.internal:4318" \ No newline at end of file diff --git a/apps/kubernetes-provider/.gitignore b/apps/kubernetes-provider/.gitignore new file mode 100644 index 00000000000..5c84119d635 --- /dev/null +++ b/apps/kubernetes-provider/.gitignore @@ -0,0 +1,3 @@ +dist/ +node_modules/ +.env \ No newline at end of file diff --git a/apps/kubernetes-provider/Containerfile b/apps/kubernetes-provider/Containerfile new file mode 100644 index 00000000000..b46b9943275 --- /dev/null +++ b/apps/kubernetes-provider/Containerfile @@ -0,0 +1,47 @@ +FROM node:20-alpine@sha256:7a91aa397f2e2dfbfcdad2e2d72599f374e0b0172be1d86eeb73f1d33f36a4b2 AS node-20-alpine + +WORKDIR /app + +FROM node-20-alpine AS pruner + +COPY --chown=node:node . . +RUN npx -q turbo@1.10.9 prune --scope=kubernetes-provider --docker +RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' + + +FROM node-20-alpine AS base + +RUN apk add --no-cache dumb-init + +COPY --chown=node:node .gitignore .gitignore +COPY --from=pruner --chown=node:node /app/out/json/ . +COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml +COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml + +FROM base AS dev-deps +RUN corepack enable +ENV NODE_ENV development + +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm fetch --no-frozen-lockfile +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm install --ignore-scripts --no-frozen-lockfile + +FROM base AS builder +RUN corepack enable + +COPY --from=pruner --chown=node:node /app/out/full/ . +COPY --from=dev-deps --chown=node:node /app/ . +COPY --chown=node:node turbo.json turbo.json + +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter kubernetes-provider build:bundle + +FROM base AS runner + +RUN corepack enable +ENV NODE_ENV production + +COPY --from=builder --chown=node:node /app/apps/kubernetes-provider/dist/index.mjs ./index.mjs + +EXPOSE 8000 + +USER node + +CMD [ "/usr/bin/dumb-init", "--", "/usr/local/bin/node", "./index.mjs" ] diff --git a/apps/kubernetes-provider/README.md b/apps/kubernetes-provider/README.md new file mode 100644 index 00000000000..829c8f2154a --- /dev/null +++ b/apps/kubernetes-provider/README.md @@ -0,0 +1,3 @@ +# Kubernetes provider + +The `kubernetes-provider` allows the platform to be orchestrator-agnostic. The platform can perform actions such as `INDEX_TASKS` or `INVOKE_TASK` which the provider translates into Kubernetes actions. diff --git a/apps/kubernetes-provider/package.json b/apps/kubernetes-provider/package.json new file mode 100644 index 00000000000..6cb26e2c70f --- /dev/null +++ b/apps/kubernetes-provider/package.json @@ -0,0 +1,28 @@ +{ + "name": "kubernetes-provider", + "private": true, + "version": "0.0.1", + "description": "", + "main": "dist/index.cjs", + "scripts": { + "build": "npm run build:bundle", + "build:bundle": "esbuild src/index.ts --bundle --outfile=dist/index.mjs --platform=node --format=esm --target=esnext --banner:js=\"import { createRequire } from 'module';const require = createRequire(import.meta.url);\"", + "build:image": "docker build -f Containerfile . -t kubernetes-provider", + "dev": "tsx --no-warnings=ExperimentalWarning --require dotenv/config --watch src/index.ts", + "start": "tsx src/index.ts", + "typecheck": "tsc --noEmit" + }, + "keywords": [], + "author": "", + "license": "MIT", + "dependencies": { + "@kubernetes/client-node": "^0.20.0", + "@trigger.dev/core": "workspace:*", + "p-queue": "^8.0.1" + }, + "devDependencies": { + "dotenv": "^16.4.2", + "esbuild": "^0.19.11", + "tsx": "^4.7.0" + } +} \ No newline at end of file diff --git a/apps/kubernetes-provider/src/index.ts b/apps/kubernetes-provider/src/index.ts new file mode 100644 index 00000000000..23a6ad56ce3 --- /dev/null +++ b/apps/kubernetes-provider/src/index.ts @@ -0,0 +1,783 @@ +import * as k8s from "@kubernetes/client-node"; +import { + EnvironmentType, + MachinePreset, + PostStartCauses, + PreStopCauses, +} from "@trigger.dev/core/v3"; +import { + ProviderShell, + SimpleLogger, + TaskOperations, + TaskOperationsCreateOptions, + TaskOperationsIndexOptions, + TaskOperationsPrePullDeploymentOptions, + TaskOperationsRestoreOptions, +} from "@trigger.dev/core/v3/apps"; +import { PodCleaner } from "./podCleaner"; +import { TaskMonitor } from "./taskMonitor"; +import { UptimeHeartbeat } from "./uptimeHeartbeat"; +import { assertExhaustive } from "@trigger.dev/core"; +import { CustomLabelHelper } from "./labelHelper"; + +const RUNTIME_ENV = process.env.KUBERNETES_PORT ? "kubernetes" : "local"; +const NODE_NAME = process.env.NODE_NAME || "local"; +const OTEL_EXPORTER_OTLP_ENDPOINT = + process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318"; +const COORDINATOR_HOST = process.env.COORDINATOR_HOST ?? undefined; +const COORDINATOR_PORT = process.env.COORDINATOR_PORT ?? undefined; +const KUBERNETES_NAMESPACE = process.env.KUBERNETES_NAMESPACE ?? "default"; + +const POD_CLEANER_INTERVAL_SECONDS = Number(process.env.POD_CLEANER_INTERVAL_SECONDS || "300"); + +const UPTIME_HEARTBEAT_URL = process.env.UPTIME_HEARTBEAT_URL; +const UPTIME_INTERVAL_SECONDS = Number(process.env.UPTIME_INTERVAL_SECONDS || "60"); +const UPTIME_MAX_PENDING_RUNS = Number(process.env.UPTIME_MAX_PENDING_RUNS || "25"); +const UPTIME_MAX_PENDING_INDECES = Number(process.env.UPTIME_MAX_PENDING_INDECES || "10"); +const UPTIME_MAX_PENDING_ERRORS = Number(process.env.UPTIME_MAX_PENDING_ERRORS || "10"); + +const POD_EPHEMERAL_STORAGE_SIZE_LIMIT = process.env.POD_EPHEMERAL_STORAGE_SIZE_LIMIT || "10Gi"; +const POD_EPHEMERAL_STORAGE_SIZE_REQUEST = process.env.POD_EPHEMERAL_STORAGE_SIZE_REQUEST || "2Gi"; + +// Image config +const PRE_PULL_DISABLED = process.env.PRE_PULL_DISABLED === "true"; +const ADDITIONAL_PULL_SECRETS = process.env.ADDITIONAL_PULL_SECRETS; +const PAUSE_IMAGE = process.env.PAUSE_IMAGE || "registry.k8s.io/pause:3.9"; +const BUSYBOX_IMAGE = process.env.BUSYBOX_IMAGE || "registry.digitalocean.com/trigger/busybox"; +const DEPLOYMENT_IMAGE_PREFIX = process.env.DEPLOYMENT_IMAGE_PREFIX; +const RESTORE_IMAGE_PREFIX = process.env.RESTORE_IMAGE_PREFIX; +const UTILITY_IMAGE_PREFIX = process.env.UTILITY_IMAGE_PREFIX; + +const logger = new SimpleLogger(`[${NODE_NAME}]`); +logger.log(`running in ${RUNTIME_ENV} mode`); + +type Namespace = { + metadata: { + name: string; + }; +}; + +type ResourceQuantities = { + [K in "cpu" | "memory" | "ephemeral-storage"]?: string; +}; + +class KubernetesTaskOperations implements TaskOperations { + #namespace: Namespace = { + metadata: { + name: "default", + }, + }; + + #k8sApi: { + core: k8s.CoreV1Api; + batch: k8s.BatchV1Api; + apps: k8s.AppsV1Api; + }; + + #labelHelper = new CustomLabelHelper(); + + constructor(opts: { namespace?: string } = {}) { + if (opts.namespace) { + this.#namespace.metadata.name = opts.namespace; + } + + this.#k8sApi = this.#createK8sApi(); + } + + async init() { + // noop + } + + async index(opts: TaskOperationsIndexOptions) { + await this.#createJob( + { + metadata: { + name: this.#getIndexContainerName(opts.shortCode), + namespace: this.#namespace.metadata.name, + }, + spec: { + completions: 1, + backoffLimit: 0, + ttlSecondsAfterFinished: 300, + template: { + metadata: { + labels: { + ...this.#getSharedLabels(opts), + app: "task-index", + "app.kubernetes.io/part-of": "trigger-worker", + "app.kubernetes.io/component": "index", + deployment: opts.deploymentId, + }, + }, + spec: { + ...this.#defaultPodSpec, + containers: [ + { + name: this.#getIndexContainerName(opts.shortCode), + image: getImageRef("deployment", opts.imageRef), + ports: [ + { + containerPort: 8000, + }, + ], + resources: { + limits: { + cpu: "1", + memory: "2G", + "ephemeral-storage": "2Gi", + }, + }, + lifecycle: { + preStop: { + exec: { + command: this.#getLifecycleCommand("preStop", "terminate"), + }, + }, + }, + env: [ + ...this.#getSharedEnv(opts.envId), + { + name: "INDEX_TASKS", + value: "true", + }, + { + name: "TRIGGER_SECRET_KEY", + value: opts.apiKey, + }, + { + name: "TRIGGER_API_URL", + value: opts.apiUrl, + }, + ], + }, + ], + }, + }, + }, + }, + this.#namespace + ); + } + + async create(opts: TaskOperationsCreateOptions) { + const containerName = this.#getRunContainerName(opts.runId, opts.nextAttemptNumber); + + await this.#createPod( + { + metadata: { + name: containerName, + namespace: this.#namespace.metadata.name, + labels: { + ...this.#labelHelper.getAdditionalLabels("create"), + ...this.#getSharedLabels(opts), + app: "task-run", + "app.kubernetes.io/part-of": "trigger-worker", + "app.kubernetes.io/component": "create", + run: opts.runId, + }, + }, + spec: { + ...this.#defaultPodSpec, + terminationGracePeriodSeconds: 60 * 60, + containers: [ + { + name: containerName, + image: getImageRef("deployment", opts.image), + ports: [ + { + containerPort: 8000, + }, + ], + resources: this.#getResourcesForMachine(opts.machine), + lifecycle: { + preStop: { + exec: { + command: this.#getLifecycleCommand("preStop", "terminate"), + }, + }, + }, + env: [ + ...this.#getSharedEnv(opts.envId), + { + name: "TRIGGER_RUN_ID", + value: opts.runId, + }, + ...(opts.dequeuedAt + ? [{ name: "TRIGGER_RUN_DEQUEUED_AT_MS", value: String(opts.dequeuedAt) }] + : []), + ], + volumeMounts: [ + { + name: "taskinfo", + mountPath: "/etc/taskinfo", + }, + ], + }, + ], + volumes: [ + { + name: "taskinfo", + emptyDir: {}, + }, + ], + }, + }, + this.#namespace + ); + } + + async restore(opts: TaskOperationsRestoreOptions) { + await this.#createPod( + { + metadata: { + name: `${this.#getRunContainerName(opts.runId)}-${opts.checkpointId.slice(-8)}`, + namespace: this.#namespace.metadata.name, + labels: { + ...this.#labelHelper.getAdditionalLabels("restore"), + ...this.#getSharedLabels(opts), + app: "task-run", + "app.kubernetes.io/part-of": "trigger-worker", + "app.kubernetes.io/component": "restore", + run: opts.runId, + checkpoint: opts.checkpointId, + }, + }, + spec: { + ...this.#defaultPodSpec, + initContainers: [ + { + name: "pull-base-image", + image: getImageRef("deployment", opts.imageRef), + command: ["sleep", "0"], + }, + { + name: "populate-taskinfo", + image: getImageRef("utility", BUSYBOX_IMAGE), + imagePullPolicy: "IfNotPresent", + command: ["/bin/sh", "-c"], + args: ["printenv COORDINATOR_HOST | tee /etc/taskinfo/coordinator-host"], + env: this.#coordinatorEnvVars, + volumeMounts: [ + { + name: "taskinfo", + mountPath: "/etc/taskinfo", + }, + ], + }, + ], + containers: [ + { + name: this.#getRunContainerName(opts.runId), + image: getImageRef("restore", opts.checkpointRef), + ports: [ + { + containerPort: 8000, + }, + ], + resources: this.#getResourcesForMachine(opts.machine), + lifecycle: { + postStart: { + exec: { + command: this.#getLifecycleCommand("postStart", "restore"), + }, + }, + preStop: { + exec: { + command: this.#getLifecycleCommand("preStop", "terminate"), + }, + }, + }, + volumeMounts: [ + { + name: "taskinfo", + mountPath: "/etc/taskinfo", + }, + ], + }, + ], + volumes: [ + { + name: "taskinfo", + emptyDir: {}, + }, + ], + }, + }, + this.#namespace + ); + } + + async delete(opts: { runId: string }) { + await this.#deletePod({ + runId: opts.runId, + namespace: this.#namespace, + }); + } + + async get(opts: { runId: string }) { + await this.#getPod(opts.runId, this.#namespace); + } + + async prePullDeployment(opts: TaskOperationsPrePullDeploymentOptions) { + if (PRE_PULL_DISABLED) { + logger.debug("Pre-pull is disabled, skipping.", { opts }); + return; + } + + const metaName = this.#getPrePullContainerName(opts.shortCode); + + const metaLabels = { + ...this.#getSharedLabels(opts), + app: "task-prepull", + "app.kubernetes.io/part-of": "trigger-worker", + "app.kubernetes.io/component": "prepull", + deployment: opts.deploymentId, + name: metaName, + } satisfies k8s.V1ObjectMeta["labels"]; + + await this.#createDaemonSet( + { + metadata: { + name: metaName, + namespace: this.#namespace.metadata.name, + labels: metaLabels, + }, + spec: { + selector: { + matchLabels: { + name: metaName, + }, + }, + template: { + metadata: { + labels: metaLabels, + }, + spec: { + ...this.#defaultPodSpec, + restartPolicy: "Always", + affinity: { + nodeAffinity: { + requiredDuringSchedulingIgnoredDuringExecution: { + nodeSelectorTerms: [ + { + matchExpressions: [ + { + key: "trigger.dev/pre-pull-disabled", + operator: "DoesNotExist", + }, + ], + }, + ], + }, + }, + }, + initContainers: [ + { + name: "prepull", + image: getImageRef("deployment", opts.imageRef), + command: ["/usr/bin/true"], + resources: { + limits: { + cpu: "0.25", + memory: "100Mi", + "ephemeral-storage": "1Gi", + }, + }, + }, + ], + containers: [ + { + name: "pause", + image: getImageRef("utility", PAUSE_IMAGE), + resources: { + limits: { + cpu: "1m", + memory: "12Mi", + }, + }, + }, + ], + }, + }, + }, + }, + this.#namespace + ); + } + + #envTypeToLabelValue(type: EnvironmentType) { + switch (type) { + case "PRODUCTION": + return "prod"; + case "STAGING": + return "stg"; + case "DEVELOPMENT": + return "dev"; + case "PREVIEW": + return "preview"; + } + } + + get #defaultPodSpec(): Omit { + const pullSecrets = ["registry-trigger", "registry-trigger-failover"]; + + if (ADDITIONAL_PULL_SECRETS) { + pullSecrets.push(...ADDITIONAL_PULL_SECRETS.split(",")); + } + + const imagePullSecrets = pullSecrets.map( + (name) => ({ name }) satisfies k8s.V1LocalObjectReference + ); + + return { + restartPolicy: "Never", + automountServiceAccountToken: false, + imagePullSecrets, + nodeSelector: { + nodetype: "worker", + }, + }; + } + + get #defaultResourceRequests(): ResourceQuantities { + return { + "ephemeral-storage": POD_EPHEMERAL_STORAGE_SIZE_REQUEST, + }; + } + + get #defaultResourceLimits(): ResourceQuantities { + return { + "ephemeral-storage": POD_EPHEMERAL_STORAGE_SIZE_LIMIT, + }; + } + + get #coordinatorHostEnvVar(): k8s.V1EnvVar { + return COORDINATOR_HOST + ? { + name: "COORDINATOR_HOST", + value: COORDINATOR_HOST, + } + : { + name: "COORDINATOR_HOST", + valueFrom: { + fieldRef: { + fieldPath: "status.hostIP", + }, + }, + }; + } + + get #coordinatorPortEnvVar(): k8s.V1EnvVar | undefined { + if (COORDINATOR_PORT) { + return { + name: "COORDINATOR_PORT", + value: COORDINATOR_PORT, + }; + } + } + + get #coordinatorEnvVars(): k8s.V1EnvVar[] { + const envVars = [this.#coordinatorHostEnvVar]; + + if (this.#coordinatorPortEnvVar) { + envVars.push(this.#coordinatorPortEnvVar); + } + + return envVars; + } + + #getSharedEnv(envId: string): k8s.V1EnvVar[] { + return [ + { + name: "TRIGGER_ENV_ID", + value: envId, + }, + { + name: "DEBUG", + value: process.env.DEBUG ? "1" : "0", + }, + { + name: "HTTP_SERVER_PORT", + value: "8000", + }, + { + name: "OTEL_EXPORTER_OTLP_ENDPOINT", + value: OTEL_EXPORTER_OTLP_ENDPOINT, + }, + { + name: "POD_NAME", + valueFrom: { + fieldRef: { + fieldPath: "metadata.name", + }, + }, + }, + { + name: "MACHINE_NAME", + valueFrom: { + fieldRef: { + fieldPath: "spec.nodeName", + }, + }, + }, + { + name: "TRIGGER_POD_SCHEDULED_AT_MS", + value: Date.now().toString(), + }, + ...this.#coordinatorEnvVars, + ]; + } + + #getSharedLabels( + opts: + | TaskOperationsIndexOptions + | TaskOperationsCreateOptions + | TaskOperationsRestoreOptions + | TaskOperationsPrePullDeploymentOptions + ): Record { + return { + env: opts.envId, + envtype: this.#envTypeToLabelValue(opts.envType), + org: opts.orgId, + project: opts.projectId, + }; + } + + #getResourceRequestsForMachine(preset: MachinePreset): ResourceQuantities { + return { + cpu: `${preset.cpu * 0.75}`, + memory: `${preset.memory}G`, + }; + } + + #getResourceLimitsForMachine(preset: MachinePreset): ResourceQuantities { + return { + cpu: `${preset.cpu}`, + memory: `${preset.memory}G`, + }; + } + + #getResourcesForMachine(preset: MachinePreset): k8s.V1ResourceRequirements { + return { + requests: { + ...this.#defaultResourceRequests, + ...this.#getResourceRequestsForMachine(preset), + }, + limits: { + ...this.#defaultResourceLimits, + ...this.#getResourceLimitsForMachine(preset), + }, + }; + } + + #getLifecycleCommand( + type: THookType, + cause: THookType extends "postStart" ? PostStartCauses : PreStopCauses + ) { + const retries = 5; + + // This will retry sending the lifecycle hook up to `retries` times + // The sleep is required as this may start running before the HTTP server is up + const exec = [ + "/bin/sh", + "-c", + `for i in $(seq ${retries}); do sleep 1; busybox wget -q -O- 127.0.0.1:8000/${type}?cause=${cause} && break; done`, + ]; + + logger.debug("getLifecycleCommand()", { exec }); + + return exec; + } + + #getIndexContainerName(suffix: string) { + return `task-index-${suffix}`; + } + + #getRunContainerName(suffix: string, attemptNumber?: number) { + return `task-run-${suffix}${attemptNumber && attemptNumber > 1 ? `-att${attemptNumber}` : ""}`; + } + + #getPrePullContainerName(suffix: string) { + return `task-prepull-${suffix}`; + } + + #createK8sApi() { + const kubeConfig = new k8s.KubeConfig(); + + if (RUNTIME_ENV === "local") { + kubeConfig.loadFromDefault(); + } else if (RUNTIME_ENV === "kubernetes") { + kubeConfig.loadFromCluster(); + } else { + throw new Error(`Unsupported runtime environment: ${RUNTIME_ENV}`); + } + + return { + core: kubeConfig.makeApiClient(k8s.CoreV1Api), + batch: kubeConfig.makeApiClient(k8s.BatchV1Api), + apps: kubeConfig.makeApiClient(k8s.AppsV1Api), + }; + } + + async #createPod(pod: k8s.V1Pod, namespace: Namespace) { + try { + const res = await this.#k8sApi.core.createNamespacedPod(namespace.metadata.name, pod); + logger.debug(res.body); + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + async #deletePod(opts: { runId: string; namespace: Namespace }) { + try { + const res = await this.#k8sApi.core.deleteNamespacedPod( + opts.runId, + opts.namespace.metadata.name + ); + logger.debug(res.body); + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + async #getPod(runId: string, namespace: Namespace) { + try { + const res = await this.#k8sApi.core.readNamespacedPod(runId, namespace.metadata.name); + logger.debug(res.body); + return res.body; + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + async #createJob(job: k8s.V1Job, namespace: Namespace) { + try { + const res = await this.#k8sApi.batch.createNamespacedJob(namespace.metadata.name, job); + logger.debug(res.body); + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + async #createDaemonSet(daemonSet: k8s.V1DaemonSet, namespace: Namespace) { + try { + const res = await this.#k8sApi.apps.createNamespacedDaemonSet( + namespace.metadata.name, + daemonSet + ); + logger.debug(res.body); + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + #throwUnlessRecord(candidate: unknown): asserts candidate is Record { + if (typeof candidate !== "object" || candidate === null) { + throw candidate; + } + } + + #handleK8sError(err: unknown) { + this.#throwUnlessRecord(err); + + if ("body" in err && err.body) { + logger.error(err.body); + this.#throwUnlessRecord(err.body); + + if (typeof err.body.message === "string") { + throw new Error(err.body?.message); + } else { + throw err.body; + } + } else { + logger.error(err); + throw err; + } + } +} + +type ImageType = "deployment" | "restore" | "utility"; + +function getImagePrefix(type: ImageType) { + switch (type) { + case "deployment": + return DEPLOYMENT_IMAGE_PREFIX; + case "restore": + return RESTORE_IMAGE_PREFIX; + case "utility": + return UTILITY_IMAGE_PREFIX; + default: + assertExhaustive(type); + } +} + +function getImageRef(type: ImageType, ref: string) { + const prefix = getImagePrefix(type); + return prefix ? `${prefix}/${ref}` : ref; +} + +const provider = new ProviderShell({ + tasks: new KubernetesTaskOperations({ + namespace: KUBERNETES_NAMESPACE, + }), + type: "kubernetes", +}); + +provider.listen(); + +const taskMonitor = new TaskMonitor({ + runtimeEnv: RUNTIME_ENV, + namespace: KUBERNETES_NAMESPACE, + onIndexFailure: async (deploymentId, details) => { + logger.log("Indexing failed", { deploymentId, details }); + + try { + provider.platformSocket.send("INDEXING_FAILED", { + deploymentId, + error: { + name: `Crashed with exit code ${details.exitCode}`, + message: details.reason, + stack: details.logs, + }, + overrideCompletion: details.overrideCompletion, + }); + } catch (error) { + logger.error(error); + } + }, + onRunFailure: async (runId, details) => { + logger.log("Run failed:", { runId, details }); + + try { + provider.platformSocket.send("WORKER_CRASHED", { runId, ...details }); + } catch (error) { + logger.error(error); + } + }, +}); + +taskMonitor.start(); + +const podCleaner = new PodCleaner({ + runtimeEnv: RUNTIME_ENV, + namespace: KUBERNETES_NAMESPACE, + intervalInSeconds: POD_CLEANER_INTERVAL_SECONDS, +}); + +podCleaner.start(); + +if (UPTIME_HEARTBEAT_URL) { + const uptimeHeartbeat = new UptimeHeartbeat({ + runtimeEnv: RUNTIME_ENV, + namespace: KUBERNETES_NAMESPACE, + intervalInSeconds: UPTIME_INTERVAL_SECONDS, + pingUrl: UPTIME_HEARTBEAT_URL, + maxPendingRuns: UPTIME_MAX_PENDING_RUNS, + maxPendingIndeces: UPTIME_MAX_PENDING_INDECES, + maxPendingErrors: UPTIME_MAX_PENDING_ERRORS, + }); + + uptimeHeartbeat.start(); +} else { + logger.log("Uptime heartbeat is disabled, set UPTIME_HEARTBEAT_URL to enable."); +} diff --git a/apps/kubernetes-provider/src/labelHelper.ts b/apps/kubernetes-provider/src/labelHelper.ts new file mode 100644 index 00000000000..98cd3d68be4 --- /dev/null +++ b/apps/kubernetes-provider/src/labelHelper.ts @@ -0,0 +1,153 @@ +import { assertExhaustive } from "@trigger.dev/core"; + +const CREATE_LABEL_ENV_VAR_PREFIX = "DEPLOYMENT_LABEL_"; +const RESTORE_LABEL_ENV_VAR_PREFIX = "RESTORE_LABEL_"; +const LABEL_SAMPLE_RATE_POSTFIX = "_SAMPLE_RATE"; + +type OperationType = "create" | "restore"; + +type CustomLabel = { + key: string; + value: string; + sampleRate: number; +}; + +export class CustomLabelHelper { + // Labels and sample rates are defined in environment variables so only need to be computed once + private createLabels?: CustomLabel[]; + private restoreLabels?: CustomLabel[]; + + private getLabelPrefix(type: OperationType) { + const prefix = type === "create" ? CREATE_LABEL_ENV_VAR_PREFIX : RESTORE_LABEL_ENV_VAR_PREFIX; + return prefix.toLowerCase(); + } + + private getLabelSampleRatePostfix() { + return LABEL_SAMPLE_RATE_POSTFIX.toLowerCase(); + } + + // Can only range from 0 to 1 + private fractionFromPercent(percent: number) { + return Math.min(1, Math.max(0, percent / 100)); + } + + private isLabelSampleRateEnvVar(key: string) { + return key.toLowerCase().endsWith(this.getLabelSampleRatePostfix()); + } + + private isLabelEnvVar(type: OperationType, key: string) { + const prefix = this.getLabelPrefix(type); + return key.toLowerCase().startsWith(prefix) && !this.isLabelSampleRateEnvVar(key); + } + + private getSampleRateEnvVarKey(type: OperationType, envKey: string) { + return `${envKey.toLowerCase()}${this.getLabelSampleRatePostfix()}`; + } + + private getLabelNameFromEnvVarKey(type: OperationType, key: string) { + return key + .slice(this.getLabelPrefix(type).length) + .toLowerCase() + .replace(/___/g, ".") + .replace(/__/g, "/") + .replace(/_/g, "-"); + } + + private getCaseInsensitiveEnvValue(key: string) { + for (const [envKey, value] of Object.entries(process.env)) { + if (envKey.toLowerCase() === key.toLowerCase()) { + return value; + } + } + } + + /** Returns the sample rate for a given label as fraction of 100 */ + private getSampleRateFromEnvVarKey(type: OperationType, envKey: string) { + // Apply default: always sample + const DEFAULT_SAMPLE_RATE_PERCENT = 100; + const defaultSampleRateFraction = this.fractionFromPercent(DEFAULT_SAMPLE_RATE_PERCENT); + + const value = this.getCaseInsensitiveEnvValue(this.getSampleRateEnvVarKey(type, envKey)); + + if (!value) { + return defaultSampleRateFraction; + } + + const sampleRatePercent = parseFloat(value || String(DEFAULT_SAMPLE_RATE_PERCENT)); + + if (isNaN(sampleRatePercent)) { + return defaultSampleRateFraction; + } + + const fractionalSampleRate = this.fractionFromPercent(sampleRatePercent); + + return fractionalSampleRate; + } + + private getCustomLabels(type: OperationType): CustomLabel[] { + switch (type) { + case "create": + if (this.createLabels) { + return this.createLabels; + } + break; + case "restore": + if (this.restoreLabels) { + return this.restoreLabels; + } + break; + default: + assertExhaustive(type); + } + + const customLabels: CustomLabel[] = []; + + for (const [envKey, value] of Object.entries(process.env)) { + const key = envKey.toLowerCase(); + + // Only process env vars that start with the expected prefix + if (!this.isLabelEnvVar(type, key)) { + continue; + } + + // Skip sample rates - deal with them separately + if (this.isLabelSampleRateEnvVar(key)) { + continue; + } + + const labelName = this.getLabelNameFromEnvVarKey(type, key); + const sampleRate = this.getSampleRateFromEnvVarKey(type, key); + + const label = { + key: labelName, + value: value || "", + sampleRate, + } satisfies CustomLabel; + + customLabels.push(label); + } + + return customLabels; + } + + getAdditionalLabels(type: OperationType): Record { + const labels = this.getCustomLabels(type); + + const additionalLabels: Record = {}; + + for (const { key, value, sampleRate } of labels) { + // Always apply label if sample rate is 1 + if (sampleRate === 1) { + additionalLabels[key] = value; + continue; + } + + if (Math.random() <= sampleRate) { + additionalLabels[key] = value; + continue; + } + } + + return additionalLabels; + } +} diff --git a/apps/kubernetes-provider/src/podCleaner.ts b/apps/kubernetes-provider/src/podCleaner.ts new file mode 100644 index 00000000000..2cd23d1b6c2 --- /dev/null +++ b/apps/kubernetes-provider/src/podCleaner.ts @@ -0,0 +1,323 @@ +import * as k8s from "@kubernetes/client-node"; +import { SimpleLogger } from "@trigger.dev/core/v3/apps"; + +type PodCleanerOptions = { + runtimeEnv: "local" | "kubernetes"; + namespace?: string; + intervalInSeconds?: number; +}; + +export class PodCleaner { + private enabled = false; + private namespace = "default"; + private intervalInSeconds = 300; + + private logger = new SimpleLogger("[PodCleaner]"); + private k8sClient: { + core: k8s.CoreV1Api; + apps: k8s.AppsV1Api; + kubeConfig: k8s.KubeConfig; + }; + + constructor(private opts: PodCleanerOptions) { + if (opts.namespace) { + this.namespace = opts.namespace; + } + + if (opts.intervalInSeconds) { + this.intervalInSeconds = opts.intervalInSeconds; + } + + this.k8sClient = this.#createK8sClient(); + } + + #createK8sClient() { + const kubeConfig = new k8s.KubeConfig(); + + if (this.opts.runtimeEnv === "local") { + kubeConfig.loadFromDefault(); + } else if (this.opts.runtimeEnv === "kubernetes") { + kubeConfig.loadFromCluster(); + } else { + throw new Error(`Unsupported runtime environment: ${this.opts.runtimeEnv}`); + } + + return { + core: kubeConfig.makeApiClient(k8s.CoreV1Api), + apps: kubeConfig.makeApiClient(k8s.AppsV1Api), + kubeConfig: kubeConfig, + }; + } + + #isRecord(candidate: unknown): candidate is Record { + if (typeof candidate !== "object" || candidate === null) { + return false; + } else { + return true; + } + } + + #logK8sError(err: unknown, debugOnly = false) { + if (debugOnly) { + this.logger.debug("K8s API Error", err); + } else { + this.logger.error("K8s API Error", err); + } + } + + #handleK8sError(err: unknown) { + if (!this.#isRecord(err) || !this.#isRecord(err.body)) { + this.#logK8sError(err); + return; + } + + this.#logK8sError(err, true); + + if (typeof err.body.message === "string") { + this.#logK8sError({ message: err.body.message }); + return; + } + + this.#logK8sError({ body: err.body }); + } + + async #deletePods(opts: { + namespace: string; + dryRun?: boolean; + fieldSelector?: string; + labelSelector?: string; + }) { + return await this.k8sClient.core + .deleteCollectionNamespacedPod( + opts.namespace, + undefined, // pretty + undefined, // continue + opts.dryRun ? "All" : undefined, + opts.fieldSelector, + undefined, // gracePeriodSeconds + opts.labelSelector + ) + .catch(this.#handleK8sError.bind(this)); + } + + async #deleteDaemonSets(opts: { + namespace: string; + dryRun?: boolean; + fieldSelector?: string; + labelSelector?: string; + }) { + return await this.k8sClient.apps + .deleteCollectionNamespacedDaemonSet( + opts.namespace, + undefined, // pretty + undefined, // continue + opts.dryRun ? "All" : undefined, + opts.fieldSelector, + undefined, // gracePeriodSeconds + opts.labelSelector + ) + .catch(this.#handleK8sError.bind(this)); + } + + async #deleteCompletedRuns() { + this.logger.log("Deleting completed runs"); + + const start = Date.now(); + + const result = await this.#deletePods({ + namespace: this.namespace, + fieldSelector: "status.phase=Succeeded", + labelSelector: "app=task-run", + }); + + const elapsedMs = Date.now() - start; + + if (!result) { + this.logger.log("Deleting completed runs: No delete result", { elapsedMs }); + return; + } + + const total = (result.response as any)?.body?.items?.length ?? 0; + + this.logger.log("Deleting completed runs: Done", { total, elapsedMs }); + } + + async #deleteFailedRuns() { + this.logger.log("Deleting failed runs"); + + const start = Date.now(); + + const result = await this.#deletePods({ + namespace: this.namespace, + fieldSelector: "status.phase=Failed", + labelSelector: "app=task-run", + }); + + const elapsedMs = Date.now() - start; + + if (!result) { + this.logger.log("Deleting failed runs: No delete result", { elapsedMs }); + return; + } + + const total = (result.response as any)?.body?.items?.length ?? 0; + + this.logger.log("Deleting failed runs: Done", { total, elapsedMs }); + } + + async #deleteUnrecoverableRuns() { + await this.#deletePods({ + namespace: this.namespace, + fieldSelector: "status.phase=?", + labelSelector: "app=task-run", + }); + } + + async #deleteCompletedPrePulls() { + this.logger.log("Deleting completed pre-pulls"); + + const start = Date.now(); + + const result = await this.#deleteDaemonSets({ + namespace: this.namespace, + labelSelector: "app=task-prepull", + }); + + const elapsedMs = Date.now() - start; + + if (!result) { + this.logger.log("Deleting completed pre-pulls: No delete result", { elapsedMs }); + return; + } + + const total = (result.response as any)?.body?.items?.length ?? 0; + + this.logger.log("Deleting completed pre-pulls: Done", { total, elapsedMs }); + } + + async start() { + this.enabled = true; + this.logger.log("Starting"); + + const completedInterval = setInterval(async () => { + if (!this.enabled) { + clearInterval(completedInterval); + return; + } + + try { + await this.#deleteCompletedRuns(); + } catch (error) { + this.logger.error("Error deleting completed runs", error); + } + }, this.intervalInSeconds * 1000); + + const failedInterval = setInterval( + async () => { + if (!this.enabled) { + clearInterval(failedInterval); + return; + } + + try { + await this.#deleteFailedRuns(); + } catch (error) { + this.logger.error("Error deleting completed runs", error); + } + }, + // Use a longer interval for failed runs. This is only a backup in case the task monitor fails. + 2 * this.intervalInSeconds * 1000 + ); + + const completedPrePullInterval = setInterval( + async () => { + if (!this.enabled) { + clearInterval(completedPrePullInterval); + return; + } + + try { + await this.#deleteCompletedPrePulls(); + } catch (error) { + this.logger.error("Error deleting completed pre-pulls", error); + } + }, + 2 * this.intervalInSeconds * 1000 + ); + + // this.#launchTests(); + } + + async stop() { + if (!this.enabled) { + return; + } + + this.enabled = false; + this.logger.log("Shutting down.."); + } + + async #launchTests() { + const createPod = async ( + container: k8s.V1Container, + name: string, + labels?: Record + ) => { + this.logger.log("Creating pod:", name); + + const pod = { + metadata: { + name, + labels, + }, + spec: { + restartPolicy: "Never", + automountServiceAccountToken: false, + terminationGracePeriodSeconds: 1, + containers: [container], + }, + } satisfies k8s.V1Pod; + + await this.k8sClient.core + .createNamespacedPod(this.namespace, pod) + .catch(this.#handleK8sError.bind(this)); + }; + + const createIdlePod = async (name: string, labels?: Record) => { + const container = { + name, + image: "docker.io/library/busybox", + command: ["sh"], + args: ["-c", "sleep infinity"], + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + const createCompletedPod = async (name: string, labels?: Record) => { + const container = { + name, + image: "docker.io/library/busybox", + command: ["sh"], + args: ["-c", "true"], + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + const createFailedPod = async (name: string, labels?: Record) => { + const container = { + name, + image: "docker.io/library/busybox", + command: ["sh"], + args: ["-c", "false"], + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + await createIdlePod("test-idle-1", { app: "task-run" }); + await createFailedPod("test-failed-1", { app: "task-run" }); + await createCompletedPod("test-completed-1", { app: "task-run" }); + } +} diff --git a/apps/kubernetes-provider/src/taskMonitor.ts b/apps/kubernetes-provider/src/taskMonitor.ts new file mode 100644 index 00000000000..aadcef18d8e --- /dev/null +++ b/apps/kubernetes-provider/src/taskMonitor.ts @@ -0,0 +1,459 @@ +import * as k8s from "@kubernetes/client-node"; +import { SimpleLogger } from "@trigger.dev/core/v3/apps"; +import { EXIT_CODE_ALREADY_HANDLED, EXIT_CODE_CHILD_NONZERO } from "@trigger.dev/core/v3/apps"; +import { setTimeout } from "timers/promises"; +import PQueue from "p-queue"; +import { TaskRunErrorCodes, type Prettify, type TaskRunInternalError } from "@trigger.dev/core/v3"; + +type FailureDetails = Prettify<{ + exitCode: number; + reason: string; + logs: string; + overrideCompletion: boolean; + errorCode: TaskRunInternalError["code"]; +}>; + +type IndexFailureHandler = (deploymentId: string, details: FailureDetails) => Promise; + +type RunFailureHandler = (runId: string, details: FailureDetails) => Promise; + +type TaskMonitorOptions = { + runtimeEnv: "local" | "kubernetes"; + onIndexFailure?: IndexFailureHandler; + onRunFailure?: RunFailureHandler; + namespace?: string; +}; + +export class TaskMonitor { + #enabled = false; + + #logger = new SimpleLogger("[TaskMonitor]"); + #taskInformer: ReturnType>; + #processedPods = new Map(); + #queue = new PQueue({ concurrency: 10 }); + + #k8sClient: { + core: k8s.CoreV1Api; + kubeConfig: k8s.KubeConfig; + }; + + private namespace = "default"; + private fieldSelector = "status.phase=Failed"; + private labelSelector = "app in (task-index, task-run)"; + + constructor(private opts: TaskMonitorOptions) { + if (opts.namespace) { + this.namespace = opts.namespace; + } + + this.#k8sClient = this.#createK8sClient(); + + this.#taskInformer = this.#createTaskInformer(); + this.#taskInformer.on("connect", this.#onInformerConnected.bind(this)); + this.#taskInformer.on("error", this.#onInformerError.bind(this)); + this.#taskInformer.on("update", this.#enqueueOnPodUpdated.bind(this)); + } + + #createTaskInformer() { + const listTasks = () => + this.#k8sClient.core.listNamespacedPod( + this.namespace, + undefined, + undefined, + undefined, + this.fieldSelector, + this.labelSelector + ); + + // Uses watch with local caching + // https://kubernetes.io/docs/reference/using-api/api-concepts/#efficient-detection-of-changes + const informer = k8s.makeInformer( + this.#k8sClient.kubeConfig, + `/api/v1/namespaces/${this.namespace}/pods`, + listTasks, + this.labelSelector, + this.fieldSelector + ); + + return informer; + } + + async #onInformerConnected() { + this.#logger.log("Connected"); + } + + async #onInformerError(error: any) { + this.#logger.error("Error:", error); + + // Automatic reconnect + await setTimeout(2_000); + this.#taskInformer.start(); + } + + #enqueueOnPodUpdated(pod: k8s.V1Pod) { + this.#queue.add(async () => { + try { + // It would be better to only pass the cache key, but the pod may already be removed from the cache by the time we process it + await this.#onPodUpdated(pod); + } catch (error) { + this.#logger.error("Caught onPodUpdated() error:", error); + } + }); + } + + async #onPodUpdated(pod: k8s.V1Pod) { + this.#logger.debug(`Updated: ${pod.metadata?.name}`); + this.#logger.debug("Updated", JSON.stringify(pod, null, 2)); + + // We only care about failures + if (pod.status?.phase !== "Failed") { + return; + } + + const podName = pod.metadata?.name; + + if (!podName) { + this.#logger.error("Pod is nameless", { pod }); + return; + } + + const containerStatus = pod.status.containerStatuses?.[0]; + + if (!containerStatus?.state) { + this.#logger.error("Pod failed, but container status doesn't have state", { + status: pod.status, + }); + return; + } + + if (this.#processedPods.has(podName)) { + this.#logger.debug("Pod update already processed", { + podName, + timestamp: this.#processedPods.get(podName), + }); + return; + } + + this.#processedPods.set(podName, Date.now()); + + const podStatus = this.#getPodStatusSummary(pod.status); + const containerState = this.#getContainerStateSummary(containerStatus.state); + const exitCode = containerState.exitCode ?? -1; + + if (exitCode === EXIT_CODE_ALREADY_HANDLED) { + this.#logger.debug("Ignoring pod failure, already handled by worker", { + podName, + }); + return; + } + + const rawLogs = await this.#getLogTail(podName); + + this.#logger.log(`${podName} failed with:`, { + podStatus, + containerState, + rawLogs, + }); + + const rawReason = podStatus.reason ?? containerState.reason ?? ""; + const message = podStatus.message ?? containerState.message ?? ""; + + let reason = rawReason || "Unknown error"; + let logs = rawLogs || ""; + + /** This will only override existing task errors. It will not crash the run. */ + let onlyOverrideExistingError = exitCode === EXIT_CODE_CHILD_NONZERO; + + let errorCode: TaskRunInternalError["code"] = TaskRunErrorCodes.POD_UNKNOWN_ERROR; + + switch (rawReason) { + case "Error": + reason = "Unknown error."; + errorCode = TaskRunErrorCodes.POD_UNKNOWN_ERROR; + break; + case "Evicted": + if (message.startsWith("Pod ephemeral local storage usage")) { + reason = "Storage limit exceeded."; + errorCode = TaskRunErrorCodes.DISK_SPACE_EXCEEDED; + } else if (message) { + reason = `Evicted: ${message}`; + errorCode = TaskRunErrorCodes.POD_EVICTED; + } else { + reason = "Evicted for unknown reason."; + errorCode = TaskRunErrorCodes.POD_EVICTED; + } + + if (logs.startsWith("failed to try resolving symlinks")) { + logs = ""; + } + break; + case "OOMKilled": + reason = + "[TaskMonitor] Your task ran out of memory. Try increasing the machine specs. If this doesn't fix it there might be a memory leak."; + errorCode = TaskRunErrorCodes.TASK_PROCESS_OOM_KILLED; + break; + default: + break; + } + + const failureInfo = { + exitCode, + reason, + logs, + overrideCompletion: onlyOverrideExistingError, + errorCode, + } satisfies FailureDetails; + + const app = pod.metadata?.labels?.app; + + switch (app) { + case "task-index": + const deploymentId = pod.metadata?.labels?.deployment; + + if (!deploymentId) { + this.#logger.error("Index is missing ID", { pod }); + return; + } + + if (this.opts.onIndexFailure) { + await this.opts.onIndexFailure(deploymentId, failureInfo); + } + break; + case "task-run": + const runId = pod.metadata?.labels?.run; + + if (!runId) { + this.#logger.error("Run is missing ID", { pod }); + return; + } + + if (this.opts.onRunFailure) { + await this.opts.onRunFailure(runId, failureInfo); + } + break; + default: + this.#logger.error("Pod has invalid app label", { pod }); + return; + } + + await this.#deletePod(podName); + } + + async #getLogTail(podName: string) { + try { + const logs = await this.#k8sClient.core.readNamespacedPodLog( + podName, + this.namespace, + undefined, + undefined, + undefined, + 1024, // limitBytes + undefined, + undefined, + undefined, + 20 // tailLines + ); + + const responseBody = logs.body ?? ""; + + if (responseBody.startsWith("unable to retrieve container logs")) { + return ""; + } + + // Type is wrong, body may be undefined + return responseBody; + } catch (error) { + this.#logger.error("Log tail error:", error instanceof Error ? error.message : "unknown"); + return ""; + } + } + + #getPodStatusSummary(status: k8s.V1PodStatus) { + return { + reason: status.reason, + message: status.message, + }; + } + + #getContainerStateSummary(state: k8s.V1ContainerState) { + return { + reason: state.terminated?.reason, + exitCode: state.terminated?.exitCode, + message: state.terminated?.message, + }; + } + + #createK8sClient() { + const kubeConfig = new k8s.KubeConfig(); + + if (this.opts.runtimeEnv === "local") { + kubeConfig.loadFromDefault(); + } else if (this.opts.runtimeEnv === "kubernetes") { + kubeConfig.loadFromCluster(); + } else { + throw new Error(`Unsupported runtime environment: ${this.opts.runtimeEnv}`); + } + + return { + core: kubeConfig.makeApiClient(k8s.CoreV1Api), + kubeConfig: kubeConfig, + }; + } + + #isRecord(candidate: unknown): candidate is Record { + if (typeof candidate !== "object" || candidate === null) { + return false; + } else { + return true; + } + } + + #logK8sError(err: unknown, debugOnly = false) { + if (debugOnly) { + this.#logger.debug("K8s API Error", err); + } else { + this.#logger.error("K8s API Error", err); + } + } + + #handleK8sError(err: unknown) { + if (!this.#isRecord(err) || !this.#isRecord(err.body)) { + this.#logK8sError(err); + return; + } + + this.#logK8sError(err, true); + + if (typeof err.body.message === "string") { + this.#logK8sError({ message: err.body.message }); + return; + } + + this.#logK8sError({ body: err.body }); + } + + #printStats(includeMoreDetails = false) { + this.#logger.log("Stats:", { + cacheSize: this.#taskInformer.list().length, + totalProcessed: this.#processedPods.size, + ...(includeMoreDetails && { + processedPods: this.#processedPods, + }), + }); + } + + async #deletePod(name: string) { + this.#logger.debug("Deleting pod:", name); + + await this.#k8sClient.core + .deleteNamespacedPod(name, this.namespace) + .catch(this.#handleK8sError.bind(this)); + } + + async start() { + this.#enabled = true; + + const interval = setInterval(() => { + if (!this.#enabled) { + clearInterval(interval); + return; + } + + this.#printStats(); + }, 300_000); + + await this.#taskInformer.start(); + + // this.#launchTests(); + } + + async stop() { + if (!this.#enabled) { + return; + } + + this.#enabled = false; + this.#logger.log("Shutting down.."); + + await this.#taskInformer.stop(); + + this.#printStats(true); + } + + async #launchTests() { + const createPod = async ( + container: k8s.V1Container, + name: string, + labels?: Record + ) => { + this.#logger.log("Creating pod:", name); + + const pod = { + metadata: { + name, + labels, + }, + spec: { + restartPolicy: "Never", + automountServiceAccountToken: false, + terminationGracePeriodSeconds: 1, + containers: [container], + }, + } satisfies k8s.V1Pod; + + await this.#k8sClient.core + .createNamespacedPod(this.namespace, pod) + .catch(this.#handleK8sError.bind(this)); + }; + + const createOomPod = async (name: string, labels?: Record) => { + const container = { + name, + image: "polinux/stress", + resources: { + limits: { + memory: "100Mi", + }, + }, + command: ["stress"], + args: ["--vm", "1", "--vm-bytes", "150M", "--vm-hang", "1"], + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + const createNonZeroExitPod = async (name: string, labels?: Record) => { + const container = { + name, + image: "docker.io/library/busybox", + command: ["sh"], + args: ["-c", "exit 1"], + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + const createOoDiskPod = async (name: string, labels?: Record) => { + const container = { + name, + image: "docker.io/library/busybox", + command: ["sh"], + args: [ + "-c", + "echo creating huge-file..; head -c 1000m /dev/zero > huge-file; ls -lh huge-file; sleep infinity", + ], + resources: { + limits: { + "ephemeral-storage": "500Mi", + }, + }, + } satisfies k8s.V1Container; + + await createPod(container, name, labels); + }; + + await createNonZeroExitPod("non-zero-exit-task", { app: "task-run", run: "123" }); + await createOomPod("oom-task", { app: "task-index", deployment: "456" }); + await createOoDiskPod("ood-task", { app: "task-run", run: "abc" }); + } +} diff --git a/apps/kubernetes-provider/src/uptimeHeartbeat.ts b/apps/kubernetes-provider/src/uptimeHeartbeat.ts new file mode 100644 index 00000000000..9ff63032f0b --- /dev/null +++ b/apps/kubernetes-provider/src/uptimeHeartbeat.ts @@ -0,0 +1,272 @@ +import * as k8s from "@kubernetes/client-node"; +import { SimpleLogger } from "@trigger.dev/core/v3/apps"; + +type UptimeHeartbeatOptions = { + runtimeEnv: "local" | "kubernetes"; + pingUrl: string; + namespace?: string; + intervalInSeconds?: number; + maxPendingRuns?: number; + maxPendingIndeces?: number; + maxPendingErrors?: number; + leadingEdge?: boolean; +}; + +export class UptimeHeartbeat { + private enabled = false; + private namespace: string; + + private intervalInSeconds: number; + private maxPendingRuns: number; + private maxPendingIndeces: number; + private maxPendingErrors: number; + + private leadingEdge = true; + + private logger = new SimpleLogger("[UptimeHeartbeat]"); + private k8sClient: { + core: k8s.CoreV1Api; + kubeConfig: k8s.KubeConfig; + }; + + constructor(private opts: UptimeHeartbeatOptions) { + this.namespace = opts.namespace ?? "default"; + + this.intervalInSeconds = opts.intervalInSeconds ?? 60; + this.maxPendingRuns = opts.maxPendingRuns ?? 25; + this.maxPendingIndeces = opts.maxPendingIndeces ?? 10; + this.maxPendingErrors = opts.maxPendingErrors ?? 10; + + this.k8sClient = this.#createK8sClient(); + } + + #createK8sClient() { + const kubeConfig = new k8s.KubeConfig(); + + if (this.opts.runtimeEnv === "local") { + kubeConfig.loadFromDefault(); + } else if (this.opts.runtimeEnv === "kubernetes") { + kubeConfig.loadFromCluster(); + } else { + throw new Error(`Unsupported runtime environment: ${this.opts.runtimeEnv}`); + } + + return { + core: kubeConfig.makeApiClient(k8s.CoreV1Api), + kubeConfig: kubeConfig, + }; + } + + #isRecord(candidate: unknown): candidate is Record { + if (typeof candidate !== "object" || candidate === null) { + return false; + } else { + return true; + } + } + + #logK8sError(err: unknown, debugOnly = false) { + if (debugOnly) { + this.logger.debug("K8s API Error", err); + } else { + this.logger.error("K8s API Error", err); + } + } + + #handleK8sError(err: unknown) { + if (!this.#isRecord(err) || !this.#isRecord(err.body)) { + this.#logK8sError(err); + return; + } + + this.#logK8sError(err, true); + + if (typeof err.body.message === "string") { + this.#logK8sError({ message: err.body.message }); + return; + } + + this.#logK8sError({ body: err.body }); + } + + async #getPods(opts: { + namespace: string; + fieldSelector?: string; + labelSelector?: string; + }): Promise | undefined> { + const listReturn = await this.k8sClient.core + .listNamespacedPod( + opts.namespace, + undefined, // pretty + undefined, // allowWatchBookmarks + undefined, // _continue + opts.fieldSelector, + opts.labelSelector, + this.maxPendingRuns * 2, // limit + undefined, // resourceVersion + undefined, // resourceVersionMatch + undefined, // sendInitialEvents + this.intervalInSeconds, // timeoutSeconds, + undefined // watch + ) + .catch(this.#handleK8sError.bind(this)); + + return listReturn?.body.items; + } + + async #getPendingIndeces(): Promise | undefined> { + return await this.#getPods({ + namespace: this.namespace, + fieldSelector: "status.phase=Pending", + labelSelector: "app=task-index", + }); + } + + async #getPendingTasks(): Promise | undefined> { + return await this.#getPods({ + namespace: this.namespace, + fieldSelector: "status.phase=Pending", + labelSelector: "app=task-run", + }); + } + + #countPods(pods: Array): number { + return pods.length; + } + + #filterPendingPods( + pods: Array, + waitingReason: "CreateContainerError" | "RunContainerError" + ): Array { + return pods.filter((pod) => { + const containerStatus = pod.status?.containerStatuses?.[0]; + return containerStatus?.state?.waiting?.reason === waitingReason; + }); + } + + async #sendPing() { + this.logger.log("Sending ping"); + + const start = Date.now(); + const controller = new AbortController(); + + const timeoutMs = (this.intervalInSeconds * 1000) / 2; + + const fetchTimeout = setTimeout(() => { + controller.abort(); + }, timeoutMs); + + try { + const response = await fetch(this.opts.pingUrl, { + signal: controller.signal, + }); + + if (!response.ok) { + this.logger.error("Failed to send ping, response not OK", { + status: response.status, + }); + return; + } + + const elapsedMs = Date.now() - start; + this.logger.log("Ping sent", { elapsedMs }); + } catch (error) { + if (error instanceof DOMException && error.name === "AbortError") { + this.logger.log("Ping timeout", { timeoutSeconds: timeoutMs }); + return; + } + + this.logger.error("Failed to send ping", error); + } finally { + clearTimeout(fetchTimeout); + } + } + + async #heartbeat() { + this.logger.log("Performing heartbeat"); + + const start = Date.now(); + + const pendingTasks = await this.#getPendingTasks(); + + if (!pendingTasks) { + this.logger.error("Failed to get pending tasks"); + return; + } + + const totalPendingTasks = this.#countPods(pendingTasks); + + const pendingIndeces = await this.#getPendingIndeces(); + + if (!pendingIndeces) { + this.logger.error("Failed to get pending indeces"); + return; + } + + const totalPendingIndeces = this.#countPods(pendingIndeces); + + const elapsedMs = Date.now() - start; + + this.logger.log("Finished heartbeat checks", { elapsedMs }); + + if (totalPendingTasks > this.maxPendingRuns) { + this.logger.log("Too many pending tasks, skipping heartbeat", { totalPendingTasks }); + return; + } + + if (totalPendingIndeces > this.maxPendingIndeces) { + this.logger.log("Too many pending indeces, skipping heartbeat", { totalPendingIndeces }); + return; + } + + const totalCreateContainerErrors = this.#countPods( + this.#filterPendingPods(pendingTasks, "CreateContainerError") + ); + const totalRunContainerErrors = this.#countPods( + this.#filterPendingPods(pendingTasks, "RunContainerError") + ); + + if (totalCreateContainerErrors + totalRunContainerErrors > this.maxPendingErrors) { + this.logger.log("Too many pending tasks with errors, skipping heartbeat", { + totalRunContainerErrors, + totalCreateContainerErrors, + }); + return; + } + + await this.#sendPing(); + + this.logger.log("Heartbeat done", { totalPendingTasks, elapsedMs }); + } + + async start() { + this.enabled = true; + this.logger.log("Starting"); + + if (this.leadingEdge) { + await this.#heartbeat(); + } + + const heartbeat = setInterval(async () => { + if (!this.enabled) { + clearInterval(heartbeat); + return; + } + + try { + await this.#heartbeat(); + } catch (error) { + this.logger.error("Error while heartbeating", error); + } + }, this.intervalInSeconds * 1000); + } + + async stop() { + if (!this.enabled) { + return; + } + + this.enabled = false; + this.logger.log("Shutting down.."); + } +} diff --git a/apps/kubernetes-provider/tsconfig.json b/apps/kubernetes-provider/tsconfig.json new file mode 100644 index 00000000000..6ec7865b64e --- /dev/null +++ b/apps/kubernetes-provider/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "target": "es2020", + "module": "commonjs", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "strict": true, + "skipLibCheck": true, + "paths": { + "@trigger.dev/core": ["../../packages/core/src"], + "@trigger.dev/core/*": ["../../packages/core/src/*"], + "@trigger.dev/core/v3": ["../../packages/core/src/v3"], + "@trigger.dev/core/v3/*": ["../../packages/core/src/v3/*"] + } + } +} diff --git a/apps/supervisor/.env.example b/apps/supervisor/.env.example new file mode 100644 index 00000000000..5cb86d5a331 --- /dev/null +++ b/apps/supervisor/.env.example @@ -0,0 +1,17 @@ +# This needs to match the token of the worker group you want to connect to +TRIGGER_WORKER_TOKEN= + +# This needs to match the MANAGED_WORKER_SECRET env var on the webapp +MANAGED_WORKER_SECRET=managed-secret + +# Point this at the webapp in prod +TRIGGER_API_URL=http://localhost:3030 + +# Point this at the webapp or an OTel collector in prod +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:3030/otel +# Use this on macOS +# OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:3030/otel + +# Optional settings +DEBUG=1 +TRIGGER_DEQUEUE_INTERVAL_MS=1000 \ No newline at end of file diff --git a/apps/supervisor/.nvmrc b/apps/supervisor/.nvmrc new file mode 100644 index 00000000000..dc0bb0f4398 --- /dev/null +++ b/apps/supervisor/.nvmrc @@ -0,0 +1 @@ +v22.12.0 diff --git a/apps/supervisor/CLAUDE.md b/apps/supervisor/CLAUDE.md new file mode 100644 index 00000000000..ded836c6069 --- /dev/null +++ b/apps/supervisor/CLAUDE.md @@ -0,0 +1,20 @@ +# Supervisor + +Node.js app that manages task execution containers. Receives work from the platform, starts Docker/Kubernetes containers, monitors execution, and reports results. + +## Key Directories + +- `src/services/` - Core service logic +- `src/workloadManager/` - Container orchestration abstraction (Docker or Kubernetes) +- `src/workloadServer/` - HTTP server for workload communication (heartbeats, snapshots) +- `src/clients/` - Platform communication (webapp/coordinator) +- `src/env.ts` - Environment configuration + +## Architecture + +- **WorkloadManager**: Abstracts Docker vs Kubernetes execution +- **SupervisorSession**: Manages the dequeue loop with EWMA-based dynamic scaling +- **ResourceMonitor**: Tracks CPU/memory during execution +- **PodCleaner/FailedPodHandler**: Kubernetes-specific cleanup + +Communicates with the platform via Socket.io and HTTP. Receives task assignments through the dequeue protocol from the webapp. diff --git a/apps/supervisor/Containerfile b/apps/supervisor/Containerfile new file mode 100644 index 00000000000..d5bb5862e96 --- /dev/null +++ b/apps/supervisor/Containerfile @@ -0,0 +1,54 @@ +FROM node:22-alpine@sha256:9bef0ef1e268f60627da9ba7d7605e8831d5b56ad07487d24d1aa386336d1944 AS node-22-alpine + +WORKDIR /app + +FROM node-22-alpine AS pruner + +COPY --chown=node:node . . +RUN npx -q turbo@2.5.4 prune --scope=supervisor --docker + +FROM node-22-alpine AS base + +RUN apk add --no-cache dumb-init + +COPY --chown=node:node .gitignore .gitignore +COPY --from=pruner --chown=node:node /app/out/json/ . +COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml +COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml + +RUN corepack enable && corepack prepare pnpm@10.23.0 --activate + +FROM base AS deps-fetcher +RUN apk add --no-cache python3-dev py3-setuptools make g++ gcc linux-headers +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm fetch --frozen-lockfile + +FROM deps-fetcher AS dev-deps +ENV NODE_ENV development + +RUN --mount=type=cache,id=pnpm,target=/root/.local/share/pnpm/store pnpm install --frozen-lockfile --offline --ignore-scripts + +FROM base AS builder + +COPY --from=pruner --chown=node:node /app/out/full/ . +COPY --from=dev-deps --chown=node:node /app/ . +COPY --chown=node:node turbo.json turbo.json +COPY --chown=node:node .configs/tsconfig.base.json .configs/tsconfig.base.json +COPY --chown=node:node scripts/updateVersion.ts scripts/updateVersion.ts + +RUN pnpm run generate && \ + pnpm run --filter supervisor... build&& \ + pnpm deploy --legacy --filter=supervisor --prod /prod/supervisor + +FROM base AS runner + +ENV NODE_ENV production + +COPY --from=builder /prod/supervisor /app/apps/supervisor + +EXPOSE 8000 +USER node + +# ensure pnpm is installed during build and not silently downloaded at runtime +RUN pnpm -v + +CMD [ "/usr/bin/dumb-init", "--", "pnpm", "run", "--filter", "supervisor", "start"] diff --git a/apps/supervisor/README.md b/apps/supervisor/README.md new file mode 100644 index 00000000000..86b447269d2 --- /dev/null +++ b/apps/supervisor/README.md @@ -0,0 +1,105 @@ +# Supervisor + +## Dev setup + +1. Create a worker group + +```sh +api_url=http://localhost:3030 +wg_name=my-worker + +# edit this +admin_pat=tr_pat_... + +curl -sS \ + -X POST \ + "$api_url/admin/api/v1/workers" \ + -H "Authorization: Bearer $admin_pat" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"$wg_name\"}" +``` + +If the worker group is newly created, the response will include a `token` field. If the group already exists, no token is returned. + +2. Create `.env` and set the worker token + +```sh +cp .env.example .env + +# Then edit your .env and set this to the token.plaintext value +TRIGGER_WORKER_TOKEN=tr_wgt_... +``` + +3. Start the supervisor + +```sh +pnpm dev +``` + +4. Build CLI, then deploy a test project + +```sh +pnpm exec trigger deploy --self-hosted + +# The additional network flag is required on linux +pnpm exec trigger deploy --self-hosted --network host +``` + +## Worker group management + +### Shared variables + +```sh +api_url=http://localhost:3030 +admin_pat=tr_pat_... # edit this +``` + +- These are used by all commands + +### Create a worker group + +```sh +wg_name=my-worker + +curl -sS \ + -X POST \ + "$api_url/admin/api/v1/workers" \ + -H "Authorization: Bearer $admin_pat" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"$wg_name\"}" +``` + +- If the worker group already exists, no token will be returned + +### Set a worker group as default for a project + +```sh +wg_name=my-worker +project_id=clsw6q8wz... + +curl -sS \ + -X POST \ + "$api_url/admin/api/v1/workers" \ + -H "Authorization: Bearer $admin_pat" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"$wg_name\", \"projectId\": \"$project_id\", \"makeDefaultForProject\": true}" +``` + +- If the worker group doesn't exist, yet it will be created +- If the worker group already exists, it will be attached to the project as default. No token will be returned. + +### Remove the default worker group from a project + +```sh +project_id=clsw6q8wz... + +curl -sS \ + -X POST \ + "$api_url/admin/api/v1/workers" \ + -H "Authorization: Bearer $admin_pat" \ + -H "Content-Type: application/json" \ + -d "{\"projectId\": \"$project_id\", \"removeDefaultFromProject\": true}" +``` + +- The project will then use the global default again +- When `removeDefaultFromProject: true` no other actions will be performed diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json new file mode 100644 index 00000000000..7456d421850 --- /dev/null +++ b/apps/supervisor/package.json @@ -0,0 +1,30 @@ +{ + "name": "supervisor", + "private": true, + "version": "0.0.1", + "main": "dist/index.js", + "type": "module", + "scripts": { + "build": "tsc", + "dev": "tsx --require dotenv/config --watch src/index.ts || (echo '!! Remember to run: nvm use'; exit 1)", + "start": "node dist/index.js", + "test:run": "vitest --no-file-parallelism --run", + "test:watch": "vitest --no-file-parallelism", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "@aws-sdk/client-ecr": "^3.839.0", + "@internal/compute": "workspace:*", + "@kubernetes/client-node": "^1.0.0", + "@trigger.dev/core": "workspace:*", + "dockerode": "^4.0.6", + "p-limit": "^6.2.0", + "prom-client": "^15.1.0", + "socket.io": "4.7.4", + "std-env": "^3.8.0", + "zod": "3.25.76" + }, + "devDependencies": { + "@types/dockerode": "^3.3.33" + } +} diff --git a/apps/supervisor/src/clients/kubernetes.ts b/apps/supervisor/src/clients/kubernetes.ts new file mode 100644 index 00000000000..f66e57e4353 --- /dev/null +++ b/apps/supervisor/src/clients/kubernetes.ts @@ -0,0 +1,55 @@ +import * as k8s from "@kubernetes/client-node"; +import { Informer } from "@kubernetes/client-node"; +import { ListPromise } from "@kubernetes/client-node"; +import { KubernetesObject } from "@kubernetes/client-node"; +import { assertExhaustive } from "@trigger.dev/core/utils"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +export const RUNTIME_ENV = process.env.KUBERNETES_PORT ? "kubernetes" : "local"; + +const logger = new SimpleStructuredLogger("kubernetes-client"); + +export function createK8sApi() { + const kubeConfig = getKubeConfig(); + + function makeInformer( + path: string, + listPromiseFn: ListPromise, + labelSelector?: string, + fieldSelector?: string + ): Informer { + return k8s.makeInformer(kubeConfig, path, listPromiseFn, labelSelector, fieldSelector); + } + + const api = { + core: kubeConfig.makeApiClient(k8s.CoreV1Api), + batch: kubeConfig.makeApiClient(k8s.BatchV1Api), + apps: kubeConfig.makeApiClient(k8s.AppsV1Api), + makeInformer, + }; + + return api; +} + +export type K8sApi = ReturnType; + +function getKubeConfig() { + logger.debug("getKubeConfig()", { RUNTIME_ENV }); + + const kubeConfig = new k8s.KubeConfig(); + + switch (RUNTIME_ENV) { + case "local": + kubeConfig.loadFromDefault(); + break; + case "kubernetes": + kubeConfig.loadFromCluster(); + break; + default: + assertExhaustive(RUNTIME_ENV); + } + + return kubeConfig; +} + +export { k8s }; diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts new file mode 100644 index 00000000000..b69fb24d73f --- /dev/null +++ b/apps/supervisor/src/env.ts @@ -0,0 +1,269 @@ +import { randomUUID } from "crypto"; +import { env as stdEnv } from "std-env"; +import { z } from "zod"; +import { AdditionalEnvVars, BoolEnv } from "./envUtil.js"; + +const Env = z + .object({ + // This will come from `spec.nodeName` in k8s + TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), + TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + + // Required settings + TRIGGER_API_URL: z.string().url(), + TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file + MANAGED_WORKER_SECRET: z.string(), + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners + + // Workload API settings (coordinator mode) - the workload API is what the run controller connects to + TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), + TRIGGER_WORKLOAD_API_PROTOCOL: z + .string() + .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) + .default("http"), + TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default + TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), + TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on + TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller + + // Runner settings + RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_PRETTY_LOGS: BoolEnv.default(false), + + // Dequeue settings (provider mode) + TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), + TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), + TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), + TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), + TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds + TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds + TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) + TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) + TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) + TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + + // Optional services + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_CHECKPOINT_URL: z.string().optional(), + TRIGGER_METADATA_URL: z.string().optional(), + + // Used by the resource monitor + RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), + RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), + RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), + + // Docker settings + DOCKER_API_VERSION: z.string().optional(), + DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 + DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), + DOCKER_REGISTRY_USERNAME: z.string().optional(), + DOCKER_REGISTRY_PASSWORD: z.string().optional(), + DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 + DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), + DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), + /** + * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. + * Any other value is taken as a custom network's name to which all runners should connect to. + * + * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. + * + * **WARNING**: Specifying multiple networks will slightly increase startup times. + * + * @default "host" + */ + DOCKER_RUNNER_NETWORKS: z.string().default("host"), + + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + COMPUTE_TRACE_SPANS_ENABLED: BoolEnv.default(true), + COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) + COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), + COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10), + + // Kubernetes settings + KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), + KUBERNETES_NAMESPACE: z.string().default("default"), + KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), + KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv + KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), + KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), + KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), + KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), + KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit + KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), + KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit + + // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO + KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO + KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB + KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + // Large machine affinity settings - large-* presets prefer a dedicated pool + KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("large-machines"), + KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100), + + // Project affinity settings - pods from the same project prefer the same node + KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), + KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z + .string() + .trim() + .min(1) + .default("kubernetes.io/hostname"), + + // Schedule affinity settings - runs from schedule trees prefer a dedicated pool + KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("scheduled-runs"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), + KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce + .number() + .int() + .min(1) + .max(100) + .default(20), + + // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool + // Comma-separated list of tolerations in the format: key=value:effect + // For Exists operator (no value): key:effect + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z + .string() + .transform((val, ctx) => { + const tolerations = val + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (missing effect): "${entry}"`, + }); + return z.NEVER; + } + + const effect = entry.slice(colonIdx + 1); + const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"]; + if (!validEffects.includes(effect)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join(", ")}`, + }); + return z.NEVER; + } + + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx); + + if (!key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (empty key): "${entry}"`, + }); + return z.NEVER; + } + + if (eqIdx === -1) { + return { key, operator: "Exists" as const, effect }; + } + + return { + key, + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + + return tolerations; + }) + .optional(), + + // Placement tags settings + PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), + PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), + + // Metrics + METRICS_ENABLED: BoolEnv.default(true), + METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), + METRICS_HOST: z.string().default("127.0.0.1"), + METRICS_PORT: z.coerce.number().int().default(9090), + + // Pod cleaner + POD_CLEANER_ENABLED: BoolEnv.default(true), + POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), + POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), + + // Failed pod handler + FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), + FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), + + // Debug + DEBUG: BoolEnv.default(false), + SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + }) + .superRefine((data, ctx) => { + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_METADATA_URL"], + }); + } + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_WORKLOAD_API_DOMAIN) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_WORKLOAD_API_DOMAIN is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_WORKLOAD_API_DOMAIN"], + }); + } + }) + .transform((data) => ({ + ...data, + COMPUTE_TRACE_OTLP_ENDPOINT: data.COMPUTE_TRACE_OTLP_ENDPOINT ?? `${data.TRIGGER_API_URL}/otel`, + })); + +export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/envUtil.test.ts b/apps/supervisor/src/envUtil.test.ts new file mode 100644 index 00000000000..c3d35758f16 --- /dev/null +++ b/apps/supervisor/src/envUtil.test.ts @@ -0,0 +1,80 @@ +import { describe, it, expect } from "vitest"; +import { BoolEnv, AdditionalEnvVars } from "./envUtil.js"; + +describe("BoolEnv", () => { + it("should parse string 'true' as true", () => { + expect(BoolEnv.parse("true")).toBe(true); + expect(BoolEnv.parse("TRUE")).toBe(true); + expect(BoolEnv.parse("True")).toBe(true); + }); + + it("should parse string '1' as true", () => { + expect(BoolEnv.parse("1")).toBe(true); + }); + + it("should parse string 'false' as false", () => { + expect(BoolEnv.parse("false")).toBe(false); + expect(BoolEnv.parse("FALSE")).toBe(false); + expect(BoolEnv.parse("False")).toBe(false); + }); + + it("should handle whitespace", () => { + expect(BoolEnv.parse(" true ")).toBe(true); + expect(BoolEnv.parse(" 1 ")).toBe(true); + }); + + it("should pass through boolean values", () => { + expect(BoolEnv.parse(true)).toBe(true); + expect(BoolEnv.parse(false)).toBe(false); + }); + + it("should return false for invalid inputs", () => { + expect(BoolEnv.parse("invalid")).toBe(false); + expect(BoolEnv.parse("")).toBe(false); + }); +}); + +describe("AdditionalEnvVars", () => { + it("should parse single key-value pair", () => { + expect(AdditionalEnvVars.parse("FOO=bar")).toEqual({ FOO: "bar" }); + }); + + it("should parse multiple key-value pairs", () => { + expect(AdditionalEnvVars.parse("FOO=bar,BAZ=qux")).toEqual({ + FOO: "bar", + BAZ: "qux", + }); + }); + + it("should handle whitespace", () => { + expect(AdditionalEnvVars.parse(" FOO = bar , BAZ = qux ")).toEqual({ + FOO: "bar", + BAZ: "qux", + }); + }); + + it("should return undefined for empty string", () => { + expect(AdditionalEnvVars.parse("")).toBeUndefined(); + }); + + it("should return undefined for invalid format", () => { + expect(AdditionalEnvVars.parse("invalid")).toBeUndefined(); + }); + + it("should skip invalid pairs but include valid ones", () => { + expect(AdditionalEnvVars.parse("FOO=bar,INVALID,BAZ=qux")).toEqual({ + FOO: "bar", + BAZ: "qux", + }); + }); + + it("should pass through undefined", () => { + expect(AdditionalEnvVars.parse(undefined)).toBeUndefined(); + }); + + it("should handle empty values", () => { + expect(AdditionalEnvVars.parse("FOO=,BAR=value")).toEqual({ + BAR: "value", + }); + }); +}); diff --git a/apps/supervisor/src/envUtil.ts b/apps/supervisor/src/envUtil.ts new file mode 100644 index 00000000000..917f984cc37 --- /dev/null +++ b/apps/supervisor/src/envUtil.ts @@ -0,0 +1,47 @@ +import { z } from "zod"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +const logger = new SimpleStructuredLogger("env-util"); + +const baseBoolEnv = z.preprocess((val) => { + if (typeof val !== "string") { + return val; + } + + return ["true", "1"].includes(val.toLowerCase().trim()); +}, z.boolean()); + +// Create a type-safe version that only accepts boolean defaults +export const BoolEnv = baseBoolEnv as Omit & { + default: (value: boolean) => z.ZodDefault; +}; + +export const AdditionalEnvVars = z.preprocess((val) => { + if (typeof val !== "string") { + return val; + } + + if (!val) { + return undefined; + } + + try { + const result = val.split(",").reduce( + (acc, pair) => { + const [key, value] = pair.split("="); + if (!key || !value) { + return acc; + } + acc[key.trim()] = value.trim(); + return acc; + }, + {} as Record + ); + + // Return undefined if no valid key-value pairs were found + return Object.keys(result).length === 0 ? undefined : result; + } catch (error) { + logger.warn("Failed to parse additional env vars", { error, val }); + return undefined; + } +}, z.record(z.string(), z.string()).optional()); diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts new file mode 100644 index 00000000000..6f5913c47ca --- /dev/null +++ b/apps/supervisor/src/index.ts @@ -0,0 +1,502 @@ +import { SupervisorSession } from "@trigger.dev/core/v3/workers"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { env } from "./env.js"; +import { WorkloadServer } from "./workloadServer/index.js"; +import type { WorkloadManagerOptions, WorkloadManager } from "./workloadManager/types.js"; +import Docker from "dockerode"; +import { z } from "zod"; +import { type DequeuedMessage } from "@trigger.dev/core/v3"; +import { + DockerResourceMonitor, + KubernetesResourceMonitor, + NoopResourceMonitor, + type ResourceMonitor, +} from "./resourceMonitor.js"; +import { KubernetesWorkloadManager } from "./workloadManager/kubernetes.js"; +import { DockerWorkloadManager } from "./workloadManager/docker.js"; +import { ComputeWorkloadManager } from "./workloadManager/compute.js"; +import { + HttpServer, + CheckpointClient, + isKubernetesEnvironment, +} from "@trigger.dev/core/v3/serverOnly"; +import { createK8sApi } from "./clients/kubernetes.js"; +import { collectDefaultMetrics } from "prom-client"; +import { register } from "./metrics.js"; +import { PodCleaner } from "./services/podCleaner.js"; +import { FailedPodHandler } from "./services/failedPodHandler.js"; +import { getWorkerToken } from "./workerToken.js"; +import { OtlpTraceService } from "./services/otlpTraceService.js"; +import { extractTraceparent, getRestoreRunnerId } from "./util.js"; + +if (env.METRICS_COLLECT_DEFAULTS) { + collectDefaultMetrics({ register }); +} + +class ManagedSupervisor { + private readonly workerSession: SupervisorSession; + private readonly metricsServer?: HttpServer; + private readonly workloadServer: WorkloadServer; + private readonly workloadManager: WorkloadManager; + private readonly computeManager?: ComputeWorkloadManager; + private readonly logger = new SimpleStructuredLogger("managed-supervisor"); + private readonly resourceMonitor: ResourceMonitor; + private readonly checkpointClient?: CheckpointClient; + + private readonly podCleaner?: PodCleaner; + private readonly failedPodHandler?: FailedPodHandler; + private readonly tracing?: OtlpTraceService; + + private readonly isKubernetes = isKubernetesEnvironment(env.KUBERNETES_FORCE_ENABLED); + private readonly warmStartUrl = env.TRIGGER_WARM_START_URL; + + constructor() { + const { + TRIGGER_WORKER_TOKEN, + MANAGED_WORKER_SECRET, + COMPUTE_GATEWAY_AUTH_TOKEN, + ...envWithoutSecrets + } = env; + + if (env.DEBUG) { + this.logger.debug("Starting up", { envWithoutSecrets }); + } + + if (this.warmStartUrl) { + this.logger.log("🔥 Warm starts enabled", { + warmStartUrl: this.warmStartUrl, + }); + } + + const workloadManagerOptions = { + workloadApiProtocol: env.TRIGGER_WORKLOAD_API_PROTOCOL, + workloadApiDomain: env.TRIGGER_WORKLOAD_API_DOMAIN, + workloadApiPort: env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL, + warmStartUrl: this.warmStartUrl, + metadataUrl: env.TRIGGER_METADATA_URL, + imagePullSecrets: env.KUBERNETES_IMAGE_PULL_SECRETS?.split(","), + heartbeatIntervalSeconds: env.RUNNER_HEARTBEAT_INTERVAL_SECONDS, + snapshotPollIntervalSeconds: env.RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS, + additionalEnvVars: env.RUNNER_ADDITIONAL_ENV_VARS, + dockerAutoremove: env.DOCKER_AUTOREMOVE_EXITED_CONTAINERS, + } satisfies WorkloadManagerOptions; + + this.resourceMonitor = env.RESOURCE_MONITOR_ENABLED + ? this.isKubernetes + ? new KubernetesResourceMonitor(createK8sApi(), env.TRIGGER_WORKER_INSTANCE_NAME) + : new DockerResourceMonitor(new Docker()) + : new NoopResourceMonitor(); + + if (env.COMPUTE_GATEWAY_URL) { + if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { + throw new Error("TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create compute manager"); + } + + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + if (env.COMPUTE_TRACE_SPANS_ENABLED) { + this.tracing = new OtlpTraceService({ + endpointUrl: env.COMPUTE_TRACE_OTLP_ENDPOINT, + }); + } + + const computeManager = new ComputeWorkloadManager({ + ...workloadManagerOptions, + gateway: { + url: env.COMPUTE_GATEWAY_URL, + authToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + timeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, + }, + snapshots: { + enabled: env.COMPUTE_SNAPSHOTS_ENABLED, + delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, + dispatchLimit: env.COMPUTE_SNAPSHOT_DISPATCH_LIMIT, + callbackUrl, + }, + tracing: this.tracing, + runner: { + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT, + prettyLogs: env.RUNNER_PRETTY_LOGS, + }, + }); + this.computeManager = computeManager; + this.workloadManager = computeManager; + } else { + this.workloadManager = this.isKubernetes + ? new KubernetesWorkloadManager(workloadManagerOptions) + : new DockerWorkloadManager(workloadManagerOptions); + } + + if (this.isKubernetes) { + if (env.POD_CLEANER_ENABLED) { + this.logger.log("🧹 Pod cleaner enabled", { + namespace: env.KUBERNETES_NAMESPACE, + batchSize: env.POD_CLEANER_BATCH_SIZE, + intervalMs: env.POD_CLEANER_INTERVAL_MS, + }); + this.podCleaner = new PodCleaner({ + register, + namespace: env.KUBERNETES_NAMESPACE, + batchSize: env.POD_CLEANER_BATCH_SIZE, + intervalMs: env.POD_CLEANER_INTERVAL_MS, + }); + } else { + this.logger.warn("Pod cleaner disabled"); + } + + if (env.FAILED_POD_HANDLER_ENABLED) { + this.logger.log("🔁 Failed pod handler enabled", { + namespace: env.KUBERNETES_NAMESPACE, + reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS, + }); + this.failedPodHandler = new FailedPodHandler({ + register, + namespace: env.KUBERNETES_NAMESPACE, + reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS, + }); + } else { + this.logger.warn("Failed pod handler disabled"); + } + } + + if (env.TRIGGER_DEQUEUE_INTERVAL_MS > env.TRIGGER_DEQUEUE_IDLE_INTERVAL_MS) { + this.logger.warn( + `⚠️ TRIGGER_DEQUEUE_INTERVAL_MS (${env.TRIGGER_DEQUEUE_INTERVAL_MS}) is greater than TRIGGER_DEQUEUE_IDLE_INTERVAL_MS (${env.TRIGGER_DEQUEUE_IDLE_INTERVAL_MS}) - did you mix them up?` + ); + } + + this.workerSession = new SupervisorSession({ + workerToken: getWorkerToken(), + apiUrl: env.TRIGGER_API_URL, + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + managedWorkerSecret: env.MANAGED_WORKER_SECRET, + dequeueIntervalMs: env.TRIGGER_DEQUEUE_INTERVAL_MS, + dequeueIdleIntervalMs: env.TRIGGER_DEQUEUE_IDLE_INTERVAL_MS, + queueConsumerEnabled: env.TRIGGER_DEQUEUE_ENABLED, + maxRunCount: env.TRIGGER_DEQUEUE_MAX_RUN_COUNT, + metricsRegistry: register, + scaling: { + strategy: env.TRIGGER_DEQUEUE_SCALING_STRATEGY, + minConsumerCount: env.TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT, + maxConsumerCount: env.TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT, + scaleUpCooldownMs: env.TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS, + scaleDownCooldownMs: env.TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS, + targetRatio: env.TRIGGER_DEQUEUE_SCALING_TARGET_RATIO, + ewmaAlpha: env.TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA, + batchWindowMs: env.TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS, + dampingFactor: env.TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR, + }, + runNotificationsEnabled: env.TRIGGER_WORKLOAD_API_ENABLED, + heartbeatIntervalSeconds: env.TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS, + sendRunDebugLogs: env.SEND_RUN_DEBUG_LOGS, + preDequeue: async () => { + if (!env.RESOURCE_MONITOR_ENABLED) { + return {}; + } + + if (this.isKubernetes) { + // Not used in k8s for now + return {}; + } + + const resources = await this.resourceMonitor.getNodeResources(); + + return { + maxResources: { + cpu: resources.cpuAvailable, + memory: resources.memoryAvailable, + }, + skipDequeue: resources.cpuAvailable < 0.25 || resources.memoryAvailable < 0.25, + }; + }, + preSkip: async () => { + // When the node is full, it should still try to warm start runs + // await this.tryWarmStartAllThisNode(); + }, + }); + + if (env.TRIGGER_CHECKPOINT_URL) { + this.logger.log("🥶 Checkpoints enabled", { + checkpointUrl: env.TRIGGER_CHECKPOINT_URL, + }); + + this.checkpointClient = new CheckpointClient({ + apiUrl: new URL(env.TRIGGER_CHECKPOINT_URL), + workerClient: this.workerSession.httpClient, + orchestrator: this.isKubernetes ? "KUBERNETES" : "DOCKER", + }); + } + + this.workerSession.on("runNotification", async ({ time, run }) => { + this.logger.verbose("runNotification", { time, run }); + + this.workloadServer.notifyRun({ run }); + }); + + this.workerSession.on( + "runQueueMessage", + async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { + this.logger.verbose(`Received message with timestamp ${time.toLocaleString()}`, message); + + if (message.completedWaitpoints.length > 0) { + this.logger.debug("Run has completed waitpoints", { + runId: message.run.id, + completedWaitpoints: message.completedWaitpoints.length, + }); + } + + if (!message.image) { + this.logger.error("Run has no image", { runId: message.run.id }); + return; + } + + const { checkpoint, ...rest } = message; + + // Register trace context early so snapshot spans work for all paths + // (cold create, restore, warm start). Re-registration on restore is safe + // since dequeue always provides fresh context. + if (this.computeManager?.traceSpansEnabled) { + const traceparent = extractTraceparent(message.run.traceContext); + + if (traceparent) { + this.workloadServer.registerRunTraceContext(message.run.friendlyId, { + traceparent, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + }); + } + } + + if (checkpoint) { + this.logger.debug("Restoring run", { runId: message.run.id }); + + if (this.computeManager) { + try { + const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id); + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + traceContext: message.run.traceContext, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + dequeuedAt: message.dequeuedAt, + }); + + if (didRestore) { + this.logger.debug("Compute restore successful", { + runId: message.run.id, + runnerId, + }); + } else { + this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); + } + } catch (error) { + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + + if (!this.checkpointClient) { + this.logger.error("No checkpoint client", { runId: message.run.id }); + return; + } + + try { + const didRestore = await this.checkpointClient.restoreRun({ + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + body: { + ...rest, + checkpoint, + }, + }); + + if (didRestore) { + this.logger.debug("Restore successful", { runId: message.run.id }); + } else { + this.logger.error("Restore failed", { runId: message.run.id }); + } + } catch (error) { + this.logger.error("Failed to restore run", { error }); + } + + return; + } + + this.logger.debug("Scheduling run", { runId: message.run.id }); + + const warmStartStart = performance.now(); + const didWarmStart = await this.tryWarmStart(message); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); + + if (didWarmStart) { + this.logger.debug("Warm start successful", { runId: message.run.id }); + return; + } + + try { + if (!message.deployment.friendlyId) { + // mostly a type guard, deployments always exists for deployed environments + // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. + throw new Error("Deployment is missing"); + } + + await this.workloadManager.create({ + dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, + envId: message.environment.id, + envType: message.environment.type, + image: message.image, + machine: message.run.machine, + orgId: message.organization.id, + projectId: message.project.id, + deploymentFriendlyId: message.deployment.friendlyId, + deploymentVersion: message.backgroundWorker.version, + runId: message.run.id, + runFriendlyId: message.run.friendlyId, + version: message.version, + nextAttemptNumber: message.run.attemptNumber, + snapshotId: message.snapshot.id, + snapshotFriendlyId: message.snapshot.friendlyId, + placementTags: message.placementTags, + traceContext: message.run.traceContext, + annotations: message.run.annotations, + hasPrivateLink: message.organization.hasPrivateLink, + }); + + // Disabled for now + // this.resourceMonitor.blockResources({ + // cpu: message.run.machine.cpu, + // memory: message.run.machine.memory, + // }); + } catch (error) { + this.logger.error("Failed to create workload", { error }); + } + } + ); + + if (env.METRICS_ENABLED) { + this.metricsServer = new HttpServer({ + port: env.METRICS_PORT, + host: env.METRICS_HOST, + metrics: { + register, + expose: true, + }, + }); + } + + // Responds to workload requests only + this.workloadServer = new WorkloadServer({ + port: env.TRIGGER_WORKLOAD_API_PORT_INTERNAL, + host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL, + workerClient: this.workerSession.httpClient, + checkpointClient: this.checkpointClient, + computeManager: this.computeManager, + tracing: this.tracing, + }); + + this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); + this.workloadServer.on("runDisconnected", this.onRunDisconnected.bind(this)); + } + + async onRunConnected({ run }: { run: { friendlyId: string } }) { + this.logger.debug("Run connected", { run }); + this.workerSession.subscribeToRunNotifications([run.friendlyId]); + } + + async onRunDisconnected({ run }: { run: { friendlyId: string } }) { + this.logger.debug("Run disconnected", { run }); + this.workerSession.unsubscribeFromRunNotifications([run.friendlyId]); + } + + private async tryWarmStart(dequeuedMessage: DequeuedMessage): Promise { + if (!this.warmStartUrl) { + return false; + } + + const warmStartUrlWithPath = new URL("/warm-start", this.warmStartUrl); + + try { + const res = await fetch(warmStartUrlWithPath.href, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ dequeuedMessage }), + }); + + if (!res.ok) { + this.logger.error("Warm start failed", { + runId: dequeuedMessage.run.id, + }); + return false; + } + + const data = await res.json(); + const parsedData = z.object({ didWarmStart: z.boolean() }).safeParse(data); + + if (!parsedData.success) { + this.logger.error("Warm start response invalid", { + runId: dequeuedMessage.run.id, + data, + }); + return false; + } + + return parsedData.data.didWarmStart; + } catch (error) { + this.logger.error("Warm start error", { + runId: dequeuedMessage.run.id, + error, + }); + return false; + } + } + + async start() { + this.logger.log("Starting up"); + + // Optional services + await this.podCleaner?.start(); + await this.failedPodHandler?.start(); + await this.metricsServer?.start(); + + if (env.TRIGGER_WORKLOAD_API_ENABLED) { + this.logger.log("Workload API enabled", { + protocol: env.TRIGGER_WORKLOAD_API_PROTOCOL, + domain: env.TRIGGER_WORKLOAD_API_DOMAIN, + port: env.TRIGGER_WORKLOAD_API_PORT_INTERNAL, + }); + await this.workloadServer.start(); + } else { + this.logger.warn("Workload API disabled"); + } + + await this.workerSession.start(); + } + + async stop() { + this.logger.log("Shutting down"); + await this.workloadServer.stop(); + await this.workerSession.stop(); + + // Optional services + await this.podCleaner?.stop(); + await this.failedPodHandler?.stop(); + await this.metricsServer?.stop(); + } +} + +const worker = new ManagedSupervisor(); +worker.start(); diff --git a/apps/supervisor/src/metrics.ts b/apps/supervisor/src/metrics.ts new file mode 100644 index 00000000000..caec4861533 --- /dev/null +++ b/apps/supervisor/src/metrics.ts @@ -0,0 +1,3 @@ +import { Registry } from "prom-client"; + +export const register = new Registry(); diff --git a/apps/supervisor/src/resourceMonitor.ts b/apps/supervisor/src/resourceMonitor.ts new file mode 100644 index 00000000000..507a52bbf60 --- /dev/null +++ b/apps/supervisor/src/resourceMonitor.ts @@ -0,0 +1,278 @@ +import type Docker from "dockerode"; +import type { MachineResources } from "@trigger.dev/core/v3"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { env } from "./env.js"; +import type { K8sApi } from "./clients/kubernetes.js"; + +const logger = new SimpleStructuredLogger("resource-monitor"); + +interface NodeResources { + cpuTotal: number; // in cores + cpuAvailable: number; + memoryTotal: number; // in bytes + memoryAvailable: number; +} + +interface ResourceRequest { + cpu: number; // in cores + memory: number; // in bytes +} + +export abstract class ResourceMonitor { + protected cacheTimeoutMs = 5_000; + protected lastUpdateMs = 0; + + protected cachedResources: NodeResources = { + cpuTotal: 0, + cpuAvailable: 0, + memoryTotal: 0, + memoryAvailable: 0, + }; + + protected resourceParser: ResourceParser; + + constructor(Parser: new () => ResourceParser) { + this.resourceParser = new Parser(); + } + + abstract getNodeResources(fromCache?: boolean): Promise; + + blockResources(resources: MachineResources): void { + const { cpu, memory } = this.toResourceRequest(resources); + + logger.debug("[ResourceMonitor] Blocking resources", { + raw: resources, + converted: { cpu, memory }, + }); + + this.cachedResources.cpuAvailable -= cpu; + this.cachedResources.memoryAvailable -= memory; + } + + async wouldFit(request: ResourceRequest): Promise { + const resources = await this.getNodeResources(); + return resources.cpuAvailable >= request.cpu && resources.memoryAvailable >= request.memory; + } + + private toResourceRequest(resources: MachineResources): ResourceRequest { + return { + cpu: resources.cpu ?? 0, + memory: this.gbToBytes(resources.memory ?? 0), + }; + } + + private gbToBytes(gb: number): number { + return gb * 1024 * 1024 * 1024; + } + + protected isCacheValid(): boolean { + return this.cachedResources !== null && Date.now() - this.lastUpdateMs < this.cacheTimeoutMs; + } + + protected applyOverrides(resources: NodeResources): NodeResources { + if ( + !env.RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL && + !env.RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB + ) { + return resources; + } + + logger.debug("[ResourceMonitor] 🛡️ Applying resource overrides", { + cpuTotal: env.RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL, + memoryTotalGb: env.RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB, + }); + + const cpuTotal = env.RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL ?? resources.cpuTotal; + const memoryTotal = env.RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB + ? this.gbToBytes(env.RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB) + : resources.memoryTotal; + + const cpuDiff = cpuTotal - resources.cpuTotal; + const memoryDiff = memoryTotal - resources.memoryTotal; + + const cpuAvailable = Math.max(0, resources.cpuAvailable + cpuDiff); + const memoryAvailable = Math.max(0, resources.memoryAvailable + memoryDiff); + + return { + cpuTotal, + cpuAvailable, + memoryTotal, + memoryAvailable, + }; + } +} + +type SystemInfo = { + NCPU: number | undefined; + MemTotal: number | undefined; +}; + +export class DockerResourceMonitor extends ResourceMonitor { + private docker: Docker; + + constructor(docker: Docker) { + super(DockerResourceParser); + this.docker = docker; + } + + async getNodeResources(fromCache?: boolean): Promise { + if (this.isCacheValid() || fromCache) { + // logger.debug("[ResourceMonitor] Using cached resources"); + return this.cachedResources; + } + + const info: SystemInfo = await this.docker.info(); + const stats = await this.docker.listContainers({ all: true }); + + // Get system-wide resources + const cpuTotal = info.NCPU ?? 0; + const memoryTotal = info.MemTotal ?? 0; + + // Calculate used resources from running containers + let cpuUsed = 0; + let memoryUsed = 0; + + for (const container of stats) { + if (container.State === "running") { + const c = this.docker.getContainer(container.Id); + const { HostConfig } = await c.inspect(); + + const cpu = this.resourceParser.cpu(HostConfig.NanoCpus ?? 0); + const memory = this.resourceParser.memory(HostConfig.Memory ?? 0); + + cpuUsed += cpu; + memoryUsed += memory; + } + } + + this.cachedResources = this.applyOverrides({ + cpuTotal, + cpuAvailable: cpuTotal - cpuUsed, + memoryTotal, + memoryAvailable: memoryTotal - memoryUsed, + }); + + this.lastUpdateMs = Date.now(); + + return this.cachedResources; + } +} + +export class KubernetesResourceMonitor extends ResourceMonitor { + private k8s: K8sApi; + private nodeName: string; + + constructor(k8s: K8sApi, nodeName: string) { + super(KubernetesResourceParser); + this.k8s = k8s; + this.nodeName = nodeName; + } + + async getNodeResources(fromCache?: boolean): Promise { + if (this.isCacheValid() || fromCache) { + logger.debug("[ResourceMonitor] Using cached resources"); + return this.cachedResources; + } + + const node = await this.k8s.core.readNode({ name: this.nodeName }); + const pods = await this.k8s.core.listPodForAllNamespaces({ + // TODO: ensure this includes all pods that consume resources + fieldSelector: `spec.nodeName=${this.nodeName},status.phase=Running`, + }); + + const allocatable = node.status?.allocatable; + const cpuTotal = this.resourceParser.cpu(allocatable?.cpu ?? "0"); + const memoryTotal = this.resourceParser.memory(allocatable?.memory ?? "0"); + + // Sum up resources requested by all pods on this node + let cpuRequested = 0; + let memoryRequested = 0; + + for (const pod of pods.items) { + if (pod.status?.phase === "Running") { + if (!pod.spec) { + continue; + } + + for (const container of pod.spec.containers) { + const resources = container.resources?.requests ?? {}; + cpuRequested += this.resourceParser.cpu(resources.cpu ?? "0"); + memoryRequested += this.resourceParser.memory(resources.memory ?? "0"); + } + } + } + + this.cachedResources = this.applyOverrides({ + cpuTotal, + cpuAvailable: cpuTotal - cpuRequested, + memoryTotal, + memoryAvailable: memoryTotal - memoryRequested, + }); + + this.lastUpdateMs = Date.now(); + + return this.cachedResources; + } +} + +export class NoopResourceMonitor extends ResourceMonitor { + constructor() { + super(NoopResourceParser); + } + + async getNodeResources(): Promise { + return { + cpuTotal: 0, + cpuAvailable: Infinity, + memoryTotal: 0, + memoryAvailable: Infinity, + }; + } +} + +abstract class ResourceParser { + abstract cpu(cpu: number | string): number; + abstract memory(memory: number | string): number; +} + +class DockerResourceParser extends ResourceParser { + cpu(cpu: number): number { + return cpu / 1e9; + } + + memory(memory: number): number { + return memory; + } +} + +class KubernetesResourceParser extends ResourceParser { + cpu(cpu: string): number { + if (cpu.endsWith("m")) { + return parseInt(cpu.slice(0, -1)) / 1000; + } + return parseInt(cpu); + } + + memory(memory: string): number { + if (memory.endsWith("Ki")) { + return parseInt(memory.slice(0, -2)) * 1024; + } + if (memory.endsWith("Mi")) { + return parseInt(memory.slice(0, -2)) * 1024 * 1024; + } + if (memory.endsWith("Gi")) { + return parseInt(memory.slice(0, -2)) * 1024 * 1024 * 1024; + } + return parseInt(memory); + } +} + +class NoopResourceParser extends ResourceParser { + cpu(cpu: number): number { + return cpu; + } + + memory(memory: number): number { + return memory; + } +} diff --git a/apps/supervisor/src/services/computeSnapshotService.ts b/apps/supervisor/src/services/computeSnapshotService.ts new file mode 100644 index 00000000000..7206f57fb73 --- /dev/null +++ b/apps/supervisor/src/services/computeSnapshotService.ts @@ -0,0 +1,240 @@ +import pLimit from "p-limit"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers"; +import { type SnapshotCallbackPayload } from "@internal/compute"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import { TimerWheel } from "./timerWheel.js"; +import type { OtlpTraceService } from "./otlpTraceService.js"; + +type DelayedSnapshot = { + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; +}; + +export type RunTraceContext = { + traceparent: string; + envId: string; + orgId: string; + projectId: string; +}; + +export type ComputeSnapshotServiceOptions = { + computeManager: ComputeWorkloadManager; + workerClient: SupervisorHttpClient; + tracing?: OtlpTraceService; +}; + +export class ComputeSnapshotService { + private readonly logger = new SimpleStructuredLogger("compute-snapshot-service"); + + private static readonly MAX_TRACE_CONTEXTS = 10_000; + private readonly runTraceContexts = new Map(); + private readonly timerWheel: TimerWheel; + private readonly dispatchLimit: ReturnType; + + private readonly computeManager: ComputeWorkloadManager; + private readonly workerClient: SupervisorHttpClient; + private readonly tracing?: OtlpTraceService; + + constructor(opts: ComputeSnapshotServiceOptions) { + this.computeManager = opts.computeManager; + this.workerClient = opts.workerClient; + this.tracing = opts.tracing; + + this.dispatchLimit = pLimit(this.computeManager.snapshotDispatchLimit); + this.timerWheel = new TimerWheel({ + delayMs: this.computeManager.snapshotDelayMs, + onExpire: (item) => { + this.dispatchLimit(() => this.dispatch(item.data)).catch((error) => { + this.logger.error("Snapshot dispatch failed", { + runId: item.data.runFriendlyId, + runnerId: item.data.runnerId, + error, + }); + }); + }, + }); + this.timerWheel.start(); + } + + /** Schedule a delayed snapshot for a run. Replaces any pending snapshot for the same run. */ + schedule(runFriendlyId: string, data: DelayedSnapshot) { + this.timerWheel.submit(runFriendlyId, data); + this.logger.debug("Snapshot scheduled", { + runFriendlyId, + snapshotFriendlyId: data.snapshotFriendlyId, + delayMs: this.computeManager.snapshotDelayMs, + }); + } + + /** Cancel a pending delayed snapshot. Returns true if one was cancelled. */ + cancel(runFriendlyId: string): boolean { + const cancelled = this.timerWheel.cancel(runFriendlyId); + if (cancelled) { + this.logger.debug("Snapshot cancelled", { runFriendlyId }); + } + return cancelled; + } + + /** Handle the callback from the gateway after a snapshot completes or fails. */ + async handleCallback(body: SnapshotCallbackPayload) { + this.logger.debug("Snapshot callback", { + snapshotId: body.snapshot_id, + instanceId: body.instance_id, + status: body.status, + error: body.error, + metadata: body.metadata, + durationMs: body.duration_ms, + }); + + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + if (!runId || !snapshotFriendlyId) { + this.logger.error("Snapshot callback missing metadata", { body }); + return { ok: false as const, status: 400 }; + } + + this.#emitSnapshotSpan(runId, body.duration_ms, body.snapshot_id); + + if (body.status === "completed") { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: true, + checkpoint: { + type: "COMPUTE", + location: body.snapshot_id, + }, + }, + }); + + if (result.success) { + this.logger.debug("Suspend completion submitted", { + runId, + instanceId: body.instance_id, + snapshotId: body.snapshot_id, + }); + } else { + this.logger.error("Failed to submit suspend completion", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } else { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: false, + error: body.error ?? "Snapshot failed", + }, + }); + + if (!result.success) { + this.logger.error("Failed to submit suspend failure", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } + + return { ok: true as const, status: 200 }; + } + + registerTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + // Evict oldest entries if we've hit the cap. This is best-effort: on a busy + // supervisor, entries for long-lived runs may be evicted before their snapshot + // callback arrives, causing those snapshot spans to be silently dropped. + // That's acceptable - trace spans are observability sugar, not correctness. + if (this.runTraceContexts.size >= ComputeSnapshotService.MAX_TRACE_CONTEXTS) { + const firstKey = this.runTraceContexts.keys().next().value; + if (firstKey) { + this.runTraceContexts.delete(firstKey); + } + } + + this.runTraceContexts.set(runFriendlyId, ctx); + } + + /** Stop the timer wheel, dropping pending snapshots. */ + stop(): string[] { + // Intentionally drop pending snapshots rather than dispatching them. The supervisor + // is shutting down, so our callback URL will be dead by the time the gateway responds. + // Runners detect the supervisor is gone and reconnect to a new instance, which + // re-triggers the snapshot workflow. Snapshots are an optimization, not a correctness + // requirement - runs continue fine without them. + const remaining = this.timerWheel.stop(); + const droppedRuns = remaining.map((item) => item.key); + + if (droppedRuns.length > 0) { + this.logger.info("Stopped, dropped pending snapshots", { count: droppedRuns.length }); + this.logger.debug("Dropped snapshot details", { runs: droppedRuns }); + } + + return droppedRuns; + } + + /** Dispatch a snapshot request to the gateway. */ + private async dispatch(snapshot: DelayedSnapshot): Promise { + const result = await this.computeManager.snapshot({ + runnerId: snapshot.runnerId, + metadata: { + runId: snapshot.runFriendlyId, + snapshotFriendlyId: snapshot.snapshotFriendlyId, + }, + }); + + if (!result) { + this.logger.error("Failed to request snapshot", { + runId: snapshot.runFriendlyId, + runnerId: snapshot.runnerId, + }); + } + } + + #emitSnapshotSpan(runFriendlyId: string, durationMs?: number, snapshotId?: string) { + if (!this.tracing) return; + + const ctx = this.runTraceContexts.get(runFriendlyId); + if (!ctx) return; + + const parsed = parseTraceparent(ctx.traceparent); + if (!parsed) return; + + const endEpochMs = Date.now(); + const startEpochMs = durationMs ? endEpochMs - durationMs : endEpochMs; + + const spanAttributes: Record = { + "compute.type": "snapshot", + }; + + if (durationMs !== undefined) { + spanAttributes["compute.total_ms"] = durationMs; + } + + if (snapshotId) { + spanAttributes["compute.snapshot_id"] = snapshotId; + } + + this.tracing.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.snapshot", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": ctx.envId, + "ctx.organization.id": ctx.orgId, + "ctx.project.id": ctx.projectId, + "ctx.run.id": runFriendlyId, + }, + spanAttributes, + }); + } +} diff --git a/apps/supervisor/src/services/failedPodHandler.test.ts b/apps/supervisor/src/services/failedPodHandler.test.ts new file mode 100644 index 00000000000..4dbfda16f43 --- /dev/null +++ b/apps/supervisor/src/services/failedPodHandler.test.ts @@ -0,0 +1,581 @@ +import { describe, it, expect, beforeAll, afterEach } from "vitest"; +import { FailedPodHandler } from "./failedPodHandler.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { Registry } from "prom-client"; +import { setTimeout } from "timers/promises"; + +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("FailedPodHandler Integration Tests", () => { + const k8s = createK8sApi(); + const namespace = "integration-test"; + const register = new Registry(); + + beforeAll(async () => { + // Create the test namespace if it doesn't exist + try { + await k8s.core.readNamespace({ name: namespace }); + } catch (error) { + await k8s.core.createNamespace({ + body: { + metadata: { + name: namespace, + }, + }, + }); + } + + // Clear any existing pods in the namespace + await deleteAllPodsInNamespace({ k8sApi: k8s, namespace }); + }); + + afterEach(async () => { + // Clear metrics to avoid conflicts + register.clear(); + + // Delete any remaining pods in the namespace + await deleteAllPodsInNamespace({ k8sApi: k8s, namespace }); + }); + + it("should process and delete failed pods with app=task-run label", async () => { + const handler = new FailedPodHandler({ namespace, k8s, register }); + + try { + // Create failed pods with the correct label + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 2, + shouldFail: true, + }); + + // Wait for pods to reach Failed state + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Failed", + }); + + // Start the handler + await handler.start(); + + // Wait for pods to be deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + // Verify metrics + const metrics = handler.getMetrics(); + + // Check informer events were recorded + const informerEvents = await metrics.informerEventsTotal.get(); + expect(informerEvents.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "add", + }), + value: 2, + }) + ); + expect(informerEvents.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "connect", + }), + value: 1, + }) + ); + expect(informerEvents.values).not.toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "error", + }), + }) + ); + + // Check pods were processed + const processedPods = await metrics.processedPodsTotal.get(); + expect(processedPods.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + status: "Failed", + }), + value: 2, + }) + ); + + // Check pods were deleted + const deletedPods = await metrics.deletedPodsTotal.get(); + expect(deletedPods.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + status: "Failed", + }), + value: 2, + }) + ); + + // Check no deletion errors were recorded + const deletionErrors = await metrics.deletionErrorsTotal.get(); + expect(deletionErrors.values).toHaveLength(0); + + // Check processing durations were recorded + const durations = await metrics.processingDurationSeconds.get(); + const failedDurations = durations.values.filter( + (v) => v.labels.namespace === namespace && v.labels.status === "Failed" + ); + expect(failedDurations.length).toBeGreaterThan(0); + } finally { + await handler.stop(); + } + }, 30000); + + it("should ignore pods without app=task-run label", async () => { + const handler = new FailedPodHandler({ namespace, k8s, register }); + + try { + // Create failed pods without the task-run label + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + shouldFail: true, + labels: { app: "not-task-run" }, + }); + + // Wait for pod to reach Failed state + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Failed", + }); + + await handler.start(); + + // Wait a reasonable time to ensure pod isn't deleted + await setTimeout(5000); + + // Verify pod still exists + const exists = await podExists({ k8sApi: k8s, namespace, podName: podNames[0]! }); + expect(exists).toBe(true); + + // Verify no metrics were recorded + const metrics = handler.getMetrics(); + const processedPods = await metrics.processedPodsTotal.get(); + expect(processedPods.values).toHaveLength(0); + } finally { + await handler.stop(); + } + }, 30000); + + it("should not process pods that are being deleted", async () => { + const handler = new FailedPodHandler({ namespace, k8s, register }); + + try { + // Create a failed pod that we'll mark for deletion + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + shouldFail: true, + command: ["/bin/sh", "-c", "sleep 30"], + }); + + // Wait for pod to reach Failed state + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Running", + }); + + // Delete the pod but don't wait for deletion + await k8s.core.deleteNamespacedPod({ + namespace, + name: podNames[0]!, + gracePeriodSeconds: 5, + }); + + // Start the handler + await handler.start(); + + // Wait for pod to be fully deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + // Verify metrics show we skipped processing + const metrics = handler.getMetrics(); + const processedPods = await metrics.processedPodsTotal.get(); + expect(processedPods.values).toHaveLength(0); + } finally { + await handler.stop(); + } + }, 30000); + + it("should detect and process pods that fail after handler starts", async () => { + const handler = new FailedPodHandler({ namespace, k8s, register }); + + try { + // Start the handler + await handler.start(); + + // Create failed pods with the correct label + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 3, + shouldFail: true, + }); + + // Wait for pods to be deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + // Verify metrics + const metrics = handler.getMetrics(); + + // Check informer events were recorded + const informerEvents = await metrics.informerEventsTotal.get(); + expect(informerEvents.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "add", + }), + value: 3, + }) + ); + expect(informerEvents.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "connect", + }), + value: 1, + }) + ); + expect(informerEvents.values).not.toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "error", + }), + }) + ); + + // Check pods were processed + const processedPods = await metrics.processedPodsTotal.get(); + expect(processedPods.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + status: "Failed", + }), + value: 3, + }) + ); + + // Check pods were deleted + const deletedPods = await metrics.deletedPodsTotal.get(); + expect(deletedPods.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + status: "Failed", + }), + value: 3, + }) + ); + + // Check no deletion errors were recorded + const deletionErrors = await metrics.deletionErrorsTotal.get(); + expect(deletionErrors.values).toHaveLength(0); + + // Check processing durations were recorded + const durations = await metrics.processingDurationSeconds.get(); + const failedDurations = durations.values.filter( + (v) => v.labels.namespace === namespace && v.labels.status === "Failed" + ); + expect(failedDurations.length).toBeGreaterThan(0); + } finally { + await handler.stop(); + } + }, 60000); + + it("should handle graceful shutdown pods differently", async () => { + const handler = new FailedPodHandler({ namespace, k8s, register }); + + try { + // Create first batch of pods before starting handler + const firstBatchPodNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 2, + exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE, + }); + + // Wait for pods to reach Failed state + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames: firstBatchPodNames, + phase: "Failed", + }); + + // Start the handler + await handler.start(); + + // Wait for first batch to be deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames: firstBatchPodNames, + }); + + // Create second batch of pods after handler is running + const secondBatchPodNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 3, + exitCode: FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE, + }); + + // Wait for second batch to be deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames: secondBatchPodNames, + }); + + // Verify metrics + const metrics = handler.getMetrics(); + + // Check informer events were recorded for both batches + const informerEvents = await metrics.informerEventsTotal.get(); + expect(informerEvents.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + verb: "add", + }), + value: 5, // 2 from first batch + 3 from second batch + }) + ); + + // Check pods were processed as graceful shutdowns + const processedPods = await metrics.processedPodsTotal.get(); + + // Should not be marked as Failed + const failedPods = processedPods.values.find( + (v) => v.labels.namespace === namespace && v.labels.status === "Failed" + ); + expect(failedPods).toBeUndefined(); + + // Should be marked as GracefulShutdown + const gracefulShutdowns = processedPods.values.find( + (v) => v.labels.namespace === namespace && v.labels.status === "GracefulShutdown" + ); + expect(gracefulShutdowns).toBeDefined(); + expect(gracefulShutdowns?.value).toBe(5); // Total from both batches + + // Check pods were still deleted + const deletedPods = await metrics.deletedPodsTotal.get(); + expect(deletedPods.values).toContainEqual( + expect.objectContaining({ + labels: expect.objectContaining({ + namespace, + status: "Failed", + }), + value: 5, // Total from both batches + }) + ); + + // Check no deletion errors were recorded + const deletionErrors = await metrics.deletionErrorsTotal.get(); + expect(deletionErrors.values).toHaveLength(0); + } finally { + await handler.stop(); + } + }, 30000); +}); + +async function createTestPods({ + k8sApi, + namespace, + count, + labels = { app: "task-run" }, + shouldFail = false, + namePrefix = "test-pod", + command = ["/bin/sh", "-c", shouldFail ? "exit 1" : "exit 0"], + randomizeName = true, + exitCode, +}: { + k8sApi: K8sApi; + namespace: string; + count: number; + labels?: Record; + shouldFail?: boolean; + namePrefix?: string; + command?: string[]; + randomizeName?: boolean; + exitCode?: number; +}) { + const createdPods: string[] = []; + + // If exitCode is specified, override the command + if (exitCode !== undefined) { + command = ["/bin/sh", "-c", `exit ${exitCode}`]; + } + + for (let i = 0; i < count; i++) { + const podName = randomizeName + ? `${namePrefix}-${i}-${Math.random().toString(36).substring(2, 15)}` + : `${namePrefix}-${i}`; + await k8sApi.core.createNamespacedPod({ + namespace, + body: { + metadata: { + name: podName, + labels, + }, + spec: { + restartPolicy: "Never", + containers: [ + { + name: "run-controller", // Changed to match the name we check in failedPodHandler + image: "busybox:1.37.0", + command, + }, + ], + }, + }, + }); + createdPods.push(podName); + } + + return createdPods; +} + +async function waitForPodsDeletion({ + k8sApi, + namespace, + podNames, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podNames: string[]; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + const pendingPods = new Set(podNames); + + while (pendingPods.size > 0 && Date.now() - startTime < timeoutMs) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + const existingPods = new Set(pods.items.map((pod) => pod.metadata?.name ?? "")); + + for (const podName of pendingPods) { + if (!existingPods.has(podName)) { + pendingPods.delete(podName); + } + } + + if (pendingPods.size > 0) { + await setTimeout(waitMs); + } + } + + if (pendingPods.size > 0) { + throw new Error( + `Pods [${Array.from(pendingPods).join(", ")}] were not deleted within ${timeoutMs}ms` + ); + } +} + +async function podExists({ + k8sApi, + namespace, + podName, +}: { + k8sApi: K8sApi; + namespace: string; + podName: string; +}) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + return pods.items.some((p) => p.metadata?.name === podName); +} + +async function waitForPodsPhase({ + k8sApi, + namespace, + podNames, + phase, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podNames: string[]; + phase: "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown"; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + const pendingPods = new Set(podNames); + + while (pendingPods.size > 0 && Date.now() - startTime < timeoutMs) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + + for (const pod of pods.items) { + if (pendingPods.has(pod.metadata?.name ?? "") && pod.status?.phase === phase) { + pendingPods.delete(pod.metadata?.name ?? ""); + } + } + + if (pendingPods.size > 0) { + await setTimeout(waitMs); + } + } + + if (pendingPods.size > 0) { + throw new Error( + `Pods [${Array.from(pendingPods).join( + ", " + )}] did not reach phase ${phase} within ${timeoutMs}ms` + ); + } +} + +async function deleteAllPodsInNamespace({ + k8sApi, + namespace, +}: { + k8sApi: K8sApi; + namespace: string; +}) { + // Get all pods + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + const podNames = pods.items.map((p) => p.metadata?.name ?? ""); + + // Delete all pods + await k8sApi.core.deleteCollectionNamespacedPod({ namespace, gracePeriodSeconds: 0 }); + + // Wait for all pods to be deleted + await waitForPodsDeletion({ k8sApi, namespace, podNames }); +} diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts new file mode 100644 index 00000000000..3d56c92b213 --- /dev/null +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -0,0 +1,326 @@ +import { LogLevel, SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { K8sApi } from "../clients/kubernetes.js"; +import { createK8sApi } from "../clients/kubernetes.js"; +import { Informer, V1Pod } from "@kubernetes/client-node"; +import { Counter, Registry, Histogram } from "prom-client"; +import { register } from "../metrics.js"; +import { setTimeout } from "timers/promises"; + +type PodStatus = "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown" | "GracefulShutdown"; + +export type FailedPodHandlerOptions = { + namespace: string; + reconnectIntervalMs?: number; + k8s?: K8sApi; + register?: Registry; +}; + +export class FailedPodHandler { + private readonly id: string; + private readonly logger: SimpleStructuredLogger; + private readonly k8s: K8sApi; + private readonly namespace: string; + + private isRunning = false; + + private readonly informer: Informer; + private readonly reconnectIntervalMs: number; + private reconnecting = false; + + // Metrics + private readonly register: Registry; + private readonly processedPodsTotal: Counter; + private readonly deletedPodsTotal: Counter; + private readonly deletionErrorsTotal: Counter; + private readonly processingDurationSeconds: Histogram; + private readonly informerEventsTotal: Counter; + + static readonly GRACEFUL_SHUTDOWN_EXIT_CODE = 200; + + constructor(opts: FailedPodHandlerOptions) { + this.id = Math.random().toString(36).substring(2, 15); + this.logger = new SimpleStructuredLogger("failed-pod-handler", LogLevel.debug, { + id: this.id, + }); + + this.k8s = opts.k8s ?? createK8sApi(); + + this.namespace = opts.namespace; + this.reconnectIntervalMs = opts.reconnectIntervalMs ?? 1000; + + this.informer = this.k8s.makeInformer( + `/api/v1/namespaces/${this.namespace}/pods`, + () => + this.k8s.core.listNamespacedPod({ + namespace: this.namespace, + labelSelector: "app=task-run", + fieldSelector: "status.phase=Failed", + }), + "app=task-run", + "status.phase=Failed" + ); + + // Whenever a matching pod is added to the informer cache + this.informer.on("add", this.onPodCompleted.bind(this)); + + // Informer events + this.informer.on("connect", this.makeOnConnect("failed-pod-informer").bind(this)); + this.informer.on("error", this.makeOnError("failed-pod-informer").bind(this)); + + // Initialize metrics + this.register = opts.register ?? register; + + this.processedPodsTotal = new Counter({ + name: "failed_pod_handler_processed_pods_total", + help: "Total number of failed pods processed", + labelNames: ["namespace", "status"], + registers: [this.register], + }); + + this.deletedPodsTotal = new Counter({ + name: "failed_pod_handler_deleted_pods_total", + help: "Total number of pods deleted", + labelNames: ["namespace", "status"], + registers: [this.register], + }); + + this.deletionErrorsTotal = new Counter({ + name: "failed_pod_handler_deletion_errors_total", + help: "Total number of errors encountered while deleting pods", + labelNames: ["namespace", "error_type"], + registers: [this.register], + }); + + this.processingDurationSeconds = new Histogram({ + name: "failed_pod_handler_processing_duration_seconds", + help: "The duration of pod processing", + labelNames: ["namespace", "status"], + registers: [this.register], + }); + + this.informerEventsTotal = new Counter({ + name: "failed_pod_handler_informer_events_total", + help: "Total number of informer events", + labelNames: ["namespace", "verb"], + registers: [this.register], + }); + } + + async start() { + if (this.isRunning) { + this.logger.warn("failed pod handler already running"); + return; + } + + this.isRunning = true; + + this.logger.info("starting failed pod handler"); + await this.informer.start(); + } + + async stop() { + if (!this.isRunning) { + this.logger.warn("failed pod handler not running"); + return; + } + + this.isRunning = false; + + this.logger.info("stopping failed pod handler"); + await this.informer.stop(); + } + + private async withHistogram( + histogram: Histogram, + promise: Promise, + labels?: Record + ): Promise { + const end = histogram.startTimer({ namespace: this.namespace, ...labels }); + try { + return await promise; + } finally { + end(); + } + } + + /** + * Returns the non-nullable status of a pod + */ + private podStatus(pod: V1Pod): PodStatus { + return (pod.status?.phase ?? "Unknown") as PodStatus; + } + + private async onPodCompleted(pod: V1Pod) { + this.logger.debug("pod-completed", this.podSummary(pod)); + this.informerEventsTotal.inc({ namespace: this.namespace, verb: "add" }); + + if (!pod.metadata?.name) { + this.logger.error("pod-completed: no name", this.podSummary(pod)); + return; + } + + if (!pod.status) { + this.logger.error("pod-completed: no status", this.podSummary(pod)); + return; + } + + if (pod.metadata?.deletionTimestamp) { + this.logger.verbose("pod-completed: pod is being deleted", this.podSummary(pod)); + return; + } + + const podStatus = this.podStatus(pod); + + switch (podStatus) { + case "Succeeded": + await this.withHistogram(this.processingDurationSeconds, this.onPodSucceeded(pod), { + status: podStatus, + }); + break; + case "Failed": + await this.withHistogram(this.processingDurationSeconds, this.onPodFailed(pod), { + status: podStatus, + }); + break; + default: + this.logger.error("pod-completed: unknown phase", this.podSummary(pod)); + } + } + + private async onPodSucceeded(pod: V1Pod) { + this.logger.debug("pod-succeeded", this.podSummary(pod)); + this.processedPodsTotal.inc({ + namespace: this.namespace, + status: this.podStatus(pod), + }); + } + + private async onPodFailed(pod: V1Pod) { + this.logger.debug("pod-failed", this.podSummary(pod)); + + try { + await this.processFailedPod(pod); + } catch (error) { + this.logger.error("pod-failed: error processing pod", this.podSummary(pod), { error }); + } finally { + await this.deletePod(pod); + } + } + + private async processFailedPod(pod: V1Pod) { + this.logger.verbose("pod-failed: processing pod", this.podSummary(pod)); + + const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller"); + + // If it's our special "graceful shutdown" exit code, don't process it further, just delete it + if ( + mainContainer?.state?.terminated?.exitCode === FailedPodHandler.GRACEFUL_SHUTDOWN_EXIT_CODE + ) { + this.logger.debug("pod-failed: graceful shutdown detected", this.podSummary(pod)); + this.processedPodsTotal.inc({ + namespace: this.namespace, + status: "GracefulShutdown", + }); + return; + } + + this.processedPodsTotal.inc({ + namespace: this.namespace, + status: this.podStatus(pod), + }); + } + + private async deletePod(pod: V1Pod) { + this.logger.verbose("pod-failed: deleting pod", this.podSummary(pod)); + try { + await this.k8s.core.deleteNamespacedPod({ + name: pod.metadata!.name!, + namespace: this.namespace, + }); + this.deletedPodsTotal.inc({ + namespace: this.namespace, + status: this.podStatus(pod), + }); + } catch (error) { + this.logger.error("pod-failed: error deleting pod", this.podSummary(pod), { error }); + this.deletionErrorsTotal.inc({ + namespace: this.namespace, + error_type: error instanceof Error ? error.name : "unknown", + }); + } + } + + private makeOnError(informerName: string) { + return (err?: unknown) => this.onError(informerName, err); + } + + private async onError(informerName: string, err?: unknown) { + if (!this.isRunning) { + this.logger.warn("onError: informer not running"); + return; + } + + // Guard against multiple simultaneous reconnections + if (this.reconnecting) { + this.logger.debug("onError: reconnection already in progress, skipping", { + informerName, + }); + return; + } + + this.reconnecting = true; + + try { + const error = err instanceof Error ? err : undefined; + this.logger.error("error event fired", { + informerName, + error: error?.message, + errorType: error?.name, + }); + this.informerEventsTotal.inc({ namespace: this.namespace, verb: "error" }); + + // Reconnect on errors + await setTimeout(this.reconnectIntervalMs); + await this.informer.start(); + } catch (handlerError) { + const error = handlerError instanceof Error ? handlerError : undefined; + this.logger.error("onError: reconnection attempt failed", { + informerName, + error: error?.message, + errorType: error?.name, + errorStack: error?.stack, + }); + } finally { + this.reconnecting = false; + } + } + + private makeOnConnect(informerName: string) { + return () => this.onConnect(informerName); + } + + private async onConnect(informerName: string) { + this.logger.info(`informer connected: ${informerName}`); + this.informerEventsTotal.inc({ namespace: this.namespace, verb: "connect" }); + } + + private podSummary(pod: V1Pod) { + return { + name: pod.metadata?.name, + namespace: pod.metadata?.namespace, + status: pod.status?.phase, + deletionTimestamp: pod.metadata?.deletionTimestamp, + }; + } + + // Method to expose metrics for testing + public getMetrics() { + return { + processedPodsTotal: this.processedPodsTotal, + deletedPodsTotal: this.deletedPodsTotal, + deletionErrorsTotal: this.deletionErrorsTotal, + informerEventsTotal: this.informerEventsTotal, + processingDurationSeconds: this.processingDurationSeconds, + }; + } +} diff --git a/apps/supervisor/src/services/otlpTraceService.test.ts b/apps/supervisor/src/services/otlpTraceService.test.ts new file mode 100644 index 00000000000..baf3bd90306 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.test.ts @@ -0,0 +1,179 @@ +import { describe, it, expect } from "vitest"; +import { buildPayload } from "./otlpTraceService.js"; + +describe("buildPayload", () => { + it("builds valid OTLP JSON with timing attributes", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + parentSpanId: "1234567890abcdef", + spanName: "compute.provision", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: { + "ctx.environment.id": "env_123", + "ctx.organization.id": "org_456", + "ctx.project.id": "proj_789", + "ctx.run.id": "run_abc", + }, + spanAttributes: { + "compute.total_ms": 250, + "compute.gateway.schedule_ms": 1, + "compute.cache.image_cached": true, + }, + }); + + expect(payload.resourceSpans).toHaveLength(1); + + const resourceSpan = payload.resourceSpans[0]!; + + // $trigger=true so the webapp accepts it + const triggerAttr = resourceSpan.resource.attributes.find((a) => a.key === "$trigger"); + expect(triggerAttr).toEqual({ key: "$trigger", value: { boolValue: true } }); + + // Resource attributes + const envAttr = resourceSpan.resource.attributes.find( + (a) => a.key === "ctx.environment.id" + ); + expect(envAttr).toEqual({ + key: "ctx.environment.id", + value: { stringValue: "env_123" }, + }); + + // Span basics + const span = resourceSpan.scopeSpans[0]!.spans[0]!; + expect(span.name).toBe("compute.provision"); + expect(span.traceId).toBe("abcd1234abcd1234abcd1234abcd1234"); + expect(span.parentSpanId).toBe("1234567890abcdef"); + + // Integer attribute + const totalMs = span.attributes.find((a) => a.key === "compute.total_ms"); + expect(totalMs).toEqual({ key: "compute.total_ms", value: { intValue: 250 } }); + + // Boolean attribute + const cached = span.attributes.find((a) => a.key === "compute.cache.image_cached"); + expect(cached).toEqual({ key: "compute.cache.image_cached", value: { boolValue: true } }); + }); + + it("generates a valid 16-char hex span ID", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.spanId).toMatch(/^[0-9a-f]{16}$/); + }); + + it("converts timestamps to nanoseconds", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000000000"); + expect(span.endTimeUnixNano).toBe("1250000000"); + }); + + it("converts real epoch timestamps without precision loss", () => { + // Date.now() values exceed Number.MAX_SAFE_INTEGER when multiplied by 1e6 + const startMs = 1711929600000; // 2024-04-01T00:00:00Z + const endMs = 1711929600250; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1711929600000000000"); + expect(span.endTimeUnixNano).toBe("1711929600250000000"); + }); + + it("preserves sub-millisecond precision from performance.now() arithmetic", () => { + // provisionStartEpochMs = Date.now() - (performance.now() - startMs) produces fractional ms. + // Use small epoch + fraction to avoid IEEE 754 noise in the fractional part. + const startMs = 1000.322; + const endMs = 1045.789; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000322000"); + expect(span.endTimeUnixNano).toBe("1045789000"); + }); + + it("sub-ms precision affects ordering for real epoch values", () => { + // Two spans within the same millisecond should have different nanosecond timestamps + const spanA = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "a", + startTimeMs: 1711929600000.3, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const spanB = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "b", + startTimeMs: 1711929600000.7, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const startA = BigInt(spanA.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + const startB = BigInt(spanB.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + // A should sort before B (both in the same ms but different sub-ms positions) + expect(startA).toBeLessThan(startB); + }); + + it("omits parentSpanId when not provided", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.parentSpanId).toBeUndefined(); + }); + + it("handles double values for non-integer numbers", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: { "compute.cpu": 0.25 }, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + const cpu = span.attributes.find((a) => a.key === "compute.cpu"); + expect(cpu).toEqual({ key: "compute.cpu", value: { doubleValue: 0.25 } }); + }); +}); diff --git a/apps/supervisor/src/services/otlpTraceService.ts b/apps/supervisor/src/services/otlpTraceService.ts new file mode 100644 index 00000000000..da3310711d0 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.ts @@ -0,0 +1,104 @@ +import { randomBytes } from "crypto"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +export type OtlpTraceServiceOptions = { + endpointUrl: string; + timeoutMs?: number; +}; + +export type OtlpTraceSpan = { + traceId: string; + parentSpanId?: string; + spanName: string; + startTimeMs: number; + endTimeMs: number; + resourceAttributes: Record; + spanAttributes: Record; +}; + +export class OtlpTraceService { + private readonly logger = new SimpleStructuredLogger("otlp-trace"); + + constructor(private opts: OtlpTraceServiceOptions) {} + + /** Fire-and-forget: build payload and send to the configured OTLP endpoint */ + emit(span: OtlpTraceSpan): void { + const payload = buildPayload(span); + + fetch(`${this.opts.endpointUrl}/v1/traces`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(this.opts.timeoutMs ?? 5_000), + }).catch((err) => { + this.logger.warn("failed to send compute trace span", { + error: err instanceof Error ? err.message : String(err), + }); + }); + } +} + +// ── Payload builder (internal) ─────────────────────────────────────────────── + +/** @internal Exported for tests only */ +export function buildPayload(span: OtlpTraceSpan) { + const spanId = randomBytes(8).toString("hex"); + + return { + resourceSpans: [ + { + resource: { + attributes: [ + { key: "$trigger", value: { boolValue: true } }, + ...toOtlpAttributes(span.resourceAttributes), + ], + }, + scopeSpans: [ + { + scope: { name: "supervisor.compute" }, + spans: [ + { + traceId: span.traceId, + spanId, + parentSpanId: span.parentSpanId, + name: span.spanName, + kind: 3, // SPAN_KIND_CLIENT + startTimeUnixNano: msToNano(span.startTimeMs), + endTimeUnixNano: msToNano(span.endTimeMs), + attributes: toOtlpAttributes(span.spanAttributes), + status: { code: 1 }, // STATUS_CODE_OK + }, + ], + }, + ], + }, + ], + }; +} + +function toOtlpAttributes( + attrs: Record +): Array<{ key: string; value: Record }> { + return Object.entries(attrs).map(([key, value]) => ({ + key, + value: toOtlpValue(value), + })); +} + +function toOtlpValue(value: string | number | boolean): Record { + if (typeof value === "string") return { stringValue: value }; + if (typeof value === "boolean") return { boolValue: value }; + if (Number.isInteger(value)) return { intValue: value }; + return { doubleValue: value }; +} + +/** + * Convert epoch milliseconds to nanosecond string, preserving sub-ms precision. + * Fractional ms from performance.now() arithmetic carry meaningful microsecond + * data that affects span sort ordering when events happen within the same ms. + */ +function msToNano(ms: number): string { + const wholeMs = Math.trunc(ms); + const fracNs = Math.round((ms - wholeMs) * 1_000_000); + return String(BigInt(wholeMs) * 1_000_000n + BigInt(fracNs)); +} diff --git a/apps/supervisor/src/services/podCleaner.test.ts b/apps/supervisor/src/services/podCleaner.test.ts new file mode 100644 index 00000000000..d6ed2bb737f --- /dev/null +++ b/apps/supervisor/src/services/podCleaner.test.ts @@ -0,0 +1,473 @@ +import { PodCleaner } from "./podCleaner.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { setTimeout } from "timers/promises"; +import { describe, it, expect, beforeAll, afterEach } from "vitest"; +import { Registry } from "prom-client"; + +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("PodCleaner Integration Tests", () => { + const k8s = createK8sApi(); + const namespace = "integration-test"; + const register = new Registry(); + + beforeAll(async () => { + // Create the test namespace, only if it doesn't exist + try { + await k8s.core.readNamespace({ name: namespace }); + } catch (error) { + await k8s.core.createNamespace({ + body: { + metadata: { + name: namespace, + }, + }, + }); + } + }); + + afterEach(async () => { + // Clear metrics to avoid conflicts + register.clear(); + + // Delete all pods in the namespace + await k8s.core.deleteCollectionNamespacedPod({ namespace, gracePeriodSeconds: 0 }); + }); + + it("should clean up succeeded pods", async () => { + const podCleaner = new PodCleaner({ namespace, k8s, register }); + + try { + // Create a test pod that's in succeeded state + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + namePrefix: "test-succeeded-pod", + }); + + if (!podNames[0]) { + throw new Error("Failed to create test pod"); + } + const podName = podNames[0]; + + // Wait for pod to complete + await waitForPodPhase({ + k8sApi: k8s, + namespace, + podName, + phase: "Succeeded", + }); + + // Start the pod cleaner + await podCleaner.start(); + + // Wait for pod to be deleted + await waitForPodDeletion({ + k8sApi: k8s, + namespace, + podName, + }); + + // Verify pod was deleted + expect(await podExists({ k8sApi: k8s, namespace, podName })).toBe(false); + } finally { + await podCleaner.stop(); + } + }, 30000); + + it("should accurately track deletion metrics", async () => { + const podCleaner = new PodCleaner({ namespace, k8s, register }); + try { + // Create a test pod that's in succeeded state + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + namePrefix: "test-succeeded-pod", + }); + + // Wait for pod to be in succeeded state + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Succeeded", + }); + + await podCleaner.start(); + + // Wait for pod to be deleted + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + const metrics = podCleaner.getMetrics(); + const deletionCycles = await metrics.deletionCyclesTotal.get(); + const deletionTimestamp = await metrics.lastDeletionTimestamp.get(); + + expect(deletionCycles?.values[0]?.value).toBeGreaterThan(0); + expect(deletionTimestamp?.values[0]?.value).toBeGreaterThan(0); + } finally { + await podCleaner.stop(); + } + }, 30000); + + it("should handle different batch sizes - small", async () => { + const podCleaner = new PodCleaner({ + namespace, + k8s, + register, + batchSize: 1, + }); + + try { + // Create some pods that will succeed + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 2, + }); + + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Succeeded", + }); + + await podCleaner.start(); + + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + const metrics = podCleaner.getMetrics(); + const cycles = await metrics.deletionCyclesTotal.get(); + + expect(cycles?.values[0]?.value).toBe(2); + } finally { + await podCleaner.stop(); + } + }, 30000); + + it("should handle different batch sizes - large", async () => { + const podCleaner = new PodCleaner({ + namespace, + k8s, + register, + batchSize: 5000, + }); + + try { + // Create some pods that will succeed + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 10, + }); + + await waitForPodsPhase({ + k8sApi: k8s, + namespace, + podNames, + phase: "Succeeded", + }); + + await podCleaner.start(); + + await waitForPodsDeletion({ + k8sApi: k8s, + namespace, + podNames, + }); + + const metrics = podCleaner.getMetrics(); + const cycles = await metrics.deletionCyclesTotal.get(); + + expect(cycles?.values[0]?.value).toBe(1); + } finally { + await podCleaner.stop(); + } + }, 30000); + + it("should not delete pods without app=task-run label", async () => { + const podCleaner = new PodCleaner({ namespace, k8s, register }); + + try { + // Create a test pod without the task-run label + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + labels: { app: "different-label" }, + namePrefix: "non-task-run-pod", + }); + + if (!podNames[0]) { + throw new Error("Failed to create test pod"); + } + const podName = podNames[0]; + + // Wait for pod to complete + await waitForPodPhase({ + k8sApi: k8s, + namespace, + podName, + phase: "Succeeded", + }); + + await podCleaner.start(); + + // Wait a reasonable time to ensure pod isn't deleted + await setTimeout(5000); + + // Verify pod still exists + expect(await podExists({ k8sApi: k8s, namespace, podName })).toBe(true); + } finally { + await podCleaner.stop(); + } + }, 30000); + + it("should not delete pods that are still running", async () => { + const podCleaner = new PodCleaner({ namespace, k8s, register }); + + try { + // Create a test pod with a long-running command + const podNames = await createTestPods({ + k8sApi: k8s, + namespace, + count: 1, + namePrefix: "running-pod", + command: ["sleep", "30"], // Will keep pod running + }); + + if (!podNames[0]) { + throw new Error("Failed to create test pod"); + } + const podName = podNames[0]; + + // Wait for pod to be running + await waitForPodPhase({ + k8sApi: k8s, + namespace, + podName, + phase: "Running", + }); + + await podCleaner.start(); + + // Wait a reasonable time to ensure pod isn't deleted + await setTimeout(5000); + + // Verify pod still exists + expect(await podExists({ k8sApi: k8s, namespace, podName })).toBe(true); + } finally { + await podCleaner.stop(); + } + }, 30000); +}); + +// Helper functions +async function waitForPodPhase({ + k8sApi, + namespace, + podName, + phase, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podName: string; + phase: string; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + + while (Date.now() - startTime < timeoutMs) { + const pod = await k8sApi.core.readNamespacedPod({ + namespace, + name: podName, + }); + if (pod.status?.phase === phase) { + return; + } + await setTimeout(waitMs); + } + + throw new Error(`Pod ${podName} did not reach phase ${phase} within ${timeoutMs}ms`); +} + +async function waitForPodDeletion({ + k8sApi, + namespace, + podName, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podName: string; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + + while (Date.now() - startTime < timeoutMs) { + try { + await k8sApi.core.readNamespacedPod({ + namespace, + name: podName, + }); + await setTimeout(waitMs); + } catch (error) { + // Pod was deleted + return; + } + } + + throw new Error(`Pod ${podName} was not deleted within ${timeoutMs}ms`); +} + +async function createTestPods({ + k8sApi, + namespace, + count, + labels = { app: "task-run" }, + shouldFail = false, + namePrefix = "test-pod", + command = ["/bin/sh", "-c", shouldFail ? "exit 1" : "exit 0"], +}: { + k8sApi: K8sApi; + namespace: string; + count: number; + labels?: Record; + shouldFail?: boolean; + namePrefix?: string; + command?: string[]; +}) { + const createdPods: string[] = []; + + for (let i = 0; i < count; i++) { + const podName = `${namePrefix}-${i}`; + await k8sApi.core.createNamespacedPod({ + namespace, + body: { + metadata: { + name: podName, + labels, + }, + spec: { + restartPolicy: "Never", + containers: [ + { + name: "test", + image: "busybox:1.37.0", + command, + }, + ], + }, + }, + }); + createdPods.push(podName); + } + + return createdPods; +} + +async function waitForPodsPhase({ + k8sApi, + namespace, + podNames, + phase, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podNames: string[]; + phase: "Pending" | "Running" | "Succeeded" | "Failed" | "Unknown"; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + const pendingPods = new Set(podNames); + + while (pendingPods.size > 0 && Date.now() - startTime < timeoutMs) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + + for (const pod of pods.items) { + if (pendingPods.has(pod.metadata?.name ?? "") && pod.status?.phase === phase) { + pendingPods.delete(pod.metadata?.name ?? ""); + } + } + + if (pendingPods.size > 0) { + await setTimeout(waitMs); + } + } + + if (pendingPods.size > 0) { + throw new Error( + `Pods [${Array.from(pendingPods).join( + ", " + )}] did not reach phase ${phase} within ${timeoutMs}ms` + ); + } +} + +async function waitForPodsDeletion({ + k8sApi, + namespace, + podNames, + timeoutMs = 10000, + waitMs = 1000, +}: { + k8sApi: K8sApi; + namespace: string; + podNames: string[]; + timeoutMs?: number; + waitMs?: number; +}) { + const startTime = Date.now(); + const pendingPods = new Set(podNames); + + while (pendingPods.size > 0 && Date.now() - startTime < timeoutMs) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + const existingPods = new Set(pods.items.map((pod) => pod.metadata?.name ?? "")); + + for (const podName of pendingPods) { + if (!existingPods.has(podName)) { + pendingPods.delete(podName); + } + } + + if (pendingPods.size > 0) { + await setTimeout(waitMs); + } + } + + if (pendingPods.size > 0) { + throw new Error( + `Pods [${Array.from(pendingPods).join(", ")}] were not deleted within ${timeoutMs}ms` + ); + } +} + +async function podExists({ + k8sApi, + namespace, + podName, +}: { + k8sApi: K8sApi; + namespace: string; + podName: string; +}) { + const pods = await k8sApi.core.listNamespacedPod({ namespace }); + return pods.items.some((p) => p.metadata?.name === podName); +} diff --git a/apps/supervisor/src/services/podCleaner.ts b/apps/supervisor/src/services/podCleaner.ts new file mode 100644 index 00000000000..3ac5da293df --- /dev/null +++ b/apps/supervisor/src/services/podCleaner.ts @@ -0,0 +1,118 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { K8sApi } from "../clients/kubernetes.js"; +import { createK8sApi } from "../clients/kubernetes.js"; +import { IntervalService } from "@trigger.dev/core/v3"; +import { Counter, Gauge, Registry } from "prom-client"; +import { register } from "../metrics.js"; + +export type PodCleanerOptions = { + namespace: string; + k8s?: K8sApi; + register?: Registry; + batchSize?: number; + intervalMs?: number; +}; + +export class PodCleaner { + private readonly logger = new SimpleStructuredLogger("pod-cleaner"); + private readonly k8s: K8sApi; + private readonly namespace: string; + + private readonly batchSize: number; + private readonly deletionInterval: IntervalService; + + // Metrics + private readonly register: Registry; + private readonly deletionCyclesTotal: Counter; + private readonly lastDeletionTimestamp: Gauge; + + constructor(opts: PodCleanerOptions) { + this.k8s = opts.k8s ?? createK8sApi(); + + this.namespace = opts.namespace; + this.batchSize = opts.batchSize ?? 500; + + this.deletionInterval = new IntervalService({ + intervalMs: opts.intervalMs ?? 10000, + leadingEdge: true, + onInterval: this.deleteCompletedPods.bind(this), + }); + + // Initialize metrics + this.register = opts.register ?? register; + + this.deletionCyclesTotal = new Counter({ + name: "pod_cleaner_deletion_cycles_total", + help: "Total number of pod deletion cycles run", + labelNames: ["namespace", "status", "batch_size"], + registers: [this.register], + }); + + this.lastDeletionTimestamp = new Gauge({ + name: "pod_cleaner_last_deletion_timestamp", + help: "Timestamp of the last deletion cycle", + labelNames: ["namespace"], + registers: [this.register], + }); + } + + async start() { + this.deletionInterval.start(); + } + + async stop() { + this.deletionInterval.stop(); + } + + private async deleteCompletedPods() { + let continuationToken: string | undefined; + + do { + try { + const result = await this.k8s.core.deleteCollectionNamespacedPod({ + namespace: this.namespace, + labelSelector: "app=task-run", + fieldSelector: "status.phase=Succeeded", + limit: this.batchSize, + _continue: continuationToken, + gracePeriodSeconds: 0, + propagationPolicy: "Background", + timeoutSeconds: 30, + }); + + // Update continuation token for next batch + continuationToken = result.metadata?._continue; + + // Increment the deletion cycles counter + this.deletionCyclesTotal.inc({ + namespace: this.namespace, + batch_size: this.batchSize, + status: "succeeded", + }); + + this.logger.debug("Deleted batch of pods", { continuationToken }); + } catch (err) { + this.logger.error("Failed to delete batch of pods", { + err: err instanceof Error ? err.message : String(err), + }); + + this.deletionCyclesTotal.inc({ + namespace: this.namespace, + batch_size: this.batchSize, + status: "failed", + }); + break; + } + } while (continuationToken); + + this.lastDeletionTimestamp.set({ namespace: this.namespace }, Date.now()); + } + + // Method to expose metrics for testing + public getMetrics() { + return { + deletionCyclesTotal: this.deletionCyclesTotal, + lastDeletionTimestamp: this.lastDeletionTimestamp, + }; + } +} diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts new file mode 100644 index 00000000000..3f6bb9aa19b --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.test.ts @@ -0,0 +1,254 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { TimerWheel } from "./timerWheel.js"; + +describe("TimerWheel", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("dispatches item after delay", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "snapshot-data"); + + // Not yet + vi.advanceTimersByTime(2900); + expect(dispatched).toEqual([]); + + // After delay + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("cancels item before it fires", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(1000); + expect(wheel.cancel("run-1")).toBe(true); + + vi.advanceTimersByTime(5000); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("cancel returns false for unknown key", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + expect(wheel.cancel("nonexistent")).toBe(false); + }); + + it("deduplicates: resubmitting same key replaces the entry", () => { + const dispatched: { key: string; data: string }[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push({ key: item.key, data: item.data }), + }); + + wheel.start(); + wheel.submit("run-1", "old-data"); + + vi.advanceTimersByTime(1000); + wheel.submit("run-1", "new-data"); + + // Original would have fired at t=3000, but was replaced + // New one fires at t=1000+3000=4000 + vi.advanceTimersByTime(2100); + expect(dispatched).toEqual([]); + + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual([{ key: "run-1", data: "new-data" }]); + + wheel.stop(); + }); + + it("handles many concurrent items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + for (let i = 0; i < 1000; i++) { + wheel.submit(`run-${i}`, `data-${i}`); + } + expect(wheel.size).toBe(1000); + + vi.advanceTimersByTime(3100); + expect(dispatched.length).toBe(1000); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("handles items submitted at different times", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-2", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-3", "data"); + + // t=2000: nothing yet + expect(dispatched).toEqual([]); + + // t=3100: run-1 fires + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-1"]); + + // t=4100: run-2 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2"]); + + // t=5100: run-3 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2", "run-3"]); + + wheel.stop(); + }); + + it("setDelay changes delay for new items only", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); // 3s delay + + vi.advanceTimersByTime(500); + wheel.setDelay(1000); + wheel.submit("run-2", "data"); // 1s delay + + // t=1500: run-2 should have fired (submitted at t=500 with 1s delay) + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-2"]); + + // t=3100: run-1 fires at its original 3s delay + vi.advanceTimersByTime(1500); + expect(dispatched).toEqual(["run-2", "run-1"]); + + wheel.stop(); + }); + + it("stop returns unprocessed items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data-1"); + wheel.submit("run-2", "data-2"); + wheel.submit("run-3", "data-3"); + + const remaining = wheel.stop(); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + expect(remaining.length).toBe(3); + expect(remaining.map((r) => r.key).sort()).toEqual(["run-1", "run-2", "run-3"]); + expect(remaining.find((r) => r.key === "run-1")?.data).toBe("data-1"); + }); + + it("after stop, new submissions are silently dropped", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.stop(); + + wheel.submit("run-late", "data"); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + }); + + it("tracks size correctly through submit/cancel/dispatch", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + + wheel.submit("a", "data"); + wheel.submit("b", "data"); + expect(wheel.size).toBe(2); + + wheel.cancel("a"); + expect(wheel.size).toBe(1); + + vi.advanceTimersByTime(3100); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("clamps delay to valid range", () => { + const dispatched: string[] = []; + + // Very small delay (should be at least 1 tick = 100ms) + const wheel = new TimerWheel({ + delayMs: 0, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("multiple cancel calls are safe", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + expect(wheel.cancel("run-1")).toBe(true); + expect(wheel.cancel("run-1")).toBe(false); + + wheel.stop(); + }); +}); diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts new file mode 100644 index 00000000000..9584423824d --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.ts @@ -0,0 +1,160 @@ +/** + * TimerWheel implements a hashed timer wheel for efficiently managing large numbers + * of delayed operations with O(1) submit, cancel, and per-item dispatch. + * + * Used by the supervisor to delay snapshot requests so that short-lived waitpoints + * (e.g. triggerAndWait that resolves in <3s) skip the snapshot entirely. + * + * The wheel is a ring buffer of slots. A single setInterval advances a cursor. + * When the cursor reaches a slot, all items in that slot are dispatched. + * + * Fixed capacity: 600 slots at 100ms tick = 60s max delay. + */ + +const TICK_MS = 100; +const NUM_SLOTS = 600; // 60s max delay at 100ms tick + +export type TimerWheelItem = { + key: string; + data: T; +}; + +export type TimerWheelOptions = { + /** Called when an item's delay expires. */ + onExpire: (item: TimerWheelItem) => void; + /** Delay in milliseconds before items fire. Clamped to [100, 60000]. */ + delayMs: number; +}; + +type Entry = { + key: string; + data: T; + slotIndex: number; +}; + +export class TimerWheel { + private slots: Set[]; + private entries: Map>; + private cursor: number; + private intervalId: ReturnType | null; + private onExpire: (item: TimerWheelItem) => void; + private delaySlots: number; + + constructor(opts: TimerWheelOptions) { + this.slots = Array.from({ length: NUM_SLOTS }, () => new Set()); + this.entries = new Map(); + this.cursor = 0; + this.intervalId = null; + this.onExpire = opts.onExpire; + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(opts.delayMs / TICK_MS))); + } + + /** Start the timer wheel. Must be called before submitting items. */ + start(): void { + if (this.intervalId) return; + this.intervalId = setInterval(() => this.tick(), TICK_MS); + // Don't hold the process open just for the timer wheel + if (this.intervalId && typeof this.intervalId === "object" && "unref" in this.intervalId) { + this.intervalId.unref(); + } + } + + /** + * Stop the timer wheel and return all unprocessed items. + * The wheel keeps running normally during graceful shutdown - call stop() + * only when you're ready to tear down. Caller decides what to do with leftovers. + */ + stop(): TimerWheelItem[] { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + + const remaining: TimerWheelItem[] = []; + for (const [key, entry] of this.entries) { + remaining.push({ key, data: entry.data }); + } + + for (const slot of this.slots) { + slot.clear(); + } + this.entries.clear(); + + return remaining; + } + + /** + * Update the delay for future submissions. Already-queued items keep their original timing. + * Clamped to [TICK_MS, 60000ms]. + */ + setDelay(delayMs: number): void { + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(delayMs / TICK_MS))); + } + + /** + * Submit an item to be dispatched after the configured delay. + * If an item with the same key already exists, it is replaced (dedup). + * No-op if the wheel is stopped. + */ + submit(key: string, data: T): void { + if (!this.intervalId) return; + + // Dedup: remove existing entry for this key + this.cancel(key); + + const slotIndex = (this.cursor + this.delaySlots) % NUM_SLOTS; + const entry: Entry = { key, data, slotIndex }; + + this.entries.set(key, entry); + this.slot(slotIndex).add(key); + } + + /** + * Cancel a pending item. Returns true if the item was found and removed. + */ + cancel(key: string): boolean { + const entry = this.entries.get(key); + if (!entry) return false; + + this.slot(entry.slotIndex).delete(key); + this.entries.delete(key); + return true; + } + + /** Number of pending items in the wheel. */ + get size(): number { + return this.entries.size; + } + + /** Whether the wheel is running. */ + get running(): boolean { + return this.intervalId !== null; + } + + /** Get a slot by index. The array is fully initialized so this always returns a Set. */ + private slot(index: number): Set { + const s = this.slots[index]; + if (!s) throw new Error(`TimerWheel: invalid slot index ${index}`); + return s; + } + + /** Advance the cursor and dispatch all items in the current slot. */ + private tick(): void { + this.cursor = (this.cursor + 1) % NUM_SLOTS; + const slot = this.slot(this.cursor); + + if (slot.size === 0) return; + + // Collect items to dispatch (copy keys since we mutate during iteration) + const keys = [...slot]; + slot.clear(); + + for (const key of keys) { + const entry = this.entries.get(key); + if (!entry) continue; + + this.entries.delete(key); + this.onExpire({ key, data: entry.data }); + } + } +} diff --git a/apps/supervisor/src/util.ts b/apps/supervisor/src/util.ts new file mode 100644 index 00000000000..d14dd99bfe1 --- /dev/null +++ b/apps/supervisor/src/util.ts @@ -0,0 +1,44 @@ +import { isMacOS, isWindows } from "std-env"; + +export function normalizeDockerHostUrl(url: string) { + const $url = new URL(url); + + if ($url.hostname === "localhost") { + $url.hostname = getDockerHostDomain(); + } + + return $url.toString(); +} + +export function getDockerHostDomain() { + return isMacOS || isWindows ? "host.docker.internal" : "localhost"; +} + +/** Extract the W3C traceparent string from an untyped trace context record */ +export function extractTraceparent(traceContext?: Record): string | undefined { + if ( + traceContext && + "traceparent" in traceContext && + typeof traceContext.traceparent === "string" + ) { + return traceContext.traceparent; + } + return undefined; +} + +export function getRunnerId(runId: string, attemptNumber?: number) { + const parts = ["runner", runId.replace("run_", "")]; + + if (attemptNumber && attemptNumber > 1) { + parts.push(`attempt-${attemptNumber}`); + } + + return parts.join("-"); +} + +/** Derive a unique runnerId for a restore cycle using the checkpoint suffix */ +export function getRestoreRunnerId(runFriendlyId: string, checkpointId: string) { + const runIdShort = runFriendlyId.replace("run_", ""); + const checkpointSuffix = checkpointId.slice(-8); + return `runner-${runIdShort}-${checkpointSuffix}`; +} diff --git a/apps/supervisor/src/workerToken.ts b/apps/supervisor/src/workerToken.ts new file mode 100644 index 00000000000..1142796a7a3 --- /dev/null +++ b/apps/supervisor/src/workerToken.ts @@ -0,0 +1,29 @@ +import { readFileSync } from "fs"; +import { env } from "./env.js"; + +export function getWorkerToken() { + if (!env.TRIGGER_WORKER_TOKEN.startsWith("file://")) { + return env.TRIGGER_WORKER_TOKEN; + } + + const tokenPath = env.TRIGGER_WORKER_TOKEN.replace("file://", ""); + + console.debug( + JSON.stringify({ + message: "🔑 Reading worker token from file", + tokenPath, + }) + ); + + try { + const token = readFileSync(tokenPath, "utf8").trim(); + return token; + } catch (error) { + console.error(`Failed to read worker token from file: ${tokenPath}`, error); + throw new Error( + `Unable to read worker token from file: ${ + error instanceof Error ? error.message : "Unknown error" + }` + ); + } +} diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts new file mode 100644 index 00000000000..1c00f33aad3 --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -0,0 +1,374 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import { flattenAttributes } from "@trigger.dev/core/v3/utils/flattenAttributes"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { extractTraceparent, getRunnerId } from "../util.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; +import { tryCatch } from "@trigger.dev/core"; + +type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { + gateway: { + url: string; + authToken?: string; + timeoutMs: number; + }; + snapshots: { + enabled: boolean; + delayMs: number; + dispatchLimit: number; + callbackUrl: string; + }; + tracing?: OtlpTraceService; + runner: { + instanceName: string; + otelEndpoint: string; + prettyLogs: boolean; + }; +}; + +export class ComputeWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + private readonly compute: ComputeClient; + + constructor(private opts: ComputeWorkloadManagerOptions) { + if (opts.workloadApiDomain) { + this.logger.warn("⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); + } + + this.compute = new ComputeClient({ + gatewayUrl: opts.gateway.url, + authToken: opts.gateway.authToken, + timeoutMs: opts.gateway.timeoutMs, + }); + } + + get snapshotsEnabled(): boolean { + return this.opts.snapshots.enabled; + } + + get snapshotDelayMs(): number { + return this.opts.snapshots.delayMs; + } + + get snapshotDispatchLimit(): number { + return this.opts.snapshots.dispatchLimit; + } + + get traceSpansEnabled(): boolean { + return !!this.opts.tracing; + } + + async create(opts: WorkloadManagerCreateOptions) { + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + const envVars: Record = { + OTEL_EXPORTER_OTLP_ENDPOINT: this.opts.runner.otelEndpoint, + TRIGGER_DEQUEUED_AT_MS: String(opts.dequeuedAt.getTime()), + TRIGGER_POD_SCHEDULED_AT_MS: String(Date.now()), + TRIGGER_ENV_ID: opts.envId, + TRIGGER_DEPLOYMENT_ID: opts.deploymentFriendlyId, + TRIGGER_DEPLOYMENT_VERSION: opts.deploymentVersion, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + TRIGGER_RUNNER_ID: runnerId, + TRIGGER_MACHINE_CPU: String(opts.machine.cpu), + TRIGGER_MACHINE_MEMORY: String(opts.machine.memory), + PRETTY_LOGS: String(this.opts.runner.prettyLogs), + }; + + if (this.opts.warmStartUrl) { + envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; + } + + if (this.snapshotsEnabled && this.opts.metadataUrl) { + envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = String(this.opts.heartbeatIntervalSeconds); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String( + this.opts.snapshotPollIntervalSeconds + ); + } + + if (this.opts.additionalEnvVars) { + Object.assign(envVars, this.opts.additionalEnvVars); + } + + // Strip image digest - resolve by tag, not digest + const imageRef = stripImageDigest(opts.image); + + // Wide event: single canonical log line emitted in finally + const event: Record = { + // High-cardinality identifiers + runId: opts.runFriendlyId, + runnerId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + // Environment + instanceName: this.opts.runner.instanceName, + // Supervisor timing + dequeueResponseMs: opts.dequeueResponseMs, + pollingIntervalMs: opts.pollingIntervalMs, + warmStartCheckMs: opts.warmStartCheckMs, + // Request + image: imageRef, + }; + + const startMs = performance.now(); + + try { + const [error, data] = await tryCatch( + this.compute.instances.create({ + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, + }) + ); + + if (error) { + event.error = error instanceof Error ? error.message : String(error); + event.errorType = + error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; + // Intentional: errors are captured in the wide event, not thrown. This matches + // the Docker/K8s managers. The run will eventually time out if scheduling fails. + return; + } + + event.instanceId = data.id; + event.ok = true; + + // Parse timing data from compute response (optional - requires gateway timing flag) + if (data._timing) { + event.timing = data._timing; + } + + this.#emitProvisionSpan(opts, startMs, data._timing); + } finally { + event.durationMs = Math.round(performance.now() - startMs); + event.ok ??= false; + this.logger.debug("create instance", event); + } + } + + async snapshot(opts: { runnerId: string; metadata: Record }): Promise { + const [error] = await tryCatch( + this.compute.instances.snapshot(opts.runnerId, { + callback: { + url: this.opts.snapshots.callbackUrl, + metadata: opts.metadata, + }, + }) + ); + + if (error) { + this.logger.error("snapshot request failed", { + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("snapshot request accepted", { runnerId: opts.runnerId }); + return true; + } + + async deleteInstance(runnerId: string): Promise { + const [error] = await tryCatch(this.compute.instances.delete(runnerId)); + + if (error) { + this.logger.error("delete instance failed", { + runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("delete instance success", { runnerId }); + return true; + } + + #emitProvisionSpan(opts: WorkloadManagerCreateOptions, startMs: number, timing?: unknown) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed) return; + + const endMs = performance.now(); + const now = Date.now(); + const provisionStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Span starts at dequeue time so events (dequeue) render in the thin-line section + // before "Started". The actual provision call time is in provisionStartEpochMs. + // Subtract 1ms so compute span always sorts before the attempt span (same dequeue time) + const startEpochMs = opts.dequeuedAt.getTime() - 1; + + const spanAttributes: Record = { + "compute.type": "create", + "compute.provision_start_ms": provisionStartEpochMs, + ...(timing + ? (flattenAttributes(timing, "compute") as Record) + : {}), + }; + + if (opts.dequeueResponseMs !== undefined) { + spanAttributes["supervisor.dequeue_response_ms"] = opts.dequeueResponseMs; + } + if (opts.warmStartCheckMs !== undefined) { + spanAttributes["supervisor.warm_start_check_ms"] = opts.warmStartCheckMs; + } + + // Use the platform API URL, not the runner OTLP endpoint (which may be a VM gateway IP) + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.provision", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes, + }); + } + + async restore(opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; + machine: { cpu: number; memory: number }; + // Trace context for OTel span emission + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }): Promise { + const metadata: Record = { + TRIGGER_RUNNER_ID: opts.runnerId, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + }; + + this.logger.verbose("restore request body", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); + + const startMs = performance.now(); + + const [error] = await tryCatch( + this.compute.snapshots.restore(opts.snapshotId, { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + }) + ); + + const durationMs = Math.round(performance.now() - startMs); + + if (error) { + this.logger.error("restore request failed", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + durationMs, + }); + return false; + } + + this.logger.debug("restore request success", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + durationMs, + }); + + this.#emitRestoreSpan(opts, startMs); + + return true; + } + + #emitRestoreSpan( + opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }, + startMs: number + ) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed || !opts.envId || !opts.orgId || !opts.projectId) return; + + const endMs = performance.now(); + const now = Date.now(); + const restoreStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Subtract 1ms so restore span always sorts before the attempt span + const startEpochMs = (opts.dequeuedAt?.getTime() ?? restoreStartEpochMs) - 1; + + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.restore", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes: { + "compute.type": "restore", + "compute.snapshot_id": opts.snapshotId, + }, + }); + } +} diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts new file mode 100644 index 00000000000..66405df9ba5 --- /dev/null +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -0,0 +1,304 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { env } from "../env.js"; +import { getDockerHostDomain, getRunnerId, normalizeDockerHostUrl } from "../util.js"; +import Docker from "dockerode"; +import { tryCatch } from "@trigger.dev/core"; +import { ECRAuthService } from "./ecrAuth.js"; + +export class DockerWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("docker-workload-manager"); + private readonly docker: Docker; + + private readonly runnerNetworks: string[]; + private readonly staticAuth?: Docker.AuthConfig; + private readonly platformOverride?: string; + private readonly ecrAuthService?: ECRAuthService; + + constructor(private opts: WorkloadManagerOptions) { + this.docker = new Docker({ + version: env.DOCKER_API_VERSION, + }); + + if (opts.workloadApiDomain) { + this.logger.warn("⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); + } + + this.runnerNetworks = env.DOCKER_RUNNER_NETWORKS.split(","); + + this.platformOverride = env.DOCKER_PLATFORM; + if (this.platformOverride) { + this.logger.info("🖥️ Platform override", { + targetPlatform: this.platformOverride, + hostPlatform: process.arch, + }); + } + + if (env.DOCKER_REGISTRY_USERNAME && env.DOCKER_REGISTRY_PASSWORD && env.DOCKER_REGISTRY_URL) { + this.logger.info("🐋 Using Docker registry credentials", { + username: env.DOCKER_REGISTRY_USERNAME, + url: env.DOCKER_REGISTRY_URL, + }); + + this.staticAuth = { + username: env.DOCKER_REGISTRY_USERNAME, + password: env.DOCKER_REGISTRY_PASSWORD, + serveraddress: env.DOCKER_REGISTRY_URL, + }; + } else if (ECRAuthService.hasAWSCredentials()) { + this.logger.info("🐋 AWS credentials found, initializing ECR auth service"); + this.ecrAuthService = new ECRAuthService(); + } else { + this.logger.warn( + "🐋 No Docker registry credentials or AWS credentials provided, skipping auth" + ); + } + } + + async create(opts: WorkloadManagerCreateOptions) { + this.logger.verbose("create()", { opts }); + + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + // Build environment variables + const envVars: string[] = [ + `OTEL_EXPORTER_OTLP_ENDPOINT=${env.OTEL_EXPORTER_OTLP_ENDPOINT}`, + `TRIGGER_DEQUEUED_AT_MS=${opts.dequeuedAt.getTime()}`, + `TRIGGER_POD_SCHEDULED_AT_MS=${Date.now()}`, + `TRIGGER_ENV_ID=${opts.envId}`, + `TRIGGER_DEPLOYMENT_ID=${opts.deploymentFriendlyId}`, + `TRIGGER_DEPLOYMENT_VERSION=${opts.deploymentVersion}`, + `TRIGGER_RUN_ID=${opts.runFriendlyId}`, + `TRIGGER_SNAPSHOT_ID=${opts.snapshotFriendlyId}`, + `TRIGGER_SUPERVISOR_API_PROTOCOL=${this.opts.workloadApiProtocol}`, + `TRIGGER_SUPERVISOR_API_PORT=${this.opts.workloadApiPort}`, + `TRIGGER_SUPERVISOR_API_DOMAIN=${this.opts.workloadApiDomain ?? getDockerHostDomain()}`, + `TRIGGER_WORKER_INSTANCE_NAME=${env.TRIGGER_WORKER_INSTANCE_NAME}`, + `TRIGGER_RUNNER_ID=${runnerId}`, + `TRIGGER_MACHINE_CPU=${opts.machine.cpu}`, + `TRIGGER_MACHINE_MEMORY=${opts.machine.memory}`, + `PRETTY_LOGS=${env.RUNNER_PRETTY_LOGS}`, + ]; + + if (this.opts.warmStartUrl) { + envVars.push(`TRIGGER_WARM_START_URL=${normalizeDockerHostUrl(this.opts.warmStartUrl)}`); + } + + if (this.opts.metadataUrl) { + envVars.push(`TRIGGER_METADATA_URL=${this.opts.metadataUrl}`); + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.push(`TRIGGER_HEARTBEAT_INTERVAL_SECONDS=${this.opts.heartbeatIntervalSeconds}`); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.push( + `TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS=${this.opts.snapshotPollIntervalSeconds}` + ); + } + + if (this.opts.additionalEnvVars) { + Object.entries(this.opts.additionalEnvVars).forEach(([key, value]) => { + envVars.push(`${key}=${value}`); + }); + } + + const hostConfig: Docker.HostConfig = { + AutoRemove: !!this.opts.dockerAutoremove, + }; + + const [firstNetwork, ...remainingNetworks] = this.runnerNetworks; + + // Always attach the first network at container creation time. This has the following benefits: + // - If there is only a single network to attach, this will prevent having to make a separate request. + // - If there are multiple networks to attach, this will ensure the runner won't also be connected to the bridge network + hostConfig.NetworkMode = firstNetwork; + + if (env.DOCKER_ENFORCE_MACHINE_PRESETS) { + hostConfig.NanoCpus = opts.machine.cpu * 1e9; + hostConfig.Memory = opts.machine.memory * 1024 * 1024 * 1024; + } + + let imageRef = opts.image; + + if (env.DOCKER_STRIP_IMAGE_DIGEST) { + imageRef = opts.image.split("@")[0]!; + } + + const containerCreateOpts: Docker.ContainerCreateOptions = { + name: runnerId, + Hostname: runnerId, + HostConfig: hostConfig, + Image: imageRef, + AttachStdout: false, + AttachStderr: false, + AttachStdin: false, + }; + + if (this.platformOverride) { + containerCreateOpts.platform = this.platformOverride; + } + + const logger = this.logger.child({ opts, containerCreateOpts }); + + const [inspectError, inspectResult] = await tryCatch(this.docker.getImage(imageRef).inspect()); + + let shouldPull = !!inspectError; + if (this.platformOverride) { + const imageArchitecture = inspectResult?.Architecture; + + // When the image architecture doesn't match the platform, we need to pull the image + if (imageArchitecture && !this.platformOverride.includes(imageArchitecture)) { + shouldPull = true; + } + } + + // If the image is not present, try to pull it + if (shouldPull) { + logger.info("Pulling image", { + error: inspectError, + image: opts.image, + targetPlatform: this.platformOverride, + imageArchitecture: inspectResult?.Architecture, + }); + + // Get auth config (static or ECR) + const authConfig = await this.getAuthConfig(); + + // Ensure the image is present + const [createImageError, imageResponseReader] = await tryCatch( + this.docker.createImage(authConfig, { + fromImage: imageRef, + ...(this.platformOverride ? { platform: this.platformOverride } : {}), + }) + ); + if (createImageError) { + logger.error("Failed to pull image", { error: createImageError }); + return; + } + + const [imageReadError, imageResponse] = await tryCatch(readAllChunks(imageResponseReader)); + if (imageReadError) { + logger.error("failed to read image response", { error: imageReadError }); + return; + } + + logger.debug("pulled image", { image: opts.image, imageResponse }); + } else { + // Image is present, so we can use it to create the container + } + + // Create container + const [createContainerError, container] = await tryCatch( + this.docker.createContainer({ + ...containerCreateOpts, + // Add env vars here so they're not logged + Env: envVars, + }) + ); + + if (createContainerError) { + logger.error("Failed to create container", { error: createContainerError }); + return; + } + + // If there are multiple networks to attach to we need to attach the remaining ones after creation + if (remainingNetworks.length > 0) { + await this.attachContainerToNetworks({ + containerId: container.id, + networkNames: remainingNetworks, + }); + } + + // Start container + const [startError, startResult] = await tryCatch(container.start()); + + if (startError) { + logger.error("Failed to start container", { error: startError, containerId: container.id }); + return; + } + + logger.debug("create succeeded", { startResult, containerId: container.id }); + } + + /** + * Get authentication config for Docker operations + * Uses static credentials if available, otherwise attempts ECR auth + */ + private async getAuthConfig(): Promise { + // Use static credentials if available + if (this.staticAuth) { + return this.staticAuth; + } + + // Use ECR auth if service is available + if (this.ecrAuthService) { + const ecrAuth = await this.ecrAuthService.getAuthConfig(); + return ecrAuth || undefined; + } + + // No auth available + return undefined; + } + + private async attachContainerToNetworks({ + containerId, + networkNames, + }: { + containerId: string; + networkNames: string[]; + }) { + this.logger.debug("Attaching container to networks", { containerId, networkNames }); + + const [error, networkResults] = await tryCatch( + this.docker.listNetworks({ + filters: { + // Full name matches only to prevent unexpected results + name: networkNames.map((name) => `^${name}$`), + }, + }) + ); + + if (error) { + this.logger.error("Failed to list networks", { networkNames }); + return; + } + + const results = await Promise.allSettled( + networkResults.map((networkInfo) => { + const network = this.docker.getNetwork(networkInfo.Id); + return network.connect({ Container: containerId }); + }) + ); + + if (results.some((r) => r.status === "rejected")) { + this.logger.error("Failed to attach container to some networks", { + containerId, + networkNames, + results, + }); + return; + } + + this.logger.debug("Attached container to networks", { + containerId, + networkNames, + results, + }); + } +} + +async function readAllChunks(reader: NodeJS.ReadableStream) { + const chunks = []; + for await (const chunk of reader) { + chunks.push(chunk.toString()); + } + return chunks; +} diff --git a/apps/supervisor/src/workloadManager/ecrAuth.ts b/apps/supervisor/src/workloadManager/ecrAuth.ts new file mode 100644 index 00000000000..33e98f63195 --- /dev/null +++ b/apps/supervisor/src/workloadManager/ecrAuth.ts @@ -0,0 +1,144 @@ +import { ECRClient, GetAuthorizationTokenCommand } from "@aws-sdk/client-ecr"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { tryCatch } from "@trigger.dev/core"; +import Docker from "dockerode"; + +interface ECRTokenCache { + token: string; + username: string; + serverAddress: string; + expiresAt: Date; +} + +export class ECRAuthService { + private readonly logger = new SimpleStructuredLogger("ecr-auth-service"); + private readonly ecrClient: ECRClient; + private tokenCache: ECRTokenCache | null = null; + + constructor() { + this.ecrClient = new ECRClient(); + + this.logger.info("🔐 ECR Auth Service initialized", { + region: this.ecrClient.config.region, + }); + } + + /** + * Check if we have AWS credentials configured + */ + static hasAWSCredentials(): boolean { + if (process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY) { + return true; + } + + if ( + process.env.AWS_PROFILE || + process.env.AWS_ROLE_ARN || + process.env.AWS_WEB_IDENTITY_TOKEN_FILE + ) { + return true; + } + + return false; + } + + /** + * Check if the current token is still valid with a 10-minute buffer + */ + private isTokenValid(): boolean { + if (!this.tokenCache) { + return false; + } + + const now = new Date(); + const bufferMs = 10 * 60 * 1000; // 10 minute buffer before expiration + return now < new Date(this.tokenCache.expiresAt.getTime() - bufferMs); + } + + /** + * Get a fresh ECR authorization token from AWS + */ + private async fetchNewToken(): Promise { + const [error, response] = await tryCatch( + this.ecrClient.send(new GetAuthorizationTokenCommand({})) + ); + + if (error) { + this.logger.error("Failed to get ECR authorization token", { error }); + return null; + } + + const authData = response.authorizationData?.[0]; + if (!authData?.authorizationToken || !authData.proxyEndpoint) { + this.logger.error("Invalid ECR authorization response", { authData }); + return null; + } + + // Decode the base64 token to get username:password + const decoded = Buffer.from(authData.authorizationToken, "base64").toString("utf-8"); + const [username, password] = decoded.split(":", 2); + + if (!username || !password) { + this.logger.error("Failed to parse ECR authorization token"); + return null; + } + + const expiresAt = authData.expiresAt || new Date(Date.now() + 12 * 60 * 60 * 1000); // Default 12 hours + + const tokenCache: ECRTokenCache = { + token: password, + username, + serverAddress: authData.proxyEndpoint, + expiresAt, + }; + + this.logger.info("🔐 Successfully fetched ECR token", { + username, + serverAddress: authData.proxyEndpoint, + expiresAt: expiresAt.toISOString(), + }); + + return tokenCache; + } + + /** + * Get ECR auth config for Docker operations + * Returns cached token if valid, otherwise fetches a new one + */ + async getAuthConfig(): Promise { + // Check if cached token is still valid + if (this.isTokenValid()) { + this.logger.debug("Using cached ECR token"); + return { + username: this.tokenCache!.username, + password: this.tokenCache!.token, + serveraddress: this.tokenCache!.serverAddress, + }; + } + + // Fetch new token + this.logger.info("Fetching new ECR authorization token"); + const newToken = await this.fetchNewToken(); + + if (!newToken) { + return null; + } + + // Cache the new token + this.tokenCache = newToken; + + return { + username: newToken.username, + password: newToken.token, + serveraddress: newToken.serverAddress, + }; + } + + /** + * Clear the cached token (useful for testing or forcing refresh) + */ + clearCache(): void { + this.tokenCache = null; + this.logger.debug("ECR token cache cleared"); + } +} diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts new file mode 100644 index 00000000000..ec089267219 --- /dev/null +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -0,0 +1,565 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import type { + EnvironmentType, + MachinePreset, + MachinePresetName, + PlacementTag, +} from "@trigger.dev/core/v3"; +import { PlacementTagProcessor } from "@trigger.dev/core/v3/serverOnly"; +import { env } from "../env.js"; +import { type K8sApi, createK8sApi, type k8s } from "../clients/kubernetes.js"; +import { getRunnerId } from "../util.js"; + +type ResourceQuantities = { + [K in "cpu" | "memory" | "ephemeral-storage"]?: string; +}; + +const cpuRequestRatioByMachinePreset: Record = { + micro: env.KUBERNETES_CPU_REQUEST_RATIO_MICRO, + "small-1x": env.KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X, + "small-2x": env.KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X, + "medium-1x": env.KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X, + "medium-2x": env.KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X, + "large-1x": env.KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X, + "large-2x": env.KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X, +}; + +const memoryRequestRatioByMachinePreset: Record = { + micro: env.KUBERNETES_MEMORY_REQUEST_RATIO_MICRO, + "small-1x": env.KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X, + "small-2x": env.KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X, + "medium-1x": env.KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X, + "medium-2x": env.KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X, + "large-1x": env.KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X, + "large-2x": env.KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X, +}; + +export class KubernetesWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("kubernetes-workload-provider"); + private k8s: K8sApi; + private namespace = env.KUBERNETES_NAMESPACE; + private placementTagProcessor: PlacementTagProcessor; + + // Resource settings + private readonly cpuRequestMinCores = env.KUBERNETES_CPU_REQUEST_MIN_CORES; + private readonly cpuRequestRatio = env.KUBERNETES_CPU_REQUEST_RATIO; + private readonly memoryRequestMinGb = env.KUBERNETES_MEMORY_REQUEST_MIN_GB; + private readonly memoryRequestRatio = env.KUBERNETES_MEMORY_REQUEST_RATIO; + private readonly memoryOverheadGb = env.KUBERNETES_MEMORY_OVERHEAD_GB; + + constructor(private opts: WorkloadManagerOptions) { + this.k8s = createK8sApi(); + this.placementTagProcessor = new PlacementTagProcessor({ + enabled: env.PLACEMENT_TAGS_ENABLED, + prefix: env.PLACEMENT_TAGS_PREFIX, + }); + + if (opts.workloadApiDomain) { + this.logger.warn("[KubernetesWorkloadManager] ⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); + } + } + + private addPlacementTags( + podSpec: Omit, + placementTags?: PlacementTag[] + ): Omit { + const nodeSelector = this.placementTagProcessor.convertToNodeSelector( + placementTags, + podSpec.nodeSelector + ); + + return { + ...podSpec, + nodeSelector, + }; + } + + private stripImageDigest(imageRef: string): string { + if (!env.KUBERNETES_STRIP_IMAGE_DIGEST) { + return imageRef; + } + + const atIndex = imageRef.lastIndexOf("@"); + + if (atIndex === -1) { + return imageRef; + } + + return imageRef.substring(0, atIndex); + } + + private clamp(value: number, min: number, max: number): number { + return Math.min(Math.max(value, min), max); + } + + async create(opts: WorkloadManagerCreateOptions) { + this.logger.verbose("[KubernetesWorkloadManager] Creating container", { opts }); + + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + try { + await this.k8s.core.createNamespacedPod({ + namespace: this.namespace, + body: { + metadata: { + name: runnerId, + namespace: this.namespace, + labels: { + ...this.#getSharedLabels(opts), + app: "task-run", + "app.kubernetes.io/part-of": "trigger-worker", + "app.kubernetes.io/component": "create", + }, + }, + spec: { + ...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags), + affinity: this.#getAffinity(opts), + tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)), + terminationGracePeriodSeconds: 60 * 60, + containers: [ + { + name: "run-controller", + image: this.stripImageDigest(opts.image), + ports: [ + { + containerPort: 8000, + }, + ], + resources: this.#getResourcesForMachine(opts.machine), + env: [ + { + name: "TRIGGER_DEQUEUED_AT_MS", + value: opts.dequeuedAt.getTime().toString(), + }, + { + name: "TRIGGER_POD_SCHEDULED_AT_MS", + value: Date.now().toString(), + }, + { + name: "TRIGGER_RUN_ID", + value: opts.runFriendlyId, + }, + { + name: "TRIGGER_ENV_ID", + value: opts.envId, + }, + { + name: "TRIGGER_DEPLOYMENT_ID", + value: opts.deploymentFriendlyId, + }, + { + name: "TRIGGER_DEPLOYMENT_VERSION", + value: opts.deploymentVersion, + }, + { + name: "TRIGGER_SNAPSHOT_ID", + value: opts.snapshotFriendlyId, + }, + { + name: "TRIGGER_SUPERVISOR_API_PROTOCOL", + value: this.opts.workloadApiProtocol, + }, + { + name: "TRIGGER_SUPERVISOR_API_PORT", + value: `${this.opts.workloadApiPort}`, + }, + { + name: "TRIGGER_SUPERVISOR_API_DOMAIN", + ...(this.opts.workloadApiDomain + ? { + value: this.opts.workloadApiDomain, + } + : { + valueFrom: { + fieldRef: { + fieldPath: "status.hostIP", + }, + }, + }), + }, + { + name: "TRIGGER_WORKER_INSTANCE_NAME", + valueFrom: { + fieldRef: { + fieldPath: "spec.nodeName", + }, + }, + }, + { + name: "OTEL_EXPORTER_OTLP_ENDPOINT", + value: env.OTEL_EXPORTER_OTLP_ENDPOINT, + }, + { + name: "TRIGGER_RUNNER_ID", + value: runnerId, + }, + { + name: "TRIGGER_MACHINE_CPU", + value: `${opts.machine.cpu}`, + }, + { + name: "TRIGGER_MACHINE_MEMORY", + value: `${opts.machine.memory}`, + }, + { + name: "LIMITS_CPU", + valueFrom: { + resourceFieldRef: { + resource: "limits.cpu", + }, + }, + }, + { + name: "LIMITS_MEMORY", + valueFrom: { + resourceFieldRef: { + resource: "limits.memory", + }, + }, + }, + ...(this.opts.warmStartUrl + ? [{ name: "TRIGGER_WARM_START_URL", value: this.opts.warmStartUrl }] + : []), + ...(this.opts.metadataUrl + ? [{ name: "TRIGGER_METADATA_URL", value: this.opts.metadataUrl }] + : []), + ...(this.opts.heartbeatIntervalSeconds + ? [ + { + name: "TRIGGER_HEARTBEAT_INTERVAL_SECONDS", + value: `${this.opts.heartbeatIntervalSeconds}`, + }, + ] + : []), + ...(this.opts.snapshotPollIntervalSeconds + ? [ + { + name: "TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS", + value: `${this.opts.snapshotPollIntervalSeconds}`, + }, + ] + : []), + ...(this.opts.additionalEnvVars + ? Object.entries(this.opts.additionalEnvVars).map(([key, value]) => ({ + name: key, + value: value, + })) + : []), + ], + }, + ], + }, + }, + }); + } catch (err: unknown) { + this.#handleK8sError(err); + } + } + + #throwUnlessRecord(candidate: unknown): asserts candidate is Record { + if (typeof candidate !== "object" || candidate === null) { + throw candidate; + } + } + + #handleK8sError(err: unknown) { + this.#throwUnlessRecord(err); + + if ("body" in err && err.body) { + this.logger.error("[KubernetesWorkloadManager] Create failed", { rawError: err.body }); + this.#throwUnlessRecord(err.body); + + if (typeof err.body.message === "string") { + throw new Error(err.body?.message); + } else { + throw err.body; + } + } else { + this.logger.error("[KubernetesWorkloadManager] Create failed", { rawError: err }); + throw err; + } + } + + #envTypeToLabelValue(type: EnvironmentType) { + switch (type) { + case "PRODUCTION": + return "prod"; + case "STAGING": + return "stg"; + case "DEVELOPMENT": + return "dev"; + case "PREVIEW": + return "preview"; + } + } + + private getImagePullSecrets(): k8s.V1LocalObjectReference[] | undefined { + return this.opts.imagePullSecrets?.map((name) => ({ name })); + } + + get #defaultPodSpec(): Omit { + return { + restartPolicy: "Never", + automountServiceAccountToken: false, + imagePullSecrets: this.getImagePullSecrets(), + ...(env.KUBERNETES_SCHEDULER_NAME + ? { + schedulerName: env.KUBERNETES_SCHEDULER_NAME, + } + : {}), + ...(env.KUBERNETES_WORKER_NODETYPE_LABEL + ? { + nodeSelector: { + nodetype: env.KUBERNETES_WORKER_NODETYPE_LABEL, + }, + } + : {}), + }; + } + + get #defaultResourceRequests(): ResourceQuantities { + return { + "ephemeral-storage": env.KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST, + }; + } + + get #defaultResourceLimits(): ResourceQuantities { + return { + "ephemeral-storage": env.KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT, + }; + } + + #isScheduledRun(opts: WorkloadManagerCreateOptions): boolean { + return opts.annotations?.rootTriggerSource === "schedule"; + } + + #getSharedLabels(opts: WorkloadManagerCreateOptions): Record { + const labels: Record = { + env: opts.envId, + envtype: this.#envTypeToLabelValue(opts.envType), + org: opts.orgId, + project: opts.projectId, + machine: opts.machine.name, + // We intentionally use a boolean label rather than exposing the full trigger source + // (e.g. sdk, api, cli, mcp, schedule) to keep label cardinality low in metrics. + // The schedule vs non-schedule distinction is all we need for the current metrics + // and pool-level scheduling decisions; finer-grained source breakdowns live in run annotations. + scheduled: String(this.#isScheduledRun(opts)), + }; + + // Add privatelink label for CiliumNetworkPolicy matching + if (opts.hasPrivateLink) { + labels.privatelink = opts.orgId; + } + + return labels; + } + + #getResourceRequestsForMachine(preset: MachinePreset): ResourceQuantities { + const cpuRatio = cpuRequestRatioByMachinePreset[preset.name] ?? this.cpuRequestRatio; + const memoryRatio = memoryRequestRatioByMachinePreset[preset.name] ?? this.memoryRequestRatio; + + const cpuRequest = preset.cpu * cpuRatio; + const memoryRequest = preset.memory * memoryRatio; + + // Clamp between min and max + const clampedCpu = this.clamp(cpuRequest, this.cpuRequestMinCores, preset.cpu); + const clampedMemory = this.clamp(memoryRequest, this.memoryRequestMinGb, preset.memory); + + return { + cpu: `${clampedCpu}`, + memory: `${clampedMemory}G`, + }; + } + + #getResourceLimitsForMachine(preset: MachinePreset): ResourceQuantities { + const memoryLimit = this.memoryOverheadGb + ? preset.memory + this.memoryOverheadGb + : preset.memory; + + return { + cpu: `${preset.cpu}`, + memory: `${memoryLimit}G`, + }; + } + + #getResourcesForMachine(preset: MachinePreset): k8s.V1ResourceRequirements { + return { + requests: { + ...this.#defaultResourceRequests, + ...this.#getResourceRequestsForMachine(preset), + }, + limits: { + ...this.#defaultResourceLimits, + ...this.#getResourceLimitsForMachine(preset), + }, + }; + } + + #isLargeMachine(preset: MachinePreset): boolean { + return preset.name.startsWith("large-"); + } + + #getAffinity(opts: WorkloadManagerCreateOptions): k8s.V1Affinity | undefined { + const largeNodeAffinity = this.#getNodeAffinityRules(opts.machine); + const scheduleNodeAffinity = this.#getScheduleNodeAffinityRules(this.#isScheduledRun(opts)); + const podAffinity = this.#getProjectPodAffinity(opts.projectId); + + // Merge node affinity rules from multiple sources + const preferred = [ + ...(largeNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ...(scheduleNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ]; + // Only large machine affinity produces hard requirements (non-large runs must stay off the large pool). + // Schedule affinity is soft both ways. + const required = [ + ...(largeNodeAffinity?.requiredDuringSchedulingIgnoredDuringExecution?.nodeSelectorTerms ?? []), + ]; + + const hasNodeAffinity = preferred.length > 0 || required.length > 0; + + if (!hasNodeAffinity && !podAffinity) { + return undefined; + } + + return { + ...(hasNodeAffinity && { + nodeAffinity: { + ...(preferred.length > 0 && { preferredDuringSchedulingIgnoredDuringExecution: preferred }), + ...(required.length > 0 && { + requiredDuringSchedulingIgnoredDuringExecution: { nodeSelectorTerms: required }, + }), + }, + }), + ...(podAffinity && { podAffinity }), + }; + } + + #getNodeAffinityRules(preset: MachinePreset): k8s.V1NodeAffinity | undefined { + if (!env.KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED) { + return undefined; + } + + if (this.#isLargeMachine(preset)) { + // soft preference for the large-machine pool, falls back to standard if unavailable + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, + operator: "In", + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + }, + ], + }; + } + + // not schedulable in the large-machine pool + return { + requiredDuringSchedulingIgnoredDuringExecution: { + nodeSelectorTerms: [ + { + matchExpressions: [ + { + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, + operator: "NotIn", + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + ], + }, + }; + } + + #getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined { + if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) { + return undefined; + } + + if (isScheduledRun) { + // soft preference for the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "In", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + }, + ], + }; + } + + // soft anti-affinity: non-schedule runs prefer to avoid the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "NotIn", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + }, + ], + }; + } + + #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) { + return undefined; + } + + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS; + } + + #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { + if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) { + return undefined; + } + + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_PROJECT_AFFINITY_WEIGHT, + podAffinityTerm: { + labelSelector: { + matchExpressions: [ + { + key: "project", + operator: "In", + values: [projectId], + }, + ], + }, + topologyKey: env.KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY, + }, + }, + ], + }; + } +} diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts new file mode 100644 index 00000000000..86199afe469 --- /dev/null +++ b/apps/supervisor/src/workloadManager/types.ts @@ -0,0 +1,47 @@ +import type { EnvironmentType, MachinePreset, PlacementTag, RunAnnotations } from "@trigger.dev/core/v3"; + +export interface WorkloadManagerOptions { + workloadApiProtocol: "http" | "https"; + workloadApiDomain?: string; // If unset, will use orchestrator-specific default + workloadApiPort: number; + warmStartUrl?: string; + metadataUrl?: string; + imagePullSecrets?: string[]; + heartbeatIntervalSeconds?: number; + snapshotPollIntervalSeconds?: number; + additionalEnvVars?: Record; + dockerAutoremove?: boolean; +} + +export interface WorkloadManager { + create: (opts: WorkloadManagerCreateOptions) => Promise; +} + +export interface WorkloadManagerCreateOptions { + image: string; + machine: MachinePreset; + version: string; + nextAttemptNumber?: number; + dequeuedAt: Date; + placementTags?: PlacementTag[]; + // Timing context (populated by supervisor handler, included in wide event) + dequeueResponseMs?: number; + pollingIntervalMs?: number; + warmStartCheckMs?: number; + // identifiers + envId: string; + envType: EnvironmentType; + orgId: string; + projectId: string; + deploymentFriendlyId: string; + deploymentVersion: string; + runId: string; + runFriendlyId: string; + snapshotId: string; + snapshotFriendlyId: string; + // Trace context for OTel span emission (W3C format: { traceparent: "00-...", tracestate?: "..." }) + traceContext?: Record; + annotations?: RunAnnotations; + // private networking + hasPrivateLink?: boolean; +} diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts new file mode 100644 index 00000000000..bd38cc8700f --- /dev/null +++ b/apps/supervisor/src/workloadServer/index.ts @@ -0,0 +1,666 @@ +import { type Namespace, Server, type Socket } from "socket.io"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import EventEmitter from "node:events"; +import { z } from "zod"; +import { + type SupervisorHttpClient, + WORKLOAD_HEADERS, + type WorkloadClientSocketData, + type WorkloadClientToServerEvents, + type WorkloadContinueRunExecutionResponseBody, + WorkloadDebugLogRequestBody, + type WorkloadDequeueFromVersionResponseBody, + WorkloadHeartbeatRequestBody, + type WorkloadHeartbeatResponseBody, + WorkloadRunAttemptCompleteRequestBody, + type WorkloadRunAttemptCompleteResponseBody, + WorkloadRunAttemptStartRequestBody, + type WorkloadRunAttemptStartResponseBody, + WorkloadRunSnapshotsSinceResponseBody, + type WorkloadServerToClientEvents, + type WorkloadSuspendRunResponseBody, +} from "@trigger.dev/core/v3/workers"; +import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOnly"; +import { type IncomingMessage } from "node:http"; +import { register } from "../metrics.js"; +import { env } from "../env.js"; +import { SnapshotCallbackPayloadSchema } from "@internal/compute"; +import { + ComputeSnapshotService, + type RunTraceContext, +} from "../services/computeSnapshotService.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; + +// Use the official export when upgrading to socket.io@4.8.0 +interface DefaultEventsMap { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + [event: string]: (...args: any[]) => void; +} + +const WorkloadActionParams = z.object({ + runFriendlyId: z.string(), + snapshotFriendlyId: z.string(), +}); + +type WorkloadServerEvents = { + runConnected: [ + { + run: { + friendlyId: string; + }; + }, + ]; + runDisconnected: [ + { + run: { + friendlyId: string; + }; + }, + ]; +}; + +type WorkloadServerOptions = { + port: number; + host?: string; + workerClient: SupervisorHttpClient; + checkpointClient?: CheckpointClient; + computeManager?: ComputeWorkloadManager; + tracing?: OtlpTraceService; +}; + +export class WorkloadServer extends EventEmitter { + private checkpointClient?: CheckpointClient; + private readonly snapshotService?: ComputeSnapshotService; + + private readonly logger = new SimpleStructuredLogger("workload-server"); + + private readonly httpServer: HttpServer; + private readonly websocketServer: Namespace< + WorkloadClientToServerEvents, + WorkloadServerToClientEvents, + DefaultEventsMap, + WorkloadClientSocketData + >; + + private readonly runSockets = new Map< + string, + Socket< + WorkloadClientToServerEvents, + WorkloadServerToClientEvents, + DefaultEventsMap, + WorkloadClientSocketData + > + >(); + + private readonly workerClient: SupervisorHttpClient; + + constructor(opts: WorkloadServerOptions) { + super(); + + const host = opts.host ?? "0.0.0.0"; + const port = opts.port; + + this.workerClient = opts.workerClient; + this.checkpointClient = opts.checkpointClient; + + if (opts.computeManager?.snapshotsEnabled) { + this.snapshotService = new ComputeSnapshotService({ + computeManager: opts.computeManager, + workerClient: opts.workerClient, + tracing: opts.tracing, + }); + } + + this.httpServer = this.createHttpServer({ host, port }); + this.websocketServer = this.createWebsocketServer(); + } + + private headerValueFromRequest(req: IncomingMessage, headerName: string): string | undefined { + const value = req.headers[headerName]; + + if (Array.isArray(value)) { + return value[0]; + } + + return value; + } + + private runnerIdFromRequest(req: IncomingMessage): string | undefined { + return this.headerValueFromRequest(req, WORKLOAD_HEADERS.RUNNER_ID); + } + + private deploymentIdFromRequest(req: IncomingMessage): string | undefined { + return this.headerValueFromRequest(req, WORKLOAD_HEADERS.DEPLOYMENT_ID); + } + + private deploymentVersionFromRequest(req: IncomingMessage): string | undefined { + return this.headerValueFromRequest(req, WORKLOAD_HEADERS.DEPLOYMENT_VERSION); + } + + private projectRefFromRequest(req: IncomingMessage): string | undefined { + return this.headerValueFromRequest(req, WORKLOAD_HEADERS.PROJECT_REF); + } + + private createHttpServer({ host, port }: { host: string; port: number }) { + const httpServer = new HttpServer({ + port, + host, + metrics: { + register, + expose: false, + }, + }) + .route("/health", "GET", { + handler: async ({ reply }) => { + reply.text("OK"); + }, + }) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/start", + "POST", + { + paramsSchema: WorkloadActionParams, + bodySchema: WorkloadRunAttemptStartRequestBody, + handler: async ({ req, reply, params, body }) => { + const startResponse = await this.workerClient.startRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!startResponse.success) { + this.logger.error("Failed to start run", { + params, + error: startResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(startResponse.data satisfies WorkloadRunAttemptStartResponseBody); + return; + }, + } + ) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/complete", + "POST", + { + paramsSchema: WorkloadActionParams, + bodySchema: WorkloadRunAttemptCompleteRequestBody, + handler: async ({ req, reply, params, body }) => { + const completeResponse = await this.workerClient.completeRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!completeResponse.success) { + this.logger.error("Failed to complete run", { + params, + error: completeResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(completeResponse.data satisfies WorkloadRunAttemptCompleteResponseBody); + return; + }, + } + ) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/heartbeat", + "POST", + { + paramsSchema: WorkloadActionParams, + bodySchema: WorkloadHeartbeatRequestBody, + handler: async ({ req, reply, params, body }) => { + const heartbeatResponse = await this.workerClient.heartbeatRun( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!heartbeatResponse.success) { + this.logger.error("Failed to heartbeat run", { + params, + error: heartbeatResponse.error, + }); + reply.empty(500); + return; + } + + reply.json({ + ok: true, + } satisfies WorkloadHeartbeatResponseBody); + }, + } + ) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/suspend", + "GET", + { + paramsSchema: WorkloadActionParams, + handler: async ({ reply, params, req }) => { + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); + + this.logger.debug("Suspend request", { + params, + runnerId, + deploymentVersion, + projectRef, + }); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + runnerId, + deploymentVersion, + projectRef, + }); + reply.json( + { + ok: false, + error: "Invalid headers", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + if (this.snapshotService) { + // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. + // If the run continues before the delay expires, the snapshot is cancelled. + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); + + this.snapshotService.schedule(params.runFriendlyId, { + runnerId, + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + }); + + return; + } + + if (!this.checkpointClient) { + reply.json( + { + ok: false, + error: "Checkpoints disabled", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + reply.json( + { + ok: true, + } satisfies WorkloadSuspendRunResponseBody, + false, + 202 + ); + + const suspendResult = await this.checkpointClient.suspendRun({ + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + body: { + runnerId, + runId: params.runFriendlyId, + snapshotId: params.snapshotFriendlyId, + projectRef, + deploymentVersion, + }, + }); + + if (!suspendResult) { + this.logger.error("Failed to suspend run", { params }); + return; + } + }, + } + ) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/continue", + "GET", + { + paramsSchema: WorkloadActionParams, + handler: async ({ req, reply, params }) => { + this.logger.debug("Run continuation request", { params }); + + // Cancel any pending delayed snapshot for this run + this.snapshotService?.cancel(params.runFriendlyId); + + const continuationResult = await this.workerClient.continueRunExecution( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!continuationResult.success) { + this.logger.error("Failed to continue run execution", { params }); + reply.json( + { + ok: false, + error: "Failed to continue run execution", + }, + false, + 400 + ); + return; + } + + reply.json(continuationResult.data as WorkloadContinueRunExecutionResponseBody); + }, + } + ) + .route( + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/since/:snapshotFriendlyId", + "GET", + { + paramsSchema: WorkloadActionParams, + handler: async ({ req, reply, params }) => { + const sinceSnapshotResponse = await this.workerClient.getSnapshotsSince( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!sinceSnapshotResponse.success) { + this.logger.error("Failed to get snapshots since", { + runId: params.runFriendlyId, + error: sinceSnapshotResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(sinceSnapshotResponse.data satisfies WorkloadRunSnapshotsSinceResponseBody); + }, + } + ) + .route("/api/v1/workload-actions/deployments/:deploymentId/dequeue", "GET", { + paramsSchema: z.object({ + deploymentId: z.string(), + }), + + handler: async ({ req, reply, params }) => { + const dequeueResponse = await this.workerClient.dequeueFromVersion( + params.deploymentId, + 1, + this.runnerIdFromRequest(req) + ); + + if (!dequeueResponse.success) { + this.logger.error("Failed to get latest snapshot", { + deploymentId: params.deploymentId, + error: dequeueResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(dequeueResponse.data satisfies WorkloadDequeueFromVersionResponseBody); + }, + }); + + if (env.SEND_RUN_DEBUG_LOGS) { + httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { + paramsSchema: WorkloadActionParams.pick({ runFriendlyId: true }), + bodySchema: WorkloadDebugLogRequestBody, + handler: async ({ req, reply, params, body }) => { + reply.empty(204); + + await this.workerClient.sendDebugLog( + params.runFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + }, + }); + } else { + // Lightweight mock route without schemas + httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { + handler: async ({ reply }) => { + reply.empty(204); + }, + }); + } + + // Compute snapshot callback endpoint + httpServer.route("/api/v1/compute/snapshot-complete", "POST", { + bodySchema: SnapshotCallbackPayloadSchema, + handler: async ({ reply, body }) => { + if (!this.snapshotService) { + reply.empty(404); + return; + } + + const result = await this.snapshotService.handleCallback(body); + reply.empty(result.status); + }, + }); + + return httpServer; + } + + private createWebsocketServer() { + const io = new Server(this.httpServer.server); + + const websocketServer: Namespace< + WorkloadClientToServerEvents, + WorkloadServerToClientEvents, + DefaultEventsMap, + WorkloadClientSocketData + > = io.of("/workload"); + + websocketServer.on("disconnect", (socket) => { + this.logger.verbose("[WS] disconnect", socket.id); + }); + websocketServer.use(async (socket, next) => { + const setSocketDataFromHeader = ( + dataKey: keyof typeof socket.data, + headerName: string, + required: boolean = true + ) => { + const value = socket.handshake.headers[headerName]; + + if (value) { + if (Array.isArray(value)) { + if (value[0]) { + socket.data[dataKey] = value[0]; + return; + } + } else { + socket.data[dataKey] = value; + return; + } + } + + if (required) { + this.logger.error("[WS] missing required header", { headerName }); + throw new Error("missing header"); + } + }; + + try { + setSocketDataFromHeader("deploymentId", WORKLOAD_HEADERS.DEPLOYMENT_ID); + setSocketDataFromHeader("runnerId", WORKLOAD_HEADERS.RUNNER_ID); + } catch (error) { + this.logger.error("[WS] setSocketDataFromHeader error", { error }); + socket.disconnect(true); + return; + } + + this.logger.debug("[WS] auth success", socket.data); + + next(); + }); + websocketServer.on("connection", (socket) => { + const socketLogger = this.logger.child({ + socketId: socket.id, + socketData: socket.data, + }); + + const getSocketMetadata = () => { + return { + deploymentId: socket.data.deploymentId, + runId: socket.data.runFriendlyId, + snapshotId: socket.data.snapshotId, + runnerId: socket.data.runnerId, + }; + }; + + const runConnected = (friendlyId: string) => { + socketLogger.debug("runConnected", { ...getSocketMetadata() }); + + // If there's already a run ID set, we should "disconnect" it from this socket + if (socket.data.runFriendlyId && socket.data.runFriendlyId !== friendlyId) { + socketLogger.debug("runConnected: disconnecting existing run", { + ...getSocketMetadata(), + newRunId: friendlyId, + oldRunId: socket.data.runFriendlyId, + }); + runDisconnected(socket.data.runFriendlyId); + } + + this.runSockets.set(friendlyId, socket); + this.emit("runConnected", { run: { friendlyId } }); + socket.data.runFriendlyId = friendlyId; + }; + + const runDisconnected = (friendlyId: string) => { + socketLogger.debug("runDisconnected", { ...getSocketMetadata() }); + + this.runSockets.delete(friendlyId); + this.emit("runDisconnected", { run: { friendlyId } }); + socket.data.runFriendlyId = undefined; + }; + + socketLogger.debug("wsServer socket connected", { ...getSocketMetadata() }); + + // FIXME: where does this get set? + if (socket.data.runFriendlyId) { + runConnected(socket.data.runFriendlyId); + } + + socket.on("disconnecting", (reason, description) => { + socketLogger.verbose("Socket disconnecting", { + ...getSocketMetadata(), + reason, + description, + }); + + if (socket.data.runFriendlyId) { + runDisconnected(socket.data.runFriendlyId); + } + }); + + socket.on("disconnect", (reason, description) => { + socketLogger.debug("Socket disconnected", { ...getSocketMetadata(), reason, description }); + }); + + socket.on("error", (error) => { + socketLogger.error("Socket error", { + ...getSocketMetadata(), + error: { + name: error.name, + message: error.message, + stack: error.stack, + }, + }); + }); + + socket.on("run:start", async (message) => { + const log = socketLogger.child({ + eventName: "run:start", + ...getSocketMetadata(), + ...message, + }); + + log.debug("Handling run:start"); + + try { + runConnected(message.run.friendlyId); + } catch (error) { + log.error("run:start error", { error }); + } + }); + + socket.on("run:stop", async (message) => { + const log = socketLogger.child({ + eventName: "run:stop", + ...getSocketMetadata(), + ...message, + }); + + log.debug("Handling run:stop"); + + try { + runDisconnected(message.run.friendlyId); + // Don't delete trace context here - run:stop fires after each snapshot/shutdown + // but the run may be restored on a new VM and snapshot again. Trace context is + // re-populated on dequeue, and entries are small (4 strings per run). + } catch (error) { + log.error("run:stop error", { error }); + } + }); + }); + + return websocketServer; + } + + notifyRun({ run }: { run: { friendlyId: string } }) { + try { + const runSocket = this.runSockets.get(run.friendlyId); + + if (!runSocket) { + this.logger.debug("notifyRun: Run socket not found", { run }); + + this.workerClient.sendDebugLog(run.friendlyId, { + time: new Date(), + message: "run:notify socket not found on supervisor", + }); + + return; + } + + runSocket.emit("run:notify", { version: "1", run }); + this.logger.debug("run:notify sent", { run }); + + this.workerClient.sendDebugLog(run.friendlyId, { + time: new Date(), + message: "run:notify supervisor -> runner", + }); + } catch (error) { + this.logger.error("Error in notifyRun", { run, error }); + + this.workerClient.sendDebugLog(run.friendlyId, { + time: new Date(), + message: "run:notify error on supervisor", + }); + } + } + + registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + this.snapshotService?.registerTraceContext(runFriendlyId, ctx); + } + + async start() { + await this.httpServer.start(); + } + + async stop() { + this.snapshotService?.stop(); + await this.httpServer.stop(); + } +} diff --git a/apps/supervisor/tsconfig.json b/apps/supervisor/tsconfig.json new file mode 100644 index 00000000000..bd9b391e1b6 --- /dev/null +++ b/apps/supervisor/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../.configs/tsconfig.base.json", + "include": ["src/**/*.ts"], + "compilerOptions": { + "rootDir": "src", + "outDir": "dist" + } +} diff --git a/apps/webapp/.babelrc.json b/apps/webapp/.babelrc.json deleted file mode 100644 index b5cf683b7e0..00000000000 --- a/apps/webapp/.babelrc.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "sourceType": "unambiguous", - "presets": [ - [ - "@babel/preset-env", - { - "targets": { - "chrome": 100 - } - } - ], - "@babel/preset-typescript", - "@babel/preset-react" - ], - "plugins": [] -} \ No newline at end of file diff --git a/packages/database/.env b/apps/webapp/.env similarity index 100% rename from packages/database/.env rename to apps/webapp/.env diff --git a/apps/webapp/.eslintrc b/apps/webapp/.eslintrc index dc7ad7e2a3a..f292eef3cce 100644 --- a/apps/webapp/.eslintrc +++ b/apps/webapp/.eslintrc @@ -1,13 +1,31 @@ { - "extends": ["@remix-run/eslint-config", "@remix-run/eslint-config/node", "prettier"], - "rules": { - "@typescript-eslint/strict-boolean-expressions": [ - "error", - { - "allowNullableBoolean": true, - "allowNullableString": true, - "allowNullableNumber": true + "plugins": ["react-hooks", "@typescript-eslint/eslint-plugin", "import"], + "parser": "@typescript-eslint/parser", + "overrides": [ + { + "files": ["*.ts", "*.tsx"], + "rules": { + // Autofixes imports from "@trigger.dev/core" to fine grained modules + // "@trigger.dev/no-trigger-core-import": "error", + // Normalize `import type {}` and `import { type }` + "@typescript-eslint/consistent-type-imports": [ + "warn", + { + // the "type" annotation can get tangled and cause syntax errors + // during some autofixes, so easier to just turn it off + "prefer": "type-imports", + "disallowTypeAnnotations": true, + "fixStyle": "inline-type-imports" + } + ], + // no-trigger-core-import splits imports into multiple lines + // this one merges them back into a single line + // if they still import from the same module + "import/no-duplicates": ["warn", { "prefer-inline": true }], + // lots of undeclared vars, enable this rule if you want to clean them up + "turbo/no-undeclared-env-vars": "off" } - ] - } + } + ], + "ignorePatterns": ["seed.js", "seedCloud.ts", "populate.js"] } diff --git a/apps/webapp/.gitignore b/apps/webapp/.gitignore index 074c30a482d..595ab180e15 100644 --- a/apps/webapp/.gitignore +++ b/apps/webapp/.gitignore @@ -9,7 +9,8 @@ node_modules /app/styles/tailwind.css - +# Ensure the .env symlink is not removed by accident +!.env # Storybook build outputs build-storybook.log @@ -17,4 +18,6 @@ build-storybook.log .storybook-out storybook-static -/prisma/seed.js \ No newline at end of file +/prisma/seed.js +/prisma/populate.js +.memory-snapshots \ No newline at end of file diff --git a/apps/webapp/.storybook/main.ts b/apps/webapp/.storybook/main.ts deleted file mode 100644 index 1fea1efde0f..00000000000 --- a/apps/webapp/.storybook/main.ts +++ /dev/null @@ -1,49 +0,0 @@ -import type { StorybookConfig } from "@storybook/react-webpack5"; -import path from "path"; - -const root = path.resolve(__dirname, "../app"); - -const config: StorybookConfig = { - webpackFinal: async (config) => { - return { - ...config, - resolve: { - ...config.resolve, - alias: { - ...(config.resolve?.alias ?? {}), - "~": root, - }, - extensions: [ - ...(config.resolve?.extensions ?? []), - ...[".ts", ".tsx", ".js", ".jsx", ".mdx"], - ], - }, - }; - }, - stories: ["../app/**/stories/*.mdx", "../app/**/stories/*.stories.@(js|jsx|ts|tsx)"], - addons: [ - "@storybook/addon-links", - "@storybook/addon-essentials", - "@storybook/addon-interactions", - "storybook-addon-variants", - "storybook-addon-designs", - "@storybook/addon-docs", - { - name: "@storybook/addon-styling", - options: { - // Check out https://github.com/storybookjs/addon-styling/blob/main/docs/api.md - // For more details on this addon's options. - postCss: true, - }, - }, - ], - framework: { - name: "@storybook/react-webpack5", - options: {}, - }, - docs: { - autodocs: "tag", - }, - staticDirs: [path.resolve("public")], -}; -export default config; diff --git a/apps/webapp/.storybook/preview.tsx b/apps/webapp/.storybook/preview.tsx deleted file mode 100644 index a995aea7322..00000000000 --- a/apps/webapp/.storybook/preview.tsx +++ /dev/null @@ -1,47 +0,0 @@ -import type { Preview } from "@storybook/react"; -import "../app/tailwind.css"; -import { createRemixStub } from "@remix-run/testing"; -import React from "react"; -import { LocaleContextProvider } from "../app/components/primitives/LocaleProvider"; -import { OperatingSystemContextProvider } from "../app/components/primitives/OperatingSystemProvider"; - -const preview: Preview = { - parameters: { - actions: { argTypesRegex: "^on[A-Z].*" }, - controls: { - matchers: { - color: /(background|color)$/i, - date: /Date$/, - }, - }, - backgrounds: { - default: "App background", - values: [ - { - name: "App background", - value: "#0B1018", - }, - ], - }, - }, - decorators: [ - (Story) => { - const RemixStub = createRemixStub([ - { - path: "/*", - Component: Story, - }, - ]); - - return ( - - - - - - ); - }, - ], -}; - -export default preview; diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md new file mode 100644 index 00000000000..b0f5e09b829 --- /dev/null +++ b/apps/webapp/CLAUDE.md @@ -0,0 +1,120 @@ +# Webapp + +Remix 2.1.0 app serving as the main API, dashboard, and orchestration engine. Uses an Express server (`server.ts`). + +## Verifying Changes + +**Never run `pnpm run build --filter webapp` to verify changes.** Building proves almost nothing about correctness. The webapp is an app, not a public package — use typecheck from the repo root: + +```bash +pnpm run typecheck --filter webapp # ~1-2 minutes +``` + +Only run typecheck after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +Note: Public packages (`packages/*`) use `build` instead. See the root CLAUDE.md for details. + +## Testing Dashboard Changes with Chrome DevTools MCP + +Use the `chrome-devtools` MCP server to visually verify local dashboard changes. The webapp must be running (`pnpm run dev --filter webapp` from repo root). + +### Login + +``` +1. mcp__chrome-devtools__new_page(url: "http://localhost:3030") + → Redirects to /login +2. mcp__chrome-devtools__click the "Continue with Email" link +3. mcp__chrome-devtools__fill the email field with "local@trigger.dev" +4. mcp__chrome-devtools__click "Send a magic link" + → Auto-logs in and redirects to the dashboard (no email verification needed locally) +``` + +### Navigating and Verifying + +- **take_snapshot**: Get an a11y tree of the page (text content, element UIDs for interaction). Prefer this over screenshots for understanding page structure. +- **take_screenshot**: Capture what the page looks like visually. Use to verify styling, layout, and visual changes. +- **navigate_page**: Go to specific URLs, e.g. `http://localhost:3030/orgs/references-bc08/projects/hello-world-SiWs/env/dev/runs` +- **click / fill**: Interact with elements using UIDs from `take_snapshot`. +- **evaluate_script**: Run JS in the browser console for debugging. +- **list_console_messages**: Check for console errors after navigating. + +### Tips + +- Snapshots can be very large on complex pages (200K+ chars). Use `take_screenshot` first to orient, then `take_snapshot` only when you need element UIDs to interact. +- The local seeded user email is `local@trigger.dev`. +- Dashboard URL pattern: `http://localhost:3030/orgs/{orgSlug}/projects/{projectSlug}/env/{envSlug}/{section}` + +## Key File Locations + +- **Trigger API**: `app/routes/api.v1.tasks.$taskId.trigger.ts` +- **Batch trigger**: `app/routes/api.v1.tasks.batch.ts` +- **OTEL endpoints**: `app/routes/otel.v1.logs.ts`, `app/routes/otel.v1.traces.ts` +- **Prisma setup**: `app/db.server.ts` +- **Run engine config**: `app/v3/runEngine.server.ts` +- **Services**: `app/v3/services/**/*.server.ts` +- **Presenters**: `app/v3/presenters/**/*.server.ts` + +## Route Convention + +Routes use Remix flat-file convention with dot-separated segments: +`api.v1.tasks.$taskId.trigger.ts` -> `/api/v1/tasks/:taskId/trigger` + +## Environment Variables + +Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** + +For testable code, **never import env.server.ts** in test files. Pass configuration as options instead: +- `realtimeClient.server.ts` (testable service, takes config as constructor arg) +- `realtimeClientGlobal.server.ts` (creates singleton with env config) + +## Run Engine 2.0 + +The webapp integrates `@internal/run-engine` via `app/v3/runEngine.server.ts`. This is the singleton engine instance. Services in `app/v3/services/` call engine methods for all run lifecycle operations (triggering, completing, cancelling, etc.). + +The `engineVersion.server.ts` file determines V1 vs V2 for a given environment. New code should always target V2. + +## Background Workers + +Background job workers use `@trigger.dev/redis-worker`: +- `app/v3/commonWorker.server.ts` +- `app/v3/alertsWorker.server.ts` +- `app/v3/batchTriggerWorker.server.ts` + +Do NOT add new jobs using zodworker/graphile-worker (legacy). + +## Real-time + +- Socket.io: `app/v3/handleSocketIo.server.ts`, `app/v3/handleWebsockets.server.ts` +- Electric SQL: Powers real-time data sync for the dashboard + +## Legacy V1 Code + +The `app/v3/` directory name is misleading - most code is actively used by V2. Only these specific files are V1-only legacy: +- `app/v3/marqs/` (old MarQS queue system) +- `app/v3/legacyRunEngineWorker.server.ts` +- `app/v3/services/triggerTaskV1.server.ts` +- `app/v3/services/cancelTaskRunV1.server.ts` +- `app/v3/authenticatedSocketConnection.server.ts` +- `app/v3/sharedSocketConnection.ts` + +Some services (e.g., `cancelTaskRun.server.ts`, `batchTriggerV3.server.ts`) branch on `RunEngineVersion` to support both V1 and V2. When editing these, only modify V2 code paths. + +## Performance: Trigger Hot Path + +The `triggerTask.server.ts` service is the **highest-throughput code path** in the system. Every API trigger call goes through it. Keep it fast: + +- **Do NOT add database queries** to `triggerTask.server.ts` or `batchTriggerV3.server.ts`. Task defaults (TTL, etc.) are resolved via `backgroundWorkerTask.findFirst()` in the queue concern (`queues.server.ts`) - one query per request, in mutually exclusive branches depending on locked/non-locked path. Piggyback on the existing query instead of adding new ones. +- **Two-stage resolution pattern**: Task metadata is resolved in two stages by design: + 1. **Trigger time** (`triggerTask.server.ts`): Only TTL is resolved from task defaults. Everything else uses whatever the caller provides. + 2. **Dequeue time** (`dequeueSystem.ts`): Full `BackgroundWorkerTask` is loaded and retry config, machine config, maxDuration, etc. are resolved against task defaults. +- If you need to add a new task-level default, **add it to the existing `select` clause** in the `backgroundWorkerTask.findFirst()` query — do NOT add a second query. If the default doesn't need to be known at trigger time, resolve it at dequeue time instead. +- Batch triggers (`batchTriggerV3.server.ts`) follow the same pattern — keep batch paths equally fast. + +## Prisma Query Patterns + +- **Always use `findFirst` instead of `findUnique`.** Prisma's `findUnique` has an implicit DataLoader that batches concurrent calls into a single `IN` query. This batching cannot be disabled and has active bugs even in Prisma 6.x: uppercase UUIDs returning null (#25484, confirmed 6.4.1), composite key SQL correctness issues (#22202), and 5-10x worse performance than manual DataLoader (#6573, open since 2021). `findFirst` is never batched and avoids this entire class of issues. + +## React Patterns + +- Only use `useCallback`/`useMemo` for context provider values, expensive derived data that is a dependency elsewhere, or stable refs required by a dependency array. Don't wrap ordinary event handlers or trivial computations. +- Use named constants for sentinel/placeholder values (e.g. `const UNSET_VALUE = "__unset__"`) instead of raw string literals scattered across comparisons. diff --git a/apps/webapp/app/api.server.ts b/apps/webapp/app/api.server.ts deleted file mode 100644 index b808913615f..00000000000 --- a/apps/webapp/app/api.server.ts +++ /dev/null @@ -1,15 +0,0 @@ -import { ApiEventLog } from "@trigger.dev/core"; -import { EventRecord } from "@trigger.dev/database"; - -export function eventRecordToApiJson(eventRecord: EventRecord): ApiEventLog { - return { - id: eventRecord.eventId, - name: eventRecord.name, - payload: eventRecord.payload as any, - context: eventRecord.context as any, - timestamp: eventRecord.timestamp, - deliverAt: eventRecord.deliverAt, - deliveredAt: eventRecord.deliveredAt, - cancelledAt: eventRecord.cancelledAt, - }; -} diff --git a/apps/webapp/app/api/versions.ts b/apps/webapp/app/api/versions.ts new file mode 100644 index 00000000000..250d214b07e --- /dev/null +++ b/apps/webapp/app/api/versions.ts @@ -0,0 +1,57 @@ +import { + API_VERSION_HEADER_NAME, + API_VERSION as CORE_API_VERSION, +} from "@trigger.dev/core/v3/serverOnly"; +import { z } from "zod"; + +export const CURRENT_API_VERSION = CORE_API_VERSION; + +export const NON_SPECIFIC_API_VERSION = "none"; + +export type API_VERSIONS = typeof CURRENT_API_VERSION | typeof NON_SPECIFIC_API_VERSION; + +export function getApiVersion(request: Request): API_VERSIONS { + const apiVersion = request.headers.get(API_VERSION_HEADER_NAME); + + if (apiVersion === CURRENT_API_VERSION) { + return apiVersion; + } + + return NON_SPECIFIC_API_VERSION; +} + +// This has been copied from the core package to allow us to use these types in the webapp +export const RunStatusUnspecifiedApiVersion = z.enum([ + /// Task is waiting for a version update because it cannot execute without additional information (task, queue, etc.). Replaces WAITING_FOR_DEPLOY + "PENDING_VERSION", + /// Task hasn't been deployed yet but is waiting to be executed + "WAITING_FOR_DEPLOY", + /// Task is waiting to be executed by a worker + "QUEUED", + /// Task is currently being executed by a worker + "EXECUTING", + /// Task has failed and is waiting to be retried + "REATTEMPTING", + /// Task has been paused by the system, and will be resumed by the system + "FROZEN", + /// Task has been completed successfully + "COMPLETED", + /// Task has been canceled by the user + "CANCELED", + /// Task has been completed with errors + "FAILED", + /// Task has crashed and won't be retried, most likely the worker ran out of resources, e.g. memory or storage + "CRASHED", + /// Task was interrupted during execution, mostly this happens in development environments + "INTERRUPTED", + /// Task has failed to complete, due to an error in the system + "SYSTEM_FAILURE", + /// Task has been scheduled to run at a specific time + "DELAYED", + /// Task has expired and won't be executed + "EXPIRED", + /// Task has reached it's maxDuration and has been stopped + "TIMED_OUT", +]); + +export type RunStatusUnspecifiedApiVersion = z.infer; diff --git a/apps/webapp/app/assets/icons/AIMetricsIcon.tsx b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx new file mode 100644 index 00000000000..038eea70b49 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx @@ -0,0 +1,16 @@ +export function AIMetricsIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AIPromptsIcon.tsx b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx new file mode 100644 index 00000000000..dd434df9931 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx @@ -0,0 +1,10 @@ +export function AIPromptsIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AISparkleIcon.tsx b/apps/webapp/app/assets/icons/AISparkleIcon.tsx new file mode 100644 index 00000000000..46f7429e77a --- /dev/null +++ b/apps/webapp/app/assets/icons/AISparkleIcon.tsx @@ -0,0 +1,31 @@ +export function AISparkleIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AbacusIcon.tsx b/apps/webapp/app/assets/icons/AbacusIcon.tsx new file mode 100644 index 00000000000..f0b7bfdf7be --- /dev/null +++ b/apps/webapp/app/assets/icons/AbacusIcon.tsx @@ -0,0 +1,71 @@ +export function AbacusIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AiProviderIcons.tsx b/apps/webapp/app/assets/icons/AiProviderIcons.tsx new file mode 100644 index 00000000000..85a01b98d63 --- /dev/null +++ b/apps/webapp/app/assets/icons/AiProviderIcons.tsx @@ -0,0 +1,177 @@ +type IconProps = { className?: string }; + +export function OpenAIIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function AnthropicIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function GeminiIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function LlamaIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function DeepseekIcon({ className }: IconProps) { + return ( + + + + + + + + + + + ); +} + +export function XAIIcon({ className }: IconProps) { + return ( + + + + + + + ); +} + +export function PerplexityIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function CerebrasIcon({ className }: IconProps) { + return ( + + + + + + + + ); +} + +export function MistralIcon({ className }: IconProps) { + return ( + + + + + + + + + + + + + ); +} + +export function AzureIcon({ className }: IconProps) { + return ( + + + + ); +} + diff --git a/apps/webapp/app/assets/icons/AnimatedHourglassIcon.tsx b/apps/webapp/app/assets/icons/AnimatedHourglassIcon.tsx new file mode 100644 index 00000000000..3c94426fa03 --- /dev/null +++ b/apps/webapp/app/assets/icons/AnimatedHourglassIcon.tsx @@ -0,0 +1,27 @@ +import { useAnimate } from "framer-motion"; +import { HourglassIcon } from "lucide-react"; +import { useEffect } from "react"; + +export function AnimatedHourglassIcon({ + className, + delay, +}: { + className?: string; + delay?: number; +}) { + const [scope, animate] = useAnimate(); + + useEffect(() => { + animate( + [ + [scope.current, { rotate: 0 }, { duration: 0.7 }], + [scope.current, { rotate: 180 }, { duration: 0.3 }], + [scope.current, { rotate: 180 }, { duration: 0.7 }], + [scope.current, { rotate: 360 }, { duration: 0.3 }], + ], + { repeat: Infinity, delay } + ); + }, []); + + return ; +} diff --git a/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx new file mode 100644 index 00000000000..3e647284cce --- /dev/null +++ b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx @@ -0,0 +1,12 @@ +export function AnthropicLogoIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ArchiveIcon.tsx b/apps/webapp/app/assets/icons/ArchiveIcon.tsx new file mode 100644 index 00000000000..1d910ba750e --- /dev/null +++ b/apps/webapp/app/assets/icons/ArchiveIcon.tsx @@ -0,0 +1,44 @@ +export function ArchiveIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + ); +} + +export function UnarchiveIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ArrowTopRightBottomLeftIcon.tsx b/apps/webapp/app/assets/icons/ArrowTopRightBottomLeftIcon.tsx new file mode 100644 index 00000000000..c49aa8cb0c2 --- /dev/null +++ b/apps/webapp/app/assets/icons/ArrowTopRightBottomLeftIcon.tsx @@ -0,0 +1,22 @@ +export function ArrowTopRightBottomLeftIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AttemptIcon.tsx b/apps/webapp/app/assets/icons/AttemptIcon.tsx new file mode 100644 index 00000000000..fc176ea201c --- /dev/null +++ b/apps/webapp/app/assets/icons/AttemptIcon.tsx @@ -0,0 +1,19 @@ +export function AttemptIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/BunLogoIcon.tsx b/apps/webapp/app/assets/icons/BunLogoIcon.tsx new file mode 100644 index 00000000000..b7357189f7c --- /dev/null +++ b/apps/webapp/app/assets/icons/BunLogoIcon.tsx @@ -0,0 +1,94 @@ +export function BunLogoIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ChevronExtraSmallDown.tsx b/apps/webapp/app/assets/icons/ChevronExtraSmallDown.tsx new file mode 100644 index 00000000000..134cbe4dfda --- /dev/null +++ b/apps/webapp/app/assets/icons/ChevronExtraSmallDown.tsx @@ -0,0 +1,13 @@ +export function ChevronExtraSmallDown({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ChevronExtraSmallUp.tsx b/apps/webapp/app/assets/icons/ChevronExtraSmallUp.tsx new file mode 100644 index 00000000000..710eeccdf20 --- /dev/null +++ b/apps/webapp/app/assets/icons/ChevronExtraSmallUp.tsx @@ -0,0 +1,13 @@ +export function ChevronExtraSmallUp({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ClockRotateLeftIcon.tsx b/apps/webapp/app/assets/icons/ClockRotateLeftIcon.tsx new file mode 100644 index 00000000000..edef4f87b75 --- /dev/null +++ b/apps/webapp/app/assets/icons/ClockRotateLeftIcon.tsx @@ -0,0 +1,15 @@ +export function ClockRotateLeftIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/CloudProviderIcon.tsx b/apps/webapp/app/assets/icons/CloudProviderIcon.tsx new file mode 100644 index 00000000000..6c162528247 --- /dev/null +++ b/apps/webapp/app/assets/icons/CloudProviderIcon.tsx @@ -0,0 +1,76 @@ +export function CloudProviderIcon({ + provider, + className, +}: { + provider: "aws" | "digitalocean" | (string & {}); + className?: string; +}) { + switch (provider) { + case "aws": + return ; + case "digitalocean": + return ; + default: + return null; + } +} + +export function AWS({ className }: { className?: string }) { + return ( + + + + + + ); +} + +export function DigitalOcean({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ConcurrencyIcon.tsx b/apps/webapp/app/assets/icons/ConcurrencyIcon.tsx new file mode 100644 index 00000000000..710ba4e6fa9 --- /dev/null +++ b/apps/webapp/app/assets/icons/ConcurrencyIcon.tsx @@ -0,0 +1,13 @@ +export function ConcurrencyIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ConnectionIcons.tsx b/apps/webapp/app/assets/icons/ConnectionIcons.tsx new file mode 100644 index 00000000000..beb0e9bab63 --- /dev/null +++ b/apps/webapp/app/assets/icons/ConnectionIcons.tsx @@ -0,0 +1,73 @@ +export function ConnectedIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} + +export function DisconnectedIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} + +export function CheckingConnectionIcon({ className }: { className?: string }) { + return ( + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/DropdownIcon.tsx b/apps/webapp/app/assets/icons/DropdownIcon.tsx new file mode 100644 index 00000000000..4a869ec8f62 --- /dev/null +++ b/apps/webapp/app/assets/icons/DropdownIcon.tsx @@ -0,0 +1,20 @@ +export function DropdownIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/EndpointIcon.tsx b/apps/webapp/app/assets/icons/EndpointIcon.tsx new file mode 100644 index 00000000000..d491e25a8a4 --- /dev/null +++ b/apps/webapp/app/assets/icons/EndpointIcon.tsx @@ -0,0 +1,36 @@ +export function EndpointIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/EnvironmentIcons.tsx b/apps/webapp/app/assets/icons/EnvironmentIcons.tsx new file mode 100644 index 00000000000..bc74ab10bcf --- /dev/null +++ b/apps/webapp/app/assets/icons/EnvironmentIcons.tsx @@ -0,0 +1,178 @@ +export function DevEnvironmentIcon({ className }: { className?: string }) { + return ( + + + + + + + + ); +} + +export function DevEnvironmentIconSmall({ className }: { className?: string }) { + return ( + + + + + + + + ); +} + +export function ProdEnvironmentIcon({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function ProdEnvironmentIconSmall({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function DeployedEnvironmentIcon({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function DeployedEnvironmentIconSmall({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function PreviewEnvironmentIconSmall({ className }: { className?: string }) { + return ; +} + +export function BranchEnvironmentIconSmall({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ExitIcon.tsx b/apps/webapp/app/assets/icons/ExitIcon.tsx new file mode 100644 index 00000000000..29d52609cdd --- /dev/null +++ b/apps/webapp/app/assets/icons/ExitIcon.tsx @@ -0,0 +1,14 @@ +export function ExitIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/FunctionIcon.tsx b/apps/webapp/app/assets/icons/FunctionIcon.tsx new file mode 100644 index 00000000000..6016322428e --- /dev/null +++ b/apps/webapp/app/assets/icons/FunctionIcon.tsx @@ -0,0 +1,21 @@ +export function FunctionIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardDownIcon.tsx b/apps/webapp/app/assets/icons/KeyboardDownIcon.tsx new file mode 100644 index 00000000000..1ef015d900a --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardDownIcon.tsx @@ -0,0 +1,17 @@ +export function KeyboardDownIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardEnterIcon.tsx b/apps/webapp/app/assets/icons/KeyboardEnterIcon.tsx new file mode 100644 index 00000000000..b6341912724 --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardEnterIcon.tsx @@ -0,0 +1,12 @@ +export function KeyboardEnterIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardLeftIcon.tsx b/apps/webapp/app/assets/icons/KeyboardLeftIcon.tsx new file mode 100644 index 00000000000..6b6999e6830 --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardLeftIcon.tsx @@ -0,0 +1,17 @@ +export function KeyboardLeftIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardRightIcon.tsx b/apps/webapp/app/assets/icons/KeyboardRightIcon.tsx new file mode 100644 index 00000000000..879e7e183ca --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardRightIcon.tsx @@ -0,0 +1,17 @@ +export function KeyboardRightIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardUpIcon.tsx b/apps/webapp/app/assets/icons/KeyboardUpIcon.tsx new file mode 100644 index 00000000000..d87f26488d8 --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardUpIcon.tsx @@ -0,0 +1,17 @@ +export function KeyboardUpIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/KeyboardWindowsIcon.tsx b/apps/webapp/app/assets/icons/KeyboardWindowsIcon.tsx new file mode 100644 index 00000000000..859a633a305 --- /dev/null +++ b/apps/webapp/app/assets/icons/KeyboardWindowsIcon.tsx @@ -0,0 +1,17 @@ +export function KeyboardWindowsIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ListBulletIcon.tsx b/apps/webapp/app/assets/icons/ListBulletIcon.tsx new file mode 100644 index 00000000000..3ca7636a900 --- /dev/null +++ b/apps/webapp/app/assets/icons/ListBulletIcon.tsx @@ -0,0 +1,30 @@ +export function ListBulletIcon({ className }: { className?: string }) { + return ( + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ListCheckedIcon.tsx b/apps/webapp/app/assets/icons/ListCheckedIcon.tsx new file mode 100644 index 00000000000..29cb828f5dd --- /dev/null +++ b/apps/webapp/app/assets/icons/ListCheckedIcon.tsx @@ -0,0 +1,48 @@ +export function ListCheckedIcon({ className }: { className?: string }) { + return ( + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/LogsIcon.tsx b/apps/webapp/app/assets/icons/LogsIcon.tsx new file mode 100644 index 00000000000..3178da237e7 --- /dev/null +++ b/apps/webapp/app/assets/icons/LogsIcon.tsx @@ -0,0 +1,66 @@ +export function LogsIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/MachineIcon.tsx b/apps/webapp/app/assets/icons/MachineIcon.tsx new file mode 100644 index 00000000000..f07e7467b0d --- /dev/null +++ b/apps/webapp/app/assets/icons/MachineIcon.tsx @@ -0,0 +1,221 @@ +import { cn } from "~/utils/cn"; + +export function MachineIcon({ preset, className }: { preset?: string; className?: string }) { + if (!preset) { + return ; + } + + switch (preset) { + case "no-machine": + return ; + case "micro": + return ; + case "small-1x": + return ; + case "small-2x": + return ; + case "medium-1x": + return ; + case "medium-2x": + return ; + case "large-1x": + return ; + case "large-2x": + return ; + default: + return ; + } +} + +export function MachineDefaultIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconNoMachine({ className }: { className?: string }) { + return ( + + + + + + + ); +} + +function MachineIconMicro({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconSmall1x({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconSmall2x({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconMedium1x({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconMedium2x({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconLarge1x({ className }: { className?: string }) { + return ( + + + + + + ); +} + +function MachineIconLarge2x({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/MiddlewareIcon.tsx b/apps/webapp/app/assets/icons/MiddlewareIcon.tsx new file mode 100644 index 00000000000..c9802f68c05 --- /dev/null +++ b/apps/webapp/app/assets/icons/MiddlewareIcon.tsx @@ -0,0 +1,21 @@ +export function MiddlewareIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/MoveToBottomIcon.tsx b/apps/webapp/app/assets/icons/MoveToBottomIcon.tsx new file mode 100644 index 00000000000..997550e9265 --- /dev/null +++ b/apps/webapp/app/assets/icons/MoveToBottomIcon.tsx @@ -0,0 +1,27 @@ +export function MoveToBottomIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/MoveToTopIcon.tsx b/apps/webapp/app/assets/icons/MoveToTopIcon.tsx new file mode 100644 index 00000000000..46938fd391a --- /dev/null +++ b/apps/webapp/app/assets/icons/MoveToTopIcon.tsx @@ -0,0 +1,34 @@ +export function MoveToTopIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/MoveUpIcon.tsx b/apps/webapp/app/assets/icons/MoveUpIcon.tsx new file mode 100644 index 00000000000..6e5d8a84ba9 --- /dev/null +++ b/apps/webapp/app/assets/icons/MoveUpIcon.tsx @@ -0,0 +1,41 @@ +export function MoveUpIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/NodejsLogoIcon.tsx b/apps/webapp/app/assets/icons/NodejsLogoIcon.tsx new file mode 100644 index 00000000000..234dd079e1c --- /dev/null +++ b/apps/webapp/app/assets/icons/NodejsLogoIcon.tsx @@ -0,0 +1,15 @@ +export function NodejsLogoIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/PauseIcon.tsx b/apps/webapp/app/assets/icons/PauseIcon.tsx new file mode 100644 index 00000000000..9da4b7f885b --- /dev/null +++ b/apps/webapp/app/assets/icons/PauseIcon.tsx @@ -0,0 +1,19 @@ +export function PauseIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/PromoteIcon.tsx b/apps/webapp/app/assets/icons/PromoteIcon.tsx new file mode 100644 index 00000000000..be703888772 --- /dev/null +++ b/apps/webapp/app/assets/icons/PromoteIcon.tsx @@ -0,0 +1,24 @@ +export function PromoteIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/PythonLogoIcon.tsx b/apps/webapp/app/assets/icons/PythonLogoIcon.tsx new file mode 100644 index 00000000000..e0fbc6fc0ec --- /dev/null +++ b/apps/webapp/app/assets/icons/PythonLogoIcon.tsx @@ -0,0 +1,21 @@ +export function PythonLogoIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/RegionIcons.tsx b/apps/webapp/app/assets/icons/RegionIcons.tsx new file mode 100644 index 00000000000..098d5bc98ce --- /dev/null +++ b/apps/webapp/app/assets/icons/RegionIcons.tsx @@ -0,0 +1,106 @@ +export function FlagIcon({ + region, + className, +}: { + region: "usa" | "europe" | (string & {}); + className?: string; +}) { + switch (region) { + case "usa": + return ; + case "europe": + return ; + default: + return null; + } +} + +export function FlagUSA({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + + + + ); +} + +export function FlagEurope({ className }: { className?: string }) { + return ( + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/RunFunctionIcon.tsx b/apps/webapp/app/assets/icons/RunFunctionIcon.tsx new file mode 100644 index 00000000000..d2866c234db --- /dev/null +++ b/apps/webapp/app/assets/icons/RunFunctionIcon.tsx @@ -0,0 +1,21 @@ +export function RunFunctionIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/RunsIcon.tsx b/apps/webapp/app/assets/icons/RunsIcon.tsx new file mode 100644 index 00000000000..a481a041ab6 --- /dev/null +++ b/apps/webapp/app/assets/icons/RunsIcon.tsx @@ -0,0 +1,56 @@ +export function RunsIcon({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function RunsIconSmall({ className }: { className?: string }) { + return ( + + + + + ); +} + +export function RunsIconExtraSmall({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ShowParentIcon.tsx b/apps/webapp/app/assets/icons/ShowParentIcon.tsx new file mode 100644 index 00000000000..fe9a5540568 --- /dev/null +++ b/apps/webapp/app/assets/icons/ShowParentIcon.tsx @@ -0,0 +1,25 @@ +export function ShowParentIcon({ className }: { className?: string }) { + return ( + + + + ); +} + +export function ShowParentIconSelected({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SideMenuRightClosed.tsx b/apps/webapp/app/assets/icons/SideMenuRightClosed.tsx new file mode 100644 index 00000000000..b120300c0c0 --- /dev/null +++ b/apps/webapp/app/assets/icons/SideMenuRightClosed.tsx @@ -0,0 +1,15 @@ +export function SideMenuRightClosedIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SlackMonoIcon.tsx b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx new file mode 100644 index 00000000000..666393a229d --- /dev/null +++ b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx @@ -0,0 +1,10 @@ +export function SlackMonoIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SnakedArrowIcon.tsx b/apps/webapp/app/assets/icons/SnakedArrowIcon.tsx new file mode 100644 index 00000000000..0766cce1b46 --- /dev/null +++ b/apps/webapp/app/assets/icons/SnakedArrowIcon.tsx @@ -0,0 +1,20 @@ +export function SnakedArrowIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SparkleListIcon.tsx b/apps/webapp/app/assets/icons/SparkleListIcon.tsx new file mode 100644 index 00000000000..264fc227c84 --- /dev/null +++ b/apps/webapp/app/assets/icons/SparkleListIcon.tsx @@ -0,0 +1,14 @@ +export function SparkleListIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/StatusIcon.tsx b/apps/webapp/app/assets/icons/StatusIcon.tsx new file mode 100644 index 00000000000..9499b50d575 --- /dev/null +++ b/apps/webapp/app/assets/icons/StatusIcon.tsx @@ -0,0 +1,9 @@ +import { cn } from "~/utils/cn"; + +export function StatusIcon({ className }: { className?: string }) { + return ( +
+
+
+ ); +} diff --git a/apps/webapp/app/assets/icons/StreamsIcon.tsx b/apps/webapp/app/assets/icons/StreamsIcon.tsx new file mode 100644 index 00000000000..73cc480f4d4 --- /dev/null +++ b/apps/webapp/app/assets/icons/StreamsIcon.tsx @@ -0,0 +1,10 @@ +export function StreamsIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} + diff --git a/apps/webapp/app/assets/icons/TaskCachedIcon.tsx b/apps/webapp/app/assets/icons/TaskCachedIcon.tsx new file mode 100644 index 00000000000..650f9be396a --- /dev/null +++ b/apps/webapp/app/assets/icons/TaskCachedIcon.tsx @@ -0,0 +1,49 @@ +export function TaskCachedIcon({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TaskIcon.tsx b/apps/webapp/app/assets/icons/TaskIcon.tsx new file mode 100644 index 00000000000..9c31a0957f7 --- /dev/null +++ b/apps/webapp/app/assets/icons/TaskIcon.tsx @@ -0,0 +1,25 @@ +export function TaskIcon({ className }: { className?: string }) { + return ( + + + + ); +} + +export function TaskIconSmall({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TextInlineIcon.tsx b/apps/webapp/app/assets/icons/TextInlineIcon.tsx new file mode 100644 index 00000000000..538d9768d03 --- /dev/null +++ b/apps/webapp/app/assets/icons/TextInlineIcon.tsx @@ -0,0 +1,41 @@ +export function TextInlineIcon({ className }: { className?: string }) { + return ( + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TextWrapIcon.tsx b/apps/webapp/app/assets/icons/TextWrapIcon.tsx new file mode 100644 index 00000000000..ac37867e829 --- /dev/null +++ b/apps/webapp/app/assets/icons/TextWrapIcon.tsx @@ -0,0 +1,34 @@ +export function TextWrapIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TimedOutIcon.tsx b/apps/webapp/app/assets/icons/TimedOutIcon.tsx new file mode 100644 index 00000000000..3ad34e698c8 --- /dev/null +++ b/apps/webapp/app/assets/icons/TimedOutIcon.tsx @@ -0,0 +1,19 @@ +export function TimedOutIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/ToggleArrowIcon.tsx b/apps/webapp/app/assets/icons/ToggleArrowIcon.tsx new file mode 100644 index 00000000000..7bcb261c4dd --- /dev/null +++ b/apps/webapp/app/assets/icons/ToggleArrowIcon.tsx @@ -0,0 +1,10 @@ +export function ToggleArrowIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TraceIcon.tsx b/apps/webapp/app/assets/icons/TraceIcon.tsx new file mode 100644 index 00000000000..20eb1078483 --- /dev/null +++ b/apps/webapp/app/assets/icons/TraceIcon.tsx @@ -0,0 +1,9 @@ +export function TraceIcon({ className }: { className?: string }) { + return ( + + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/TriggerIcon.tsx b/apps/webapp/app/assets/icons/TriggerIcon.tsx new file mode 100644 index 00000000000..da73b842911 --- /dev/null +++ b/apps/webapp/app/assets/icons/TriggerIcon.tsx @@ -0,0 +1,5 @@ +import { BoltIcon } from "@heroicons/react/20/solid"; + +export function TriggerIcon({ className }: { className?: string }) { + return ; +} diff --git a/apps/webapp/app/assets/icons/WaitpointTokenIcon.tsx b/apps/webapp/app/assets/icons/WaitpointTokenIcon.tsx new file mode 100644 index 00000000000..23269fb8f02 --- /dev/null +++ b/apps/webapp/app/assets/icons/WaitpointTokenIcon.tsx @@ -0,0 +1,12 @@ +export function WaitpointTokenIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/WarmStartIcon.tsx b/apps/webapp/app/assets/icons/WarmStartIcon.tsx new file mode 100644 index 00000000000..211b27a98f2 --- /dev/null +++ b/apps/webapp/app/assets/icons/WarmStartIcon.tsx @@ -0,0 +1,26 @@ +import { FireIcon } from "@heroicons/react/20/solid"; +import { cn } from "~/utils/cn"; + +function ColdStartIcon({ className }: { className?: string }) { + return ( + + + + ); +} + +export function WarmStartIcon({ + isWarmStart, + className, +}: { + isWarmStart: boolean; + className?: string; +}) { + if (isWarmStart) { + return ; + } + return ; +} diff --git a/apps/webapp/app/assets/icons/v3.svg b/apps/webapp/app/assets/icons/v3.svg new file mode 100644 index 00000000000..d9aa6523ce8 --- /dev/null +++ b/apps/webapp/app/assets/icons/v3.svg @@ -0,0 +1,4 @@ + + + + diff --git a/apps/webapp/app/assets/images/blurred-dashboard-background-menu-bottom.jpg b/apps/webapp/app/assets/images/blurred-dashboard-background-menu-bottom.jpg new file mode 100644 index 00000000000..2a993f82127 Binary files /dev/null and b/apps/webapp/app/assets/images/blurred-dashboard-background-menu-bottom.jpg differ diff --git a/apps/webapp/app/assets/images/blurred-dashboard-background-menu-top.jpg b/apps/webapp/app/assets/images/blurred-dashboard-background-menu-top.jpg new file mode 100644 index 00000000000..8aca8563cdc Binary files /dev/null and b/apps/webapp/app/assets/images/blurred-dashboard-background-menu-top.jpg differ diff --git a/apps/webapp/app/assets/images/blurred-dashboard-background-table.jpg b/apps/webapp/app/assets/images/blurred-dashboard-background-table.jpg new file mode 100644 index 00000000000..a2ae4029fe2 Binary files /dev/null and b/apps/webapp/app/assets/images/blurred-dashboard-background-table.jpg differ diff --git a/apps/webapp/app/assets/images/cli-connected.png b/apps/webapp/app/assets/images/cli-connected.png new file mode 100644 index 00000000000..cd6b4e37fe1 Binary files /dev/null and b/apps/webapp/app/assets/images/cli-connected.png differ diff --git a/apps/webapp/app/assets/images/cli-disconnected.png b/apps/webapp/app/assets/images/cli-disconnected.png new file mode 100644 index 00000000000..dff3ecc106a Binary files /dev/null and b/apps/webapp/app/assets/images/cli-disconnected.png differ diff --git a/apps/webapp/app/assets/images/color-wheel.png b/apps/webapp/app/assets/images/color-wheel.png new file mode 100644 index 00000000000..af76136e82d Binary files /dev/null and b/apps/webapp/app/assets/images/color-wheel.png differ diff --git a/apps/webapp/app/assets/images/error-banner-tile@2x.png b/apps/webapp/app/assets/images/error-banner-tile@2x.png new file mode 100644 index 00000000000..2d601bfced7 Binary files /dev/null and b/apps/webapp/app/assets/images/error-banner-tile@2x.png differ diff --git a/apps/webapp/app/assets/images/gradient-background.png b/apps/webapp/app/assets/images/gradient-background.png deleted file mode 100644 index 9f40d248366..00000000000 Binary files a/apps/webapp/app/assets/images/gradient-background.png and /dev/null differ diff --git a/apps/webapp/app/assets/images/open-bulk-actions-panel.png b/apps/webapp/app/assets/images/open-bulk-actions-panel.png new file mode 100644 index 00000000000..a1b48f38646 Binary files /dev/null and b/apps/webapp/app/assets/images/open-bulk-actions-panel.png differ diff --git a/apps/webapp/app/assets/images/queues-dashboard.png b/apps/webapp/app/assets/images/queues-dashboard.png new file mode 100644 index 00000000000..321c79e6290 Binary files /dev/null and b/apps/webapp/app/assets/images/queues-dashboard.png differ diff --git a/apps/webapp/app/assets/images/select-runs-individually.png b/apps/webapp/app/assets/images/select-runs-individually.png new file mode 100644 index 00000000000..31a5d048a8a Binary files /dev/null and b/apps/webapp/app/assets/images/select-runs-individually.png differ diff --git a/apps/webapp/app/assets/images/select-runs-using-filters.png b/apps/webapp/app/assets/images/select-runs-using-filters.png new file mode 100644 index 00000000000..78ce487d0fc Binary files /dev/null and b/apps/webapp/app/assets/images/select-runs-using-filters.png differ diff --git a/apps/webapp/app/assets/images/tile-in-progress@2x.png b/apps/webapp/app/assets/images/tile-in-progress@2x.png new file mode 100644 index 00000000000..5724c3a0a2d Binary files /dev/null and b/apps/webapp/app/assets/images/tile-in-progress@2x.png differ diff --git a/apps/webapp/app/assets/logos/ATAndTLogo.tsx b/apps/webapp/app/assets/logos/ATAndTLogo.tsx new file mode 100644 index 00000000000..505294d3440 --- /dev/null +++ b/apps/webapp/app/assets/logos/ATAndTLogo.tsx @@ -0,0 +1,21 @@ +export function ATAndTLogo({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/AppsmithLogo.tsx b/apps/webapp/app/assets/logos/AppsmithLogo.tsx new file mode 100644 index 00000000000..4d4af738fe4 --- /dev/null +++ b/apps/webapp/app/assets/logos/AppsmithLogo.tsx @@ -0,0 +1,34 @@ +export function AppsmithLogo({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/CalComLogo.tsx b/apps/webapp/app/assets/logos/CalComLogo.tsx new file mode 100644 index 00000000000..097ddbe8149 --- /dev/null +++ b/apps/webapp/app/assets/logos/CalComLogo.tsx @@ -0,0 +1,50 @@ +export function CalComLogo({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/GoogleLogo.tsx b/apps/webapp/app/assets/logos/GoogleLogo.tsx new file mode 100644 index 00000000000..e0ff9597f07 --- /dev/null +++ b/apps/webapp/app/assets/logos/GoogleLogo.tsx @@ -0,0 +1,22 @@ +export function GoogleLogo({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/LyftLogo.tsx b/apps/webapp/app/assets/logos/LyftLogo.tsx new file mode 100644 index 00000000000..270781927af --- /dev/null +++ b/apps/webapp/app/assets/logos/LyftLogo.tsx @@ -0,0 +1,19 @@ +export function LyftLogo({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/logos/MiddayLogo.tsx b/apps/webapp/app/assets/logos/MiddayLogo.tsx new file mode 100644 index 00000000000..1f4740ecb14 --- /dev/null +++ b/apps/webapp/app/assets/logos/MiddayLogo.tsx @@ -0,0 +1,23 @@ +export function MiddayLogo({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/ShopifyLogo.tsx b/apps/webapp/app/assets/logos/ShopifyLogo.tsx new file mode 100644 index 00000000000..86c71de7cfa --- /dev/null +++ b/apps/webapp/app/assets/logos/ShopifyLogo.tsx @@ -0,0 +1,39 @@ +export function ShopifyLogo({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/TldrawLogo.tsx b/apps/webapp/app/assets/logos/TldrawLogo.tsx new file mode 100644 index 00000000000..a15ca8c64d0 --- /dev/null +++ b/apps/webapp/app/assets/logos/TldrawLogo.tsx @@ -0,0 +1,41 @@ +export function TldrawLogo({ className }: { className?: string }) { + return ( + + + + + + + + + + ); +} diff --git a/apps/webapp/app/assets/logos/UnkeyLogo.tsx b/apps/webapp/app/assets/logos/UnkeyLogo.tsx new file mode 100644 index 00000000000..9ef4f416675 --- /dev/null +++ b/apps/webapp/app/assets/logos/UnkeyLogo.tsx @@ -0,0 +1,17 @@ +export function UnkeyLogo({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/logos/VerizonLogo.tsx b/apps/webapp/app/assets/logos/VerizonLogo.tsx new file mode 100644 index 00000000000..908dcb4968c --- /dev/null +++ b/apps/webapp/app/assets/logos/VerizonLogo.tsx @@ -0,0 +1,33 @@ +export function VerizonLogo({ className }: { className?: string }) { + return ( + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/bootstrap.ts b/apps/webapp/app/bootstrap.ts new file mode 100644 index 00000000000..84c13c061f8 --- /dev/null +++ b/apps/webapp/app/bootstrap.ts @@ -0,0 +1,75 @@ +import { mkdir, writeFile } from "fs/promises"; +import { prisma } from "./db.server"; +import { env } from "./env.server"; +import { WorkerGroupService } from "./v3/services/worker/workerGroupService.server"; +import { dirname } from "path"; +import { tryCatch } from "@trigger.dev/core"; + +export async function bootstrap() { + if (env.TRIGGER_BOOTSTRAP_ENABLED !== "1") { + return; + } + + if (env.TRIGGER_BOOTSTRAP_WORKER_GROUP_NAME) { + const [error] = await tryCatch(createWorkerGroup()); + if (error) { + console.error("Failed to create worker group", { error }); + } + } +} + +async function createWorkerGroup() { + const workerGroupName = env.TRIGGER_BOOTSTRAP_WORKER_GROUP_NAME; + const tokenPath = env.TRIGGER_BOOTSTRAP_WORKER_TOKEN_PATH; + + const existingWorkerGroup = await prisma.workerInstanceGroup.findFirst({ + where: { + name: workerGroupName, + }, + }); + + if (existingWorkerGroup) { + console.warn(`[bootstrap] Worker group ${workerGroupName} already exists`); + return; + } + + const service = new WorkerGroupService(); + const { token, workerGroup } = await service.createWorkerGroup({ + name: workerGroupName, + }); + + console.log(` +========================== +Trigger.dev Bootstrap - Worker Token + +WARNING: This will only be shown once. Save it now! + +Worker group: +${workerGroup.name} + +Token: +${token.plaintext} + +If using docker compose, set: +TRIGGER_WORKER_TOKEN=${token.plaintext} + +${ + tokenPath + ? `Or, if using a file: +TRIGGER_WORKER_TOKEN=file://${tokenPath}` + : "" +} + +========================== + `); + + if (tokenPath) { + const dir = dirname(tokenPath); + await mkdir(dir, { recursive: true }); + await writeFile(tokenPath, token.plaintext, { + mode: 0o600, + }); + + console.log(`[bootstrap] Worker token saved to ${tokenPath}`); + } +} diff --git a/apps/webapp/app/components/ActiveBadge.tsx b/apps/webapp/app/components/ActiveBadge.tsx deleted file mode 100644 index cb7aef6a0a3..00000000000 --- a/apps/webapp/app/components/ActiveBadge.tsx +++ /dev/null @@ -1,57 +0,0 @@ -import { cn } from "~/utils/cn"; - -const variant = { - small: - "py-[0.25rem] px-1.5 text-xxs font-normal inline-flex items-center justify-center whitespace-nowrap rounded-[0.125rem]", - normal: - "py-1 px-1.5 text-xs font-normal inline-flex items-center justify-center whitespace-nowrap rounded-sm", -}; - -type ActiveBadgeProps = { - active: boolean; - className?: string; - badgeSize?: keyof typeof variant; -}; - -export function ActiveBadge({ active, className, badgeSize = "normal" }: ActiveBadgeProps) { - switch (active) { - case true: - return ( - - Active - - ); - case false: - return ( - - Disabled - - ); - } -} - -export function MissingIntegrationBadge({ - className, - badgeSize = "normal", -}: { - className?: string; - badgeSize?: keyof typeof variant; -}) { - return ( - - Missing Integration - - ); -} - -export function NewBadge({ - className, - badgeSize = "normal", -}: { - className?: string; - badgeSize?: keyof typeof variant; -}) { - return ( - New! - ); -} diff --git a/apps/webapp/app/components/AlphaBadge.tsx b/apps/webapp/app/components/AlphaBadge.tsx new file mode 100644 index 00000000000..0a1c4a7fc9a --- /dev/null +++ b/apps/webapp/app/components/AlphaBadge.tsx @@ -0,0 +1,61 @@ +import { cn } from "~/utils/cn"; +import { Badge } from "./primitives/Badge"; +import { SimpleTooltip } from "./primitives/Tooltip"; + +export function AlphaBadge({ + inline = false, + className, +}: { + inline?: boolean; + className?: string; +}) { + return ( + + Alpha + + } + content="This feature is in Alpha." + disableHoverableContent + /> + ); +} + +export function AlphaTitle({ children }: { children: React.ReactNode }) { + return ( + <> + {children} + + + ); +} + +export function BetaBadge({ + inline = false, + className, +}: { + inline?: boolean; + className?: string; +}) { + return ( + + Beta + + } + content="This feature is in Beta." + disableHoverableContent + /> + ); +} + +export function BetaTitle({ children }: { children: React.ReactNode }) { + return ( + <> + {children} + + + ); +} diff --git a/apps/webapp/app/components/AskAI.tsx b/apps/webapp/app/components/AskAI.tsx new file mode 100644 index 00000000000..814d4649c8f --- /dev/null +++ b/apps/webapp/app/components/AskAI.tsx @@ -0,0 +1,549 @@ +import { + ArrowPathIcon, + ArrowUpIcon, + HandThumbDownIcon, + HandThumbUpIcon, + StopIcon, +} from "@heroicons/react/20/solid"; +import { cn } from "~/utils/cn"; +import { type FeedbackComment, KapaProvider, type QA, useChat } from "@kapaai/react-sdk"; +import { useSearchParams } from "@remix-run/react"; +import DOMPurify from "dompurify"; +import { motion } from "framer-motion"; +import { marked } from "marked"; +import { useCallback, useEffect, useRef, useState } from "react"; +import { useTypedRouteLoaderData } from "remix-typedjson"; +import { AISparkleIcon } from "~/assets/icons/AISparkleIcon"; +import { SparkleListIcon } from "~/assets/icons/SparkleListIcon"; +import { useFeatures } from "~/hooks/useFeatures"; +import { type loader } from "~/root"; +import { Button } from "./primitives/Buttons"; +import { Callout } from "./primitives/Callout"; +import { Dialog, DialogContent, DialogHeader, DialogTitle } from "./primitives/Dialog"; +import { Header2 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { ShortcutKey } from "./primitives/ShortcutKey"; +import { Spinner } from "./primitives/Spinner"; +import { + SimpleTooltip, + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "./primitives/Tooltip"; +import { ClientOnly } from "remix-utils/client-only"; + +function useKapaWebsiteId() { + const routeMatch = useTypedRouteLoaderData("root"); + return routeMatch?.kapa.websiteId; +} + +export function AskAI({ isCollapsed = false }: { isCollapsed?: boolean }) { + const { isManagedCloud } = useFeatures(); + const websiteId = useKapaWebsiteId(); + + if (!isManagedCloud || !websiteId) { + return null; + } + + return ( + + + + } + > + {() => } + + ); +} + +type AskAIProviderProps = { + websiteId: string; + isCollapsed?: boolean; +}; + +function AskAIProvider({ websiteId, isCollapsed = false }: AskAIProviderProps) { + const [isOpen, setIsOpen] = useState(false); + const [initialQuery, setInitialQuery] = useState(); + const [searchParams, setSearchParams] = useSearchParams(); + + const openAskAI = useCallback((question?: string) => { + if (question) { + setInitialQuery(question); + } else { + setInitialQuery(undefined); + } + setIsOpen(true); + }, []); + + const closeAskAI = useCallback(() => { + setIsOpen(false); + setInitialQuery(undefined); + }, []); + + // Handle URL param functionality + useEffect(() => { + const aiHelp = searchParams.get("aiHelp"); + if (aiHelp) { + // Delay to avoid hCaptcha bot detection + window.setTimeout(() => openAskAI(aiHelp), 1000); + + // Clone instead of mutating in place + const next = new URLSearchParams(searchParams); + next.delete("aiHelp"); + setSearchParams(next); + } + }, [searchParams, openAskAI]); + + return ( + openAskAI(), + onAnswerGenerationCompleted: () => openAskAI(), + }, + }} + botProtectionMechanism="hcaptcha" + > + + + + + + + + + + Ask AI + + + + + + + + + + + ); +} + +type AskAIDialogProps = { + initialQuery?: string; + isOpen: boolean; + onOpenChange: (open: boolean) => void; + closeAskAI: () => void; +}; + +function AskAIDialog({ initialQuery, isOpen, onOpenChange, closeAskAI }: AskAIDialogProps) { + const handleOpenChange = (open: boolean) => { + if (!open) { + closeAskAI(); + } else { + onOpenChange(open); + } + }; + + return ( + + + +
+ + Ask AI +
+
+ +
+
+ ); +} + +function ChatMessages({ + conversation, + isPreparingAnswer, + isGeneratingAnswer, + onReset, + onExampleClick, + error, + addFeedback, +}: { + conversation: QA[]; + isPreparingAnswer: boolean; + isGeneratingAnswer: boolean; + onReset: () => void; + onExampleClick: (question: string) => void; + error: string | null; + addFeedback: ( + questionAnswerId: string, + reaction: "upvote" | "downvote", + comment?: FeedbackComment + ) => void; +}) { + const [feedbackGivenForQAs, setFeedbackGivenForQAs] = useState>(new Set()); + + // Reset feedback state when conversation is reset + useEffect(() => { + if (conversation.length === 0) { + setFeedbackGivenForQAs(new Set()); + } + }, [conversation.length]); + + // Check if feedback has been given for the latest QA + const latestQA = conversation[conversation.length - 1]; + const hasFeedbackForLatestQA = latestQA?.id ? feedbackGivenForQAs.has(latestQA.id) : false; + + const exampleQuestions = [ + "How do I increase my concurrency limit?", + "How do I debug errors in my task?", + "How do I deploy my task?", + ]; + + return ( +
+ {conversation.length === 0 ? ( + + + I'm trained on docs, examples, and other content. Ask me anything about Trigger.dev. + + {exampleQuestions.map((question, index) => ( + onExampleClick(question)} + variants={{ + hidden: { + opacity: 0, + x: 20, + }, + visible: { + opacity: 1, + x: 0, + transition: { + opacity: { + duration: 0.5, + ease: "linear", + }, + x: { + type: "spring", + stiffness: 300, + damping: 25, + }, + }, + }, + }} + > + + + {question} + + + ))} + + ) : ( + conversation.map((qa) => ( +
+ {qa.question} +
+
+ )) + )} + {conversation.length > 0 && + !isPreparingAnswer && + !isGeneratingAnswer && + !error && + !latestQA?.id && ( +
+ + Answer generation was stopped + + +
+ )} + {conversation.length > 0 && + !isPreparingAnswer && + !isGeneratingAnswer && + !error && + latestQA?.id && ( +
+ {hasFeedbackForLatestQA ? ( + + + Thanks for your feedback! + + + ) : ( +
+ + Was this helpful? + +
+ + +
+
+ )} + +
+ )} + {isPreparingAnswer && ( +
+ + Preparing answer… +
+ )} + {error && ( +
+ + Error generating answer: + + {error} If the problem persists after retrying, please contact support. + + +
+ +
+
+ )} +
+ ); +} + +function ChatInterface({ initialQuery }: { initialQuery?: string }) { + const [message, setMessage] = useState(""); + const [isExpanded, setIsExpanded] = useState(false); + const hasSubmittedInitialQuery = useRef(false); + const { + conversation, + submitQuery, + isGeneratingAnswer, + isPreparingAnswer, + resetConversation, + stopGeneration, + error, + addFeedback, + } = useChat(); + + useEffect(() => { + if (initialQuery && !hasSubmittedInitialQuery.current) { + hasSubmittedInitialQuery.current = true; + setIsExpanded(true); + submitQuery(initialQuery); + } + }, [initialQuery, submitQuery]); + + const handleSubmit = (e: React.FormEvent) => { + e.preventDefault(); + if (message.trim()) { + setIsExpanded(true); + submitQuery(message); + setMessage(""); + } + }; + + const handleExampleClick = (question: string) => { + setIsExpanded(true); + submitQuery(question); + }; + + const handleReset = () => { + resetConversation(); + setIsExpanded(false); + }; + + return ( + + +
+
+ setMessage(e.target.value)} + placeholder="Ask a question..." + disabled={isGeneratingAnswer} + autoFocus + className="flex-1 rounded-md border border-grid-bright bg-background-dimmed px-3 py-2 text-text-bright placeholder:text-text-dimmed focus-visible:focus-custom" + /> + {isGeneratingAnswer ? ( + stopGeneration()} + className="group relative z-10 flex size-10 min-w-10 cursor-pointer items-center justify-center" + > + + + + } + content="Stop generating" + /> + ) : isPreparingAnswer ? ( + + + + ) : ( +
+
+
+ ); +} + +function GradientSpinnerBackground({ + children, + className, + hoverEffect = false, +}: { + children?: React.ReactNode; + className?: string; + hoverEffect?: boolean; +}) { + return ( +
+
+ {children} +
+
+ ); +} diff --git a/apps/webapp/app/components/BackgroundWrapper.tsx b/apps/webapp/app/components/BackgroundWrapper.tsx new file mode 100644 index 00000000000..aaf06d56aaf --- /dev/null +++ b/apps/webapp/app/components/BackgroundWrapper.tsx @@ -0,0 +1,44 @@ +import { type ReactNode } from "react"; +import blurredDashboardBackgroundMenuTop from "~/assets/images/blurred-dashboard-background-menu-top.jpg"; +import blurredDashboardBackgroundMenuBottom from "~/assets/images/blurred-dashboard-background-menu-bottom.jpg"; +import blurredDashboardBackgroundTable from "~/assets/images/blurred-dashboard-background-table.jpg"; + +export function BackgroundWrapper({ children }: { children: ReactNode }) { + return ( +
+
+ +
+ +
+ +
{children}
+
+ ); +} diff --git a/apps/webapp/app/components/BlankStatePanels.tsx b/apps/webapp/app/components/BlankStatePanels.tsx new file mode 100644 index 00000000000..fe39f6785c5 --- /dev/null +++ b/apps/webapp/app/components/BlankStatePanels.tsx @@ -0,0 +1,737 @@ +import { + BeakerIcon, + BellAlertIcon, + BookOpenIcon, + ChatBubbleLeftRightIcon, + ClockIcon, + PlusIcon, + QuestionMarkCircleIcon, + RectangleGroupIcon, + RectangleStackIcon, + Squares2X2Icon, +} from "@heroicons/react/20/solid"; +import { useLocation } from "react-use"; +import { AIPromptsIcon } from "~/assets/icons/AIPromptsIcon"; +import { BranchEnvironmentIconSmall } from "~/assets/icons/EnvironmentIcons"; +import { WaitpointTokenIcon } from "~/assets/icons/WaitpointTokenIcon"; +import openBulkActionsPanel from "~/assets/images/open-bulk-actions-panel.png"; +import selectRunsIndividually from "~/assets/images/select-runs-individually.png"; +import selectRunsUsingFilters from "~/assets/images/select-runs-using-filters.png"; +import { useEnvironment } from "~/hooks/useEnvironment"; +import { useFeatures } from "~/hooks/useFeatures"; +import { useOrganization } from "~/hooks/useOrganizations"; +import { useProject } from "~/hooks/useProject"; +import { type MinimumEnvironment } from "~/presenters/SelectBestEnvironmentPresenter.server"; +import { NewBranchPanel } from "~/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.branches/route"; +import { GitHubSettingsPanel } from "~/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.github"; +import { + docsPath, + v3BillingPath, + v3CreateBulkActionPath, + v3EnvironmentPath, + v3NewProjectAlertPath, + v3NewSchedulePath, +} from "~/utils/pathBuilder"; +import { AskAI } from "./AskAI"; +import { CodeBlock } from "./code/CodeBlock"; +import { InlineCode } from "./code/InlineCode"; +import { environmentFullTitle, EnvironmentIcon } from "./environments/EnvironmentLabel"; +import { Feedback } from "./Feedback"; +import { EnvironmentSelector } from "./navigation/EnvironmentSelector"; +import { Button, LinkButton } from "./primitives/Buttons"; +import { + ClientTabs, + ClientTabsContent, + ClientTabsList, + ClientTabsTrigger, +} from "./primitives/ClientTabs"; +import { Header1 } from "./primitives/Headers"; +import { InfoPanel } from "./primitives/InfoPanel"; +import { Paragraph } from "./primitives/Paragraph"; +import { StepNumber } from "./primitives/StepNumber"; +import { TextLink } from "./primitives/TextLink"; +import { SimpleTooltip } from "./primitives/Tooltip"; +import { + InitCommandV3, + PackageManagerProvider, + TriggerDeployStep, + TriggerDevStepV3, +} from "./SetupCommands"; +import { StepContentContainer } from "./StepContentContainer"; +import { V4Badge } from "./V4Badge"; + +export function HasNoTasksDev() { + return ( + +
+
+ Get setup in 3 minutes +
+ + I'm stuck! + + } + defaultValue="help" + /> +
+
+ + + + + You'll notice a new folder in your project called{" "} + trigger. We've added a few simple example tasks + in there to help you get started. + + + + + + + + + This page will automatically refresh. + +
+
+ ); +} + +export function HasNoTasksDeployed({ environment }: { environment: MinimumEnvironment }) { + return ; +} + +export function SchedulesNoPossibleTaskPanel() { + return ( + + How to schedule tasks + + } + > + + You have no scheduled tasks in your project. Before you can schedule a task you need to + create a schedules.task. + + + ); +} + +export function SchedulesNoneAttached() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + const location = useLocation(); + + return ( + + + Scheduled tasks will only run automatically if you connect a schedule to them, you can do + this in the dashboard or using the SDK. + +
+ + Use the dashboard + + + Use the SDK + +
+
+ ); +} + +export function BatchesNone() { + return ( + + How to trigger batches + + } + > + + You have no batches in this environment. You can trigger batches from your backend or from + inside other tasks. + + + ); +} + +export function TestHasNoTasks() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + return ( + + Create a task + + } + > + + Before testing a task, you must first create one. Follow the instructions on the{" "} + Tasks page{" "} + to create a task, then return here to test it. + + + ); +} + +export function DeploymentsNone() { + return ; +} + +export function DeploymentsNoneDev() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + return ( + <> +
+
+ + Deploy your tasks +
+
+ + } + content="Deploy docs" + /> + + } + content="Troubleshooting docs" + /> + +
+
+ + + + This is the Development environment. When you're ready to deploy your tasks, switch to a + different environment. + + + + + ); +} + +export function AlertsNoneDev() { + return ( +
+ + + You can get alerted when deployed runs fail. + + + We don't support alerts in the Development environment. Switch to a deployed environment + to setup alerts. + +
+ + How to setup alerts + +
+
+ +
+ ); +} + +export function AlertsNoneDeployed() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + return ( +
+ + + You can get alerted when deployed runs fail. We currently support sending Slack, Email, + and webhooks. + + +
+ + Alerts docs + + + New alert + +
+
+
+ ); +} + +export function QueuesHasNoTasks() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + return ( + + Create a task + + } + > + + Queues will appear here when you have created a task in this environment. Follow the + instructions on the{" "} + Tasks page{" "} + to create a task, then return here to see its queue. + + + ); +} + +export function NoWaitpointTokens() { + return ( + + Waitpoint docs + + } + > + + Waitpoint tokens pause task runs until you complete the token. They're commonly used for + approval workflows and other scenarios where you need to wait for external confirmation, + such as human-in-the-loop processes. + + + ); +} + +export function BranchesNoBranchableEnvironment() { + const { isManagedCloud } = useFeatures(); + const organization = useOrganization(); + + if (!isManagedCloud) { + return ( + + + To add branches you need to have a RuntimeEnvironment where{" "} + isBranchableEnvironment is true. We recommend creating a + dedicated one using the "PREVIEW" type. + + + ); + } + + return ( + + Upgrade + + } + > + + Preview branches in Trigger.dev create isolated environments for testing new features before + production. + + + You must be on to access preview branches. Read our{" "} + upgrade to v4 guide to learn more. + + + ); +} + +export function BranchesNoBranches({ + parentEnvironment, + limits, + canUpgrade, +}: { + parentEnvironment: { id: string }; + limits: { used: number; limit: number }; + canUpgrade: boolean; +}) { + const organization = useOrganization(); + + if (limits.used >= limits.limit) { + return ( + + Upgrade + + ) : ( + Request more} + defaultValue="help" + /> + ) + } + > + + You've reached the limit ({limits.used}/{limits.limit}) of branches for your plan. Upgrade + to get branches. + + + ); + } + + return ( + + New branch + + } + parentEnvironment={parentEnvironment} + /> + } + > + + Branches are a way to test new features in isolation before merging them into the main + environment. + + + Branches are only available when using or above. Read our{" "} + v4 upgrade guide to learn more. + + + ); +} + +export function SwitcherPanel({ title = "Switch to a deployed environment" }: { title?: string }) { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + return ( +
+ + {title} + + +
+ ); +} + +export function BulkActionsNone() { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + return ( +
+
+ Create a bulk action +
+ + New bulk action + +
+
+ + + Select runs from the runs page individually. +
+ Select runs individually +
+
+
+
+ + OR + +
+
+ + + + Use the filter menu on the runs page to select just the runs you want to bulk action. + +
+ Select runs using filters +
+
+ + + Click the “Bulk actions” button in the top right of the runs page. +
+ Open the bulk action panel +
+
+
+ ); +} + +function DeploymentOnboardingSteps() { + const environment = useEnvironment(); + const organization = useOrganization(); + const project = useProject(); + + return ( + +
+
+ + + Deploy your tasks to {environmentFullTitle(environment)} + +
+
+ + } + content="Deploy docs" + /> + + } + content="Troubleshooting docs" + /> + +
+
+ + + + GitHub + + + Manual + + + GitHub Actions + + + + + + + Deploy automatically with every push. Read the{" "} + full guide. + +
+ +
+
+
+ + + + + This will deploy your tasks to the {environmentFullTitle(environment)} environment. + Read the full guide. + + + + + + + + + Read the GitHub Actions guide to + get started. + + + +
+ + + + This page will automatically refresh when your tasks are deployed. + +
+ ); +} + +export function PromptsNone() { + return ( + + Prompts docs + + } + > + + Managed prompts let you define AI prompts in code with typesafe variables, then edit and + version them from the dashboard without redeploying. + + + Add a prompt to your project using prompts.define() + : + + + + Deploy your project and your prompts will appear here with version history and a live + editor. + + + ); +} diff --git a/apps/webapp/app/components/BulkActionFilterSummary.tsx b/apps/webapp/app/components/BulkActionFilterSummary.tsx new file mode 100644 index 00000000000..a230e70b346 --- /dev/null +++ b/apps/webapp/app/components/BulkActionFilterSummary.tsx @@ -0,0 +1,280 @@ +import { z } from "zod"; +import { + filterIcon, + filterTitle, + type TaskRunListSearchFilterKey, + type TaskRunListSearchFilters, +} from "./runs/v3/RunFilters"; +import { Paragraph } from "./primitives/Paragraph"; +import simplur from "simplur"; +import { appliedSummary, dateFromString, timeFilterRenderValues } from "./runs/v3/SharedFilters"; +import { formatNumber } from "~/utils/numberFormatter"; +import { SpinnerWhite } from "./primitives/Spinner"; +import { ArrowPathIcon, CheckIcon, XCircleIcon } from "@heroicons/react/20/solid"; +import { XCircleIcon as XCircleIconOutline } from "@heroicons/react/24/outline"; +import assertNever from "assert-never"; +import { AppliedFilter } from "./primitives/AppliedFilter"; +import { runStatusTitle } from "./runs/v3/TaskRunStatus"; +import type { TaskRunStatus } from "@trigger.dev/database"; + +export const BulkActionMode = z.union([z.literal("selected"), z.literal("filter")]); +export type BulkActionMode = z.infer; +export const BulkActionAction = z.union([z.literal("cancel"), z.literal("replay")]); +export type BulkActionAction = z.infer; + +export function BulkActionFilterSummary({ + selected, + final = false, + mode, + action, + filters, +}: { + selected?: number; + final?: boolean; + mode: BulkActionMode; + action: BulkActionAction; + filters: TaskRunListSearchFilters; +}) { + switch (mode) { + case "selected": + return ( + + You {!final ? "have " : " "}individually selected {simplur`${selected} run[|s]`} to be{" "} + . + + ); + case "filter": { + const { label, valueLabel, rangeType } = timeFilterRenderValues({ + from: filters.from ? dateFromString(`${filters.from}`) : undefined, + to: filters.to ? dateFromString(`${filters.to}`) : undefined, + period: filters.period, + }); + + return ( +
+ + You {!final ? "have " : " "}selected{" "} + + {final ? selected : } + {" "} + runs to be using these filters: + +
+ + {Object.entries(filters).map(([key, value]) => { + if (!value && key !== "period") { + return null; + } + + const typedKey = key as TaskRunListSearchFilterKey; + + switch (typedKey) { + case "cursor": + case "direction": + case "environments": + //We need to handle time differently because we have a default + case "period": + case "from": + case "to": { + return null; + } + case "tasks": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } + case "versions": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } + case "statuses": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + runStatusTitle(v as TaskRunStatus)))} + removable={false} + /> + ); + } + case "tags": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } + case "bulkId": { + return ( + + ); + } + case "rootOnly": { + return ( + + ) : ( + + ) + } + removable={false} + /> + ); + } + case "runId": { + return ( + + ); + } + case "batchId": { + return ( + + ); + } + case "scheduleId": { + return ( + + ); + } + case "queues": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + v.replace("task/", "")))} + removable={false} + /> + ); + } + case "machines": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } + case "errorId": { + return ( + + ); + } + default: { + assertNever(typedKey); + } + } + })} +
+
+ ); + } + } +} + +function Action({ action }: { action: BulkActionAction }) { + switch (action) { + case "cancel": + return ( + + + Canceled + + ); + case "replay": + return ( + + + Replayed + + ); + } +} + +export function EstimatedCount({ count }: { count?: number }) { + if (typeof count === "number") { + return <>~{formatNumber(count)}; + } + + return ; +} diff --git a/apps/webapp/app/components/CloudProvider.tsx b/apps/webapp/app/components/CloudProvider.tsx new file mode 100644 index 00000000000..acf8cff5506 --- /dev/null +++ b/apps/webapp/app/components/CloudProvider.tsx @@ -0,0 +1,10 @@ +export function cloudProviderTitle(provider: "aws" | "digitalocean" | (string & {})) { + switch (provider) { + case "aws": + return "Amazon Web Services"; + case "digitalocean": + return "Digital Ocean"; + default: + return provider; + } +} diff --git a/apps/webapp/app/components/ComingSoon.tsx b/apps/webapp/app/components/ComingSoon.tsx deleted file mode 100644 index 283f1f4a500..00000000000 --- a/apps/webapp/app/components/ComingSoon.tsx +++ /dev/null @@ -1,34 +0,0 @@ -import { ReactNode } from "react"; -import { MainCenteredContainer } from "./layout/AppLayout"; -import { Header2 } from "./primitives/Headers"; -import { NamedIconInBox } from "./primitives/NamedIcon"; -import { Paragraph } from "./primitives/Paragraph"; - -type ComingSoonProps = { - title: string; - description: string; - icon: ReactNode; -}; - -export function ComingSoon({ title, description, icon }: ComingSoonProps) { - return ( - -
-
- {typeof icon === "string" ? ( - - ) : ( - icon - )} -
- Coming soon - {title} -
-
- - {description} - -
-
- ); -} diff --git a/apps/webapp/app/components/DefinitionTooltip.tsx b/apps/webapp/app/components/DefinitionTooltip.tsx new file mode 100644 index 00000000000..5bb3a713997 --- /dev/null +++ b/apps/webapp/app/components/DefinitionTooltip.tsx @@ -0,0 +1,33 @@ +import { Header3 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "./primitives/Tooltip"; + +export function DefinitionTip({ + content, + children, + title, +}: { + content: React.ReactNode; + children: React.ReactNode; + title: React.ReactNode; +}) { + return ( + + + + + {children} + + + + {title} + {typeof content === "string" ? ( + {content} + ) : ( +
{content}
+ )} +
+
+
+ ); +} diff --git a/apps/webapp/app/components/DevPresence.tsx b/apps/webapp/app/components/DevPresence.tsx new file mode 100644 index 00000000000..7a99dab37a5 --- /dev/null +++ b/apps/webapp/app/components/DevPresence.tsx @@ -0,0 +1,222 @@ +import { AnimatePresence, motion } from "framer-motion"; +import { createContext, type ReactNode, useContext, useEffect, useMemo, useState } from "react"; +import { + CheckingConnectionIcon, + ConnectedIcon, + DisconnectedIcon, +} from "~/assets/icons/ConnectionIcons"; +import { useEnvironment } from "~/hooks/useEnvironment"; +import { useEventSource } from "~/hooks/useEventSource"; +import { useOrganization } from "~/hooks/useOrganizations"; +import { useProject } from "~/hooks/useProject"; +import { docsPath } from "~/utils/pathBuilder"; +import connectedImage from "../assets/images/cli-connected.png"; +import disconnectedImage from "../assets/images/cli-disconnected.png"; +import { InlineCode } from "./code/InlineCode"; +import { Button } from "./primitives/Buttons"; +import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "./primitives/Dialog"; +import { Paragraph } from "./primitives/Paragraph"; +import { TextLink } from "./primitives/TextLink"; +import { PackageManagerProvider, TriggerDevStepV3 } from "./SetupCommands"; + +// Define Context types +type DevPresenceContextType = { + isConnected: boolean | undefined; +}; + +// Create Context with default values +const DevPresenceContext = createContext({ + isConnected: undefined, +}); + +// Provider component with enabled prop +interface DevPresenceProviderProps { + children: ReactNode; + enabled?: boolean; +} + +export function DevPresenceProvider({ children, enabled = true }: DevPresenceProviderProps) { + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + // Only subscribe to event source if enabled is true + const streamedEvents = useEventSource( + `/resources/orgs/${organization.slug}/projects/${project.slug}/dev/presence`, + { + event: "presence", + disabled: !enabled, + } + ); + + const [isConnected, setIsConnected] = useState(undefined); + + useEffect(() => { + // If disabled or no events + if (!enabled || streamedEvents === null) { + setIsConnected(undefined); + return; + } + + try { + const data = JSON.parse(streamedEvents) as any; + if ("isConnected" in data && data.isConnected) { + try { + setIsConnected(true); + } catch (error) { + console.log("DevPresence: Failed to parse lastSeen timestamp", { error }); + setIsConnected(false); + } + } else { + setIsConnected(false); + } + } catch (error) { + console.log("DevPresence: Failed to parse presence message", { error }); + setIsConnected(false); + } + }, [streamedEvents, enabled]); + + // Calculate isConnected and memoize the context value + const contextValue = useMemo(() => { + return { isConnected }; + }, [isConnected, enabled]); + + return {children}; +} + +// Custom hook to use the context +export function useDevPresence() { + const context = useContext(DevPresenceContext); + if (context === undefined) { + throw new Error("useDevPresence must be used within a DevPresenceProvider"); + } + return context; +} + +/** + * We need this for the legacy v1 engine, where we show the banner after a delay if there are no events. + */ +export function useCrossEngineIsConnected({ + isCompleted, + logCount, +}: { + isCompleted: boolean; + logCount: number; +}) { + const project = useProject(); + const environment = useEnvironment(); + const { isConnected } = useDevPresence(); + const [crossEngineIsConnected, setCrossEngineIsConnected] = useState( + undefined + ); + + useEffect(() => { + if (project.engine === "V2") { + setCrossEngineIsConnected(isConnected); + return; + } + + if (project.engine === "V1") { + if (isCompleted) { + setCrossEngineIsConnected(true); + return; + } + + if (logCount <= 1) { + const timer = setTimeout(() => { + setCrossEngineIsConnected(false); + }, 5000); + return () => clearTimeout(timer); + } else { + setCrossEngineIsConnected(true); + } + } + }, [environment.type, project.engine, logCount, isConnected, isCompleted]); + + return crossEngineIsConnected; +} + +export function ConnectionIcon({ isConnected }: { isConnected: boolean | undefined }) { + if (isConnected === undefined) { + return ; + } + return isConnected ? ( + + ) : ( + + ); +} + +export function DevPresencePanel({ isConnected }: { isConnected: boolean | undefined }) { + return ( + + + {isConnected === undefined + ? "Checking connection..." + : isConnected + ? "Your dev server is connected" + : "Your dev server is not connected"} + +
+
+ {isConnected + + {isConnected === undefined + ? "Checking connection..." + : isConnected + ? "Your local dev server is connected to Trigger.dev" + : "Your local dev server is not connected to Trigger.dev"} + +
+ {isConnected ? null : ( +
+ + + + + Run this CLI dev command to connect to + the Trigger.dev servers to start developing locally. Keep it running while you develop + to stay connected. Learn more in the{" "} + CLI docs. + +
+ )} +
+
+ ); +} + +export function DevDisconnectedBanner({ isConnected }: { isConnected: boolean | undefined }) { + return ( + + + {isConnected === false && ( + + + + + + )} + + + + ); +} diff --git a/apps/webapp/app/components/ErrorDisplay.tsx b/apps/webapp/app/components/ErrorDisplay.tsx index c40360a2740..5787a2edbac 100644 --- a/apps/webapp/app/components/ErrorDisplay.tsx +++ b/apps/webapp/app/components/ErrorDisplay.tsx @@ -1,7 +1,11 @@ +import { HomeIcon } from "@heroicons/react/20/solid"; import { isRouteErrorResponse, useRouteError } from "@remix-run/react"; -import { LinkButton } from "./primitives/Buttons"; -import { Header1, Header3 } from "./primitives/Headers"; import { friendlyErrorDisplay } from "~/utils/httpErrors"; +import { LinkButton } from "./primitives/Buttons"; +import { Header1 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { TriggerRotatingLogo } from "./TriggerRotatingLogo"; +import { type ReactNode } from "react"; type ErrorDisplayOptions = { button?: { @@ -34,17 +38,25 @@ export function RouteErrorDisplay(options?: ErrorDisplayOptions) { type DisplayOptionsProps = { title: string; - message?: string; + message?: ReactNode; } & ErrorDisplayOptions; export function ErrorDisplay({ title, message, button }: DisplayOptionsProps) { return ( -
- {title} - {message && {message}} - - {button ? button.title : "Home"} - +
+
+ {title} + {message && {message}} + + {button ? button.title : "Go to homepage"} + +
+
); } diff --git a/apps/webapp/app/components/Feedback.tsx b/apps/webapp/app/components/Feedback.tsx index 8ebf2af9d7f..ecfd4e88c9a 100644 --- a/apps/webapp/app/components/Feedback.tsx +++ b/apps/webapp/app/components/Feedback.tsx @@ -1,87 +1,153 @@ import { conform, useForm } from "@conform-to/react"; import { parse } from "@conform-to/zod"; -import { Form, useActionData, useLocation, useNavigation } from "@remix-run/react"; -import { ReactNode, useState } from "react"; -import { FeedbackType, feedbackTypeLabel, schema } from "~/routes/resources.feedback"; +import { InformationCircleIcon, ArrowUpCircleIcon } from "@heroicons/react/20/solid"; +import { EnvelopeIcon } from "@heroicons/react/24/solid"; +import { Form, useActionData, useLocation, useNavigation, useSearchParams } from "@remix-run/react"; +import { type ReactNode, useEffect, useState } from "react"; +import { type FeedbackType, feedbackTypeLabel, schema } from "~/routes/resources.feedback"; import { Button } from "./primitives/Buttons"; +import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "./primitives/Dialog"; import { Fieldset } from "./primitives/Fieldset"; import { FormButtons } from "./primitives/FormButtons"; import { FormError } from "./primitives/FormError"; +import { Icon } from "./primitives/Icon"; +import { InfoPanel } from "./primitives/InfoPanel"; import { InputGroup } from "./primitives/InputGroup"; import { Label } from "./primitives/Label"; import { Paragraph } from "./primitives/Paragraph"; -import { - Select, - SelectContent, - SelectGroup, - SelectItem, - SelectTrigger, - SelectValue, -} from "./primitives/Select"; -import { Sheet, SheetBody, SheetContent, SheetHeader, SheetTrigger } from "./primitives/Sheet"; +import { Select, SelectItem } from "./primitives/Select"; import { TextArea } from "./primitives/TextArea"; -import { DiscordIcon } from "@trigger.dev/companyicons"; -import { ChevronRightIcon } from "@heroicons/react/24/solid"; +import { TextLink } from "./primitives/TextLink"; +import { DialogClose } from "@radix-ui/react-dialog"; type FeedbackProps = { button: ReactNode; defaultValue?: FeedbackType; + onOpenChange?: (open: boolean) => void; }; -export function Feedback({ button, defaultValue = "bug" }: FeedbackProps) { +export function Feedback({ button, defaultValue = "bug", onOpenChange }: FeedbackProps) { const [open, setOpen] = useState(false); + const [searchParams, setSearchParams] = useSearchParams(); const location = useLocation(); const lastSubmission = useActionData(); const navigation = useNavigation(); + const [type, setType] = useState(defaultValue); const [form, { path, feedbackType, message }] = useForm({ id: "accept-invite", - // TODO: type this lastSubmission: lastSubmission as any, onValidate({ formData }) { return parse(formData, { schema }); }, + shouldRevalidate: "onInput", }); - if ( - open && - navigation.formAction === "/resources/feedback" && - form.error === undefined && - form.errors.length === 0 - ) { - setOpen(false); - } + useEffect(() => { + if ( + navigation.formAction === "/resources/feedback" && + navigation.state === "loading" && + form.error === undefined && + form.errors.length === 0 + ) { + setOpen(false); + } + }, [navigation, form]); + + // Handle URL param functionality + useEffect(() => { + const open = searchParams.get("feedbackPanel"); + if (open) { + setType(open as FeedbackType); + setOpen(true); + // Clone instead of mutating in place + const next = new URLSearchParams(searchParams); + next.delete("feedbackPanel"); + setSearchParams(next); + } + }, [searchParams]); + + const handleOpenChange = (value: boolean) => { + setOpen(value); + onOpenChange?.(value); + }; return ( - - {button} - - Help & feedback - - - - Or use this form to ask for help or give us feedback. We read every message and will get - back to you as soon as we can. - -
-
+ + {button} + + Contact us +
+
+ + + How can we help? We read every message and will respond as quickly as we can. + +
+ {!(type === "feature" || type === "help" || type === "concurrency") && ( +
+ )} + +
- - - - + {type === "feature" && ( + + + All our feature requests are public and voted on by the community. The best + way to submit your feature request is to{" "} + + post it to our feedback forum + + . + + + )} + {type === "help" && ( + + + The quickest way to get answers from the Trigger.dev team and community is to{" "} + ask in our Discord. + + + )} + {type === "concurrency" && ( + + + How much extra concurrency do you need? You can add bundles of 50 for + $50/month each. To help us advise you, please let us know what your tasks do, + your typical run volume, and if your workload is spiky (many runs at once). + + + )} + {feedbackType.error} @@ -91,42 +157,21 @@ export function Feedback({ button, defaultValue = "bug" }: FeedbackProps) { {form.error} - Send + Send message } + cancelButton={ + + + + } />
- - - - ); -} - -function DiscordBanner() { - return ( - -
- -

- Join the Trigger.dev -
- Discord community -

- - Get help or answer questions from the Trigger.dev community. - -
-
- -
-
+
+
+
); } diff --git a/apps/webapp/app/components/GitHubLoginButton.tsx b/apps/webapp/app/components/GitHubLoginButton.tsx index 87238db087e..76a494927cd 100644 --- a/apps/webapp/app/components/GitHubLoginButton.tsx +++ b/apps/webapp/app/components/GitHubLoginButton.tsx @@ -32,8 +32,6 @@ export function OctoKitty({ className }: { className?: string }) { baseProfile="tiny" id="Layer_1" xmlns="http://www.w3.org/2000/svg" - x="0px" - y="0px" viewBox="0 0 2350 2314.8" xmlSpace="preserve" fill="currentColor" diff --git a/apps/webapp/app/components/GitMetadata.tsx b/apps/webapp/app/components/GitMetadata.tsx new file mode 100644 index 00000000000..fb53ee6bfea --- /dev/null +++ b/apps/webapp/app/components/GitMetadata.tsx @@ -0,0 +1,89 @@ +import { GitPullRequestIcon, GitCommitIcon, GitBranchIcon } from "lucide-react"; +import { type GitMetaLinks } from "~/presenters/v3/BranchesPresenter.server"; +import { LinkButton } from "./primitives/Buttons"; +import { SimpleTooltip } from "./primitives/Tooltip"; + +export function GitMetadata({ git }: { git?: GitMetaLinks | null }) { + if (!git) return null; + return ( + <> + {git.pullRequestUrl && git.pullRequestNumber && } + {git.branchUrl && } + {git.shortSha && } + + ); +} + +export function GitMetadataBranch({ + git, +}: { + git: Pick; +}) { + return ( + } + leadingIconClassName="group-hover/table-row:text-text-bright" + iconSpacing="gap-x-1" + to={git.branchUrl} + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" + > + {git.branchName} + + } + content="Jump to GitHub branch" + /> + ); +} + +export function GitMetadataCommit({ + git, +}: { + git: Pick; +}) { + return ( + } + leadingIconClassName="group-hover/table-row:text-text-bright" + iconSpacing="gap-x-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" + > + {`${git.shortSha} / ${git.commitMessage}`} + + } + content="Jump to GitHub commit" + /> + ); +} + +export function GitMetadataPullRequest({ + git, +}: { + git: Pick; +}) { + if (!git.pullRequestUrl || !git.pullRequestNumber) return null; + + return ( + } + leadingIconClassName="group-hover/table-row:text-text-bright" + iconSpacing="gap-x-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" + > + #{git.pullRequestNumber} {git.pullRequestTitle} + + } + content="Jump to GitHub pull request" + /> + ); +} diff --git a/apps/webapp/app/components/HighlightInit.tsx b/apps/webapp/app/components/HighlightInit.tsx deleted file mode 100644 index fea26ef1ec2..00000000000 --- a/apps/webapp/app/components/HighlightInit.tsx +++ /dev/null @@ -1,14 +0,0 @@ -import { H, HighlightOptions } from "highlight.run"; -import { useEffect } from "react"; - -interface Props extends HighlightOptions { - projectId?: string; -} - -export function HighlightInit({ projectId, ...highlightOptions }: Props) { - useEffect(() => { - projectId && H.init(projectId, highlightOptions); - }, []); // eslint-disable-line react-hooks/exhaustive-deps - - return null; -} diff --git a/apps/webapp/app/components/ImpersonationBanner.tsx b/apps/webapp/app/components/ImpersonationBanner.tsx index d665d344bf6..a459f78a05c 100644 --- a/apps/webapp/app/components/ImpersonationBanner.tsx +++ b/apps/webapp/app/components/ImpersonationBanner.tsx @@ -1,27 +1,29 @@ +import { UserMinusIcon } from "@heroicons/react/20/solid"; import { Form } from "@remix-run/react"; -import { Paragraph } from "./primitives/Paragraph"; import { Button } from "./primitives/Buttons"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "./primitives/Tooltip"; -export function ImpersonationBanner({ impersonationId }: { impersonationId: string }) { +export function ImpersonationBanner() { return ( -
- - - You are impersonating {impersonationId} - -
- +
+ + + + +
); diff --git a/apps/webapp/app/components/JobsStatusTable.tsx b/apps/webapp/app/components/JobsStatusTable.tsx deleted file mode 100644 index dbf66e2ab40..00000000000 --- a/apps/webapp/app/components/JobsStatusTable.tsx +++ /dev/null @@ -1,54 +0,0 @@ -import { RuntimeEnvironmentType } from "@trigger.dev/database"; -import { - Table, - TableBody, - TableCell, - TableHeader, - TableHeaderCell, - TableRow, -} from "~/components/primitives/Table"; -import { EnvironmentLabel } from "./environments/EnvironmentLabel"; -import { DateTime } from "./primitives/DateTime"; -import { ActiveBadge } from "./ActiveBadge"; - -export type JobEnvironment = { - type: RuntimeEnvironmentType; - lastRun?: Date; - version: string; - enabled: boolean; -}; - -type JobStatusTableProps = { - environments: JobEnvironment[]; -}; - -export function JobStatusTable({ environments }: JobStatusTableProps) { - return ( - - - - Env - Last Run - Version - Status - - - - {environments.map((environment, index) => ( - - - - - - {environment.lastRun ? : "Never Run"} - - {environment.version} - - - - - ))} - -
- ); -} diff --git a/apps/webapp/app/components/ListPagination.tsx b/apps/webapp/app/components/ListPagination.tsx new file mode 100644 index 00000000000..6e26330677f --- /dev/null +++ b/apps/webapp/app/components/ListPagination.tsx @@ -0,0 +1,89 @@ +import { ChevronLeftIcon, ChevronRightIcon } from "@heroicons/react/20/solid"; +import { useLocation } from "@remix-run/react"; +import { z } from "zod"; +import { LinkButton } from "~/components/primitives/Buttons"; +import { cn } from "~/utils/cn"; + +type List = { + pagination: { + next?: string | undefined; + previous?: string | undefined; + }; +}; + +export const DirectionSchema = z.union([z.literal("forward"), z.literal("backward")]); +export type Direction = z.infer; + +export function ListPagination({ list, className }: { list: List; className?: string }) { + const bothDisabled = !list.pagination.previous && !list.pagination.next; + + return ( +
+ + +
+
+ ); +} + +function PreviousButton({ cursor }: { cursor?: string }) { + const path = useCursorPath(cursor, "backward"); + + return ( +
+ !path && e.preventDefault()} + shortcut={{ key: "j" }} + tooltip="Previous" + disabled={!path} + /> +
+ ); +} + +function NextButton({ cursor }: { cursor?: string }) { + const path = useCursorPath(cursor, "forward"); + + return ( +
+ !path && e.preventDefault()} + shortcut={{ key: "k" }} + tooltip="Next" + disabled={!path} + /> +
+ ); +} + +function useCursorPath(cursor: string | undefined, direction: Direction) { + const location = useLocation(); + + if (!cursor) { + return undefined; + } + + const search = new URLSearchParams(location.search); + search.set("cursor", cursor); + search.set("direction", direction); + return location.pathname + "?" + search.toString(); +} diff --git a/apps/webapp/app/components/LogLevelTooltipInfo.tsx b/apps/webapp/app/components/LogLevelTooltipInfo.tsx new file mode 100644 index 00000000000..2a8093af066 --- /dev/null +++ b/apps/webapp/app/components/LogLevelTooltipInfo.tsx @@ -0,0 +1,56 @@ +import { Header3 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { LogLevel } from "./logs/LogLevel"; + +export function LogLevelTooltipInfo() { + return ( +
+
+ Log Levels + + Structured logging helps you debug and monitor your tasks. + +
+
+
+ +
+ + Traces and spans representing the execution flow of your tasks. + +
+
+
+ +
+ + General informational messages about task execution. + +
+
+
+ +
+ + Warning messages indicating potential issues that don't prevent execution. + +
+
+
+ +
+ + Error messages for failures and exceptions during task execution. + +
+
+
+ +
+ + Detailed diagnostic information for development and debugging. + +
+
+ ); +} diff --git a/apps/webapp/app/components/LoginPageLayout.tsx b/apps/webapp/app/components/LoginPageLayout.tsx new file mode 100644 index 00000000000..3e42cd6894f --- /dev/null +++ b/apps/webapp/app/components/LoginPageLayout.tsx @@ -0,0 +1,92 @@ +import { useEffect, useState } from "react"; +import { AppsmithLogo } from "~/assets/logos/AppsmithLogo"; +import { CalComLogo } from "~/assets/logos/CalComLogo"; +import { LyftLogo } from "~/assets/logos/LyftLogo"; +import { MiddayLogo } from "~/assets/logos/MiddayLogo"; +import { TldrawLogo } from "~/assets/logos/TldrawLogo"; +import { UnkeyLogo } from "~/assets/logos/UnkeyLogo"; +import { LogoType } from "./LogoType"; +import { LinkButton } from "./primitives/Buttons"; +import { Header3 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { TextLink } from "./primitives/TextLink"; +import { BookOpenIcon } from "@heroicons/react/20/solid"; + +interface QuoteType { + quote: string; + person: string; +} + +const quotes: QuoteType[] = [ + { + quote: "Trigger.dev is redefining background jobs for modern developers.", + person: "Paul Copplestone, Supabase", + }, + { + quote: + "Trigger.dev is a great way to automate email campaigns with Resend, and we've heard nothing but good things from our mutual customers.", + person: "Zeno Rocha, Resend", + }, + { + quote: "We love Trigger.dev and it’s had a big impact in dev iteration velocity already.", + person: "André Neves, ZBD", + }, + { + quote: + "We’ve been looking for a product like Trigger.dev for a really long time - automation that's simple and developer-focused.", + person: "Han Wang, Mintlify", + }, +]; + +export function LoginPageLayout({ children }: { children: React.ReactNode }) { + const [randomQuote, setRandomQuote] = useState(null); + useEffect(() => { + const randomIndex = Math.floor(Math.random() * quotes.length); + setRandomQuote(quotes[randomIndex]); + }, []); + + return ( +
+
+
+
+ + + + + Documentation + +
+
{children}
+ + Having login issues? Email us{" "} + or ask us in Discord + +
+
+
+
+ + {randomQuote?.quote} + + {randomQuote?.person} +
+
+ Trusted by developers at +
+ + + + + + +
+
+
+
+ ); +} diff --git a/apps/webapp/app/components/LogoType.tsx b/apps/webapp/app/components/LogoType.tsx new file mode 100644 index 00000000000..76a88fce1a1 --- /dev/null +++ b/apps/webapp/app/components/LogoType.tsx @@ -0,0 +1,190 @@ +export function LogoType({ className }: { className?: string }) { + return ( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ); +} diff --git a/apps/webapp/app/components/MachineLabelCombo.tsx b/apps/webapp/app/components/MachineLabelCombo.tsx new file mode 100644 index 00000000000..3d22ca527d0 --- /dev/null +++ b/apps/webapp/app/components/MachineLabelCombo.tsx @@ -0,0 +1,61 @@ +import { MachinePresetName } from "@trigger.dev/core/v3"; +import { MachineIcon } from "~/assets/icons/MachineIcon"; +import { cn } from "~/utils/cn"; + +export const machines = Object.values(MachinePresetName.enum); + +export function MachineLabelCombo({ + preset, + className, + iconClassName, + labelClassName, +}: { + preset?: MachinePresetName | null; + className?: string; + iconClassName?: string; + labelClassName?: string; +}) { + return ( + + + + + ); +} + +export function MachineLabel({ + preset, + className, +}: { + preset?: MachinePresetName | null; + className?: string; +}) { + return ( + {formatMachinePresetName(preset)} + ); +} + +export function formatMachinePresetName(preset?: MachinePresetName | null): string { + if (!preset) { + return "No machine yet"; + } + + switch (preset) { + case "micro": + return "Micro"; + case "small-1x": + return "Small 1x"; + case "small-2x": + return "Small 2x"; + case "medium-1x": + return "Medium 1x"; + case "medium-2x": + return "Medium 2x"; + case "large-1x": + return "Large 1x"; + case "large-2x": + return "Large 2x"; + default: + return preset; + } +} diff --git a/apps/webapp/app/components/MachineTooltipInfo.tsx b/apps/webapp/app/components/MachineTooltipInfo.tsx new file mode 100644 index 00000000000..3b3616288bd --- /dev/null +++ b/apps/webapp/app/components/MachineTooltipInfo.tsx @@ -0,0 +1,63 @@ +import { MachineIcon } from "~/assets/icons/MachineIcon"; +import { docsPath } from "~/utils/pathBuilder"; +import { LinkButton } from "./primitives/Buttons"; +import { Header3 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { BookOpenIcon } from "@heroicons/react/20/solid"; + +export function MachineTooltipInfo() { + return ( +
+
+
+ + No machine yet +
+ + The machine is set at the moment the run is dequeued. + +
+
+
+ + Micro +
+ + The smallest and cheapest machine available. + +
+
+
+ Small 1x & 2x +
+ + Smaller machines for basic workloads. Small 1x is the default machine. + +
+
+
+ Medium 1x & 2x +
+ + Medium machines for more demanding workloads. + +
+
+
+ Large 1x & 2x +
+ + Larger machines for the most demanding workloads such as video processing. The larger the + machine, the more expensive it is. + +
+ + Read docs + +
+ ); +} diff --git a/apps/webapp/app/components/NoMobileOverlay.tsx b/apps/webapp/app/components/NoMobileOverlay.tsx deleted file mode 100644 index 2e6de051bb5..00000000000 --- a/apps/webapp/app/components/NoMobileOverlay.tsx +++ /dev/null @@ -1,21 +0,0 @@ -import { XMarkIcon, DevicePhoneMobileIcon } from "@heroicons/react/24/outline"; -import { Paragraph } from "./primitives/Paragraph"; -import { LinkButton } from "./primitives/Buttons"; - -export function NoMobileOverlay() { - return ( - <> -
-
-
- - - Trigger.dev is currently only available on desktop. - - Back Home - -
-
- - ); -} diff --git a/apps/webapp/app/components/PageGradient.tsx b/apps/webapp/app/components/PageGradient.tsx deleted file mode 100644 index 4bfed9f24c8..00000000000 --- a/apps/webapp/app/components/PageGradient.tsx +++ /dev/null @@ -1,12 +0,0 @@ -import gradientBackground from "~/assets/images/gradient-background.png"; - -export function PageGradient({ children }: { children: React.ReactNode }) { - return ( -
- {children} -
- ); -} diff --git a/apps/webapp/app/components/ProductHuntBanner.tsx b/apps/webapp/app/components/ProductHuntBanner.tsx index 39642da98c1..b85893836a1 100644 --- a/apps/webapp/app/components/ProductHuntBanner.tsx +++ b/apps/webapp/app/components/ProductHuntBanner.tsx @@ -14,7 +14,7 @@ export function ProductHuntBanner() { Vote for us today only! diff --git a/apps/webapp/app/components/RuntimeIcon.tsx b/apps/webapp/app/components/RuntimeIcon.tsx new file mode 100644 index 00000000000..f0626e97a38 --- /dev/null +++ b/apps/webapp/app/components/RuntimeIcon.tsx @@ -0,0 +1,58 @@ +import { SimpleTooltip } from "~/components/primitives/Tooltip"; +import { BunLogoIcon } from "~/assets/icons/BunLogoIcon"; +import { NodejsLogoIcon } from "~/assets/icons/NodejsLogoIcon"; +import { parseRuntime, formatRuntimeWithVersion, type NormalizedRuntime } from "~/utils/runtime"; + +interface RuntimeIconProps { + runtime?: string | null; + runtimeVersion?: string | null; + className?: string; + withLabel?: boolean; +} + +const getIcon = (runtime: NormalizedRuntime, className: string) => { + switch (runtime) { + case "bun": + return ; + case "node": + return ; + default: + return ; + } +}; + +export function RuntimeIcon({ + runtime, + runtimeVersion, + className = "h-4 w-4", + withLabel = false, +}: RuntimeIconProps) { + const parsedRuntime = parseRuntime(runtime); + + // Default to Node.js if no runtime is specified + const effectiveRuntime = parsedRuntime || { + runtime: "node" as const, + originalRuntime: "node", + displayName: "Node.js", + }; + + const icon = getIcon(effectiveRuntime.runtime, className); + const formattedText = formatRuntimeWithVersion(effectiveRuntime.originalRuntime, runtimeVersion); + + if (withLabel) { + return ( + + {icon} + {formattedText} + + ); + } + + if (typeof icon === "object" && "type" in icon) { + return ( + + ); + } + + return icon; +} diff --git a/apps/webapp/app/components/SetupCommands.tsx b/apps/webapp/app/components/SetupCommands.tsx index 5c346936935..accb2f65a8f 100644 --- a/apps/webapp/app/components/SetupCommands.tsx +++ b/apps/webapp/app/components/SetupCommands.tsx @@ -1,4 +1,7 @@ -import { InlineCode } from "./code/InlineCode"; +import { createContext, useContext, useState } from "react"; +import { useAppOrigin } from "~/hooks/useAppOrigin"; +import { useProject } from "~/hooks/useProject"; +import { useTriggerCliTag } from "~/hooks/useTriggerCliTag"; import { ClientTabs, ClientTabsContent, @@ -6,125 +9,263 @@ import { ClientTabsTrigger, } from "./primitives/ClientTabs"; import { ClipboardField } from "./primitives/ClipboardField"; -import { Paragraph } from "./primitives/Paragraph"; +import { Header3 } from "./primitives/Headers"; + +type PackageManagerContextType = { + activePackageManager: string; + setActivePackageManager: (value: string) => void; +}; + +const PackageManagerContext = createContext(undefined); + +export function PackageManagerProvider({ children }: { children: React.ReactNode }) { + const [activePackageManager, setActivePackageManager] = useState("npm"); + + return ( + + {children} + + ); +} + +function usePackageManager() { + const context = useContext(PackageManagerContext); + if (context === undefined) { + throw new Error("usePackageManager must be used within a PackageManagerProvider"); + } + return context; +} + +function getApiUrlArg() { + const appOrigin = useAppOrigin(); + + let apiUrl: string | undefined = undefined; + + switch (appOrigin) { + case "https://cloud.trigger.dev": + // don't display the arg, use the CLI default + break; + case "https://test-cloud.trigger.dev": + apiUrl = "https://test-api.trigger.dev"; + break; + case "https://internal.trigger.dev": + apiUrl = "https://internal-api.trigger.dev"; + break; + default: + apiUrl = appOrigin; + break; + } + + return apiUrl ? `-a ${apiUrl}` : undefined; +} + +// Add title prop to the component interfaces +type TabsProps = { + title?: string; +}; + +export function InitCommandV3({ title }: TabsProps) { + const project = useProject(); + const projectRef = project.externalRef; + const apiUrlArg = getApiUrlArg(); + const triggerCliTag = useTriggerCliTag(); + + const initCommandParts = [`trigger.dev@${triggerCliTag}`, "init", `-p ${projectRef}`, apiUrlArg]; + const initCommand = initCommandParts.filter(Boolean).join(" "); + + const { activePackageManager, setActivePackageManager } = usePackageManager(); -export function InitCommand({ appOrigin, apiKey }: { appOrigin: string; apiKey: string }) { return ( - - - npm - pnpm - yarn - + +
+ {title && {title}} + + npm + pnpm + yarn + +
); } -export function RunDevCommand({ extra }: { extra?: string }) { +export function TriggerDevStepV3({ title }: TabsProps) { + const triggerCliTag = useTriggerCliTag(); + const { activePackageManager, setActivePackageManager } = usePackageManager(); + return ( - - - npm - pnpm - yarn - + +
+ {title && {title}} + + npm + pnpm + yarn + +
); } -export function TriggerDevCommand({ extra }: { extra?: string }) { +export function TriggerLoginStepV3({ title }: TabsProps) { + const triggerCliTag = useTriggerCliTag(); + const { activePackageManager, setActivePackageManager } = usePackageManager(); + return ( - - - npm - pnpm - yarn - + +
+ {title && {title}} + + npm + pnpm + yarn + +
); } -export function TriggerDevStep({ extra }: { extra?: string }) { +export function TriggerDeployStep({ title, environment }: TabsProps & { environment: { type: string } }) { + const triggerCliTag = useTriggerCliTag(); + const { activePackageManager, setActivePackageManager } = usePackageManager(); + + // Generate the environment flag based on environment type + const getEnvironmentFlag = () => { + switch (environment.type) { + case "STAGING": + return " --env staging"; + case "PREVIEW": + return " --env preview"; + case "PRODUCTION": + default: + return ""; + } + }; + + const environmentFlag = getEnvironmentFlag(); + return ( - <> - - In a separate terminal window or tab run: - - - - If you’re not running on the default you can specify the port by adding{" "} - --port 3001 to the end. - - - You should leave the dev command running when - you're developing. - - + +
+ {title && {title}} + + npm + pnpm + yarn + +
+ + + + + + + + + +
); } diff --git a/apps/webapp/app/components/Shortcuts.tsx b/apps/webapp/app/components/Shortcuts.tsx new file mode 100644 index 00000000000..2decc82c914 --- /dev/null +++ b/apps/webapp/app/components/Shortcuts.tsx @@ -0,0 +1,227 @@ +import { Keyboard } from "lucide-react"; +import { useState } from "react"; +import { useShortcutKeys } from "~/hooks/useShortcutKeys"; +import { Button } from "./primitives/Buttons"; +import { Header3 } from "./primitives/Headers"; +import { Paragraph } from "./primitives/Paragraph"; +import { + Sheet, + SheetContent, + SheetHeader, + SheetTitle, + SheetTrigger +} from "./primitives/SheetV3"; +import { ShortcutKey } from "./primitives/ShortcutKey"; + +export function Shortcuts() { + return ( + + + + + + + ); +} + +export function ShortcutsAutoOpen() { + const [isOpen, setIsOpen] = useState(false); + + useShortcutKeys({ + shortcut: { modifiers: ["shift"], key: "?" }, + action: () => { + setIsOpen(true); + }, + }); + + return ( + + + + ); +} + +function ShortcutContent() { + return ( + + + +
+ + + Keyboard shortcuts + +
+
+
+
+ General + + + + + + + + + + + + + + + + + + + + + + to + + + + + + + + + + + + +
+
+ Runs page + + + + + + + + + +
+
+ Run page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + to + + + + + + + + + +
+
+ Logs page + + + + + + + + + + + + + to + + + + + + + + + + + + + + + +
+
+ Metrics page + + + +
+
+ Schedules page + + + +
+
+ Alerts page + + + +
+
+
+
+ ); +} + +function Shortcut({ children, name }: { children: React.ReactNode; name: string }) { + return ( +
+ {name} + {children} +
+ ); +} diff --git a/apps/webapp/app/components/TimezoneSetter.tsx b/apps/webapp/app/components/TimezoneSetter.tsx new file mode 100644 index 00000000000..3481af6571d --- /dev/null +++ b/apps/webapp/app/components/TimezoneSetter.tsx @@ -0,0 +1,30 @@ +import { useFetcher } from "@remix-run/react"; +import { useEffect, useRef } from "react"; +import { useTypedLoaderData } from "remix-typedjson"; +import type { loader } from "~/root"; + +export function TimezoneSetter() { + const { timezone: storedTimezone } = useTypedLoaderData(); + const fetcher = useFetcher(); + const hasSetTimezone = useRef(false); + + useEffect(() => { + if (hasSetTimezone.current) return; + + const browserTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone; + + if (browserTimezone && browserTimezone !== storedTimezone) { + hasSetTimezone.current = true; + fetcher.submit( + { timezone: browserTimezone }, + { + method: "POST", + action: "/resources/timezone", + encType: "application/json", + } + ); + } + }, [storedTimezone, fetcher]); + + return null; +} diff --git a/apps/webapp/app/components/TriggerRotatingLogo.tsx b/apps/webapp/app/components/TriggerRotatingLogo.tsx new file mode 100644 index 00000000000..878c203a3ca --- /dev/null +++ b/apps/webapp/app/components/TriggerRotatingLogo.tsx @@ -0,0 +1,75 @@ +import { motion } from "framer-motion"; +import { useEffect, useState } from "react"; + +declare global { + namespace JSX { + interface IntrinsicElements { + "spline-viewer": React.DetailedHTMLProps< + React.HTMLAttributes & { + url?: string; + "loading-anim-type"?: string; + }, + HTMLElement + >; + } + } + + interface Window { + __splineLoader?: Promise; + } +} + +export function TriggerRotatingLogo() { + const [isSplineReady, setIsSplineReady] = useState(false); + + useEffect(() => { + // Already registered from a previous render + if (customElements.get("spline-viewer")) { + setIsSplineReady(true); + return; + } + + // Another mount already started loading - share the same promise + if (window.__splineLoader) { + window.__splineLoader.then(() => setIsSplineReady(true)).catch(() => setIsSplineReady(false)); + return; + } + + // First mount: create script and shared loader promise + const script = document.createElement("script"); + script.type = "module"; + // Version pinned; SRI hash omitted as unpkg doesn't guarantee hash stability across deploys + script.src = "https://unpkg.com/@splinetool/viewer@1.12.29/build/spline-viewer.js"; + + window.__splineLoader = new Promise((resolve, reject) => { + script.onload = () => resolve(); + script.onerror = () => reject(); + }); + + window.__splineLoader.then(() => setIsSplineReady(true)).catch(() => setIsSplineReady(false)); + + document.head.appendChild(script); + + // Intentionally no cleanup: once the custom element is registered globally, + // removing the script would break re-mounts while providing no benefit + }, []); + + if (!isSplineReady) { + return null; + } + + return ( + + + + ); +} diff --git a/apps/webapp/app/components/UserProfilePhoto.tsx b/apps/webapp/app/components/UserProfilePhoto.tsx index a59e0a6538a..99febd1c240 100644 --- a/apps/webapp/app/components/UserProfilePhoto.tsx +++ b/apps/webapp/app/components/UserProfilePhoto.tsx @@ -22,9 +22,10 @@ export function UserAvatar({ className={cn("aspect-square rounded-full p-[7%]")} src={avatarUrl} alt={name ?? "User"} + referrerPolicy="no-referrer" />
) : ( - + ); } diff --git a/apps/webapp/app/components/V4Badge.tsx b/apps/webapp/app/components/V4Badge.tsx new file mode 100644 index 00000000000..c92baabac81 --- /dev/null +++ b/apps/webapp/app/components/V4Badge.tsx @@ -0,0 +1,26 @@ +import { cn } from "~/utils/cn"; +import { Badge } from "./primitives/Badge"; +import { SimpleTooltip } from "./primitives/Tooltip"; + +export function V4Badge({ inline = false, className }: { inline?: boolean; className?: string }) { + return ( + + V4 + + } + content="This feature is only available in V4 and above." + disableHoverableContent + /> + ); +} + +export function V4Title({ children }: { children: React.ReactNode }) { + return ( + <> + {children} + + + ); +} diff --git a/apps/webapp/app/components/WarmStarts.tsx b/apps/webapp/app/components/WarmStarts.tsx new file mode 100644 index 00000000000..07a894b9e7b --- /dev/null +++ b/apps/webapp/app/components/WarmStarts.tsx @@ -0,0 +1,59 @@ +import { WarmStartIcon } from "~/assets/icons/WarmStartIcon"; +import { InfoIconTooltip, SimpleTooltip } from "./primitives/Tooltip"; +import { cn } from "~/utils/cn"; +import { Paragraph } from "./primitives/Paragraph"; + +export function WarmStartCombo({ + isWarmStart, + showTooltip = false, + className, +}: { + isWarmStart: boolean; + showTooltip?: boolean; + className?: string; +}) { + return ( +
+ + {isWarmStart ? "Warm Start" : "Cold Start"} + {showTooltip && } />} +
+ ); +} + +export function WarmStartIconWithTooltip({ + isWarmStart, + className, +}: { + isWarmStart: boolean; + className?: string; +}) { + return ( + } + content={} + /> + ); +} + +function WarmStartTooltipContent() { + return ( +
+
+ + + A cold start happens when we need to boot up a new machine for your run to execute. This + takes longer than a warm start. + +
+
+ + + A warm start happens when we can reuse a machine from a run that recently finished. This + takes less time than a cold start. + +
+
+ ); +} diff --git a/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx new file mode 100644 index 00000000000..df8669d36dd --- /dev/null +++ b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx @@ -0,0 +1,290 @@ +import { useFetcher } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import stableStringify from "json-stable-stringify"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogDescription, + DialogFooter, +} from "~/components/primitives/Dialog"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { LockClosedIcon } from "@heroicons/react/20/solid"; +import { CheckboxWithLabel } from "~/components/primitives/Checkbox"; +import { cn } from "~/utils/cn"; +import { FEATURE_FLAG, ORG_LOCKED_FLAGS, type FlagControlType } from "~/v3/featureFlags"; +import { + UNSET_VALUE, + BooleanControl, + EnumControl, + StringControl, + WorkerGroupControl, + type WorkerGroup, +} from "./FlagControls"; + +type LoaderData = { + org: { id: string; title: string; slug: string }; + orgFlags: Record; + globalFlags: Record; + controlTypes: Record; + workerGroupName?: string; + workerGroups?: WorkerGroup[]; + isManagedCloud?: boolean; +}; + +type ActionData = { + success?: boolean; + error?: string; +}; + +type FeatureFlagsDialogProps = { + orgId: string | null; + orgTitle: string; + open: boolean; + onOpenChange: (open: boolean) => void; +}; + +export function FeatureFlagsDialog({ + orgId, + orgTitle, + open, + onOpenChange, +}: FeatureFlagsDialogProps) { + const loadFetcher = useFetcher(); + const saveFetcher = useFetcher(); + + const [overrides, setOverrides] = useState>({}); + const [initialOverrides, setInitialOverrides] = useState>({}); + const [saveError, setSaveError] = useState(null); + const [unlocked, setUnlocked] = useState(false); + + const isLocked = (key: string) => !unlocked && ORG_LOCKED_FLAGS.includes(key); + + useEffect(() => { + if (open && orgId) { + setSaveError(null); + setOverrides({}); + setInitialOverrides({}); + loadFetcher.load(`/admin/api/v2/orgs/${orgId}/feature-flags`); + } + }, [open, orgId]); + + useEffect(() => { + if (loadFetcher.data) { + const loaded = loadFetcher.data.orgFlags ?? {}; + setOverrides({ ...loaded }); + setInitialOverrides({ ...loaded }); + } + }, [loadFetcher.data]); + + useEffect(() => { + if (saveFetcher.data?.success) { + onOpenChange(false); + } else if (saveFetcher.data?.error) { + setSaveError(saveFetcher.data.error); + } + }, [saveFetcher.data]); + + const isDirty = stableStringify(overrides) !== stableStringify(initialOverrides); + + const setFlagValue = (key: string, value: unknown) => { + setOverrides((prev) => ({ ...prev, [key]: value })); + }; + + const unsetFlag = (key: string) => { + setOverrides((prev) => { + const next = { ...prev }; + delete next[key]; + return next; + }); + }; + + const handleSave = () => { + if (!orgId) return; + const body = Object.keys(overrides).length === 0 ? null : overrides; + saveFetcher.submit(JSON.stringify(body), { + method: "POST", + action: `/admin/api/v2/orgs/${orgId}/feature-flags`, + encType: "application/json", + }); + }; + + const data = loadFetcher.data; + const isLoading = loadFetcher.state === "loading"; + const isSaving = saveFetcher.state === "submitting"; + + const jsonPreview = + Object.keys(overrides).length === 0 ? "null" : JSON.stringify(overrides, null, 2); + + const sortedFlagKeys = data ? Object.keys(data.controlTypes).sort() : []; + + return ( + + + Feature flags - {orgTitle} + + Org-level overrides. Unset flags inherit from global defaults. + + + {data && ( +
+ +
+ )} + +
+ {isLoading ? ( +
Loading flags...
+ ) : data ? ( +
+ {sortedFlagKeys.map((key) => { + const control = data.controlTypes[key]; + const locked = isLocked(key); + const globalValue = data.globalFlags[key as keyof typeof data.globalFlags]; + const isWorkerGroup = key === FEATURE_FLAG.defaultWorkerInstanceGroupId; + const globalDisplay = + isWorkerGroup && data.workerGroupName && globalValue !== undefined + ? `${data.workerGroupName} (${String(globalValue).slice(0, 8)}...)` + : globalValue !== undefined + ? String(globalValue) + : "unset"; + + if (locked) { + return ( +
+
+
{key}
+
global: {globalDisplay}
+
+ +
+ ); + } + + const isOverridden = key in overrides; + + return ( +
+
+
+ {key} +
+
global: {globalDisplay}
+
+ +
+ + + {isWorkerGroup && data.workerGroups ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "boolean" ? ( + setFlagValue(key, val)} + dimmed={!isOverridden} + /> + ) : control.type === "enum" ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "string" ? ( + { + if (val === "") { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : null} +
+
+ ); + })} +
+ ) : null} +
+ + {data && ( +
+ + Preview JSON + +
+              {jsonPreview}
+            
+
+ )} + + {saveError && {saveError}} + + + + + +
+
+ ); +} diff --git a/apps/webapp/app/components/admin/FlagControls.tsx b/apps/webapp/app/components/admin/FlagControls.tsx new file mode 100644 index 00000000000..b08f925dd90 --- /dev/null +++ b/apps/webapp/app/components/admin/FlagControls.tsx @@ -0,0 +1,120 @@ +import { Switch } from "~/components/primitives/Switch"; +import { Select, SelectItem } from "~/components/primitives/Select"; +import { Input } from "~/components/primitives/Input"; +import { cn } from "~/utils/cn"; + +export const UNSET_VALUE = "__unset__"; + +export function BooleanControl({ + value, + onChange, + dimmed, +}: { + value: boolean | undefined; + onChange: (val: boolean) => void; + dimmed: boolean; +}) { + return ( + + ); +} + +export function EnumControl({ + value, + options, + onChange, + dimmed, +}: { + value: string | undefined; + options: string[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...options]; + + return ( + + ); +} + +export type WorkerGroup = { id: string; name: string }; + +export function WorkerGroupControl({ + value, + workerGroups, + onChange, + dimmed, +}: { + value: string | undefined; + workerGroups: WorkerGroup[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...workerGroups.map((wg) => wg.id)]; + + return ( + + ); +} + +export function StringControl({ + value, + onChange, + dimmed, +}: { + value: string; + onChange: (val: string) => void; + dimmed: boolean; +}) { + return ( + onChange(e.target.value)} + placeholder="unset" + className={cn("w-40", dimmed && "opacity-50")} + /> + ); +} diff --git a/apps/webapp/app/components/admin/debugRun.tsx b/apps/webapp/app/components/admin/debugRun.tsx new file mode 100644 index 00000000000..5d5386a58aa --- /dev/null +++ b/apps/webapp/app/components/admin/debugRun.tsx @@ -0,0 +1,400 @@ +import { useIsImpersonating } from "~/hooks/useOrganizations"; +import { useHasAdminAccess } from "~/hooks/useUser"; +import { Button } from "../primitives/Buttons"; +import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "../primitives/Dialog"; +import { Cog6ToothIcon } from "@heroicons/react/20/solid"; +import { type loader } from "~/routes/resources.taskruns.$runParam.debug"; +import { UseDataFunctionReturn, useTypedFetcher } from "remix-typedjson"; +import { useEffect } from "react"; +import { Spinner } from "../primitives/Spinner"; +import * as Property from "~/components/primitives/PropertyTable"; +import { ClipboardField } from "../primitives/ClipboardField"; +import { MarQSShortKeyProducer } from "~/v3/marqs/marqsKeyProducer"; + +export function AdminDebugRun({ friendlyId }: { friendlyId: string }) { + const hasAdminAccess = useHasAdminAccess(); + const isImpersonating = useIsImpersonating(); + + if (!hasAdminAccess && !isImpersonating) { + return null; + } + + return ( + + + + + + + ); +} + +export function DebugRunDialog({ friendlyId }: { friendlyId: string }) { + return ( + + + + ); +} + +function DebugRunContent({ friendlyId }: { friendlyId: string }) { + const fetcher = useTypedFetcher(); + const isLoading = fetcher.state === "loading"; + + useEffect(() => { + fetcher.load(`/resources/taskruns/${friendlyId}/debug`); + }, [friendlyId]); + + return ( + <> + Debugging run + {isLoading ? ( +
+ +
+ ) : fetcher.data ? ( + + ) : ( + <>Failed to get run debug data + )} + + ); +} + +function DebugRunData(props: UseDataFunctionReturn) { + if (props.engine === "V1") { + return ; + } + + return ; +} + +function DebugRunDataEngineV1({ + run, + queueConcurrencyLimit, + queueCurrentConcurrency, + envConcurrencyLimit, + envCurrentConcurrency, + queueReserveConcurrency, + envReserveConcurrency, +}: UseDataFunctionReturn) { + const keys = new MarQSShortKeyProducer("marqs:"); + + const withPrefix = (key: string) => `marqs:${key}`; + + return ( + + + ID + + + + + + Message key + + + + + + GET message + + + + + + Queue key + + + + + + Get queue set + + + + + + Queue current concurrency key + + + + + + + Get queue current concurrency + + + + + + Queue current concurrency + + {queueCurrentConcurrency ?? "0"} + + + + Queue reserve concurrency key + + + + + + + Get queue reserve concurrency + + + + + + Queue reserve concurrency + + {queueReserveConcurrency ?? "0"} + + + + Queue concurrency limit key + + + + + + GET queue concurrency limit + + + + + + Queue concurrency limit + + {queueConcurrencyLimit ?? "Not set"} + + + + Env current concurrency key + + + + + + Get env current concurrency + + + + + + Env current concurrency + + {envCurrentConcurrency ?? "0"} + + + + Env reserve concurrency key + + + + + + Get env reserve concurrency + + + + + + Env reserve concurrency + + {envReserveConcurrency ?? "0"} + + + + Env concurrency limit key + + + + + + GET env concurrency limit + + + + + + Env concurrency limit + + {envConcurrencyLimit ?? "Not set"} + + + + Shared queue key + + + + + + Get shared queue set + + + + + + ); +} + +function DebugRunDataEngineV2({ + run, + queueConcurrencyLimit, + queueCurrentConcurrency, + envConcurrencyLimit, + envCurrentConcurrency, + keys, +}: UseDataFunctionReturn) { + return ( + + + ID + + + + + + Queue current concurrency + + {queueCurrentConcurrency ?? "0"} + + + + Queue concurrency limit + + {queueConcurrencyLimit ?? "Not set"} + + + + Env current concurrency + + {envCurrentConcurrency ?? "0"} + + + + Env concurrency limit + + {envConcurrencyLimit ?? "Not set"} + + + {keys.map((key) => ( + + {key.label} + + + + + ))} + + ); +} diff --git a/apps/webapp/app/components/admin/debugTooltip.tsx b/apps/webapp/app/components/admin/debugTooltip.tsx new file mode 100644 index 00000000000..b4ccb74f88d --- /dev/null +++ b/apps/webapp/app/components/admin/debugTooltip.tsx @@ -0,0 +1,87 @@ +import { ShieldCheckIcon } from "@heroicons/react/20/solid"; +import * as Property from "~/components/primitives/PropertyTable"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "~/components/primitives/Tooltip"; +import { useOptionalEnvironment } from "~/hooks/useEnvironment"; +import { useIsImpersonating, useOptionalOrganization } from "~/hooks/useOrganizations"; +import { useOptionalProject } from "~/hooks/useProject"; +import { useHasAdminAccess, useUser } from "~/hooks/useUser"; + +export function AdminDebugTooltip({ children }: { children?: React.ReactNode }) { + const hasAdminAccess = useHasAdminAccess(); + const isImpersonating = useIsImpersonating(); + + if (!hasAdminAccess && !isImpersonating) { + return null; + } + + return ( + + + + + + + {children} + + + + ); +} + +function Content({ children }: { children: React.ReactNode }) { + const organization = useOptionalOrganization(); + const project = useOptionalProject(); + const environment = useOptionalEnvironment(); + const user = useUser(); + + return ( +
+ + + User ID + {user.id} + + {organization && ( + + Org ID + {organization.id} + + )} + {project && ( + <> + + Project ID + {project.id} + + + Project ref + {project.externalRef} + + + )} + {environment && ( + <> + + Environment ID + {environment.id} + + + Environment type + {environment.type} + + + Environment paused + {environment.paused ? "Yes" : "No"} + + + )} + +
{children}
+
+ ); +} diff --git a/apps/webapp/app/components/billing/FreePlanUsage.tsx b/apps/webapp/app/components/billing/FreePlanUsage.tsx new file mode 100644 index 00000000000..3aa3378d0e8 --- /dev/null +++ b/apps/webapp/app/components/billing/FreePlanUsage.tsx @@ -0,0 +1,47 @@ +import { ArrowUpCircleIcon } from "@heroicons/react/24/outline"; +import { motion, useMotionValue, useTransform } from "framer-motion"; +import { Paragraph } from "../primitives/Paragraph"; +import { Link } from "@remix-run/react"; +import { cn } from "~/utils/cn"; + +export function FreePlanUsage({ to, percentage }: { to: string; percentage: number }) { + const cappedPercentage = Math.min(percentage, 1); + const widthProgress = useMotionValue(cappedPercentage * 100); + const color = useTransform( + widthProgress, + [0, 74, 75, 95, 100], + ["#22C55E", "#22C55E", "#F59E0B", "#F43F5E", "#F43F5E"] + ); + + const hasHitLimit = cappedPercentage >= 1; + + return ( +
+
+
+ + Free Plan +
+ + Upgrade + +
+
+ +
+
+ ); +} diff --git a/apps/webapp/app/components/billing/UpgradePrompt.tsx b/apps/webapp/app/components/billing/UpgradePrompt.tsx new file mode 100644 index 00000000000..8a3e098ba42 --- /dev/null +++ b/apps/webapp/app/components/billing/UpgradePrompt.tsx @@ -0,0 +1,53 @@ +import { ExclamationCircleIcon } from "@heroicons/react/20/solid"; +import tileBgPath from "~/assets/images/error-banner-tile@2x.png"; +import { MatchedOrganization, useOrganization } from "~/hooks/useOrganizations"; +import { useCurrentPlan } from "~/routes/_app.orgs.$organizationSlug/route"; +import { v3BillingPath } from "~/utils/pathBuilder"; +import { LinkButton } from "../primitives/Buttons"; +import { Icon } from "../primitives/Icon"; +import { Paragraph } from "../primitives/Paragraph"; +import { DateTime } from "~/components/primitives/DateTime"; + +export function UpgradePrompt() { + const organization = useOrganization(); + const plan = useCurrentPlan(); + + if (!plan || !plan.v3Usage.hasExceededFreeTier) { + return null; + } + + const nextMonth = new Date(); + nextMonth.setUTCMonth(nextMonth.getMonth() + 1); + nextMonth.setUTCDate(1); + nextMonth.setUTCHours(0, 0, 0, 0); + + return ( +
+
+ + + You have exceeded the monthly $ + {(plan.v3Subscription?.plan?.limits.includedUsage ?? 500) / 100} free credits. Existing + runs will be queued and new runs won't be created until{" "} + , or you upgrade. + +
+ + Upgrade + +
+ ); +} + +export function useShowUpgradePrompt(organization?: MatchedOrganization) { + const currentPlan = useCurrentPlan(); + const shouldShow = currentPlan?.v3Usage.hasExceededFreeTier === true; + return { shouldShow }; +} diff --git a/apps/webapp/app/components/billing/UsageBar.tsx b/apps/webapp/app/components/billing/UsageBar.tsx new file mode 100644 index 00000000000..e570a029e27 --- /dev/null +++ b/apps/webapp/app/components/billing/UsageBar.tsx @@ -0,0 +1,138 @@ +import { cn } from "~/utils/cn"; +import { formatCurrency } from "~/utils/numberFormatter"; +import { Paragraph } from "../primitives/Paragraph"; +import { SimpleTooltip } from "../primitives/Tooltip"; +import { motion } from "framer-motion"; + +type UsageBarProps = { + current: number; + billingLimit?: number; + tierLimit?: number; + isPaying: boolean; +}; + +const startFactor = 4; + +export function UsageBar({ current, billingLimit, tierLimit, isPaying }: UsageBarProps) { + const getLargestNumber = Math.max(current, tierLimit ?? -Infinity, billingLimit ?? -Infinity, 5); + //creates a maximum range for the progress bar, add 10% to the largest number so the bar doesn't reach the end + const maxRange = Math.round(getLargestNumber * 1.1); + const tierRunLimitPercentage = tierLimit ? Math.round((tierLimit / maxRange) * 100) : 0; + const billingLimitPercentage = + billingLimit !== undefined ? Math.round((billingLimit / maxRange) * 100) : 0; + const usagePercentage = Math.round((current / maxRange) * 100); + + //cap the usagePercentage to the freeRunLimitPercentage + const usageCappedToLimitPercentage = Math.min(usagePercentage, tierRunLimitPercentage); + + return ( +
+
+ {billingLimit !== undefined && ( + + + + )} + tierLimit ? "bg-green-700" : "bg-green-600" + )} + > + + + {tierLimit !== undefined && ( + + + + )} + +
+
+ ); +} + +const positions = { + topRow1: "bottom-0 h-9", + topRow2: "bottom-0 h-14", + bottomRow1: "top-0 h-9 items-end", + bottomRow2: "top-0 h-14 items-end", +}; + +type LegendProps = { + text: string; + value: number | string; + percentage: number; + position: keyof typeof positions; + tooltipContent?: string; +}; + +function Legend({ text, value, position, percentage, tooltipContent }: LegendProps) { + const flipLegendPositionValue = 80; + const flipLegendPosition = percentage > flipLegendPositionValue ? true : false; + return ( +
+ {tooltipContent ? ( + + {text} + {value} + + } + side="top" + content={tooltipContent} + className="z-50 h-fit" + /> + ) : ( + + {text} + {value} + + )} +
+ ); +} diff --git a/apps/webapp/app/components/code/AIQueryInput.tsx b/apps/webapp/app/components/code/AIQueryInput.tsx new file mode 100644 index 00000000000..0775ec2c2a0 --- /dev/null +++ b/apps/webapp/app/components/code/AIQueryInput.tsx @@ -0,0 +1,415 @@ +import { CheckIcon, PencilSquareIcon, PlusIcon, XMarkIcon } from "@heroicons/react/20/solid"; +import { AnimatePresence, motion } from "framer-motion"; +import { Suspense, lazy, useCallback, useEffect, useRef, useState } from "react"; +import { Button } from "~/components/primitives/Buttons"; +import { Spinner } from "~/components/primitives/Spinner"; +import { useEnvironment } from "~/hooks/useEnvironment"; +import { useOrganization } from "~/hooks/useOrganizations"; +import { useProject } from "~/hooks/useProject"; +import type { AITimeFilter } from "~/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/types"; +import { cn } from "~/utils/cn"; + +// Lazy load streamdown components to avoid SSR issues +const StreamdownRenderer = lazy(() => + import("streamdown").then((mod) => ({ + default: ({ children, isAnimating }: { children: string; isAnimating: boolean }) => ( + + {children} + + ), + })) +); + +type StreamEventType = + | { type: "thinking"; content: string } + | { type: "tool_call"; tool: string; args: unknown } + | { type: "time_filter"; filter: AITimeFilter } + | { type: "result"; success: true; query: string; timeFilter?: AITimeFilter } + | { type: "result"; success: false; error: string }; + +export type AIQueryMode = "new" | "edit"; + +interface AIQueryInputProps { + onQueryGenerated: (query: string) => void; + /** Called when the AI sets a time filter - updates URL search params */ + onTimeFilterChange?: (filter: AITimeFilter) => void; + /** Set this to a prompt to auto-populate and immediately submit */ + autoSubmitPrompt?: string; + /** Change this to force re-submission even if prompt is the same */ + autoSubmitKey?: number; + /** Get the current query in the editor (used for edit mode) */ + getCurrentQuery?: () => string; +} + +export function AIQueryInput({ + onQueryGenerated, + onTimeFilterChange, + autoSubmitPrompt, + autoSubmitKey, + getCurrentQuery, +}: AIQueryInputProps) { + const [prompt, setPrompt] = useState(""); + const [mode, setMode] = useState("new"); + const [isLoading, setIsLoading] = useState(false); + const [thinking, setThinking] = useState(""); + const [error, setError] = useState(null); + const [showThinking, setShowThinking] = useState(false); + const [lastResult, setLastResult] = useState<"success" | "error" | null>(null); + const textareaRef = useRef(null); + const abortControllerRef = useRef(null); + const lastAutoSubmitRef = useRef<{ prompt: string; key?: number } | null>(null); + + const organization = useOrganization(); + const project = useProject(); + const environment = useEnvironment(); + + const resourcePath = `/resources/orgs/${organization.slug}/projects/${project.slug}/env/${environment.slug}/query/ai-generate`; + + // Can only use edit mode if there's a current query + const canEdit = Boolean(getCurrentQuery?.()?.trim()); + + // If mode is edit but there's no current query, switch to new + useEffect(() => { + if (mode === "edit" && !canEdit) { + setMode("new"); + } + }, [mode, canEdit]); + + const submitQuery = useCallback( + async (queryPrompt: string, submitMode: AIQueryMode = mode) => { + if (!queryPrompt.trim() || isLoading) return; + const currentQuery = getCurrentQuery?.(); + if (submitMode === "edit" && !currentQuery?.trim()) return; + + setIsLoading(true); + setThinking(""); + setError(null); + setShowThinking(true); + setLastResult(null); + + // Abort any existing request + if (abortControllerRef.current) { + abortControllerRef.current.abort(); + } + abortControllerRef.current = new AbortController(); + + try { + const formData = new FormData(); + formData.append("prompt", queryPrompt); + formData.append("mode", submitMode); + if (submitMode === "edit" && currentQuery) { + formData.append("currentQuery", currentQuery); + } + + const response = await fetch(resourcePath, { + method: "POST", + body: formData, + signal: abortControllerRef.current.signal, + }); + + if (!response.ok) { + const errorData = (await response.json()) as { error?: string }; + setError(errorData.error || "Failed to generate query"); + setIsLoading(false); + setLastResult("error"); + return; + } + + const reader = response.body?.getReader(); + if (!reader) { + setError("No response stream"); + setIsLoading(false); + setLastResult("error"); + return; + } + + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + + // Process complete events from buffer + const lines = buffer.split("\n\n"); + buffer = lines.pop() || ""; // Keep incomplete line in buffer + + for (const line of lines) { + if (line.startsWith("data: ")) { + try { + const event = JSON.parse(line.slice(6)) as StreamEventType; + processStreamEvent(event); + } catch { + // Ignore parse errors + } + } + } + } + + // Process any remaining data + if (buffer.startsWith("data: ")) { + try { + const event = JSON.parse(buffer.slice(6)) as StreamEventType; + processStreamEvent(event); + } catch { + // Ignore parse errors + } + } + } catch (err) { + if (err instanceof Error && err.name === "AbortError") { + // Request was aborted, ignore + return; + } + setError(err instanceof Error ? err.message : "An error occurred"); + setLastResult("error"); + } finally { + setIsLoading(false); + } + }, + [isLoading, resourcePath, mode, getCurrentQuery] + ); + + const processStreamEvent = useCallback( + (event: StreamEventType) => { + switch (event.type) { + case "thinking": + setThinking((prev) => prev + event.content); + break; + case "tool_call": + // Tool calls are handled silently — no UI text needed + break; + case "time_filter": + // Apply time filter immediately when the AI sets it + onTimeFilterChange?.(event.filter); + break; + case "result": + if (event.success) { + // Apply time filter if included in result (backup in case time_filter event was missed) + if (event.timeFilter) { + onTimeFilterChange?.(event.timeFilter); + } + onQueryGenerated(event.query); + setPrompt(""); + setLastResult("success"); + // Keep thinking visible to show what happened + } else { + setError(event.error); + setLastResult("error"); + } + break; + } + }, + [onQueryGenerated, onTimeFilterChange] + ); + + const handleSubmit = useCallback( + (e?: React.FormEvent) => { + e?.preventDefault(); + submitQuery(prompt); + }, + [prompt, submitQuery] + ); + + // Auto-submit when autoSubmitPrompt or autoSubmitKey changes + useEffect(() => { + if (!autoSubmitPrompt || !autoSubmitPrompt.trim() || isLoading) { + return; + } + + const last = lastAutoSubmitRef.current; + const isDifferent = + last === null || autoSubmitPrompt !== last.prompt || autoSubmitKey !== last.key; + + if (isDifferent) { + lastAutoSubmitRef.current = { prompt: autoSubmitPrompt, key: autoSubmitKey }; + setPrompt(autoSubmitPrompt); + submitQuery(autoSubmitPrompt); + } + }, [autoSubmitPrompt, autoSubmitKey, isLoading, submitQuery]); + + // Cleanup on unmount + useEffect(() => { + return () => { + if (abortControllerRef.current) { + abortControllerRef.current.abort(); + } + }; + }, []); + + // Auto-hide error after delay + useEffect(() => { + if (error) { + const timer = setTimeout(() => setError(null), 15000); + return () => clearTimeout(timer); + } + }, [error]); + + return ( +
+ {/* Gradient border wrapper like the schedules AI input */} +
+
+
+