diff --git a/.changeset/truncate-error-stacks.md b/.changeset/truncate-error-stacks.md new file mode 100644 index 00000000000..b39eb3ae031 --- /dev/null +++ b/.changeset/truncate-error-stacks.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Truncate large error stacks and messages to prevent OOM crashes. Stack traces are capped at 50 frames (keeping top 5 + bottom 45 with an omission notice), individual stack lines at 1024 chars, and error messages at 1000 chars. Applied in parseError, sanitizeError, and OTel span recording. diff --git a/.claude/rules/database-safety.md b/.claude/rules/database-safety.md new file mode 100644 index 00000000000..14a6523595b --- /dev/null +++ b/.claude/rules/database-safety.md @@ -0,0 +1,13 @@ +--- +paths: + - "internal-packages/database/**" +--- + +# Database Migration Safety + +- When adding indexes to **existing tables**, use `CREATE INDEX CONCURRENTLY IF NOT EXISTS` to avoid table locks. These must be in their own separate migration file (one index per file). +- Indexes on **newly created tables** (same migration as `CREATE TABLE`) do not need CONCURRENTLY. +- When indexing a **new column on an existing table**, split into two migrations: first `ADD COLUMN IF NOT EXISTS`, then `CREATE INDEX CONCURRENTLY IF NOT EXISTS` in a separate file. +- After generating a migration with Prisma, remove extraneous lines for: `_BackgroundWorkerToBackgroundWorkerFile`, `_BackgroundWorkerToTaskQueue`, `_TaskRunToTaskRunTag`, `_WaitpointRunConnections`, `_completedWaitpoints`, `SecretStore_key_idx`, and unrelated TaskRun indexes. +- Never drop columns or tables without explicit approval. +- New code should target `RunEngineVersion.V2` only. diff --git a/.claude/rules/docs-writing.md b/.claude/rules/docs-writing.md new file mode 100644 index 00000000000..bbfb471368e --- /dev/null +++ b/.claude/rules/docs-writing.md @@ -0,0 +1,14 @@ +--- +paths: + - "docs/**" +--- + +# Documentation Writing Rules + +- Use Mintlify MDX format. Frontmatter: `title`, `description`, `sidebarTitle` (optional). +- After creating a new page, add it to `docs.json` navigation under the correct group. +- Use Mintlify components: ``, ``, ``, ``, ``, ``, ``/``. +- Code examples should be complete and runnable where possible. +- Always import from `@trigger.dev/sdk`, never `@trigger.dev/sdk/v3`. +- Keep paragraphs short. Use headers to break up content. +- Link to related pages using relative paths (e.g., `[Tasks](/tasks/overview)`). diff --git a/.claude/rules/legacy-v3-code.md b/.claude/rules/legacy-v3-code.md new file mode 100644 index 00000000000..6fd8d9402c2 --- /dev/null +++ b/.claude/rules/legacy-v3-code.md @@ -0,0 +1,33 @@ +--- +paths: + - "apps/webapp/app/v3/**" +--- + +# Legacy V1 Engine Code in `app/v3/` + +The `v3/` directory name is misleading - most code here is actively used by the current V2 engine. Only the specific files below are legacy V1-only code. + +## V1-Only Files - Never Modify + +- `marqs/` directory (entire MarQS queue system: sharedQueueConsumer, devQueueConsumer, fairDequeuingStrategy, devPubSub) +- `legacyRunEngineWorker.server.ts` (V1 background job worker) +- `services/triggerTaskV1.server.ts` (deprecated V1 task triggering) +- `services/cancelTaskRunV1.server.ts` (deprecated V1 cancellation) +- `authenticatedSocketConnection.server.ts` (V1 dev WebSocket using DevQueueConsumer) +- `sharedSocketConnection.ts` (V1 shared queue socket using SharedQueueConsumer) + +## V1/V2 Branching Pattern + +Some services act as routers that branch on `RunEngineVersion`: +- `services/cancelTaskRun.server.ts` - calls V1 service or `engine.cancelRun()` for V2 +- `services/batchTriggerV3.server.ts` - uses marqs for V1 path, run-engine for V2 + +When editing these shared services, only modify V2 code paths. + +## V2 Modern Stack + +- **Run lifecycle**: `@internal/run-engine` (internal-packages/run-engine) +- **Background jobs**: `@trigger.dev/redis-worker` (not graphile-worker/zodworker) +- **Queue operations**: RunQueue inside run-engine (not MarQS) +- **V2 engine singleton**: `runEngine.server.ts`, `runEngineHandlers.server.ts` +- **V2 workers**: `commonWorker.server.ts`, `alertsWorker.server.ts`, `batchTriggerWorker.server.ts` diff --git a/.claude/rules/sdk-packages.md b/.claude/rules/sdk-packages.md new file mode 100644 index 00000000000..549eb341809 --- /dev/null +++ b/.claude/rules/sdk-packages.md @@ -0,0 +1,12 @@ +--- +paths: + - "packages/**" +--- + +# Public Package Rules + +- Changes to `packages/` are **customer-facing**. Always add a changeset: `pnpm run changeset:add` +- Default to **patch**. Get maintainer approval for minor. Never select major without explicit approval. +- `@trigger.dev/core`: **Never import the root**. Always use subpath imports (e.g., `@trigger.dev/core/v3`). +- Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked. These are maintained in separate dedicated passes. +- Test changes using `references/hello-world` reference project. diff --git a/.claude/rules/server-apps.md b/.claude/rules/server-apps.md new file mode 100644 index 00000000000..4d46789701c --- /dev/null +++ b/.claude/rules/server-apps.md @@ -0,0 +1,23 @@ +--- +paths: + - "apps/**" +--- + +# Server App Changes + +When modifying server apps (webapp, supervisor, coordinator, etc.) with **no package changes**, add a `.server-changes/` file instead of a changeset: + +```bash +cat > .server-changes/descriptive-name.md << 'EOF' +--- +area: webapp +type: fix +--- + +Brief description of what changed and why. +EOF +``` + +- **area**: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type**: `feature` | `fix` | `improvement` | `breaking` +- If the PR also touches `packages/`, just the changeset is sufficient (no `.server-changes/` needed). diff --git a/.claude/skills/span-timeline-events/SKILL.md b/.claude/skills/span-timeline-events/SKILL.md new file mode 100644 index 00000000000..122f49912d7 --- /dev/null +++ b/.claude/skills/span-timeline-events/SKILL.md @@ -0,0 +1,78 @@ +--- +name: span-timeline-events +description: Use when adding, modifying, or debugging OTel span timeline events in the trace view. Covers event structure, ClickHouse storage constraints, rendering in SpanTimeline component, admin visibility, and the step-by-step process for adding new events. +allowed-tools: Read, Write, Edit, Glob, Grep, Bash +--- + +# Span Timeline Events + +The trace view's right panel shows a timeline of events for the selected span. These are OTel span events rendered by `app/utils/timelineSpanEvents.ts` and the `SpanTimeline` component. + +## How They Work + +1. **Span events** in OTel are attached to a parent span. In ClickHouse, they're stored as separate rows with `kind: "SPAN_EVENT"` sharing the parent span's `span_id`. The `#mergeRecordsIntoSpanDetail` method reassembles them into the span's `events` array at query time. +2. The timeline only renders events whose `name` starts with `trigger.dev/` - all others are silently filtered out. +3. The **display name** comes from `properties.event` (not the span event name), mapped through `getFriendlyNameForEvent()`. +4. Events are shown on the **span they belong to** - events on one span don't appear in another span's timeline. + +## ClickHouse Storage Constraint + +When events are written to ClickHouse, `spanEventsToTaskEventV1Input()` filters out events whose `start_time` is not greater than the parent span's `startTime`. Events at or before the span start are silently dropped. This means span events must have timestamps strictly after the span's own `startTimeUnixNano`. + +## Timeline Rendering (SpanTimeline component) + +The `SpanTimeline` component in `app/components/run/RunTimeline.tsx` renders: + +1. **Events** (thin 1px line with hollow dots) - all events from `createTimelineSpanEventsFromSpanEvents()` +2. **"Started"** marker (thick cap) - at the span's `startTime` +3. **Duration bar** (thick 7px line) - from "Started" to "Finished" +4. **"Finished"** marker (thick cap) - at `startTime + duration` + +The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued -> Pod scheduled -> Launched -> etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). + +## Trace View Sort Order + +Sibling spans (same parent) are sorted by `start_time ASC` from the ClickHouse query. The `createTreeFromFlatItems` function preserves this order. Event timestamps don't affect sort order - only the span's own `start_time`. + +## Event Structure + +```typescript +// OTel span event format +{ + name: "trigger.dev/run", // Must start with "trigger.dev/" to render + timeUnixNano: "1711200000000000000", + attributes: [ + { key: "event", value: { stringValue: "dequeue" } }, // The actual event type + { key: "duration", value: { intValue: 150 } }, // Optional: duration in ms + ] +} +``` + +## Admin-Only Events + +`getAdminOnlyForEvent()` controls visibility. Events default to **admin-only** (`true`). + +| Event | Admin-only | Friendly name | +|-------|-----------|---------------| +| `dequeue` | No | Dequeued | +| `fork` | No | Launched | +| `import` | No (if no fork event) | Importing task file | +| `create_attempt` | Yes | Attempt created | +| `lazy_payload` | Yes | Lazy attempt initialized | +| `pod_scheduled` | Yes | Pod scheduled | +| (default) | Yes | (raw event name) | + +## Adding New Timeline Events + +1. Add OTLP span event with `name: "trigger.dev/"` and `properties.event: ""` +2. Event timestamp must be strictly after the parent span's `startTimeUnixNano` (ClickHouse drops earlier events) +3. Add friendly name in `getFriendlyNameForEvent()` in `app/utils/timelineSpanEvents.ts` +4. Set admin visibility in `getAdminOnlyForEvent()` +5. Optionally add help text in `getHelpTextForEvent()` + +## Key Files + +- `app/utils/timelineSpanEvents.ts` - filtering, naming, admin logic +- `app/components/run/RunTimeline.tsx` - `SpanTimeline` component (thin line + thick bar rendering) +- `app/presenters/v3/SpanPresenter.server.ts` - loads span data including events +- `app/v3/eventRepository/clickhouseEventRepository.server.ts` - `spanEventsToTaskEventV1Input()` (storage filter), `#mergeRecordsIntoSpanDetail` (reassembly) diff --git a/.cursor/rules/webapp.mdc b/.cursor/rules/webapp.mdc index a362f14fe12..f1333febdc0 100644 --- a/.cursor/rules/webapp.mdc +++ b/.cursor/rules/webapp.mdc @@ -4,7 +4,7 @@ globs: apps/webapp/**/*.tsx,apps/webapp/**/*.ts alwaysApply: false --- -The main trigger.dev webapp, which powers it's API and dashboard and makes up the docker image that is produced as an OSS image, is a Remix 2.1.0 app that uses an express server, written in TypeScript. The following subsystems are either included in the webapp or are used by the webapp in another part of the monorepo: +The main trigger.dev webapp, which powers it's API and dashboard and makes up the docker image that is produced as an OSS image, is a Remix 2.17.4 app that uses an express server, written in TypeScript. The following subsystems are either included in the webapp or are used by the webapp in another part of the monorepo: - `@trigger.dev/database` exports a Prisma 6.14.0 client that is used extensively in the webapp to access a PostgreSQL instance. The schema file is [schema.prisma](mdc:internal-packages/database/prisma/schema.prisma) - `@trigger.dev/core` is a published package and is used to share code between the `@trigger.dev/sdk` and the webapp. It includes functionality but also a load of Zod schemas for data validation. When importing from `@trigger.dev/core` in the webapp, we never import the root `@trigger.dev/core` path, instead we favor one of the subpath exports that you can find in [package.json](mdc:packages/core/package.json) diff --git a/.cursor/rules/writing-tasks.mdc b/.cursor/rules/writing-tasks.mdc index 5116d083e23..359ed5d4733 100644 --- a/.cursor/rules/writing-tasks.mdc +++ b/.cursor/rules/writing-tasks.mdc @@ -14,7 +14,7 @@ alwaysApply: false ## Essential requirements when generating task code -1. You MUST use `@trigger.dev/sdk/v3` +1. You MUST import from `@trigger.dev/sdk` (NEVER `@trigger.dev/sdk/v3`) 2. You MUST NEVER use `client.defineJob` 3. YOU MUST `export` every task, including subtasks 4. If you are able to generate an example payload for a task, do so. @@ -53,7 +53,7 @@ Instead, you MUST ALWAYS generate ONLY this pattern: ```ts // โœ… ALWAYS GENERATE THIS EXACT PATTERN -import { task } from "@trigger.dev/sdk/v3"; +import { task } from "@trigger.dev/sdk"; //1. You need to export each task, even if it's a subtask export const helloWorld = task({ @@ -71,7 +71,7 @@ export const helloWorld = task({ A task is a function that can run for a long time with resilience to failure: ```ts -import { task } from "@trigger.dev/sdk/v3"; +import { task } from "@trigger.dev/sdk"; export const helloWorld = task({ id: "hello-world", @@ -271,7 +271,7 @@ Global lifecycle hooks can also be defined in `trigger.config.ts` to apply to al ## Correct Schedules task (cron) implementations ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; export const firstScheduledTask = schedules.task({ id: "first-scheduled-task", @@ -312,7 +312,7 @@ export const firstScheduledTask = schedules.task({ ### Attach a Declarative schedule ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; // Sepcify a cron pattern (UTC) export const firstScheduledTask = schedules.task({ @@ -326,7 +326,7 @@ export const firstScheduledTask = schedules.task({ ``` ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; // Specify a specific timezone like this: export const secondScheduledTask = schedules.task({ @@ -375,7 +375,7 @@ const createdSchedule = await schedules.create({ Schema tasks validate payloads against a schema before execution: ```ts -import { schemaTask } from "@trigger.dev/sdk/v3"; +import { schemaTask } from "@trigger.dev/sdk"; import { z } from "zod"; const myTask = schemaTask({ @@ -400,7 +400,7 @@ When you trigger a task from your backend code, you need to set the `TRIGGER_SEC Triggers a single run of a task with specified payload and options without importing the task. Use type-only imports for full type checking. ```ts -import { tasks } from "@trigger.dev/sdk/v3"; +import { tasks } from "@trigger.dev/sdk"; import type { emailSequence } from "~/trigger/emails"; export async function POST(request: Request) { @@ -418,7 +418,7 @@ export async function POST(request: Request) { Triggers multiple runs of a single task with different payloads without importing the task. ```ts -import { tasks } from "@trigger.dev/sdk/v3"; +import { tasks } from "@trigger.dev/sdk"; import type { emailSequence } from "~/trigger/emails"; export async function POST(request: Request) { @@ -436,7 +436,7 @@ export async function POST(request: Request) { Triggers multiple runs of different tasks at once, useful when you need to execute multiple tasks simultaneously. ```ts -import { batch } from "@trigger.dev/sdk/v3"; +import { batch } from "@trigger.dev/sdk"; import type { myTask1, myTask2 } from "~/trigger/myTasks"; export async function POST(request: Request) { @@ -621,7 +621,7 @@ const handle = await myTask.trigger( Access metadata inside a run: ```ts -import { task, metadata } from "@trigger.dev/sdk/v3"; +import { task, metadata } from "@trigger.dev/sdk"; export const myTask = task({ id: "my-task", @@ -713,7 +713,7 @@ Trigger.dev Realtime enables subscribing to runs for real-time updates on run st Subscribe to a run after triggering a task: ```ts -import { runs, tasks } from "@trigger.dev/sdk/v3"; +import { runs, tasks } from "@trigger.dev/sdk"; async function myBackend() { const handle = await tasks.trigger("my-task", { some: "data" }); @@ -735,7 +735,7 @@ async function myBackend() { You can infer types of run's payload and output by passing the task type: ```ts -import { runs } from "@trigger.dev/sdk/v3"; +import { runs } from "@trigger.dev/sdk"; import type { myTask } from "./trigger/my-task"; for await (const run of runs.subscribeToRun(handle.id)) { @@ -752,7 +752,7 @@ for await (const run of runs.subscribeToRun(handle.id)) { Stream data in realtime from inside your tasks using the metadata system: ```ts -import { task, metadata } from "@trigger.dev/sdk/v3"; +import { task, metadata } from "@trigger.dev/sdk"; import OpenAI from "openai"; export type STREAMS = { @@ -947,7 +947,7 @@ For most use cases, Realtime hooks are preferred over SWR hooks with polling due For client-side usage, generate a public access token with appropriate scopes: ```ts -import { auth } from "@trigger.dev/sdk/v3"; +import { auth } from "@trigger.dev/sdk"; const publicToken = await auth.createPublicToken({ scopes: { @@ -967,7 +967,7 @@ Idempotency ensures that an operation produces the same result when called multi Provide an `idempotencyKey` when triggering a task to ensure it runs only once with that key: ```ts -import { idempotencyKeys, task } from "@trigger.dev/sdk/v3"; +import { idempotencyKeys, task } from "@trigger.dev/sdk"; export const myTask = task({ id: "my-task", @@ -1058,7 +1058,7 @@ function hash(payload: any): string { ```ts // onFailure executes after all retries are exhausted; use for notifications, logging, or side effects on final failure: -import { task, logger } from "@trigger.dev/sdk/v3"; +import { task, logger } from "@trigger.dev/sdk"; export const loggingExample = task({ id: "logging-example", @@ -1078,7 +1078,7 @@ export const loggingExample = task({ The `trigger.config.ts` file configures your Trigger.dev project, specifying task locations, retry settings, telemetry, and build options. ```ts -import { defineConfig } from "@trigger.dev/sdk/v3"; +import { defineConfig } from "@trigger.dev/sdk"; export default defineConfig({ project: "", @@ -1226,7 +1226,7 @@ await myTask.trigger({ name: "Alice", age: 30 }); Before generating any code, you MUST verify: -1. Are you importing from `@trigger.dev/sdk/v3`? If not, STOP and FIX. +1. Are you importing from `@trigger.dev/sdk` (NOT `@trigger.dev/sdk/v3`)? If not, STOP and FIX. 2. Have you exported every task? If not, STOP and FIX. 3. Have you generated any DEPRECATED code patterns? If yes, STOP and FIX. diff --git a/.env.example b/.env.example index 35c8c976ff6..69d5acdc560 100644 --- a/.env.example +++ b/.env.example @@ -77,9 +77,28 @@ POSTHOG_PROJECT_KEY= # DEPOT_TOKEN= # DEV_OTEL_EXPORTER_OTLP_ENDPOINT="http://0.0.0.0:4318" # These are needed for the object store (for handling large payloads/outputs) -# OBJECT_STORE_BASE_URL="https://{bucket}.{accountId}.r2.cloudflarestorage.com" -# OBJECT_STORE_ACCESS_KEY_ID= -# OBJECT_STORE_SECRET_ACCESS_KEY= +# +# Default provider +# OBJECT_STORE_BASE_URL=http://localhost:9005 +# OBJECT_STORE_BUCKET=packets +# OBJECT_STORE_ACCESS_KEY_ID=minioadmin +# OBJECT_STORE_SECRET_ACCESS_KEY=minioadmin +# OBJECT_STORE_REGION=us-east-1 +# OBJECT_STORE_SERVICE=s3 +# +# OBJECT_STORE_DEFAULT_PROTOCOL=s3 # Only specify this if you're going to migrate object storage and set protocol values below +# Named providers (protocol-prefixed data) - optional for multi-provider support +# OBJECT_STORE_S3_BASE_URL=https://s3.amazonaws.com +# OBJECT_STORE_S3_ACCESS_KEY_ID= +# OBJECT_STORE_S3_SECRET_ACCESS_KEY= +# OBJECT_STORE_S3_REGION=us-east-1 +# OBJECT_STORE_S3_SERVICE=s3 +# +# OBJECT_STORE_R2_BASE_URL=https://{bucket}.{accountId}.r2.cloudflarestorage.com +# OBJECT_STORE_R2_ACCESS_KEY_ID= +# OBJECT_STORE_R2_SECRET_ACCESS_KEY= +# OBJECT_STORE_R2_REGION=auto +# OBJECT_STORE_R2_SERVICE=s3 # CHECKPOINT_THRESHOLD_IN_MS=10000 # These control the server-side internal telemetry diff --git a/.github/VOUCHED.td b/.github/VOUCHED.td index 8e06c770ab8..f9f31226363 100644 --- a/.github/VOUCHED.td +++ b/.github/VOUCHED.td @@ -11,6 +11,13 @@ myftija nicktrn samejr isshaddad +# Bots +devin-ai-integration[bot] # Outside contributors gautamsi -capaj \ No newline at end of file +capaj +chengzp +bharathkumar39293 +bhekanik +jrossi +ThullyoCunha \ No newline at end of file diff --git a/.github/workflows/changesets-pr.yml b/.github/workflows/changesets-pr.yml index e2fdc187614..1e18cd65742 100644 --- a/.github/workflows/changesets-pr.yml +++ b/.github/workflows/changesets-pr.yml @@ -7,6 +7,7 @@ on: paths: - "packages/**" - ".changeset/**" + - ".server-changes/**" - "package.json" - "pnpm-lock.yaml" @@ -50,7 +51,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Update PR title with version + - name: Update PR title and enhance body if: steps.changesets.outputs.published != 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -61,6 +62,15 @@ jobs: # we arbitrarily reference the version of the cli package here; it is the same for all package releases VERSION=$(git show origin/changeset-release/main:packages/cli-v3/package.json | jq -r '.version') gh pr edit "$PR_NUMBER" --title "chore: release v$VERSION" + + # Enhance the PR body with a clean, deduplicated summary + RAW_BODY=$(gh pr view "$PR_NUMBER" --json body --jq '.body') + ENHANCED_BODY=$(CHANGESET_PR_BODY="$RAW_BODY" node scripts/enhance-release-pr.mjs "$VERSION") + if [ -n "$ENHANCED_BODY" ]; then + gh api repos/triggerdotdev/trigger.dev/pulls/"$PR_NUMBER" \ + -X PATCH \ + -f body="$ENHANCED_BODY" + fi fi update-lockfile: @@ -88,15 +98,58 @@ jobs: - name: Install and update lockfile run: pnpm install --no-frozen-lockfile - - name: Commit and push lockfile + - name: Clean up consumed .server-changes/ files + run: | + set -e + shopt -s nullglob + files=(.server-changes/*.md) + for f in "${files[@]}"; do + if [ "$(basename "$f")" != "README.md" ]; then + git rm --ignore-unmatch "$f" + fi + done + + - name: Commit and push lockfile + server-changes cleanup run: | set -e - if git diff --quiet pnpm-lock.yaml; then - echo "No lockfile changes" + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add pnpm-lock.yaml + if ! git diff --cached --quiet; then + git commit -m "chore: update lockfile and clean up .server-changes/ for release" + git push origin changeset-release/main else - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add pnpm-lock.yaml - git commit -m "chore: update lockfile for release" + echo "No changes to commit" + fi + + bump-chart-version: + name: Bump Helm chart version on release PR + runs-on: ubuntu-latest + needs: update-lockfile + permissions: + contents: write + steps: + - name: Checkout release branch + uses: actions/checkout@v4 + with: + ref: changeset-release/main + + - name: Bump Chart.yaml + run: | + set -e + VERSION=$(jq -r '.version' packages/cli-v3/package.json) + sed -i "s/^version:.*/version: ${VERSION}/" ./hosting/k8s/helm/Chart.yaml + sed -i "s/^appVersion:.*/appVersion: v${VERSION}/" ./hosting/k8s/helm/Chart.yaml + + - name: Commit and push Chart.yaml bump + run: | + set -e + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add hosting/k8s/helm/Chart.yaml + if ! git diff --cached --quiet; then + git commit -m "chore: bump helm chart version for release" git push origin changeset-release/main + else + echo "Chart.yaml already at target version, no-op" fi diff --git a/.github/workflows/claude-md-audit.yml b/.github/workflows/claude-md-audit.yml new file mode 100644 index 00000000000..ddba0180401 --- /dev/null +++ b/.github/workflows/claude-md-audit.yml @@ -0,0 +1,70 @@ +name: ๐Ÿ“ CLAUDE.md Audit + +on: + pull_request: + types: [opened, ready_for_review, synchronize] + paths-ignore: + - "docs/**" + - ".changeset/**" + - ".server-changes/**" + - "**/*.md" + - "references/**" + +concurrency: + group: claude-md-audit-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + audit: + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + use_sticky_comment: true + allowed_bots: "devin-ai-integration[bot]" + + claude_args: | + --max-turns 15 + --allowedTools "Read,Glob,Grep,Bash(git diff:*)" + + prompt: | + You are reviewing a PR to check whether any CLAUDE.md files or .claude/rules/ files need updating. + + ## Your task + + 1. Run `git diff origin/main...HEAD --name-only` to see which files changed in this PR. + 2. For each changed directory, check if there's a CLAUDE.md in that directory or a parent directory. + 3. Determine if any CLAUDE.md or .claude/rules/ file should be updated based on the changes. Consider: + - New files/directories that aren't covered by existing documentation + - Changed architecture or patterns that contradict current CLAUDE.md guidance + - New dependencies, services, or infrastructure that Claude should know about + - Renamed or moved files that are referenced in CLAUDE.md + - Changes to build commands, test patterns, or development workflows + + ## Response format + + If NO updates are needed, respond with exactly: + โœ… CLAUDE.md files look current for this PR. + + If updates ARE needed, respond with a short list: + ๐Ÿ“ **CLAUDE.md updates suggested:** + - `path/to/CLAUDE.md`: [what should be added/changed] + - `.claude/rules/file.md`: [what should be added/changed] + + Keep suggestions specific and brief. Only flag things that would actually mislead Claude in future sessions. + Do NOT suggest updates for trivial changes (bug fixes, small refactors within existing patterns). + Do NOT suggest creating new CLAUDE.md files - only updates to existing ones. diff --git a/.github/workflows/e2e-webapp.yml b/.github/workflows/e2e-webapp.yml new file mode 100644 index 00000000000..9a58aa58c7b --- /dev/null +++ b/.github/workflows/e2e-webapp.yml @@ -0,0 +1,91 @@ +name: "๐Ÿงช E2E Tests: Webapp" + +permissions: + contents: read + +on: + workflow_call: + +jobs: + e2eTests: + name: "๐Ÿงช E2E Tests: Webapp" + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + steps: + - name: ๐Ÿ”ง Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: ๐Ÿ”ง Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: ๐Ÿ”ง Restart docker daemon + run: sudo systemctl restart docker + + - name: โฌ‡๏ธ Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: โŽ” Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - name: โŽ” Setup node + uses: buildjet/setup-node@v4 + with: + node-version: 20.20.0 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: ๐Ÿณ Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: ๐Ÿณ Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: ๐Ÿณ Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull redis:7.2 + docker pull testcontainers/ryuk:0.11.0 + echo "Image pre-pull complete" + + - name: ๐Ÿ“ฅ Download deps + run: pnpm install --frozen-lockfile + + - name: ๐Ÿ“€ Generate Prisma Client + run: pnpm run generate + + - name: ๐Ÿ—๏ธ Build Webapp + run: pnpm run build --filter webapp + + - name: ๐Ÿงช Run Webapp E2E Tests + run: cd apps/webapp && pnpm exec vitest run --config vitest.e2e.config.ts --reporter=default + env: + WEBAPP_TEST_VERBOSE: "1" diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index dab18223e35..12da89db3b2 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -6,6 +6,7 @@ on: paths-ignore: - "docs/**" - ".changeset/**" + - "hosting/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml index c6efd382ff6..6b5ae8bf0b3 100644 --- a/.github/workflows/release-helm.yml +++ b/.github/workflows/release-helm.yml @@ -4,6 +4,12 @@ on: push: tags: - 'helm-v*' + workflow_call: + inputs: + chart_version: + description: 'Chart version to release' + required: true + type: string workflow_dispatch: inputs: chart_version: @@ -86,8 +92,8 @@ jobs: - name: Extract version from tag or input id: version run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ github.event.inputs.chart_version }}" + if [ -n "${{ inputs.chart_version }}" ]; then + VERSION="${{ inputs.chart_version }}" else VERSION="${{ github.ref_name }}" VERSION="${VERSION#helm-v}" @@ -122,9 +128,8 @@ jobs: - name: Create GitHub Release id: release uses: softprops/action-gh-release@v1 - if: github.event_name == 'push' with: - tag_name: ${{ github.ref_name }} + tag_name: helm-v${{ steps.version.outputs.version }} name: "Helm Chart ${{ steps.version.outputs.version }}" body: | ### Installation diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3b4135ec099..bfb57061ace 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -111,7 +111,7 @@ jobs: uses: changesets/action@v1 with: publish: pnpm run changeset:release - createGithubReleases: true + createGithubReleases: false env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -122,6 +122,19 @@ jobs: package_version=$(echo '${{ steps.changesets.outputs.publishedPackages }}' | jq -r '.[0].version') echo "package_version=${package_version}" >> "$GITHUB_OUTPUT" + - name: Create unified GitHub release + if: steps.changesets.outputs.published == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_PR_BODY: ${{ github.event.pull_request.body }} + run: | + VERSION="${{ steps.get_version.outputs.package_version }}" + node scripts/generate-github-release.mjs "$VERSION" > /tmp/release-body.md + gh release create "v${VERSION}" \ + --title "trigger.dev v${VERSION}" \ + --notes-file /tmp/release-body.md \ + --target main + - name: Create and push Docker tag if: steps.changesets.outputs.published == 'true' run: | @@ -129,6 +142,13 @@ jobs: git tag "v.docker.${{ steps.get_version.outputs.package_version }}" git push origin "v.docker.${{ steps.get_version.outputs.package_version }}" + - name: Create and push Helm chart tag + if: steps.changesets.outputs.published == 'true' + run: | + set -e + git tag "helm-v${{ steps.get_version.outputs.package_version }}" + git push origin "helm-v${{ steps.get_version.outputs.package_version }}" + # Trigger Docker builds directly via workflow_call since tags pushed with # GITHUB_TOKEN don't trigger other workflows (GitHub Actions limitation). publish-docker: @@ -140,6 +160,77 @@ jobs: with: image_tag: v${{ needs.release.outputs.published_package_version }} + # Trigger Helm chart release directly via workflow_call (same GITHUB_TOKEN + # limitation as the Docker path). Runs after Docker images are published so + # the chart never references images that don't exist yet. + publish-helm: + name: ๐Ÿงญ Publish Helm chart + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + permissions: + contents: write + packages: write + uses: ./.github/workflows/release-helm.yml + secrets: inherit + with: + chart_version: ${{ needs.release.outputs.published_package_version }} + + # After Docker images are published, update the GitHub release with the exact GHCR tag URL. + # The GHCR package version ID is only known after the image is pushed, so we query for it here. + update-release: + name: ๐Ÿ”— Update release Docker link + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + permissions: + contents: write + packages: read + steps: + - name: Update GitHub release with Docker image link + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -e + VERSION="${{ needs.release.outputs.published_package_version }}" + TAG="v${VERSION}" + + # Query GHCR for the version ID matching this tag + VERSION_ID=$(gh api --paginate -H "Accept: application/vnd.github+json" \ + /orgs/triggerdotdev/packages/container/trigger.dev/versions \ + --jq ".[] | select(.metadata.container.tags[] == \"${TAG}\") | .id" \ + | head -1) + + if [ -z "$VERSION_ID" ]; then + echo "Warning: Could not find GHCR version ID for tag ${TAG}, skipping update" + exit 0 + fi + + DOCKER_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev/${VERSION_ID}?tag=${TAG}" + GENERIC_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev" + + # Get current release body and replace the generic link with the tag-specific one. + # Use word boundary after GENERIC_URL (closing paren) to avoid matching URLs that + # already have a version ID appended (idempotent on re-runs). + gh release view "${TAG}" --repo triggerdotdev/trigger.dev --json body --jq '.body' > /tmp/release-body.md + sed -i "s|${GENERIC_URL})|${DOCKER_URL})|g" /tmp/release-body.md + + gh release edit "${TAG}" --repo triggerdotdev/trigger.dev --notes-file /tmp/release-body.md + + # Dispatch changelog entry creation to the marketing site repo. + # Runs after update-release so the GitHub release body already has the exact Docker image URL. + dispatch-changelog: + name: ๐Ÿ“ Dispatch changelog PR + needs: [release, update-release] + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + steps: + - uses: peter-evans/repository-dispatch@v3 + with: + token: ${{ secrets.CROSS_REPO_PAT }} + repository: triggerdotdev/trigger.dev-site-v3 + event-type: new-release + client-payload: '{"version": "${{ needs.release.outputs.published_package_version }}"}' + # The prerelease job needs to be on the same workflow file due to a limitation related to how npm verifies OIDC claims. prerelease: name: ๐Ÿงช Prerelease diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7c90a5a30ad..2c4276a5aa0 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -10,6 +10,9 @@ jobs: webapp: uses: ./.github/workflows/unit-tests-webapp.yml secrets: inherit + e2e-webapp: + uses: ./.github/workflows/e2e-webapp.yml + secrets: inherit packages: uses: ./.github/workflows/unit-tests-packages.yml secrets: inherit diff --git a/.github/workflows/vouch-check-pr.yml b/.github/workflows/vouch-check-pr.yml index a2f4c6d1b6b..2109b7d3289 100644 --- a/.github/workflows/vouch-check-pr.yml +++ b/.github/workflows/vouch-check-pr.yml @@ -10,14 +10,36 @@ permissions: issues: read jobs: - check-pr: + check-vouch: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: mitchellh/vouch/action/check-pr@main + - uses: mitchellh/vouch/action/check-pr@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 with: pr-number: ${{ github.event.pull_request.number }} auto-close: true require-vouch: true env: GH_TOKEN: ${{ github.token }} + + require-draft: + needs: check-vouch + if: > + github.event.pull_request.draft == false && + github.event.pull_request.author_association != 'MEMBER' && + github.event.pull_request.author_association != 'OWNER' && + github.event.pull_request.author_association != 'COLLABORATOR' && + github.event.pull_request.user.login != 'devin-ai-integration[bot]' + runs-on: ubuntu-latest + steps: + - name: Close non-draft PR + env: + GH_TOKEN: ${{ github.token }} + run: | + STATE=$(gh pr view ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --json state -q '.state') + if [ "$STATE" != "OPEN" ]; then + echo "PR is already closed, skipping." + exit 0 + fi + gh pr close ${{ github.event.pull_request.number }} \ + --repo ${{ github.repository }} \ + --comment "Thanks for your contribution! We require all external PRs to be opened in **draft** status first so you can address CodeRabbit review comments and ensure CI passes before requesting a review. Please re-open this PR as a draft. See [CONTRIBUTING.md](https://github.com/${{ github.repository }}/blob/main/CONTRIBUTING.md#pr-workflow) for details." diff --git a/.github/workflows/vouch-manage-by-issue.yml b/.github/workflows/vouch-manage-by-issue.yml index 36de055752f..51bce367b3e 100644 --- a/.github/workflows/vouch-manage-by-issue.yml +++ b/.github/workflows/vouch-manage-by-issue.yml @@ -16,8 +16,7 @@ jobs: contains(github.event.comment.body, 'denounce') || contains(github.event.comment.body, 'unvouch') steps: - - uses: actions/checkout@v4 - - uses: mitchellh/vouch/action/manage-by-issue@main + - uses: mitchellh/vouch/action/manage-by-issue@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 with: comment-id: ${{ github.event.comment.id }} issue-id: ${{ github.event.issue.number }} diff --git a/.gitignore b/.gitignore index 071b9b59035..5f6adddba0a 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,5 @@ apps/**/public/build **/.claude/settings.local.json .mcp.log .mcp.json -.cursor/debug.log \ No newline at end of file +.cursor/debug.log +ailogger-output.log \ No newline at end of file diff --git a/.server-changes/.gitkeep b/.server-changes/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/.server-changes/README.md b/.server-changes/README.md new file mode 100644 index 00000000000..82716de981c --- /dev/null +++ b/.server-changes/README.md @@ -0,0 +1,81 @@ +# Server Changes + +This directory tracks changes to server-only components (webapp, supervisor, coordinator, etc.) that are not captured by changesets. Changesets only track published npm packages โ€” server changes would otherwise go undocumented. + +## When to add a file + +**Server-only PRs**: If your PR only changes `apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, or other server components (and does NOT change anything in `packages/`), add a `.server-changes/` file. + +**Mixed PRs** (both packages and server): Just add a changeset as usual. No `.server-changes/` file needed โ€” the changeset covers it. + +**Package-only PRs**: Just add a changeset as usual. + +## File format + +Create a markdown file with a descriptive name: + +``` +.server-changes/fix-batch-queue-stalls.md +``` + +With this format: + +```markdown +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +``` + +### Fields + +- **area** (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type** (required): `feature` | `fix` | `improvement` | `breaking` + +### Description + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise โ€” it will appear in release notes. + +## Lifecycle + +1. Engineer adds a `.server-changes/` file in their PR +2. Files accumulate on `main` as PRs merge +3. The changeset release PR includes these in its summary +4. After the release merges, CI cleans up the consumed files + +## Examples + +**New feature:** + +```markdown +--- +area: webapp +type: feature +--- + +TRQL query language and the Query page +``` + +**Bug fix:** + +```markdown +--- +area: webapp +type: fix +--- + +Fix schedule limit counting for orgs with custom limits +``` + +**Improvement:** + +```markdown +--- +area: webapp +type: improvement +--- + +Use the replica for API auth queries to reduce primary load +``` diff --git a/.server-changes/admin-back-office-rate-limit.md b/.server-changes/admin-back-office-rate-limit.md new file mode 100644 index 00000000000..da6835f0c5e --- /dev/null +++ b/.server-changes/admin-back-office-rate-limit.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a "Back office" tab to `/admin` and a per-organization detail page at `/admin/back-office/orgs/:orgId`. The first action available on that page is editing the org's API rate limit: admins can save a `tokenBucket` override (refill rate, interval, max tokens) and see a plain-English preview of the resulting sustained rate and burst allowance. Writes are audit-logged via the server logger. diff --git a/.server-changes/admin-global-flags-warning.md b/.server-changes/admin-global-flags-warning.md new file mode 100644 index 00000000000..f0af91b143b --- /dev/null +++ b/.server-changes/admin-global-flags-warning.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Make it clear in the admin that feature flags are global and should rarely be changed. diff --git a/.server-changes/admin-workers-endpoint.md b/.server-changes/admin-workers-endpoint.md new file mode 100644 index 00000000000..34cd6ad70e3 --- /dev/null +++ b/.server-changes/admin-workers-endpoint.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Admin worker groups API: add GET loader and expose more fields on POST. diff --git a/.server-changes/batch-fast-fail-queue-size-limit.md b/.server-changes/batch-fast-fail-queue-size-limit.md new file mode 100644 index 00000000000..77b926a5a80 --- /dev/null +++ b/.server-changes/batch-fast-fail-queue-size-limit.md @@ -0,0 +1,7 @@ +--- +area: webapp +type: fix +--- + +Batch items that hit the environment queue size limit now fast-fail without +retries and without creating pre-failed TaskRuns. diff --git a/.server-changes/cancel-dequeued-runs.md b/.server-changes/cancel-dequeued-runs.md new file mode 100644 index 00000000000..4e393411010 --- /dev/null +++ b/.server-changes/cancel-dequeued-runs.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Show the cancel button in the runs list for runs in `DEQUEUED` status. `DEQUEUED` was missing from `NON_FINAL_RUN_STATUSES` so the list hid the button even though the single run page allowed it. diff --git a/.server-changes/deprecate-v3-cli-deploys.md b/.server-changes/deprecate-v3-cli-deploys.md new file mode 100644 index 00000000000..72040b4c5ed --- /dev/null +++ b/.server-changes/deprecate-v3-cli-deploys.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: breaking +--- + +Add server-side deprecation gate for deploys from v3 CLI versions (gated by `DEPRECATE_V3_CLI_DEPLOYS_ENABLED`). v4 CLI deploys are unaffected. diff --git a/.server-changes/dev-cli-disconnect-md b/.server-changes/dev-cli-disconnect-md new file mode 100644 index 00000000000..a0790d70765 --- /dev/null +++ b/.server-changes/dev-cli-disconnect-md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Added `/engine/v1/dev/disconnect` endpoint to auto-cancel runs when the CLI disconnects. Maximum of 500 runs can be cancelled. Uses the bulk action system when there are more than 25 runs to cancel. \ No newline at end of file diff --git a/.server-changes/fix-realtime-fetch-signal-leak.md b/.server-changes/fix-realtime-fetch-signal-leak.md new file mode 100644 index 00000000000..ac681a301f6 --- /dev/null +++ b/.server-changes/fix-realtime-fetch-signal-leak.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix RSS memory leak in the realtime proxy routes. `/realtime/v1/runs`, `/realtime/v1/runs/:id`, and `/realtime/v1/batches/:id` called `fetch()` into Electric with no abort signal, so when a client disconnected mid long-poll, undici kept the upstream socket open and buffered response chunks that would never be consumed โ€” retained only in RSS, invisible to V8 heap tooling. Thread `getRequestAbortSignal()` through `RealtimeClient.streamRun/streamRuns/streamBatch` to `longPollingFetch` and cancel the upstream body in the error path. Isolated reproducer showed ~44 KB retained per leaked request; signal propagation releases it cleanly. diff --git a/.server-changes/fix-sse-memory-leak.md b/.server-changes/fix-sse-memory-leak.md new file mode 100644 index 00000000000..e2b9ddd1810 --- /dev/null +++ b/.server-changes/fix-sse-memory-leak.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix memory leak where every aborted SSE connection pinned the full request/response graph on Node 20, caused by `AbortSignal.any()` in `sse.ts` retaining its source signals indefinitely (see nodejs/node#54614, nodejs/node#55351). Also clear the `setTimeout(abort)` timer in `entry.server.tsx` so successful HTML renders don't pin the React tree for 30s per request. diff --git a/.server-changes/getEntitlement-swr-cache.md b/.server-changes/getEntitlement-swr-cache.md new file mode 100644 index 00000000000..1c9c887a33d --- /dev/null +++ b/.server-changes/getEntitlement-swr-cache.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add 60s fresh / 60s stale SWR cache to `getEntitlement` in `platform.v3.server.ts`. Eliminates a synchronous billing-service HTTP round trip on every trigger. Reuses the existing `platformCache` (LRU memory + Redis) pattern already used for `limits` and `usage`. Cache key is `${orgId}`. Errors return a permissive `{ hasAccess: true }` fallback (existing behavior) and are also cached to prevent thundering-herd on billing outages. diff --git a/.server-changes/highlight-microvm-regions.md b/.server-changes/highlight-microvm-regions.md new file mode 100644 index 00000000000..0d5139f93fb --- /dev/null +++ b/.server-changes/highlight-microvm-regions.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Show a `MicroVM` badge next to the region name on the regions page. diff --git a/.server-changes/increase-default-project-limit.md b/.server-changes/increase-default-project-limit.md new file mode 100644 index 00000000000..f24ba53ca0b --- /dev/null +++ b/.server-changes/increase-default-project-limit.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Increase default maximum project count per organization from 10 to 25 diff --git a/.server-changes/merge-dequeue-snapshot-into-transaction.md b/.server-changes/merge-dequeue-snapshot-into-transaction.md new file mode 100644 index 00000000000..62c9a0ec6ca --- /dev/null +++ b/.server-changes/merge-dequeue-snapshot-into-transaction.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Merge execution snapshot creation into the dequeue taskRun.update transaction, reducing 2 DB commits to 1 per dequeue operation diff --git a/.server-changes/nodejs-heap-metrics.md b/.server-changes/nodejs-heap-metrics.md new file mode 100644 index 00000000000..bb82fcca99a --- /dev/null +++ b/.server-changes/nodejs-heap-metrics.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add per-worker Node.js heap metrics to the OTel meter โ€” `nodejs.memory.heap.used`, `nodejs.memory.heap.total`, `nodejs.memory.heap.limit`, `nodejs.memory.external`, `nodejs.memory.array_buffers`, `nodejs.memory.rss`. Host-metrics only publishes RSS, which overstates V8 heap by the external + native footprint; these give direct heap visibility per cluster worker so `NODE_MAX_OLD_SPACE_SIZE` can be sized against observed heap peaks rather than RSS. diff --git a/.server-changes/prisma-span-datasource-attribute.md b/.server-changes/prisma-span-datasource-attribute.md new file mode 100644 index 00000000000..86507b89790 --- /dev/null +++ b/.server-changes/prisma-span-datasource-attribute.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Tag Prisma spans with `db.datasource: "writer" | "replica"` so monitors and trace queries can distinguish the writer pool from the replica pool. Applies to all `prisma:engine:*` spans (including `prisma:engine:connection` used by the connection-pool monitors) and the outer `prisma:client:operation` span. diff --git a/.server-changes/read-replica-snapshots-since.md b/.server-changes/read-replica-snapshots-since.md new file mode 100644 index 00000000000..24f4f070c7d --- /dev/null +++ b/.server-changes/read-replica-snapshots-since.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add `RUN_ENGINE_READ_REPLICA_SNAPSHOTS_SINCE_ENABLED` flag (default off) to route the Prisma reads inside `RunEngine.getSnapshotsSince` through the read-only replica client. Offloads the snapshot polling queries (fired by every running task runner) from the primary. When disabled, behavior is unchanged. diff --git a/.server-changes/realtime-redis-connection-leak.md b/.server-changes/realtime-redis-connection-leak.md new file mode 100644 index 00000000000..e27b200174e --- /dev/null +++ b/.server-changes/realtime-redis-connection-leak.md @@ -0,0 +1,10 @@ +--- +area: webapp +type: fix +--- + +Fix Redis connection leak in realtime streams and broken abort signal propagation. + +**Redis connections**: Non-blocking methods (ingestData, appendPart, getLastChunkIndex) now share a single Redis connection instead of creating one per request. streamResponse still uses dedicated connections (required for XREAD BLOCK) but now tears them down immediately via disconnect() instead of graceful quit(), with a 15s inactivity fallback. + +**Abort signal**: request.signal is broken in Remix/Express due to a Node.js undici GC bug (nodejs/node#55428) that severs the signal chain when Remix clones the Request internally. Added getRequestAbortSignal() wired to Express res.on("close") via httpAsyncStorage, which fires reliably on client disconnect. All SSE/streaming routes updated to use it. diff --git a/.server-changes/revoked-api-key-grace-period.md b/.server-changes/revoked-api-key-grace-period.md new file mode 100644 index 00000000000..df8727295ea --- /dev/null +++ b/.server-changes/revoked-api-key-grace-period.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Regenerating a RuntimeEnvironment API key no longer invalidates the previous key immediately. The old key is recorded in a new `RevokedApiKey` table with a 24 hour grace window, and `findEnvironmentByApiKey` falls back to it when the submitted key doesn't match any live environment. The grace window can be ended early (or extended) by updating `expiresAt` on the row. diff --git a/.server-changes/span-accessory-text-guard.md b/.server-changes/span-accessory-text-guard.md new file mode 100644 index 00000000000..ab668efd17a --- /dev/null +++ b/.server-changes/span-accessory-text-guard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Prevent dashboard crash (React error #31) when span accessory item text is not a string. Filters out malformed accessory items in SpanCodePathAccessory instead of passing objects to React as children. diff --git a/.server-changes/stop-creating-taskruntag-records.md b/.server-changes/stop-creating-taskruntag-records.md new file mode 100644 index 00000000000..0b068d3c3ac --- /dev/null +++ b/.server-changes/stop-creating-taskruntag-records.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Stop creating TaskRunTag records and _TaskRunToTaskRunTag join table entries during task triggering. The denormalized runTags string array on TaskRun already stores tag names, making the M2M relation redundant write overhead. diff --git a/.server-changes/supervisor-pod-dns-ndots.md b/.server-changes/supervisor-pod-dns-ndots.md new file mode 100644 index 00000000000..48d3563f146 --- /dev/null +++ b/.server-changes/supervisor-pod-dns-ndots.md @@ -0,0 +1,8 @@ +--- +area: supervisor +type: feature +--- + +Add `KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED` flag (off by default) that overrides the cluster default and sets `dnsConfig.options.ndots` on runner pods (defaulting to 2, configurable via `KUBERNETES_POD_DNS_NDOTS`). Kubernetes defaults pods to `ndots: 5`, so any name with fewer than 5 dots โ€” including typical external domains like `api.example.com` โ€” is first walked through every entry in the cluster search list (`.svc.cluster.local`, `svc.cluster.local`, `cluster.local`) before being tried as-is, turning one resolution into 4+ CoreDNS queries (ร—2 with A+AAAA). Using a lower `ndots` value reduces DNS query amplification in the `cluster.local` zone. + +Note: before enabling, make sure no code path relies on search-list expansion for names with dots โ‰ฅ the configured value โ€” those names will hit their as-is form first and could resolve externally before falling back to the cluster search path. diff --git a/.server-changes/task-identifier-registry.md b/.server-changes/task-identifier-registry.md new file mode 100644 index 00000000000..327e188de21 --- /dev/null +++ b/.server-changes/task-identifier-registry.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Replace the expensive DISTINCT query for task filter dropdowns with a dedicated TaskIdentifier registry table backed by Redis. Environments migrate automatically on their next deploy, with a transparent fallback to the legacy query for unmigrated environments. Also fixes duplicate dropdown entries when a task changes trigger source, and adds active/archived grouping for removed tasks. Moves BackgroundWorkerTask reads in the trigger hot path to the read replica. diff --git a/.server-changes/upgrade-remix-security.md b/.server-changes/upgrade-remix-security.md new file mode 100644 index 00000000000..cfb19bacf58 --- /dev/null +++ b/.server-changes/upgrade-remix-security.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Upgrade Remix packages from 2.1.0 to 2.17.4 to address security vulnerabilities in React Router diff --git a/.server-changes/vercel-auto-promote-toggle.md b/.server-changes/vercel-auto-promote-toggle.md new file mode 100644 index 00000000000..bb5f25a21c1 --- /dev/null +++ b/.server-changes/vercel-auto-promote-toggle.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Vercel integration option to disable auto promotions diff --git a/.server-changes/vercel-settings-fix-and-onboarding-improvements.md b/.server-changes/vercel-settings-fix-and-onboarding-improvements.md new file mode 100644 index 00000000000..a78a9012432 --- /dev/null +++ b/.server-changes/vercel-settings-fix-and-onboarding-improvements.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix Vercel integration settings page (remove redundant section toggles) and improve the Vercel onboarding flow so the modal closes after connecting a GitHub repo and the marketplace `next` URL is preserved across the GitHub app install redirect. diff --git a/.vouch.yml b/.vouch.yml new file mode 100644 index 00000000000..228b51ab2fe --- /dev/null +++ b/.vouch.yml @@ -0,0 +1,3 @@ +vouch: + - github: edosrecki + - github: GautamBytes diff --git a/CHANGESETS.md b/CHANGESETS.md index 722fe64eb4c..2e225b9ad34 100644 --- a/CHANGESETS.md +++ b/CHANGESETS.md @@ -1,24 +1,49 @@ -# Changesets +# Changesets and Server Changes -Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage updated our packages and releasing them to npm. +Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage package versions and releasing them to npm. For server-only changes, we use a lightweight `.server-changes/` convention. -## Adding a changeset +## Adding a changeset (package changes) To add a changeset, use `pnpm run changeset:add` and follow the instructions [here](https://github.com/changesets/changesets/blob/main/docs/adding-a-changeset.md). Please only ever select one of our public packages when adding a changeset. -## Release instructions (local only) +## Adding a server change (server-only changes) -Based on the instructions [here](https://github.com/changesets/changesets/blob/main/docs/intro-to-using-changesets.md) +If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, etc.) and does NOT change any published packages, add a `.server-changes/` file instead of a changeset: -1. Run `pnpm run changeset:version` -2. Run `pnpm run changeset:release` +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +- `area`: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type`: `feature` | `fix` | `improvement` | `breaking` + +For **mixed PRs** (both packages and server): just add a changeset. No `.server-changes/` file needed. + +See `.server-changes/README.md` for full documentation. + +## When to add which + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset (`pnpm run changeset:add`) | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | ## Release instructions (CI) Please follow the best-practice of adding changesets in the same commit as the code making the change with `pnpm run changeset:add`, as it will allow our release.yml CI workflow to function properly: -- Anytime new changesets are added in a commit in the `main` branch, the [release.yml](./.github/workflows/release.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. -- When the version PR is merged into `main`, the release.yml workflow will automatically run `pnpm run changeset:release` to build and release packages to npm. +- Anytime new changesets are added in a commit in the `main` branch, the [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. +- The release PR body is automatically enhanced with a clean, deduplicated summary that includes both package changes and `.server-changes/` entries. +- Consumed `.server-changes/` files are removed on the `changeset-release/main` branch โ€” the same way changesets deletes `.changeset/*.md` files. When the release PR merges, they're gone from main. +- When the version PR is merged into `main`, the [release.yml](./.github/workflows/release.yml) workflow will automatically build, release packages to npm, and create a single unified GitHub release. ## Pre-release instructions diff --git a/CLAUDE.md b/CLAUDE.md index acc59359707..79d931a4548 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,73 +1,72 @@ # CLAUDE.md -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +This file provides guidance to Claude Code when working with this repository. Subdirectory CLAUDE.md files provide deeper context when you navigate into specific areas. ## Build and Development Commands This is a pnpm 10.23.0 monorepo using Turborepo. Run commands from root with `pnpm run`. -### Essential Commands - ```bash -# Start Docker services (PostgreSQL, Redis, Electric) -pnpm run docker - -# Run database migrations -pnpm run db:migrate - -# Seed the database (required for reference projects) -pnpm run db:seed +pnpm run docker # Start Docker services (PostgreSQL, Redis, Electric) +pnpm run db:migrate # Run database migrations +pnpm run db:seed # Seed the database (required for reference projects) # Build packages (required before running) pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk -# Run webapp in development mode (http://localhost:3030) -pnpm run dev --filter webapp +pnpm run dev --filter webapp # Run webapp (http://localhost:3030) +pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" # Watch CLI and packages +``` + +### Verifying Changes + +The verification command depends on where the change lives: + +- **Apps and internal packages** (`apps/*`, `internal-packages/*`): Use `typecheck`. **Never use `build`** for these โ€” building proves almost nothing about correctness. +- **Public packages** (`packages/*`): Use `build`. -# Build and watch for changes (CLI and packages) -pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" +```bash +# Apps and internal packages โ€” use typecheck +pnpm run typecheck --filter webapp # ~1-2 minutes +pnpm run typecheck --filter @internal/run-engine + +# Public packages โ€” use build +pnpm run build --filter @trigger.dev/sdk +pnpm run build --filter @trigger.dev/core ``` -### Testing +Only run typecheck/build after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +## Testing We use vitest exclusively. **Never mock anything** - use testcontainers instead. ```bash -# Run all tests for a package -pnpm run test --filter webapp - -# Run a single test file (preferred - cd into directory first) +pnpm run test --filter webapp # All tests for a package cd internal-packages/run-engine -pnpm run test ./src/engine/tests/ttl.test.ts --run - -# May need to build dependencies first -pnpm run build --filter @internal/run-engine +pnpm run test ./src/engine/tests/ttl.test.ts --run # Single test file +pnpm run build --filter @internal/run-engine # May need to build deps first ``` -Test files go next to source files (e.g., `MyService.ts` โ†’ `MyService.test.ts`). +Test files go next to source files (e.g., `MyService.ts` -> `MyService.test.ts`). -#### Testcontainers for Redis/PostgreSQL +### Testcontainers for Redis/PostgreSQL ```typescript import { redisTest, postgresTest, containerTest } from "@internal/testcontainers"; -// Redis only redisTest("should use redis", async ({ redisOptions }) => { /* ... */ }); - -// PostgreSQL only postgresTest("should use postgres", async ({ prisma }) => { /* ... */ }); - -// Both Redis and PostgreSQL containerTest("should use both", async ({ prisma, redisOptions }) => { /* ... */ }); ``` -### Changesets +## Changesets and Server Changes When modifying any public package (`packages/*` or `integrations/*`), add a changeset: @@ -77,227 +76,172 @@ pnpm run changeset:add - Default to **patch** for bug fixes and minor changes - Confirm with maintainers before selecting **minor** (new features) -- **Never** select major (breaking changes) without explicit approval +- **Never** select major without explicit approval -## Architecture Overview +When modifying only server components (`apps/webapp/`, `apps/supervisor/`, etc.) with no package changes, add a `.server-changes/` file instead. See `.server-changes/README.md` for format and documentation. -### Apps +## Dependency Pinning -- **apps/webapp**: Remix 2.1.0 app - main API, dashboard, and Docker image. Uses Express server. -- **apps/supervisor**: Node.js app handling task execution, interfacing with Docker/Kubernetes. +Zod is pinned to a single version across the entire monorepo (currently `3.25.76`). When adding zod to a new or existing package, use the **exact same version** as the rest of the repo - never a different version or a range. Mismatched zod versions cause runtime type incompatibilities (e.g., schemas from one package can't be used as body validators in another). -### Public Packages +## Architecture Overview -- **packages/trigger-sdk** (`@trigger.dev/sdk`): Main SDK -- **packages/cli-v3** (`trigger.dev`): CLI package -- **packages/core** (`@trigger.dev/core`): Shared code between SDK and webapp. Import subpaths only (never root). -- **packages/build**: Build extensions and types -- **packages/react-hooks**: React hooks for realtime and triggering -- **packages/redis-worker** (`@trigger.dev/redis-worker`): Custom Redis-based background job system +### Request Flow -### Internal Packages +User API call -> Webapp routes -> Services -> RunEngine -> Redis Queue -> Supervisor -> Container execution -> Results back through RunEngine -> ClickHouse (analytics) + PostgreSQL (state) -- **internal-packages/database** (`@trigger.dev/database`): Prisma 6.14.0 client and schema -- **internal-packages/clickhouse** (`@internal/clickhouse`): ClickHouse client and schema migrations -- **internal-packages/run-engine** (`@internal/run-engine`): "Run Engine 2.0" - run lifecycle management -- **internal-packages/redis** (`@internal/redis`): Redis client creation utilities -- **internal-packages/testcontainers** (`@internal/testcontainers`): Test helpers for Redis/PostgreSQL containers -- **internal-packages/zodworker** (`@internal/zodworker`): Graphile-worker wrapper (being replaced by redis-worker) +### Apps -### Reference Projects +- **apps/webapp**: Remix 2.17.4 app - main API, dashboard, orchestration. Uses Express server. +- **apps/supervisor**: Manages task execution containers (Docker/Kubernetes). -The `references/` directory contains test workspaces for developing and testing new SDK and platform features. Use these projects (e.g., `references/hello-world`) to manually test changes to the CLI, SDK, core packages, and webapp before submitting PRs. +### Public Packages -## Webapp Development +- **packages/trigger-sdk** (`@trigger.dev/sdk`): Main SDK for writing tasks +- **packages/cli-v3** (`trigger.dev`): CLI - also bundles code that goes into customer task images +- **packages/core** (`@trigger.dev/core`): Shared types. **Import subpaths only** (never root). +- **packages/build** (`@trigger.dev/build`): Build extensions and types +- **packages/react-hooks**: React hooks for realtime and triggering +- **packages/redis-worker** (`@trigger.dev/redis-worker`): Redis-based background job system -### Key Locations +### Internal Packages -- Trigger API: `apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts` -- Batch trigger: `apps/webapp/app/routes/api.v1.tasks.batch.ts` -- Prisma setup: `apps/webapp/app/db.server.ts` -- Run engine config: `apps/webapp/app/v3/runEngine.server.ts` -- Services: `apps/webapp/app/v3/services/**/*.server.ts` -- Presenters: `apps/webapp/app/v3/presenters/**/*.server.ts` -- OTEL endpoints: `apps/webapp/app/routes/otel.v1.logs.ts`, `otel.v1.traces.ts` +- **internal-packages/database**: Prisma 6.14.0 client and schema (PostgreSQL) +- **internal-packages/clickhouse**: ClickHouse client, schema migrations, analytics queries +- **internal-packages/run-engine**: "Run Engine 2.0" - core run lifecycle management +- **internal-packages/redis**: Redis client creation utilities (ioredis) +- **internal-packages/testcontainers**: Test helpers for Redis/PostgreSQL containers +- **internal-packages/schedule-engine**: Durable cron scheduling +- **internal-packages/zodworker**: Graphile-worker wrapper (DEPRECATED - use redis-worker) -### Environment Variables +### Legacy V1 Engine Code -Access via `env` export from `apps/webapp/app/env.server.ts`, never `process.env` directly. +The `apps/webapp/app/v3/` directory name is misleading - most code there is actively used by V2. Only specific files are V1-only legacy (MarQS queue, triggerTaskV1, cancelTaskRunV1, etc.). See `apps/webapp/CLAUDE.md` for the exact list. When you encounter V1/V2 branching in services, only modify V2 code paths. All new work uses Run Engine 2.0 (`@internal/run-engine`) and redis-worker. -For testable code, **never import env.server.ts** in test files. Pass configuration as options instead. Example pattern: +### Documentation -- `realtimeClient.server.ts` (testable service) -- `realtimeClientGlobal.server.ts` (configuration) +Docs live in `docs/` as a Mintlify site (MDX format). See `docs/CLAUDE.md` for conventions. -### Legacy vs Run Engine 2.0 +### Reference Projects -The codebase is transitioning from the "legacy run engine" (spread across codebase) to "Run Engine 2.0" (`@internal/run-engine`). Focus on Run Engine 2.0 for new work. +The `references/` directory contains test workspaces for testing SDK and platform features. Use `references/hello-world` to manually test changes before submitting PRs. ## Docker Image Guidelines -When updating Docker image references in `docker/Dockerfile` or other container files: +When updating Docker image references: - **Always use multiplatform/index digests**, not architecture-specific digests -- Architecture-specific digests (e.g., for `linux/amd64` only) will cause CI failures on different build environments -- On Docker Hub, the multiplatform digest is shown on the main image page, while architecture-specific digests are listed under "OS/ARCH" -- Example: Use `node:20.20-bullseye-slim@sha256:abc123...` where the digest is from the multiplatform index, not from a specific OS/ARCH variant - -## Database Migrations (PostgreSQL) - -1. Edit `internal-packages/database/prisma/schema.prisma` -2. Create migration: - ```bash - cd internal-packages/database - pnpm run db:migrate:dev:create --name "add_new_column" - ``` -3. **Important**: Generated migration includes extraneous changes. Remove lines related to: - - `_BackgroundWorkerToBackgroundWorkerFile` - - `_BackgroundWorkerToTaskQueue` - - `_TaskRunToTaskRunTag` - - `_WaitpointRunConnections` - - `_completedWaitpoints` - - `SecretStore_key_idx` - - Various `TaskRun` indexes unless you added them -4. Apply migration: - ```bash - pnpm run db:migrate:deploy && pnpm run generate - ``` - -### Index Migration Rules - -- Indexes **must use CONCURRENTLY** to avoid table locks -- **CONCURRENTLY indexes must be in their own separate migration file** - they cannot be combined with other schema changes - -## ClickHouse Migrations - -ClickHouse migrations use Goose format and live in `internal-packages/clickhouse/schema/`. - -1. Create a new numbered SQL file (e.g., `010_add_new_column.sql`) -2. Use Goose markers: - - ```sql - -- +goose Up - ALTER TABLE trigger_dev.your_table - ADD COLUMN new_column String DEFAULT ''; - - -- +goose Down - ALTER TABLE trigger_dev.your_table - DROP COLUMN new_column; - ``` - -Follow naming conventions in `internal-packages/clickhouse/README.md`: - -- `raw_` prefix for input tables -- `_v1`, `_v2` suffixes for versioning -- `_mv_v1` suffix for materialized views +- Architecture-specific digests cause CI failures on different build environments +- Use the digest from the main Docker Hub page, not from a specific OS/ARCH variant ## Writing Trigger.dev Tasks -Always import from `@trigger.dev/sdk`. Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob` pattern. +Always import from `@trigger.dev/sdk`. Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob`. ```typescript import { task } from "@trigger.dev/sdk"; -// Every task must be exported export const myTask = task({ - id: "my-task", // Unique ID + id: "my-task", run: async (payload: { message: string }) => { - // Task logic - no timeouts + // Task logic }, }); ``` ### SDK Documentation Rules -The `rules/` directory contains versioned documentation for writing Trigger.dev tasks, distributed to users via the SDK installer. Current version is defined in `rules/manifest.json`. - -- `rules/4.3.0/` - Latest: batch trigger v2 (1,000 items, 3MB payloads), debouncing -- `rules/4.1.0/` - Realtime streams v2, updated config -- `rules/4.0.0/` - Base v4 SDK documentation - -When adding new SDK features, create a new version directory with only the files that changed from the previous version. Update `manifest.json` to point unchanged files to previous versions. - -### Claude Code Skill - -The `.claude/skills/trigger-dev-tasks/` skill provides Claude Code with Trigger.dev task expertise. It includes: - -- `SKILL.md` - Core instructions and patterns -- Reference files for basic tasks, advanced tasks, scheduled tasks, realtime, and config - -Keep the skill in sync with the latest rules version when SDK features change. +The `rules/` directory contains versioned SDK documentation distributed via the SDK installer. Current version: `rules/manifest.json`. Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked - these are maintained in separate dedicated passes. ## Testing with hello-world Reference Project First-time setup: -1. Run `pnpm run db:seed` to seed the database (creates the hello-world project) +1. `pnpm run db:seed` to seed the database 2. Build CLI: `pnpm run build --filter trigger.dev && pnpm i` -3. Authorize CLI: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` +3. Authorize: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` -Running: - -```bash -cd references/hello-world -pnpm exec trigger dev # or with --log-level debug -``` +Running: `cd references/hello-world && pnpm exec trigger dev` ## Local Task Testing Workflow -This workflow enables Claude Code to run the webapp and trigger dev simultaneously, trigger tasks, and inspect results for testing code changes. - ### Step 1: Start Webapp in Background ```bash # Run from repo root with run_in_background: true pnpm run dev --filter webapp -``` - -Verify webapp is running: - -```bash -curl -s http://localhost:3030/healthcheck # Should return 200 +curl -s http://localhost:3030/healthcheck # Verify running ``` ### Step 2: Start Trigger Dev in Background ```bash -# Run from hello-world directory with run_in_background: true cd references/hello-world && pnpm exec trigger dev +# Wait for "Local worker ready [node]" ``` -The worker will build and register tasks. Check output for "Local worker ready [node]" message. - ### Step 3: Trigger and Monitor Tasks via MCP -Use the Trigger.dev MCP tools to interact with tasks: - ``` -# Get current worker and registered tasks mcp__trigger__get_current_worker(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev") - -# Trigger a task -mcp__trigger__trigger_task( - projectRef: "proj_rrkpdguyagvsoktglnod", - environment: "dev", - taskId: "hello-world", - payload: {"message": "Hello from Claude"} -) - -# List runs to see status -mcp__trigger__list_runs( - projectRef: "proj_rrkpdguyagvsoktglnod", - environment: "dev", - taskIdentifier: "hello-world", - limit: 5 -) +mcp__trigger__trigger_task(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskId: "hello-world", payload: {"message": "Hello"}) +mcp__trigger__list_runs(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskIdentifier: "hello-world", limit: 5) ``` -### Step 4: Monitor Execution +Dashboard: http://localhost:3030/orgs/references-9dfd/projects/hello-world-97DT/env/dev/runs + + + +# Skill mappings โ€” when working in these areas, load the linked skill file into context. + +skills: + +- task: "Using agentcrumbs for debug tracing, adding crumbs, trails, markers, querying traces, or stripping debug code before merge" + load: "node_modules/agentcrumbs/skills/agentcrumbs/SKILL.md" +- task: "Setting up agentcrumbs in the project, initializing namespace catalog, running crumbs init" +load: "node_modules/agentcrumbs/skills/agentcrumbs/init/SKILL.md" + + +## agentcrumbs + +Add crumbs as you write code โ€” not just when debugging. Mark lines with +`// @crumbs` or wrap blocks in `// #region @crumbs`. They stay on the +branch throughout development and are stripped by `agentcrumbs strip` +before merge. -- Check trigger dev output file for real-time execution logs -- Successful runs show: `Task | Run ID | Success (Xms)` -- Dashboard available at: http://localhost:3030/orgs/references-9dfd/projects/hello-world-97DT/env/dev/runs +### Namespaces -### Key Project Refs +| Namespace | Description | Path | +| ----------------- | ---------------------------------------------- | ----------------------------------- | +| `webapp` | Main Remix app โ€” API, dashboard, orchestration | `apps/webapp` | +| `supervisor` | Task execution container manager | `apps/supervisor` | +| `sdk` | Main SDK for writing tasks | `packages/trigger-sdk` | +| `cli` | CLI and task image bundling | `packages/cli-v3` | +| `core` | Shared types and subpath exports | `packages/core` | +| `build` | Build extensions and types | `packages/build` | +| `react-hooks` | React hooks for realtime/triggering | `packages/react-hooks` | +| `redis-worker` | Redis-based background job system | `packages/redis-worker` | +| `run-engine` | Run Engine 2.0 โ€” core run lifecycle | `internal-packages/run-engine` | +| `database` | Prisma client and schema | `internal-packages/database` | +| `clickhouse` | ClickHouse client, migrations, analytics | `internal-packages/clickhouse` | +| `schedule-engine` | Durable cron scheduling | `internal-packages/schedule-engine` | +| `run-queue` | Run queue management | `internal-packages/run-queue` | + +Do not invent new namespaces โ€” pick from this table or ask first. + +### For PR reviewers + +Do not flag `// @crumbs` markers, `#region @crumbs` blocks, or agentcrumbs +imports in reviews. These are temporary debug instrumentation stripped +before merge. Data logged in crumbs (IDs, names, values) never reaches +production. + +### CLI + +```bash +pnpm exec agentcrumbs collect # start collector (multi-service) +pnpm exec agentcrumbs tail --app trigger # live tail +pnpm exec agentcrumbs clear --app trigger # remove crumbs before merge +``` -- hello-world: `proj_rrkpdguyagvsoktglnod` +The preferred way to query for crumbs is to use `pnpm exec agentcrumbs query --app trigger` with the `--limit` option and cursor pagination, and clear existing crumbs before reproducing a bug via `pnpm exec agentcrumbs clear --app trigger`. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b4b280bda06..88e24cba4f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -242,9 +242,23 @@ See the [Job Catalog](./references/job-catalog/README.md) file for more. **If you get errors, be sure to fix them before committing.** -- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating you PR. -- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue - ](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). +> **Note:** We may close PRs if we decide that the cost of integrating the change outweighs the benefits. To improve the chances of your PR getting accepted, follow the guidelines below. + +### PR workflow + +1. **Always open your PR in draft status first.** Do not mark it as "Ready for Review" until the steps below are complete. +2. **Address all CodeRabbit code review comments.** Our CI runs an automated code review via CodeRabbit. Go through each comment and either fix the issue or resolve it with a comment explaining why no change is needed. +3. **Wait for all CI checks to pass.** Do not mark the PR as "Ready for Review" until every check is green. +4. **Then mark the PR as "Ready for Review"** so a maintainer can take a look. + +### Cost/benefit analysis for risky changes + +If your change touches core infrastructure, modifies widely-used code paths, or could introduce regressions, consider doing a brief cost/benefit analysis and including it in the PR description. Explain what the benefit is to users and why the risk is worth it. This goes a long way toward helping maintainers evaluate your contribution. + +### General guidelines + +- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating your PR. +- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). - Be sure to fill the PR Template accordingly. ## Adding changesets @@ -267,6 +281,39 @@ You will be prompted to select which packages to include in the changeset. Only Most of the time the changes you'll make are likely to be categorized as patch releases. If you feel like there is the need for a minor or major release of the package based on the changes being made, add the changeset as such and it will be discussed during PR review. +## Adding server changes + +Changesets only track published npm packages. If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, etc.) with no package changes, add a `.server-changes/` file so the change appears in release notes. + +Create a markdown file with a descriptive name: + +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +**Fields:** +- `area` (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type` (required): `feature` | `fix` | `improvement` | `breaking` + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise โ€” it will appear in release notes. + +**When to add which:** + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | + +See `.server-changes/README.md` for more details. + ## Troubleshooting ### EADDRINUSE: address already in use :::3030 diff --git a/RELEASE.md b/RELEASE.md index 1b9273cb142..8ba3ecb5007 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,28 @@ ## Guide on releasing a new version +### Automated release (v4+) + +Releases are fully automated via CI: + +1. PRs merge to `main` with changesets (for package changes) and/or `.server-changes/` files (for server-only changes). +2. The [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow automatically creates/updates the `changeset-release/main` PR with version bumps and an enhanced summary of all changes. Consumed `.server-changes/` files are removed on the release branch (same approach changesets uses for `.changeset/` files โ€” they're deleted on the branch, so merging the PR cleans them up). +3. When ready to release, merge the changeset release PR into `main`. +4. The [release.yml](./.github/workflows/release.yml) workflow automatically: + - Publishes all packages to npm + - Creates a single unified GitHub release (e.g., "trigger.dev v4.3.4") + - Tags and triggers Docker image builds + - After Docker images are pushed, updates the GitHub release with the exact GHCR tag link + +### What engineers need to do + +- **Package changes**: Add a changeset with `pnpm run changeset:add` +- **Server-only changes**: Add a `.server-changes/` file (see `.server-changes/README.md`) +- **Mixed PRs**: Just the changeset is enough + +See `CHANGESETS.md` for full details on changesets and server changes. + +### Legacy release (v3) + 1. Merge in the changeset PR into main, making sure to cancel both the release and publish github actions from that merge. 2. Pull the changes locally into main 3. Run `pnpm i` which will update the pnpm lock file with the new versions diff --git a/ailogger-output.log b/ailogger-output.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/supervisor/CLAUDE.md b/apps/supervisor/CLAUDE.md new file mode 100644 index 00000000000..ded836c6069 --- /dev/null +++ b/apps/supervisor/CLAUDE.md @@ -0,0 +1,20 @@ +# Supervisor + +Node.js app that manages task execution containers. Receives work from the platform, starts Docker/Kubernetes containers, monitors execution, and reports results. + +## Key Directories + +- `src/services/` - Core service logic +- `src/workloadManager/` - Container orchestration abstraction (Docker or Kubernetes) +- `src/workloadServer/` - HTTP server for workload communication (heartbeats, snapshots) +- `src/clients/` - Platform communication (webapp/coordinator) +- `src/env.ts` - Environment configuration + +## Architecture + +- **WorkloadManager**: Abstracts Docker vs Kubernetes execution +- **SupervisorSession**: Manages the dequeue loop with EWMA-based dynamic scaling +- **ResourceMonitor**: Tracks CPU/memory during execution +- **PodCleaner/FailedPodHandler**: Kubernetes-specific cleanup + +Communicates with the platform via Socket.io and HTTP. Receives task assignments through the dequeue protocol from the webapp. diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index e9609bf1541..7456d421850 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -14,9 +14,11 @@ }, "dependencies": { "@aws-sdk/client-ecr": "^3.839.0", + "@internal/compute": "workspace:*", "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", + "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", "std-env": "^3.8.0", diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index faf34bcd025..f2d54741eee 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -3,142 +3,279 @@ import { env as stdEnv } from "std-env"; import { z } from "zod"; import { AdditionalEnvVars, BoolEnv } from "./envUtil.js"; -const Env = z.object({ - // This will come from `spec.nodeName` in k8s - TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), - TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), - - // Required settings - TRIGGER_API_URL: z.string().url(), - TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file - MANAGED_WORKER_SECRET: z.string(), - OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners - - // Workload API settings (coordinator mode) - the workload API is what the run controller connects to - TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), - TRIGGER_WORKLOAD_API_PROTOCOL: z - .string() - .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) - .default("http"), - TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default - TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), - TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on - TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller - - // Runner settings - RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) - RUNNER_PRETTY_LOGS: BoolEnv.default(false), - - // Dequeue settings (provider mode) - TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), - TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), - TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), - TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), - TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), - TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds - TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds - TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) - TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) - TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) - TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) - - // Optional services - TRIGGER_WARM_START_URL: z.string().optional(), - TRIGGER_CHECKPOINT_URL: z.string().optional(), - TRIGGER_METADATA_URL: z.string().optional(), - - // Used by the resource monitor - RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), - RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), - RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), - - // Docker settings - DOCKER_API_VERSION: z.string().optional(), - DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 - DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), - DOCKER_REGISTRY_USERNAME: z.string().optional(), - DOCKER_REGISTRY_PASSWORD: z.string().optional(), - DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 - DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), - DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), - /** - * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. - * Any other value is taken as a custom network's name to which all runners should connect to. - * - * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. - * - * **WARNING**: Specifying multiple networks will slightly increase startup times. - * - * @default "host" - */ - DOCKER_RUNNER_NETWORKS: z.string().default("host"), - - // Kubernetes settings - KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), - KUBERNETES_NAMESPACE: z.string().default("default"), - KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), - KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv - KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), - KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), - KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), - KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), - KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit - KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), - KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit - - // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO - KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO - KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB - KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods - KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= - - // Project affinity settings - pods from the same project prefer the same node - KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), - KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z.string().trim().min(1).default("kubernetes.io/hostname"), - - // Placement tags settings - PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), - PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), - - // Metrics - METRICS_ENABLED: BoolEnv.default(true), - METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), - METRICS_HOST: z.string().default("127.0.0.1"), - METRICS_PORT: z.coerce.number().int().default(9090), - - // Pod cleaner - POD_CLEANER_ENABLED: BoolEnv.default(true), - POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), - POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), - - // Failed pod handler - FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), - FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), - - // Debug - DEBUG: BoolEnv.default(false), - SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), -}); +const Env = z + .object({ + // This will come from `spec.nodeName` in k8s + TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), + TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + + // Required settings + TRIGGER_API_URL: z.string().url(), + TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file + MANAGED_WORKER_SECRET: z.string(), + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners + + // Workload API settings (coordinator mode) - the workload API is what the run controller connects to + TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), + TRIGGER_WORKLOAD_API_PROTOCOL: z + .string() + .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) + .default("http"), + TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default + TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), + TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on + TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller + + // Runner settings + RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_PRETTY_LOGS: BoolEnv.default(false), + + // Dequeue settings (provider mode) + TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), + TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), + TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), + TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), + TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds + TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds + TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) + TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) + TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) + TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + + // Optional services + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_CHECKPOINT_URL: z.string().optional(), + TRIGGER_METADATA_URL: z.string().optional(), + + // Used by the resource monitor + RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), + RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), + RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), + + // Docker settings + DOCKER_API_VERSION: z.string().optional(), + DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 + DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), + DOCKER_REGISTRY_USERNAME: z.string().optional(), + DOCKER_REGISTRY_PASSWORD: z.string().optional(), + DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 + DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), + DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), + /** + * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. + * Any other value is taken as a custom network's name to which all runners should connect to. + * + * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. + * + * **WARNING**: Specifying multiple networks will slightly increase startup times. + * + * @default "host" + */ + DOCKER_RUNNER_NETWORKS: z.string().default("host"), + + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + COMPUTE_TRACE_SPANS_ENABLED: BoolEnv.default(true), + COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) + COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), + COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10), + + // Kubernetes settings + KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), + KUBERNETES_NAMESPACE: z.string().default("default"), + KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), + KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv + KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), + KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), + KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), + KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), + KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit + KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), + KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit + + // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO + KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO + KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB + KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + + // Pod DNS config โ€” override the cluster default ndots to `KUBERNETES_POD_DNS_NDOTS`. + // Default k8s ndots is 5: any name with fewer than 5 dots (e.g. `api.example.com`, 2 dots) is first walked + // through every entry in the cluster search list (`.svc.cluster.local`, `svc.cluster.local`, `cluster.local`) + // before being tried as-is, turning one resolution into 4+ CoreDNS queries (ร—2 with A+AAAA). + // Overriding the default can be useful to cut CoreDNS query amplification for external domains. + // Note: before enabling, make sure no code path relies on search-list expansion for names with dots โ‰ฅ the value + // set here โ€” those names will now hit their as-is form first and could resolve externally before falling back. + KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED: BoolEnv.default(false), + KUBERNETES_POD_DNS_NDOTS: z.coerce.number().int().min(1).max(15).default(2), + // Large machine affinity settings - large-* presets prefer a dedicated pool + KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("large-machines"), + KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100), + + // Project affinity settings - pods from the same project prefer the same node + KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), + KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z + .string() + .trim() + .min(1) + .default("kubernetes.io/hostname"), + + // Schedule affinity settings - runs from schedule trees prefer a dedicated pool + KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("scheduled-runs"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), + KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce + .number() + .int() + .min(1) + .max(100) + .default(20), + + // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool + // Comma-separated list of tolerations in the format: key=value:effect + // For Exists operator (no value): key:effect + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z + .string() + .transform((val, ctx) => { + const tolerations = val + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (missing effect): "${entry}"`, + }); + return z.NEVER; + } + + const effect = entry.slice(colonIdx + 1); + const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"]; + if (!validEffects.includes(effect)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join( + ", " + )}`, + }); + return z.NEVER; + } + + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx); + + if (!key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (empty key): "${entry}"`, + }); + return z.NEVER; + } + + if (eqIdx === -1) { + return { key, operator: "Exists" as const, effect }; + } + + return { + key, + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + + return tolerations; + }) + .optional(), + + // Placement tags settings + PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), + PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), + + // Metrics + METRICS_ENABLED: BoolEnv.default(true), + METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), + METRICS_HOST: z.string().default("127.0.0.1"), + METRICS_PORT: z.coerce.number().int().default(9090), + + // Pod cleaner + POD_CLEANER_ENABLED: BoolEnv.default(true), + POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), + POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), + + // Failed pod handler + FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), + FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), + + // Debug + DEBUG: BoolEnv.default(false), + SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + }) + .superRefine((data, ctx) => { + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_METADATA_URL"], + }); + } + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_WORKLOAD_API_DOMAIN) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_WORKLOAD_API_DOMAIN is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_WORKLOAD_API_DOMAIN"], + }); + } + }) + .transform((data) => ({ + ...data, + COMPUTE_TRACE_OTLP_ENDPOINT: data.COMPUTE_TRACE_OTLP_ENDPOINT ?? `${data.TRIGGER_API_URL}/otel`, + })); export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 0e274b30390..6f5913c47ca 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -14,6 +14,7 @@ import { } from "./resourceMonitor.js"; import { KubernetesWorkloadManager } from "./workloadManager/kubernetes.js"; import { DockerWorkloadManager } from "./workloadManager/docker.js"; +import { ComputeWorkloadManager } from "./workloadManager/compute.js"; import { HttpServer, CheckpointClient, @@ -25,6 +26,8 @@ import { register } from "./metrics.js"; import { PodCleaner } from "./services/podCleaner.js"; import { FailedPodHandler } from "./services/failedPodHandler.js"; import { getWorkerToken } from "./workerToken.js"; +import { OtlpTraceService } from "./services/otlpTraceService.js"; +import { extractTraceparent, getRestoreRunnerId } from "./util.js"; if (env.METRICS_COLLECT_DEFAULTS) { collectDefaultMetrics({ register }); @@ -35,18 +38,25 @@ class ManagedSupervisor { private readonly metricsServer?: HttpServer; private readonly workloadServer: WorkloadServer; private readonly workloadManager: WorkloadManager; + private readonly computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("managed-supervisor"); private readonly resourceMonitor: ResourceMonitor; private readonly checkpointClient?: CheckpointClient; private readonly podCleaner?: PodCleaner; private readonly failedPodHandler?: FailedPodHandler; + private readonly tracing?: OtlpTraceService; private readonly isKubernetes = isKubernetesEnvironment(env.KUBERNETES_FORCE_ENABLED); private readonly warmStartUrl = env.TRIGGER_WARM_START_URL; constructor() { - const { TRIGGER_WORKER_TOKEN, MANAGED_WORKER_SECRET, ...envWithoutSecrets } = env; + const { + TRIGGER_WORKER_TOKEN, + MANAGED_WORKER_SECRET, + COMPUTE_GATEWAY_AUTH_TOKEN, + ...envWithoutSecrets + } = env; if (env.DEBUG) { this.logger.debug("Starting up", { envWithoutSecrets }); @@ -77,9 +87,46 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = this.isKubernetes - ? new KubernetesWorkloadManager(workloadManagerOptions) - : new DockerWorkloadManager(workloadManagerOptions); + if (env.COMPUTE_GATEWAY_URL) { + if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { + throw new Error("TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create compute manager"); + } + + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + if (env.COMPUTE_TRACE_SPANS_ENABLED) { + this.tracing = new OtlpTraceService({ + endpointUrl: env.COMPUTE_TRACE_OTLP_ENDPOINT, + }); + } + + const computeManager = new ComputeWorkloadManager({ + ...workloadManagerOptions, + gateway: { + url: env.COMPUTE_GATEWAY_URL, + authToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + timeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, + }, + snapshots: { + enabled: env.COMPUTE_SNAPSHOTS_ENABLED, + delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, + dispatchLimit: env.COMPUTE_SNAPSHOT_DISPATCH_LIMIT, + callbackUrl, + }, + tracing: this.tracing, + runner: { + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT, + prettyLogs: env.RUNNER_PRETTY_LOGS, + }, + }); + this.computeManager = computeManager; + this.workloadManager = computeManager; + } else { + this.workloadManager = this.isKubernetes + ? new KubernetesWorkloadManager(workloadManagerOptions) + : new DockerWorkloadManager(workloadManagerOptions); + } if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { @@ -182,102 +229,161 @@ class ManagedSupervisor { } this.workerSession.on("runNotification", async ({ time, run }) => { - this.logger.log("runNotification", { time, run }); + this.logger.verbose("runNotification", { time, run }); this.workloadServer.notifyRun({ run }); }); - this.workerSession.on("runQueueMessage", async ({ time, message }) => { - this.logger.log(`Received message with timestamp ${time.toLocaleString()}`, message); + this.workerSession.on( + "runQueueMessage", + async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { + this.logger.verbose(`Received message with timestamp ${time.toLocaleString()}`, message); - if (message.completedWaitpoints.length > 0) { - this.logger.debug("Run has completed waitpoints", { - runId: message.run.id, - completedWaitpoints: message.completedWaitpoints.length, - }); - } - - if (!message.image) { - this.logger.error("Run has no image", { runId: message.run.id }); - return; - } - - const { checkpoint, ...rest } = message; - - if (checkpoint) { - this.logger.log("Restoring run", { runId: message.run.id }); + if (message.completedWaitpoints.length > 0) { + this.logger.debug("Run has completed waitpoints", { + runId: message.run.id, + completedWaitpoints: message.completedWaitpoints.length, + }); + } - if (!this.checkpointClient) { - this.logger.error("No checkpoint client", { runId: message.run.id }); + if (!message.image) { + this.logger.error("Run has no image", { runId: message.run.id }); return; } - try { - const didRestore = await this.checkpointClient.restoreRun({ - runFriendlyId: message.run.friendlyId, - snapshotFriendlyId: message.snapshot.friendlyId, - body: { - ...rest, - checkpoint, - }, - }); - - if (didRestore) { - this.logger.log("Restore successful", { runId: message.run.id }); - } else { - this.logger.error("Restore failed", { runId: message.run.id }); + const { checkpoint, ...rest } = message; + + // Register trace context early so snapshot spans work for all paths + // (cold create, restore, warm start). Re-registration on restore is safe + // since dequeue always provides fresh context. + if (this.computeManager?.traceSpansEnabled) { + const traceparent = extractTraceparent(message.run.traceContext); + + if (traceparent) { + this.workloadServer.registerRunTraceContext(message.run.friendlyId, { + traceparent, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + }); } - } catch (error) { - this.logger.error("Failed to restore run", { error }); } - return; - } + if (checkpoint) { + this.logger.debug("Restoring run", { runId: message.run.id }); + + if (this.computeManager) { + try { + const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id); + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + traceContext: message.run.traceContext, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + dequeuedAt: message.dequeuedAt, + }); + + if (didRestore) { + this.logger.debug("Compute restore successful", { + runId: message.run.id, + runnerId, + }); + } else { + this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); + } + } catch (error) { + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + + if (!this.checkpointClient) { + this.logger.error("No checkpoint client", { runId: message.run.id }); + return; + } + + try { + const didRestore = await this.checkpointClient.restoreRun({ + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + body: { + ...rest, + checkpoint, + }, + }); + + if (didRestore) { + this.logger.debug("Restore successful", { runId: message.run.id }); + } else { + this.logger.error("Restore failed", { runId: message.run.id }); + } + } catch (error) { + this.logger.error("Failed to restore run", { error }); + } - this.logger.log("Scheduling run", { runId: message.run.id }); + return; + } - const didWarmStart = await this.tryWarmStart(message); + this.logger.debug("Scheduling run", { runId: message.run.id }); - if (didWarmStart) { - this.logger.log("Warm start successful", { runId: message.run.id }); - return; - } + const warmStartStart = performance.now(); + const didWarmStart = await this.tryWarmStart(message); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); - try { - if (!message.deployment.friendlyId) { - // mostly a type guard, deployments always exists for deployed environments - // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. - throw new Error("Deployment is missing"); + if (didWarmStart) { + this.logger.debug("Warm start successful", { runId: message.run.id }); + return; } - await this.workloadManager.create({ - dequeuedAt: message.dequeuedAt, - envId: message.environment.id, - envType: message.environment.type, - image: message.image, - machine: message.run.machine, - orgId: message.organization.id, - projectId: message.project.id, - deploymentFriendlyId: message.deployment.friendlyId, - deploymentVersion: message.backgroundWorker.version, - runId: message.run.id, - runFriendlyId: message.run.friendlyId, - version: message.version, - nextAttemptNumber: message.run.attemptNumber, - snapshotId: message.snapshot.id, - snapshotFriendlyId: message.snapshot.friendlyId, - placementTags: message.placementTags, - }); + try { + if (!message.deployment.friendlyId) { + // mostly a type guard, deployments always exists for deployed environments + // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. + throw new Error("Deployment is missing"); + } - // Disabled for now - // this.resourceMonitor.blockResources({ - // cpu: message.run.machine.cpu, - // memory: message.run.machine.memory, - // }); - } catch (error) { - this.logger.error("Failed to create workload", { error }); + await this.workloadManager.create({ + dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, + envId: message.environment.id, + envType: message.environment.type, + image: message.image, + machine: message.run.machine, + orgId: message.organization.id, + projectId: message.project.id, + deploymentFriendlyId: message.deployment.friendlyId, + deploymentVersion: message.backgroundWorker.version, + runId: message.run.id, + runFriendlyId: message.run.friendlyId, + version: message.version, + nextAttemptNumber: message.run.attemptNumber, + snapshotId: message.snapshot.id, + snapshotFriendlyId: message.snapshot.friendlyId, + placementTags: message.placementTags, + traceContext: message.run.traceContext, + annotations: message.run.annotations, + hasPrivateLink: message.organization.hasPrivateLink, + }); + + // Disabled for now + // this.resourceMonitor.blockResources({ + // cpu: message.run.machine.cpu, + // memory: message.run.machine.memory, + // }); + } catch (error) { + this.logger.error("Failed to create workload", { error }); + } } - }); + ); if (env.METRICS_ENABLED) { this.metricsServer = new HttpServer({ @@ -296,6 +402,8 @@ class ManagedSupervisor { host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL, workerClient: this.workerSession.httpClient, checkpointClient: this.checkpointClient, + computeManager: this.computeManager, + tracing: this.tracing, }); this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); @@ -380,6 +488,7 @@ class ManagedSupervisor { async stop() { this.logger.log("Shutting down"); + await this.workloadServer.stop(); await this.workerSession.stop(); // Optional services diff --git a/apps/supervisor/src/services/computeSnapshotService.ts b/apps/supervisor/src/services/computeSnapshotService.ts new file mode 100644 index 00000000000..041e2902c75 --- /dev/null +++ b/apps/supervisor/src/services/computeSnapshotService.ts @@ -0,0 +1,242 @@ +import pLimit from "p-limit"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers"; +import { type SnapshotCallbackPayload } from "@internal/compute"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import { TimerWheel } from "./timerWheel.js"; +import type { OtlpTraceService } from "./otlpTraceService.js"; + +type DelayedSnapshot = { + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; +}; + +export type RunTraceContext = { + traceparent: string; + envId: string; + orgId: string; + projectId: string; +}; + +export type ComputeSnapshotServiceOptions = { + computeManager: ComputeWorkloadManager; + workerClient: SupervisorHttpClient; + tracing?: OtlpTraceService; +}; + +export class ComputeSnapshotService { + private readonly logger = new SimpleStructuredLogger("compute-snapshot-service"); + + private static readonly MAX_TRACE_CONTEXTS = 10_000; + private readonly runTraceContexts = new Map(); + private readonly timerWheel: TimerWheel; + private readonly dispatchLimit: ReturnType; + + private readonly computeManager: ComputeWorkloadManager; + private readonly workerClient: SupervisorHttpClient; + private readonly tracing?: OtlpTraceService; + + constructor(opts: ComputeSnapshotServiceOptions) { + this.computeManager = opts.computeManager; + this.workerClient = opts.workerClient; + this.tracing = opts.tracing; + + this.dispatchLimit = pLimit(this.computeManager.snapshotDispatchLimit); + this.timerWheel = new TimerWheel({ + delayMs: this.computeManager.snapshotDelayMs, + onExpire: (item) => { + this.dispatchLimit(() => this.dispatch(item.data)).catch((error) => { + this.logger.error("Snapshot dispatch failed", { + runId: item.data.runFriendlyId, + runnerId: item.data.runnerId, + error, + }); + }); + }, + }); + this.timerWheel.start(); + } + + /** Schedule a delayed snapshot for a run. Replaces any pending snapshot for the same run. */ + schedule(runFriendlyId: string, data: DelayedSnapshot) { + this.timerWheel.submit(runFriendlyId, data); + this.logger.debug("Snapshot scheduled", { + runFriendlyId, + snapshotFriendlyId: data.snapshotFriendlyId, + delayMs: this.computeManager.snapshotDelayMs, + }); + } + + /** Cancel a pending delayed snapshot. Returns true if one was cancelled. */ + cancel(runFriendlyId: string): boolean { + const cancelled = this.timerWheel.cancel(runFriendlyId); + if (cancelled) { + this.logger.debug("Snapshot cancelled", { runFriendlyId }); + } + return cancelled; + } + + /** Handle the callback from the gateway after a snapshot completes or fails. */ + async handleCallback(body: SnapshotCallbackPayload) { + const snapshotId = body.status === "completed" ? body.snapshot_id : undefined; + + this.logger.debug("Snapshot callback", { + snapshotId, + instanceId: body.instance_id, + status: body.status, + error: body.status === "failed" ? body.error : undefined, + metadata: body.metadata, + durationMs: body.duration_ms, + }); + + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + if (!runId || !snapshotFriendlyId) { + this.logger.error("Snapshot callback missing metadata", { body }); + return { ok: false as const, status: 400 }; + } + + this.#emitSnapshotSpan(runId, body.duration_ms, snapshotId); + + if (body.status === "completed") { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: true, + checkpoint: { + type: "COMPUTE", + location: body.snapshot_id, + }, + }, + }); + + if (result.success) { + this.logger.debug("Suspend completion submitted", { + runId, + instanceId: body.instance_id, + snapshotId: body.snapshot_id, + }); + } else { + this.logger.error("Failed to submit suspend completion", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } else { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: false, + error: body.error ?? "Snapshot failed", + }, + }); + + if (!result.success) { + this.logger.error("Failed to submit suspend failure", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } + + return { ok: true as const, status: 200 }; + } + + registerTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + // Evict oldest entries if we've hit the cap. This is best-effort: on a busy + // supervisor, entries for long-lived runs may be evicted before their snapshot + // callback arrives, causing those snapshot spans to be silently dropped. + // That's acceptable - trace spans are observability sugar, not correctness. + if (this.runTraceContexts.size >= ComputeSnapshotService.MAX_TRACE_CONTEXTS) { + const firstKey = this.runTraceContexts.keys().next().value; + if (firstKey) { + this.runTraceContexts.delete(firstKey); + } + } + + this.runTraceContexts.set(runFriendlyId, ctx); + } + + /** Stop the timer wheel, dropping pending snapshots. */ + stop(): string[] { + // Intentionally drop pending snapshots rather than dispatching them. The supervisor + // is shutting down, so our callback URL will be dead by the time the gateway responds. + // Runners detect the supervisor is gone and reconnect to a new instance, which + // re-triggers the snapshot workflow. Snapshots are an optimization, not a correctness + // requirement - runs continue fine without them. + const remaining = this.timerWheel.stop(); + const droppedRuns = remaining.map((item) => item.key); + + if (droppedRuns.length > 0) { + this.logger.info("Stopped, dropped pending snapshots", { count: droppedRuns.length }); + this.logger.debug("Dropped snapshot details", { runs: droppedRuns }); + } + + return droppedRuns; + } + + /** Dispatch a snapshot request to the gateway. */ + private async dispatch(snapshot: DelayedSnapshot): Promise { + const result = await this.computeManager.snapshot({ + runnerId: snapshot.runnerId, + metadata: { + runId: snapshot.runFriendlyId, + snapshotFriendlyId: snapshot.snapshotFriendlyId, + }, + }); + + if (!result) { + this.logger.error("Failed to request snapshot", { + runId: snapshot.runFriendlyId, + runnerId: snapshot.runnerId, + }); + } + } + + #emitSnapshotSpan(runFriendlyId: string, durationMs?: number, snapshotId?: string) { + if (!this.tracing) return; + + const ctx = this.runTraceContexts.get(runFriendlyId); + if (!ctx) return; + + const parsed = parseTraceparent(ctx.traceparent); + if (!parsed) return; + + const endEpochMs = Date.now(); + const startEpochMs = durationMs ? endEpochMs - durationMs : endEpochMs; + + const spanAttributes: Record = { + "compute.type": "snapshot", + }; + + if (durationMs !== undefined) { + spanAttributes["compute.total_ms"] = durationMs; + } + + if (snapshotId) { + spanAttributes["compute.snapshot_id"] = snapshotId; + } + + this.tracing.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.snapshot", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": ctx.envId, + "ctx.organization.id": ctx.orgId, + "ctx.project.id": ctx.projectId, + "ctx.run.id": runFriendlyId, + }, + spanAttributes, + }); + } +} diff --git a/apps/supervisor/src/services/failedPodHandler.test.ts b/apps/supervisor/src/services/failedPodHandler.test.ts index d05783288ed..4dbfda16f43 100644 --- a/apps/supervisor/src/services/failedPodHandler.test.ts +++ b/apps/supervisor/src/services/failedPodHandler.test.ts @@ -1,10 +1,11 @@ import { describe, it, expect, beforeAll, afterEach } from "vitest"; import { FailedPodHandler } from "./failedPodHandler.js"; -import { K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; import { Registry } from "prom-client"; import { setTimeout } from "timers/promises"; -describe("FailedPodHandler Integration Tests", () => { +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("FailedPodHandler Integration Tests", () => { const k8s = createK8sApi(); const namespace = "integration-test"; const register = new Registry(); diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index 07217243769..3d56c92b213 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -151,7 +151,7 @@ export class FailedPodHandler { } private async onPodCompleted(pod: V1Pod) { - this.logger.info("pod-completed", this.podSummary(pod)); + this.logger.debug("pod-completed", this.podSummary(pod)); this.informerEventsTotal.inc({ namespace: this.namespace, verb: "add" }); if (!pod.metadata?.name) { @@ -165,7 +165,7 @@ export class FailedPodHandler { } if (pod.metadata?.deletionTimestamp) { - this.logger.info("pod-completed: pod is being deleted", this.podSummary(pod)); + this.logger.verbose("pod-completed: pod is being deleted", this.podSummary(pod)); return; } @@ -188,7 +188,7 @@ export class FailedPodHandler { } private async onPodSucceeded(pod: V1Pod) { - this.logger.info("pod-succeeded", this.podSummary(pod)); + this.logger.debug("pod-succeeded", this.podSummary(pod)); this.processedPodsTotal.inc({ namespace: this.namespace, status: this.podStatus(pod), @@ -196,7 +196,7 @@ export class FailedPodHandler { } private async onPodFailed(pod: V1Pod) { - this.logger.info("pod-failed", this.podSummary(pod)); + this.logger.debug("pod-failed", this.podSummary(pod)); try { await this.processFailedPod(pod); @@ -208,7 +208,7 @@ export class FailedPodHandler { } private async processFailedPod(pod: V1Pod) { - this.logger.info("pod-failed: processing pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: processing pod", this.podSummary(pod)); const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller"); @@ -231,7 +231,7 @@ export class FailedPodHandler { } private async deletePod(pod: V1Pod) { - this.logger.info("pod-failed: deleting pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: deleting pod", this.podSummary(pod)); try { await this.k8s.core.deleteNamespacedPod({ name: pod.metadata!.name!, diff --git a/apps/supervisor/src/services/otlpTraceService.test.ts b/apps/supervisor/src/services/otlpTraceService.test.ts new file mode 100644 index 00000000000..baf3bd90306 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.test.ts @@ -0,0 +1,179 @@ +import { describe, it, expect } from "vitest"; +import { buildPayload } from "./otlpTraceService.js"; + +describe("buildPayload", () => { + it("builds valid OTLP JSON with timing attributes", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + parentSpanId: "1234567890abcdef", + spanName: "compute.provision", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: { + "ctx.environment.id": "env_123", + "ctx.organization.id": "org_456", + "ctx.project.id": "proj_789", + "ctx.run.id": "run_abc", + }, + spanAttributes: { + "compute.total_ms": 250, + "compute.gateway.schedule_ms": 1, + "compute.cache.image_cached": true, + }, + }); + + expect(payload.resourceSpans).toHaveLength(1); + + const resourceSpan = payload.resourceSpans[0]!; + + // $trigger=true so the webapp accepts it + const triggerAttr = resourceSpan.resource.attributes.find((a) => a.key === "$trigger"); + expect(triggerAttr).toEqual({ key: "$trigger", value: { boolValue: true } }); + + // Resource attributes + const envAttr = resourceSpan.resource.attributes.find( + (a) => a.key === "ctx.environment.id" + ); + expect(envAttr).toEqual({ + key: "ctx.environment.id", + value: { stringValue: "env_123" }, + }); + + // Span basics + const span = resourceSpan.scopeSpans[0]!.spans[0]!; + expect(span.name).toBe("compute.provision"); + expect(span.traceId).toBe("abcd1234abcd1234abcd1234abcd1234"); + expect(span.parentSpanId).toBe("1234567890abcdef"); + + // Integer attribute + const totalMs = span.attributes.find((a) => a.key === "compute.total_ms"); + expect(totalMs).toEqual({ key: "compute.total_ms", value: { intValue: 250 } }); + + // Boolean attribute + const cached = span.attributes.find((a) => a.key === "compute.cache.image_cached"); + expect(cached).toEqual({ key: "compute.cache.image_cached", value: { boolValue: true } }); + }); + + it("generates a valid 16-char hex span ID", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.spanId).toMatch(/^[0-9a-f]{16}$/); + }); + + it("converts timestamps to nanoseconds", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000000000"); + expect(span.endTimeUnixNano).toBe("1250000000"); + }); + + it("converts real epoch timestamps without precision loss", () => { + // Date.now() values exceed Number.MAX_SAFE_INTEGER when multiplied by 1e6 + const startMs = 1711929600000; // 2024-04-01T00:00:00Z + const endMs = 1711929600250; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1711929600000000000"); + expect(span.endTimeUnixNano).toBe("1711929600250000000"); + }); + + it("preserves sub-millisecond precision from performance.now() arithmetic", () => { + // provisionStartEpochMs = Date.now() - (performance.now() - startMs) produces fractional ms. + // Use small epoch + fraction to avoid IEEE 754 noise in the fractional part. + const startMs = 1000.322; + const endMs = 1045.789; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000322000"); + expect(span.endTimeUnixNano).toBe("1045789000"); + }); + + it("sub-ms precision affects ordering for real epoch values", () => { + // Two spans within the same millisecond should have different nanosecond timestamps + const spanA = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "a", + startTimeMs: 1711929600000.3, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const spanB = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "b", + startTimeMs: 1711929600000.7, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const startA = BigInt(spanA.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + const startB = BigInt(spanB.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + // A should sort before B (both in the same ms but different sub-ms positions) + expect(startA).toBeLessThan(startB); + }); + + it("omits parentSpanId when not provided", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.parentSpanId).toBeUndefined(); + }); + + it("handles double values for non-integer numbers", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: { "compute.cpu": 0.25 }, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + const cpu = span.attributes.find((a) => a.key === "compute.cpu"); + expect(cpu).toEqual({ key: "compute.cpu", value: { doubleValue: 0.25 } }); + }); +}); diff --git a/apps/supervisor/src/services/otlpTraceService.ts b/apps/supervisor/src/services/otlpTraceService.ts new file mode 100644 index 00000000000..da3310711d0 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.ts @@ -0,0 +1,104 @@ +import { randomBytes } from "crypto"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +export type OtlpTraceServiceOptions = { + endpointUrl: string; + timeoutMs?: number; +}; + +export type OtlpTraceSpan = { + traceId: string; + parentSpanId?: string; + spanName: string; + startTimeMs: number; + endTimeMs: number; + resourceAttributes: Record; + spanAttributes: Record; +}; + +export class OtlpTraceService { + private readonly logger = new SimpleStructuredLogger("otlp-trace"); + + constructor(private opts: OtlpTraceServiceOptions) {} + + /** Fire-and-forget: build payload and send to the configured OTLP endpoint */ + emit(span: OtlpTraceSpan): void { + const payload = buildPayload(span); + + fetch(`${this.opts.endpointUrl}/v1/traces`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(this.opts.timeoutMs ?? 5_000), + }).catch((err) => { + this.logger.warn("failed to send compute trace span", { + error: err instanceof Error ? err.message : String(err), + }); + }); + } +} + +// โ”€โ”€ Payload builder (internal) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +/** @internal Exported for tests only */ +export function buildPayload(span: OtlpTraceSpan) { + const spanId = randomBytes(8).toString("hex"); + + return { + resourceSpans: [ + { + resource: { + attributes: [ + { key: "$trigger", value: { boolValue: true } }, + ...toOtlpAttributes(span.resourceAttributes), + ], + }, + scopeSpans: [ + { + scope: { name: "supervisor.compute" }, + spans: [ + { + traceId: span.traceId, + spanId, + parentSpanId: span.parentSpanId, + name: span.spanName, + kind: 3, // SPAN_KIND_CLIENT + startTimeUnixNano: msToNano(span.startTimeMs), + endTimeUnixNano: msToNano(span.endTimeMs), + attributes: toOtlpAttributes(span.spanAttributes), + status: { code: 1 }, // STATUS_CODE_OK + }, + ], + }, + ], + }, + ], + }; +} + +function toOtlpAttributes( + attrs: Record +): Array<{ key: string; value: Record }> { + return Object.entries(attrs).map(([key, value]) => ({ + key, + value: toOtlpValue(value), + })); +} + +function toOtlpValue(value: string | number | boolean): Record { + if (typeof value === "string") return { stringValue: value }; + if (typeof value === "boolean") return { boolValue: value }; + if (Number.isInteger(value)) return { intValue: value }; + return { doubleValue: value }; +} + +/** + * Convert epoch milliseconds to nanosecond string, preserving sub-ms precision. + * Fractional ms from performance.now() arithmetic carry meaningful microsecond + * data that affects span sort ordering when events happen within the same ms. + */ +function msToNano(ms: number): string { + const wholeMs = Math.trunc(ms); + const fracNs = Math.round((ms - wholeMs) * 1_000_000); + return String(BigInt(wholeMs) * 1_000_000n + BigInt(fracNs)); +} diff --git a/apps/supervisor/src/services/podCleaner.test.ts b/apps/supervisor/src/services/podCleaner.test.ts index 36bb5de6d1f..d6ed2bb737f 100644 --- a/apps/supervisor/src/services/podCleaner.test.ts +++ b/apps/supervisor/src/services/podCleaner.test.ts @@ -1,10 +1,11 @@ import { PodCleaner } from "./podCleaner.js"; -import { K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; import { setTimeout } from "timers/promises"; import { describe, it, expect, beforeAll, afterEach } from "vitest"; import { Registry } from "prom-client"; -describe("PodCleaner Integration Tests", () => { +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("PodCleaner Integration Tests", () => { const k8s = createK8sApi(); const namespace = "integration-test"; const register = new Registry(); diff --git a/apps/supervisor/src/services/podCleaner.ts b/apps/supervisor/src/services/podCleaner.ts index 56eaaeb88af..3ac5da293df 100644 --- a/apps/supervisor/src/services/podCleaner.ts +++ b/apps/supervisor/src/services/podCleaner.ts @@ -90,7 +90,7 @@ export class PodCleaner { status: "succeeded", }); - this.logger.info("Deleted batch of pods", { continuationToken }); + this.logger.debug("Deleted batch of pods", { continuationToken }); } catch (err) { this.logger.error("Failed to delete batch of pods", { err: err instanceof Error ? err.message : String(err), diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts new file mode 100644 index 00000000000..3f6bb9aa19b --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.test.ts @@ -0,0 +1,254 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { TimerWheel } from "./timerWheel.js"; + +describe("TimerWheel", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("dispatches item after delay", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "snapshot-data"); + + // Not yet + vi.advanceTimersByTime(2900); + expect(dispatched).toEqual([]); + + // After delay + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("cancels item before it fires", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(1000); + expect(wheel.cancel("run-1")).toBe(true); + + vi.advanceTimersByTime(5000); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("cancel returns false for unknown key", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + expect(wheel.cancel("nonexistent")).toBe(false); + }); + + it("deduplicates: resubmitting same key replaces the entry", () => { + const dispatched: { key: string; data: string }[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push({ key: item.key, data: item.data }), + }); + + wheel.start(); + wheel.submit("run-1", "old-data"); + + vi.advanceTimersByTime(1000); + wheel.submit("run-1", "new-data"); + + // Original would have fired at t=3000, but was replaced + // New one fires at t=1000+3000=4000 + vi.advanceTimersByTime(2100); + expect(dispatched).toEqual([]); + + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual([{ key: "run-1", data: "new-data" }]); + + wheel.stop(); + }); + + it("handles many concurrent items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + for (let i = 0; i < 1000; i++) { + wheel.submit(`run-${i}`, `data-${i}`); + } + expect(wheel.size).toBe(1000); + + vi.advanceTimersByTime(3100); + expect(dispatched.length).toBe(1000); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("handles items submitted at different times", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-2", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-3", "data"); + + // t=2000: nothing yet + expect(dispatched).toEqual([]); + + // t=3100: run-1 fires + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-1"]); + + // t=4100: run-2 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2"]); + + // t=5100: run-3 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2", "run-3"]); + + wheel.stop(); + }); + + it("setDelay changes delay for new items only", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); // 3s delay + + vi.advanceTimersByTime(500); + wheel.setDelay(1000); + wheel.submit("run-2", "data"); // 1s delay + + // t=1500: run-2 should have fired (submitted at t=500 with 1s delay) + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-2"]); + + // t=3100: run-1 fires at its original 3s delay + vi.advanceTimersByTime(1500); + expect(dispatched).toEqual(["run-2", "run-1"]); + + wheel.stop(); + }); + + it("stop returns unprocessed items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data-1"); + wheel.submit("run-2", "data-2"); + wheel.submit("run-3", "data-3"); + + const remaining = wheel.stop(); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + expect(remaining.length).toBe(3); + expect(remaining.map((r) => r.key).sort()).toEqual(["run-1", "run-2", "run-3"]); + expect(remaining.find((r) => r.key === "run-1")?.data).toBe("data-1"); + }); + + it("after stop, new submissions are silently dropped", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.stop(); + + wheel.submit("run-late", "data"); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + }); + + it("tracks size correctly through submit/cancel/dispatch", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + + wheel.submit("a", "data"); + wheel.submit("b", "data"); + expect(wheel.size).toBe(2); + + wheel.cancel("a"); + expect(wheel.size).toBe(1); + + vi.advanceTimersByTime(3100); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("clamps delay to valid range", () => { + const dispatched: string[] = []; + + // Very small delay (should be at least 1 tick = 100ms) + const wheel = new TimerWheel({ + delayMs: 0, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("multiple cancel calls are safe", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + expect(wheel.cancel("run-1")).toBe(true); + expect(wheel.cancel("run-1")).toBe(false); + + wheel.stop(); + }); +}); diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts new file mode 100644 index 00000000000..9584423824d --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.ts @@ -0,0 +1,160 @@ +/** + * TimerWheel implements a hashed timer wheel for efficiently managing large numbers + * of delayed operations with O(1) submit, cancel, and per-item dispatch. + * + * Used by the supervisor to delay snapshot requests so that short-lived waitpoints + * (e.g. triggerAndWait that resolves in <3s) skip the snapshot entirely. + * + * The wheel is a ring buffer of slots. A single setInterval advances a cursor. + * When the cursor reaches a slot, all items in that slot are dispatched. + * + * Fixed capacity: 600 slots at 100ms tick = 60s max delay. + */ + +const TICK_MS = 100; +const NUM_SLOTS = 600; // 60s max delay at 100ms tick + +export type TimerWheelItem = { + key: string; + data: T; +}; + +export type TimerWheelOptions = { + /** Called when an item's delay expires. */ + onExpire: (item: TimerWheelItem) => void; + /** Delay in milliseconds before items fire. Clamped to [100, 60000]. */ + delayMs: number; +}; + +type Entry = { + key: string; + data: T; + slotIndex: number; +}; + +export class TimerWheel { + private slots: Set[]; + private entries: Map>; + private cursor: number; + private intervalId: ReturnType | null; + private onExpire: (item: TimerWheelItem) => void; + private delaySlots: number; + + constructor(opts: TimerWheelOptions) { + this.slots = Array.from({ length: NUM_SLOTS }, () => new Set()); + this.entries = new Map(); + this.cursor = 0; + this.intervalId = null; + this.onExpire = opts.onExpire; + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(opts.delayMs / TICK_MS))); + } + + /** Start the timer wheel. Must be called before submitting items. */ + start(): void { + if (this.intervalId) return; + this.intervalId = setInterval(() => this.tick(), TICK_MS); + // Don't hold the process open just for the timer wheel + if (this.intervalId && typeof this.intervalId === "object" && "unref" in this.intervalId) { + this.intervalId.unref(); + } + } + + /** + * Stop the timer wheel and return all unprocessed items. + * The wheel keeps running normally during graceful shutdown - call stop() + * only when you're ready to tear down. Caller decides what to do with leftovers. + */ + stop(): TimerWheelItem[] { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + + const remaining: TimerWheelItem[] = []; + for (const [key, entry] of this.entries) { + remaining.push({ key, data: entry.data }); + } + + for (const slot of this.slots) { + slot.clear(); + } + this.entries.clear(); + + return remaining; + } + + /** + * Update the delay for future submissions. Already-queued items keep their original timing. + * Clamped to [TICK_MS, 60000ms]. + */ + setDelay(delayMs: number): void { + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(delayMs / TICK_MS))); + } + + /** + * Submit an item to be dispatched after the configured delay. + * If an item with the same key already exists, it is replaced (dedup). + * No-op if the wheel is stopped. + */ + submit(key: string, data: T): void { + if (!this.intervalId) return; + + // Dedup: remove existing entry for this key + this.cancel(key); + + const slotIndex = (this.cursor + this.delaySlots) % NUM_SLOTS; + const entry: Entry = { key, data, slotIndex }; + + this.entries.set(key, entry); + this.slot(slotIndex).add(key); + } + + /** + * Cancel a pending item. Returns true if the item was found and removed. + */ + cancel(key: string): boolean { + const entry = this.entries.get(key); + if (!entry) return false; + + this.slot(entry.slotIndex).delete(key); + this.entries.delete(key); + return true; + } + + /** Number of pending items in the wheel. */ + get size(): number { + return this.entries.size; + } + + /** Whether the wheel is running. */ + get running(): boolean { + return this.intervalId !== null; + } + + /** Get a slot by index. The array is fully initialized so this always returns a Set. */ + private slot(index: number): Set { + const s = this.slots[index]; + if (!s) throw new Error(`TimerWheel: invalid slot index ${index}`); + return s; + } + + /** Advance the cursor and dispatch all items in the current slot. */ + private tick(): void { + this.cursor = (this.cursor + 1) % NUM_SLOTS; + const slot = this.slot(this.cursor); + + if (slot.size === 0) return; + + // Collect items to dispatch (copy keys since we mutate during iteration) + const keys = [...slot]; + slot.clear(); + + for (const key of keys) { + const entry = this.entries.get(key); + if (!entry) continue; + + this.entries.delete(key); + this.onExpire({ key, data: entry.data }); + } + } +} diff --git a/apps/supervisor/src/util.ts b/apps/supervisor/src/util.ts index 4fcda27b2aa..d14dd99bfe1 100644 --- a/apps/supervisor/src/util.ts +++ b/apps/supervisor/src/util.ts @@ -14,6 +14,18 @@ export function getDockerHostDomain() { return isMacOS || isWindows ? "host.docker.internal" : "localhost"; } +/** Extract the W3C traceparent string from an untyped trace context record */ +export function extractTraceparent(traceContext?: Record): string | undefined { + if ( + traceContext && + "traceparent" in traceContext && + typeof traceContext.traceparent === "string" + ) { + return traceContext.traceparent; + } + return undefined; +} + export function getRunnerId(runId: string, attemptNumber?: number) { const parts = ["runner", runId.replace("run_", "")]; @@ -23,3 +35,10 @@ export function getRunnerId(runId: string, attemptNumber?: number) { return parts.join("-"); } + +/** Derive a unique runnerId for a restore cycle using the checkpoint suffix */ +export function getRestoreRunnerId(runFriendlyId: string, checkpointId: string) { + const runIdShort = runFriendlyId.replace("run_", ""); + const checkpointSuffix = checkpointId.slice(-8); + return `runner-${runIdShort}-${checkpointSuffix}`; +} diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts new file mode 100644 index 00000000000..1c00f33aad3 --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -0,0 +1,374 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import { flattenAttributes } from "@trigger.dev/core/v3/utils/flattenAttributes"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { extractTraceparent, getRunnerId } from "../util.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; +import { tryCatch } from "@trigger.dev/core"; + +type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { + gateway: { + url: string; + authToken?: string; + timeoutMs: number; + }; + snapshots: { + enabled: boolean; + delayMs: number; + dispatchLimit: number; + callbackUrl: string; + }; + tracing?: OtlpTraceService; + runner: { + instanceName: string; + otelEndpoint: string; + prettyLogs: boolean; + }; +}; + +export class ComputeWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + private readonly compute: ComputeClient; + + constructor(private opts: ComputeWorkloadManagerOptions) { + if (opts.workloadApiDomain) { + this.logger.warn("โš ๏ธ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); + } + + this.compute = new ComputeClient({ + gatewayUrl: opts.gateway.url, + authToken: opts.gateway.authToken, + timeoutMs: opts.gateway.timeoutMs, + }); + } + + get snapshotsEnabled(): boolean { + return this.opts.snapshots.enabled; + } + + get snapshotDelayMs(): number { + return this.opts.snapshots.delayMs; + } + + get snapshotDispatchLimit(): number { + return this.opts.snapshots.dispatchLimit; + } + + get traceSpansEnabled(): boolean { + return !!this.opts.tracing; + } + + async create(opts: WorkloadManagerCreateOptions) { + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + const envVars: Record = { + OTEL_EXPORTER_OTLP_ENDPOINT: this.opts.runner.otelEndpoint, + TRIGGER_DEQUEUED_AT_MS: String(opts.dequeuedAt.getTime()), + TRIGGER_POD_SCHEDULED_AT_MS: String(Date.now()), + TRIGGER_ENV_ID: opts.envId, + TRIGGER_DEPLOYMENT_ID: opts.deploymentFriendlyId, + TRIGGER_DEPLOYMENT_VERSION: opts.deploymentVersion, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + TRIGGER_RUNNER_ID: runnerId, + TRIGGER_MACHINE_CPU: String(opts.machine.cpu), + TRIGGER_MACHINE_MEMORY: String(opts.machine.memory), + PRETTY_LOGS: String(this.opts.runner.prettyLogs), + }; + + if (this.opts.warmStartUrl) { + envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; + } + + if (this.snapshotsEnabled && this.opts.metadataUrl) { + envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = String(this.opts.heartbeatIntervalSeconds); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String( + this.opts.snapshotPollIntervalSeconds + ); + } + + if (this.opts.additionalEnvVars) { + Object.assign(envVars, this.opts.additionalEnvVars); + } + + // Strip image digest - resolve by tag, not digest + const imageRef = stripImageDigest(opts.image); + + // Wide event: single canonical log line emitted in finally + const event: Record = { + // High-cardinality identifiers + runId: opts.runFriendlyId, + runnerId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + // Environment + instanceName: this.opts.runner.instanceName, + // Supervisor timing + dequeueResponseMs: opts.dequeueResponseMs, + pollingIntervalMs: opts.pollingIntervalMs, + warmStartCheckMs: opts.warmStartCheckMs, + // Request + image: imageRef, + }; + + const startMs = performance.now(); + + try { + const [error, data] = await tryCatch( + this.compute.instances.create({ + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, + }) + ); + + if (error) { + event.error = error instanceof Error ? error.message : String(error); + event.errorType = + error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; + // Intentional: errors are captured in the wide event, not thrown. This matches + // the Docker/K8s managers. The run will eventually time out if scheduling fails. + return; + } + + event.instanceId = data.id; + event.ok = true; + + // Parse timing data from compute response (optional - requires gateway timing flag) + if (data._timing) { + event.timing = data._timing; + } + + this.#emitProvisionSpan(opts, startMs, data._timing); + } finally { + event.durationMs = Math.round(performance.now() - startMs); + event.ok ??= false; + this.logger.debug("create instance", event); + } + } + + async snapshot(opts: { runnerId: string; metadata: Record }): Promise { + const [error] = await tryCatch( + this.compute.instances.snapshot(opts.runnerId, { + callback: { + url: this.opts.snapshots.callbackUrl, + metadata: opts.metadata, + }, + }) + ); + + if (error) { + this.logger.error("snapshot request failed", { + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("snapshot request accepted", { runnerId: opts.runnerId }); + return true; + } + + async deleteInstance(runnerId: string): Promise { + const [error] = await tryCatch(this.compute.instances.delete(runnerId)); + + if (error) { + this.logger.error("delete instance failed", { + runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("delete instance success", { runnerId }); + return true; + } + + #emitProvisionSpan(opts: WorkloadManagerCreateOptions, startMs: number, timing?: unknown) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed) return; + + const endMs = performance.now(); + const now = Date.now(); + const provisionStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Span starts at dequeue time so events (dequeue) render in the thin-line section + // before "Started". The actual provision call time is in provisionStartEpochMs. + // Subtract 1ms so compute span always sorts before the attempt span (same dequeue time) + const startEpochMs = opts.dequeuedAt.getTime() - 1; + + const spanAttributes: Record = { + "compute.type": "create", + "compute.provision_start_ms": provisionStartEpochMs, + ...(timing + ? (flattenAttributes(timing, "compute") as Record) + : {}), + }; + + if (opts.dequeueResponseMs !== undefined) { + spanAttributes["supervisor.dequeue_response_ms"] = opts.dequeueResponseMs; + } + if (opts.warmStartCheckMs !== undefined) { + spanAttributes["supervisor.warm_start_check_ms"] = opts.warmStartCheckMs; + } + + // Use the platform API URL, not the runner OTLP endpoint (which may be a VM gateway IP) + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.provision", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes, + }); + } + + async restore(opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; + machine: { cpu: number; memory: number }; + // Trace context for OTel span emission + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }): Promise { + const metadata: Record = { + TRIGGER_RUNNER_ID: opts.runnerId, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + }; + + this.logger.verbose("restore request body", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); + + const startMs = performance.now(); + + const [error] = await tryCatch( + this.compute.snapshots.restore(opts.snapshotId, { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + }) + ); + + const durationMs = Math.round(performance.now() - startMs); + + if (error) { + this.logger.error("restore request failed", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + durationMs, + }); + return false; + } + + this.logger.debug("restore request success", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + durationMs, + }); + + this.#emitRestoreSpan(opts, startMs); + + return true; + } + + #emitRestoreSpan( + opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }, + startMs: number + ) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed || !opts.envId || !opts.orgId || !opts.projectId) return; + + const endMs = performance.now(); + const now = Date.now(); + const restoreStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Subtract 1ms so restore span always sorts before the attempt span + const startEpochMs = (opts.dequeuedAt?.getTime() ?? restoreStartEpochMs) - 1; + + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.restore", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes: { + "compute.type": "restore", + "compute.snapshot_id": opts.snapshotId, + }, + }); + } +} diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts index d6651d325a2..66405df9ba5 100644 --- a/apps/supervisor/src/workloadManager/docker.ts +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -62,7 +62,7 @@ export class DockerWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("create()", { opts }); + this.logger.verbose("create()", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index 16c5eff9da1..b2ed05c9f11 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -100,7 +100,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("[KubernetesWorkloadManager] Creating container", { opts }); + this.logger.verbose("[KubernetesWorkloadManager] Creating container", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); @@ -120,7 +120,8 @@ export class KubernetesWorkloadManager implements WorkloadManager { }, spec: { ...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags), - affinity: this.#getAffinity(opts.machine, opts.projectId), + affinity: this.#getAffinity(opts), + tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)), terminationGracePeriodSeconds: 60 * 60, containers: [ { @@ -320,6 +321,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { }, } : {}), + ...(env.KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED + ? { + dnsConfig: { + options: [{ name: "ndots", value: `${env.KUBERNETES_POD_DNS_NDOTS}` }], + }, + } + : {}), }; } @@ -335,14 +343,30 @@ export class KubernetesWorkloadManager implements WorkloadManager { }; } + #isScheduledRun(opts: WorkloadManagerCreateOptions): boolean { + return opts.annotations?.rootTriggerSource === "schedule"; + } + #getSharedLabels(opts: WorkloadManagerCreateOptions): Record { - return { + const labels: Record = { env: opts.envId, envtype: this.#envTypeToLabelValue(opts.envType), org: opts.orgId, project: opts.projectId, machine: opts.machine.name, + // We intentionally use a boolean label rather than exposing the full trigger source + // (e.g. sdk, api, cli, mcp, schedule) to keep label cardinality low in metrics. + // The schedule vs non-schedule distinction is all we need for the current metrics + // and pool-level scheduling decisions; finer-grained source breakdowns live in run annotations. + scheduled: String(this.#isScheduledRun(opts)), }; + + // Add privatelink label for CiliumNetworkPolicy matching + if (opts.hasPrivateLink) { + labels.privatelink = opts.orgId; + } + + return labels; } #getResourceRequestsForMachine(preset: MachinePreset): ResourceQuantities { @@ -390,22 +414,43 @@ export class KubernetesWorkloadManager implements WorkloadManager { return preset.name.startsWith("large-"); } - #getAffinity(preset: MachinePreset, projectId: string): k8s.V1Affinity | undefined { - const nodeAffinity = this.#getNodeAffinityRules(preset); - const podAffinity = this.#getProjectPodAffinity(projectId); - - if (!nodeAffinity && !podAffinity) { + #getAffinity(opts: WorkloadManagerCreateOptions): k8s.V1Affinity | undefined { + const largeNodeAffinity = this.#getNodeAffinityRules(opts.machine); + const scheduleNodeAffinity = this.#getScheduleNodeAffinityRules(this.#isScheduledRun(opts)); + const podAffinity = this.#getProjectPodAffinity(opts.projectId); + + // Merge node affinity rules from multiple sources + const preferred = [ + ...(largeNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ...(scheduleNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ]; + // Only large machine affinity produces hard requirements (non-large runs must stay off the large pool). + // Schedule affinity is soft both ways. + const required = [ + ...(largeNodeAffinity?.requiredDuringSchedulingIgnoredDuringExecution?.nodeSelectorTerms ?? []), + ]; + + const hasNodeAffinity = preferred.length > 0 || required.length > 0; + + if (!hasNodeAffinity && !podAffinity) { return undefined; } return { - ...(nodeAffinity && { nodeAffinity }), + ...(hasNodeAffinity && { + nodeAffinity: { + ...(preferred.length > 0 && { preferredDuringSchedulingIgnoredDuringExecution: preferred }), + ...(required.length > 0 && { + requiredDuringSchedulingIgnoredDuringExecution: { nodeSelectorTerms: required }, + }), + }, + }), ...(podAffinity && { podAffinity }), }; } #getNodeAffinityRules(preset: MachinePreset): k8s.V1NodeAffinity | undefined { - if (!env.KUBERNETES_LARGE_MACHINE_POOL_LABEL) { + if (!env.KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED) { return undefined; } @@ -414,13 +459,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { return { preferredDuringSchedulingIgnoredDuringExecution: [ { - weight: 100, + weight: env.KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT, preference: { matchExpressions: [ { - key: "node.cluster.x-k8s.io/machinepool", + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, operator: "In", - values: [env.KUBERNETES_LARGE_MACHINE_POOL_LABEL], + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -436,9 +481,9 @@ export class KubernetesWorkloadManager implements WorkloadManager { { matchExpressions: [ { - key: "node.cluster.x-k8s.io/machinepool", + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, operator: "NotIn", - values: [env.KUBERNETES_LARGE_MACHINE_POOL_LABEL], + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], }, ], }, @@ -447,6 +492,58 @@ export class KubernetesWorkloadManager implements WorkloadManager { }; } + #getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined { + if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) { + return undefined; + } + + if (isScheduledRun) { + // soft preference for the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "In", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + }, + ], + }; + } + + // soft anti-affinity: non-schedule runs prefer to avoid the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "NotIn", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + }, + ], + }; + } + + #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) { + return undefined; + } + + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS; + } + #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) { return undefined; diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index 90b61957795..86199afe469 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -1,4 +1,4 @@ -import type { EnvironmentType, MachinePreset, PlacementTag } from "@trigger.dev/core/v3"; +import type { EnvironmentType, MachinePreset, PlacementTag, RunAnnotations } from "@trigger.dev/core/v3"; export interface WorkloadManagerOptions { workloadApiProtocol: "http" | "https"; @@ -24,6 +24,10 @@ export interface WorkloadManagerCreateOptions { nextAttemptNumber?: number; dequeuedAt: Date; placementTags?: PlacementTag[]; + // Timing context (populated by supervisor handler, included in wide event) + dequeueResponseMs?: number; + pollingIntervalMs?: number; + warmStartCheckMs?: number; // identifiers envId: string; envType: EnvironmentType; @@ -35,4 +39,9 @@ export interface WorkloadManagerCreateOptions { runFriendlyId: string; snapshotId: string; snapshotFriendlyId: string; + // Trace context for OTel span emission (W3C format: { traceparent: "00-...", tracestate?: "..." }) + traceContext?: Record; + annotations?: RunAnnotations; + // private networking + hasPrivateLink?: boolean; } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 35d53d36099..bd38cc8700f 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -24,6 +24,13 @@ import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOn import { type IncomingMessage } from "node:http"; import { register } from "../metrics.js"; import { env } from "../env.js"; +import { SnapshotCallbackPayloadSchema } from "@internal/compute"; +import { + ComputeSnapshotService, + type RunTraceContext, +} from "../services/computeSnapshotService.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -58,10 +65,13 @@ type WorkloadServerOptions = { host?: string; workerClient: SupervisorHttpClient; checkpointClient?: CheckpointClient; + computeManager?: ComputeWorkloadManager; + tracing?: OtlpTraceService; }; export class WorkloadServer extends EventEmitter { private checkpointClient?: CheckpointClient; + private readonly snapshotService?: ComputeSnapshotService; private readonly logger = new SimpleStructuredLogger("workload-server"); @@ -94,6 +104,14 @@ export class WorkloadServer extends EventEmitter { this.workerClient = opts.workerClient; this.checkpointClient = opts.checkpointClient; + if (opts.computeManager?.snapshotsEnabled) { + this.snapshotService = new ComputeSnapshotService({ + computeManager: opts.computeManager, + workerClient: opts.workerClient, + tracing: opts.tracing, + }); + } + this.httpServer = this.createHttpServer({ host, port }); this.websocketServer = this.createWebsocketServer(); } @@ -229,13 +247,28 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, handler: async ({ reply, params, req }) => { - this.logger.debug("Suspend request", { params, headers: req.headers }); + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); - if (!this.checkpointClient) { + this.logger.debug("Suspend request", { + params, + runnerId, + deploymentVersion, + projectRef, + }); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + runnerId, + deploymentVersion, + projectRef, + }); reply.json( { ok: false, - error: "Checkpoints disabled", + error: "Invalid headers", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -243,19 +276,25 @@ export class WorkloadServer extends EventEmitter { return; } - const runnerId = this.runnerIdFromRequest(req); - const deploymentVersion = this.deploymentVersionFromRequest(req); - const projectRef = this.projectRefFromRequest(req); + if (this.snapshotService) { + // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. + // If the run continues before the delay expires, the snapshot is cancelled. + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - if (!runnerId || !deploymentVersion || !projectRef) { - this.logger.error("Invalid headers for suspend request", { - ...params, - headers: req.headers, + this.snapshotService.schedule(params.runFriendlyId, { + runnerId, + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, }); + + return; + } + + if (!this.checkpointClient) { reply.json( { ok: false, - error: "Invalid headers", + error: "Checkpoints disabled", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -298,6 +337,9 @@ export class WorkloadServer extends EventEmitter { handler: async ({ req, reply, params }) => { this.logger.debug("Run continuation request", { params }); + // Cancel any pending delayed snapshot for this run + this.snapshotService?.cancel(params.runFriendlyId); + const continuationResult = await this.workerClient.continueRunExecution( params.runFriendlyId, params.snapshotFriendlyId, @@ -394,6 +436,20 @@ export class WorkloadServer extends EventEmitter { }); } + // Compute snapshot callback endpoint + httpServer.route("/api/v1/compute/snapshot-complete", "POST", { + bodySchema: SnapshotCallbackPayloadSchema, + handler: async ({ reply, body }) => { + if (!this.snapshotService) { + reply.empty(404); + return; + } + + const result = await this.snapshotService.handleCallback(body); + reply.empty(result.status); + }, + }); + return httpServer; } @@ -408,7 +464,7 @@ export class WorkloadServer extends EventEmitter { > = io.of("/workload"); websocketServer.on("disconnect", (socket) => { - this.logger.log("[WS] disconnect", socket.id); + this.logger.verbose("[WS] disconnect", socket.id); }); websocketServer.use(async (socket, next) => { const setSocketDataFromHeader = ( @@ -490,7 +546,7 @@ export class WorkloadServer extends EventEmitter { socket.data.runFriendlyId = undefined; }; - socketLogger.log("wsServer socket connected", { ...getSocketMetadata() }); + socketLogger.debug("wsServer socket connected", { ...getSocketMetadata() }); // FIXME: where does this get set? if (socket.data.runFriendlyId) { @@ -498,7 +554,11 @@ export class WorkloadServer extends EventEmitter { } socket.on("disconnecting", (reason, description) => { - socketLogger.log("Socket disconnecting", { ...getSocketMetadata(), reason, description }); + socketLogger.verbose("Socket disconnecting", { + ...getSocketMetadata(), + reason, + description, + }); if (socket.data.runFriendlyId) { runDisconnected(socket.data.runFriendlyId); @@ -506,7 +566,7 @@ export class WorkloadServer extends EventEmitter { }); socket.on("disconnect", (reason, description) => { - socketLogger.log("Socket disconnected", { ...getSocketMetadata(), reason, description }); + socketLogger.debug("Socket disconnected", { ...getSocketMetadata(), reason, description }); }); socket.on("error", (error) => { @@ -527,7 +587,7 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:start"); + log.debug("Handling run:start"); try { runConnected(message.run.friendlyId); @@ -543,10 +603,13 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:stop"); + log.debug("Handling run:stop"); try { runDisconnected(message.run.friendlyId); + // Don't delete trace context here - run:stop fires after each snapshot/shutdown + // but the run may be restored on a new VM and snapshot again. Trace context is + // re-populated on dequeue, and entries are small (4 strings per run). } catch (error) { log.error("run:stop error", { error }); } @@ -588,11 +651,16 @@ export class WorkloadServer extends EventEmitter { } } + registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + this.snapshotService?.registerTraceContext(runFriendlyId, ctx); + } + async start() { await this.httpServer.start(); } async stop() { + this.snapshotService?.stop(); await this.httpServer.stop(); } } diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md new file mode 100644 index 00000000000..a4de6ab57b7 --- /dev/null +++ b/apps/webapp/CLAUDE.md @@ -0,0 +1,131 @@ +# Webapp + +Remix 2.17.4 app serving as the main API, dashboard, and orchestration engine. Uses an Express server (`server.ts`). + +## Verifying Changes + +**Never run `pnpm run build --filter webapp` to verify changes.** Building proves almost nothing about correctness. The webapp is an app, not a public package โ€” use typecheck from the repo root: + +```bash +pnpm run typecheck --filter webapp # ~1-2 minutes +``` + +Only run typecheck after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +Note: Public packages (`packages/*`) use `build` instead. See the root CLAUDE.md for details. + +## Testing Dashboard Changes with Chrome DevTools MCP + +Use the `chrome-devtools` MCP server to visually verify local dashboard changes. The webapp must be running (`pnpm run dev --filter webapp` from repo root). + +### Login + +``` +1. mcp__chrome-devtools__new_page(url: "http://localhost:3030") + โ†’ Redirects to /login +2. mcp__chrome-devtools__click the "Continue with Email" link +3. mcp__chrome-devtools__fill the email field with "local@trigger.dev" +4. mcp__chrome-devtools__click "Send a magic link" + โ†’ Auto-logs in and redirects to the dashboard (no email verification needed locally) +``` + +### Navigating and Verifying + +- **take_snapshot**: Get an a11y tree of the page (text content, element UIDs for interaction). Prefer this over screenshots for understanding page structure. +- **take_screenshot**: Capture what the page looks like visually. Use to verify styling, layout, and visual changes. +- **navigate_page**: Go to specific URLs, e.g. `http://localhost:3030/orgs/references-bc08/projects/hello-world-SiWs/env/dev/runs` +- **click / fill**: Interact with elements using UIDs from `take_snapshot`. +- **evaluate_script**: Run JS in the browser console for debugging. +- **list_console_messages**: Check for console errors after navigating. + +### Tips + +- Snapshots can be very large on complex pages (200K+ chars). Use `take_screenshot` first to orient, then `take_snapshot` only when you need element UIDs to interact. +- The local seeded user email is `local@trigger.dev`. +- Dashboard URL pattern: `http://localhost:3030/orgs/{orgSlug}/projects/{projectSlug}/env/{envSlug}/{section}` + +## Key File Locations + +- **Trigger API**: `app/routes/api.v1.tasks.$taskId.trigger.ts` +- **Batch trigger**: `app/routes/api.v1.tasks.batch.ts` +- **OTEL endpoints**: `app/routes/otel.v1.logs.ts`, `app/routes/otel.v1.traces.ts` +- **Prisma setup**: `app/db.server.ts` +- **Run engine config**: `app/v3/runEngine.server.ts` +- **Services**: `app/v3/services/**/*.server.ts` +- **Presenters**: `app/v3/presenters/**/*.server.ts` + +## Route Convention + +Routes use Remix flat-file convention with dot-separated segments: +`api.v1.tasks.$taskId.trigger.ts` -> `/api/v1/tasks/:taskId/trigger` + +## Abort Signals + +**Never use `request.signal`** for detecting client disconnects. It is broken due to a Node.js bug ([nodejs/node#55428](https://github.com/nodejs/node/issues/55428)) where the AbortSignal chain is severed when Remix internally clones the Request object. Instead, use `getRequestAbortSignal()` from `app/services/httpAsyncStorage.server.ts`, which is wired directly to Express `res.on("close")` and fires reliably. + +```typescript +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; + +// In route handlers, SSE streams, or any server-side code: +const signal = getRequestAbortSignal(); +``` + +## Environment Variables + +Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** + +For testable code, **never import env.server.ts** in test files. Pass configuration as options instead: +- `realtimeClient.server.ts` (testable service, takes config as constructor arg) +- `realtimeClientGlobal.server.ts` (creates singleton with env config) + +## Run Engine 2.0 + +The webapp integrates `@internal/run-engine` via `app/v3/runEngine.server.ts`. This is the singleton engine instance. Services in `app/v3/services/` call engine methods for all run lifecycle operations (triggering, completing, cancelling, etc.). + +The `engineVersion.server.ts` file determines V1 vs V2 for a given environment. New code should always target V2. + +## Background Workers + +Background job workers use `@trigger.dev/redis-worker`: +- `app/v3/commonWorker.server.ts` +- `app/v3/alertsWorker.server.ts` +- `app/v3/batchTriggerWorker.server.ts` + +Do NOT add new jobs using zodworker/graphile-worker (legacy). + +## Real-time + +- Socket.io: `app/v3/handleSocketIo.server.ts`, `app/v3/handleWebsockets.server.ts` +- Electric SQL: Powers real-time data sync for the dashboard + +## Legacy V1 Code + +The `app/v3/` directory name is misleading - most code is actively used by V2. Only these specific files are V1-only legacy: +- `app/v3/marqs/` (old MarQS queue system) +- `app/v3/legacyRunEngineWorker.server.ts` +- `app/v3/services/triggerTaskV1.server.ts` +- `app/v3/services/cancelTaskRunV1.server.ts` +- `app/v3/authenticatedSocketConnection.server.ts` +- `app/v3/sharedSocketConnection.ts` + +Some services (e.g., `cancelTaskRun.server.ts`, `batchTriggerV3.server.ts`) branch on `RunEngineVersion` to support both V1 and V2. When editing these, only modify V2 code paths. + +## Performance: Trigger Hot Path + +The `triggerTask.server.ts` service is the **highest-throughput code path** in the system. Every API trigger call goes through it. Keep it fast: + +- **Do NOT add database queries** to `triggerTask.server.ts` or `batchTriggerV3.server.ts`. Task defaults (TTL, etc.) are resolved via `backgroundWorkerTask.findFirst()` in the queue concern (`queues.server.ts`) - one query per request, in mutually exclusive branches depending on locked/non-locked path. Piggyback on the existing query instead of adding new ones. +- **Two-stage resolution pattern**: Task metadata is resolved in two stages by design: + 1. **Trigger time** (`triggerTask.server.ts`): Only TTL is resolved from task defaults. Everything else uses whatever the caller provides. + 2. **Dequeue time** (`dequeueSystem.ts`): Full `BackgroundWorkerTask` is loaded and retry config, machine config, maxDuration, etc. are resolved against task defaults. +- If you need to add a new task-level default, **add it to the existing `select` clause** in the `backgroundWorkerTask.findFirst()` query โ€” do NOT add a second query. If the default doesn't need to be known at trigger time, resolve it at dequeue time instead. +- Batch triggers (`batchTriggerV3.server.ts`) follow the same pattern โ€” keep batch paths equally fast. + +## Prisma Query Patterns + +- **Always use `findFirst` instead of `findUnique`.** Prisma's `findUnique` has an implicit DataLoader that batches concurrent calls into a single `IN` query. This batching cannot be disabled and has active bugs even in Prisma 6.x: uppercase UUIDs returning null (#25484, confirmed 6.4.1), composite key SQL correctness issues (#22202), and 5-10x worse performance than manual DataLoader (#6573, open since 2021). `findFirst` is never batched and avoids this entire class of issues. + +## React Patterns + +- Only use `useCallback`/`useMemo` for context provider values, expensive derived data that is a dependency elsewhere, or stable refs required by a dependency array. Don't wrap ordinary event handlers or trivial computations. +- Use named constants for sentinel/placeholder values (e.g. `const UNSET_VALUE = "__unset__"`) instead of raw string literals scattered across comparisons. diff --git a/apps/webapp/app/assets/icons/AIMetricsIcon.tsx b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx new file mode 100644 index 00000000000..038eea70b49 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx @@ -0,0 +1,16 @@ +export function AIMetricsIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AIPromptsIcon.tsx b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx new file mode 100644 index 00000000000..dd434df9931 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx @@ -0,0 +1,10 @@ +export function AIPromptsIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AiProviderIcons.tsx b/apps/webapp/app/assets/icons/AiProviderIcons.tsx new file mode 100644 index 00000000000..85a01b98d63 --- /dev/null +++ b/apps/webapp/app/assets/icons/AiProviderIcons.tsx @@ -0,0 +1,177 @@ +type IconProps = { className?: string }; + +export function OpenAIIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function AnthropicIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function GeminiIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function LlamaIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function DeepseekIcon({ className }: IconProps) { + return ( + + + + + + + + + + + ); +} + +export function XAIIcon({ className }: IconProps) { + return ( + + + + + + + ); +} + +export function PerplexityIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function CerebrasIcon({ className }: IconProps) { + return ( + + + + + + + + ); +} + +export function MistralIcon({ className }: IconProps) { + return ( + + + + + + + + + + + + + ); +} + +export function AzureIcon({ className }: IconProps) { + return ( + + + + ); +} + diff --git a/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx new file mode 100644 index 00000000000..3e647284cce --- /dev/null +++ b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx @@ -0,0 +1,12 @@ +export function AnthropicLogoIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SlackMonoIcon.tsx b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx new file mode 100644 index 00000000000..666393a229d --- /dev/null +++ b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx @@ -0,0 +1,10 @@ +export function SlackMonoIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/components/BackgroundWrapper.tsx b/apps/webapp/app/components/BackgroundWrapper.tsx index ecff3af6dd4..aaf06d56aaf 100644 --- a/apps/webapp/app/components/BackgroundWrapper.tsx +++ b/apps/webapp/app/components/BackgroundWrapper.tsx @@ -5,10 +5,9 @@ import blurredDashboardBackgroundTable from "~/assets/images/blurred-dashboard-b export function BackgroundWrapper({ children }: { children: ReactNode }) { return ( -
- {/* Left menu top background - fixed width 260px, maintains aspect ratio */} +
- {/* Left menu bottom background - fixed width 260px, maintains aspect ratio */}
- {/* Right table background - fixed width 2000px, positioned next to menu */}
- {/* Content layer */}
{children}
); diff --git a/apps/webapp/app/components/BlankStatePanels.tsx b/apps/webapp/app/components/BlankStatePanels.tsx index 7423bd61ac8..fe39f6785c5 100644 --- a/apps/webapp/app/components/BlankStatePanels.tsx +++ b/apps/webapp/app/components/BlankStatePanels.tsx @@ -8,10 +8,10 @@ import { QuestionMarkCircleIcon, RectangleGroupIcon, RectangleStackIcon, - ServerStackIcon, Squares2X2Icon, } from "@heroicons/react/20/solid"; import { useLocation } from "react-use"; +import { AIPromptsIcon } from "~/assets/icons/AIPromptsIcon"; import { BranchEnvironmentIconSmall } from "~/assets/icons/EnvironmentIcons"; import { WaitpointTokenIcon } from "~/assets/icons/WaitpointTokenIcon"; import openBulkActionsPanel from "~/assets/images/open-bulk-actions-panel.png"; @@ -23,21 +23,28 @@ import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; import { type MinimumEnvironment } from "~/presenters/SelectBestEnvironmentPresenter.server"; import { NewBranchPanel } from "~/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.branches/route"; +import { GitHubSettingsPanel } from "~/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.github"; import { docsPath, v3BillingPath, v3CreateBulkActionPath, v3EnvironmentPath, - v3EnvironmentVariablesPath, v3NewProjectAlertPath, v3NewSchedulePath, } from "~/utils/pathBuilder"; import { AskAI } from "./AskAI"; +import { CodeBlock } from "./code/CodeBlock"; import { InlineCode } from "./code/InlineCode"; import { environmentFullTitle, EnvironmentIcon } from "./environments/EnvironmentLabel"; import { Feedback } from "./Feedback"; import { EnvironmentSelector } from "./navigation/EnvironmentSelector"; import { Button, LinkButton } from "./primitives/Buttons"; +import { + ClientTabs, + ClientTabsContent, + ClientTabsList, + ClientTabsTrigger, +} from "./primitives/ClientTabs"; import { Header1 } from "./primitives/Headers"; import { InfoPanel } from "./primitives/InfoPanel"; import { Paragraph } from "./primitives/Paragraph"; @@ -52,13 +59,6 @@ import { } from "./SetupCommands"; import { StepContentContainer } from "./StepContentContainer"; import { V4Badge } from "./V4Badge"; -import { - ClientTabs, - ClientTabsContent, - ClientTabsList, - ClientTabsTrigger, -} from "./primitives/ClientTabs"; -import { GitHubSettingsPanel } from "~/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.github"; export function HasNoTasksDev() { return ( @@ -601,7 +601,9 @@ function DeploymentOnboardingSteps() {
- Deploy your tasks to {environmentFullTitle(environment)} + + Deploy your tasks to {environmentFullTitle(environment)} +
@@ -686,3 +688,50 @@ function DeploymentOnboardingSteps() { ); } + +export function PromptsNone() { + return ( + + Prompts docs + + } + > + + Managed prompts let you define AI prompts in code with typesafe variables, then edit and + version them from the dashboard without redeploying. + + + Add a prompt to your project using prompts.define() + : + + + + Deploy your project and your prompts will appear here with version history and a live + editor. + + + ); +} diff --git a/apps/webapp/app/components/BulkActionFilterSummary.tsx b/apps/webapp/app/components/BulkActionFilterSummary.tsx index 073940d7d0a..a230e70b346 100644 --- a/apps/webapp/app/components/BulkActionFilterSummary.tsx +++ b/apps/webapp/app/components/BulkActionFilterSummary.tsx @@ -228,6 +228,18 @@ export function BulkActionFilterSummary({ /> ); } + case "errorId": { + return ( + + ); + } default: { assertNever(typedKey); } diff --git a/apps/webapp/app/components/GitMetadata.tsx b/apps/webapp/app/components/GitMetadata.tsx index efe3fb0efb7..fb53ee6bfea 100644 --- a/apps/webapp/app/components/GitMetadata.tsx +++ b/apps/webapp/app/components/GitMetadata.tsx @@ -25,9 +25,10 @@ export function GitMetadataBranch({ } + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" to={git.branchUrl} - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > {git.branchName} @@ -49,8 +50,9 @@ export function GitMetadataCommit({ variant="minimal/small" to={git.commitUrl} LeadingIcon={} + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > {`${git.shortSha} / ${git.commitMessage}`} @@ -74,8 +76,9 @@ export function GitMetadataPullRequest({ variant="minimal/small" to={git.pullRequestUrl} LeadingIcon={} + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > #{git.pullRequestNumber} {git.pullRequestTitle} diff --git a/apps/webapp/app/components/LoginPageLayout.tsx b/apps/webapp/app/components/LoginPageLayout.tsx index d9ac7ceb4d7..3e42cd6894f 100644 --- a/apps/webapp/app/components/LoginPageLayout.tsx +++ b/apps/webapp/app/components/LoginPageLayout.tsx @@ -46,10 +46,10 @@ export function LoginPageLayout({ children }: { children: React.ReactNode }) { }, []); return ( -
-
-
-
+
+
+
+
@@ -63,12 +63,12 @@ export function LoginPageLayout({ children }: { children: React.ReactNode }) {
{children}
- Having login issues? Email us{" "} + Having login issues? Email us{" "} or ask us in Discord
-
+
{randomQuote?.quote} diff --git a/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx new file mode 100644 index 00000000000..df8669d36dd --- /dev/null +++ b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx @@ -0,0 +1,290 @@ +import { useFetcher } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import stableStringify from "json-stable-stringify"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogDescription, + DialogFooter, +} from "~/components/primitives/Dialog"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { LockClosedIcon } from "@heroicons/react/20/solid"; +import { CheckboxWithLabel } from "~/components/primitives/Checkbox"; +import { cn } from "~/utils/cn"; +import { FEATURE_FLAG, ORG_LOCKED_FLAGS, type FlagControlType } from "~/v3/featureFlags"; +import { + UNSET_VALUE, + BooleanControl, + EnumControl, + StringControl, + WorkerGroupControl, + type WorkerGroup, +} from "./FlagControls"; + +type LoaderData = { + org: { id: string; title: string; slug: string }; + orgFlags: Record; + globalFlags: Record; + controlTypes: Record; + workerGroupName?: string; + workerGroups?: WorkerGroup[]; + isManagedCloud?: boolean; +}; + +type ActionData = { + success?: boolean; + error?: string; +}; + +type FeatureFlagsDialogProps = { + orgId: string | null; + orgTitle: string; + open: boolean; + onOpenChange: (open: boolean) => void; +}; + +export function FeatureFlagsDialog({ + orgId, + orgTitle, + open, + onOpenChange, +}: FeatureFlagsDialogProps) { + const loadFetcher = useFetcher(); + const saveFetcher = useFetcher(); + + const [overrides, setOverrides] = useState>({}); + const [initialOverrides, setInitialOverrides] = useState>({}); + const [saveError, setSaveError] = useState(null); + const [unlocked, setUnlocked] = useState(false); + + const isLocked = (key: string) => !unlocked && ORG_LOCKED_FLAGS.includes(key); + + useEffect(() => { + if (open && orgId) { + setSaveError(null); + setOverrides({}); + setInitialOverrides({}); + loadFetcher.load(`/admin/api/v2/orgs/${orgId}/feature-flags`); + } + }, [open, orgId]); + + useEffect(() => { + if (loadFetcher.data) { + const loaded = loadFetcher.data.orgFlags ?? {}; + setOverrides({ ...loaded }); + setInitialOverrides({ ...loaded }); + } + }, [loadFetcher.data]); + + useEffect(() => { + if (saveFetcher.data?.success) { + onOpenChange(false); + } else if (saveFetcher.data?.error) { + setSaveError(saveFetcher.data.error); + } + }, [saveFetcher.data]); + + const isDirty = stableStringify(overrides) !== stableStringify(initialOverrides); + + const setFlagValue = (key: string, value: unknown) => { + setOverrides((prev) => ({ ...prev, [key]: value })); + }; + + const unsetFlag = (key: string) => { + setOverrides((prev) => { + const next = { ...prev }; + delete next[key]; + return next; + }); + }; + + const handleSave = () => { + if (!orgId) return; + const body = Object.keys(overrides).length === 0 ? null : overrides; + saveFetcher.submit(JSON.stringify(body), { + method: "POST", + action: `/admin/api/v2/orgs/${orgId}/feature-flags`, + encType: "application/json", + }); + }; + + const data = loadFetcher.data; + const isLoading = loadFetcher.state === "loading"; + const isSaving = saveFetcher.state === "submitting"; + + const jsonPreview = + Object.keys(overrides).length === 0 ? "null" : JSON.stringify(overrides, null, 2); + + const sortedFlagKeys = data ? Object.keys(data.controlTypes).sort() : []; + + return ( + + + Feature flags - {orgTitle} + + Org-level overrides. Unset flags inherit from global defaults. + + + {data && ( +
+ +
+ )} + +
+ {isLoading ? ( +
Loading flags...
+ ) : data ? ( +
+ {sortedFlagKeys.map((key) => { + const control = data.controlTypes[key]; + const locked = isLocked(key); + const globalValue = data.globalFlags[key as keyof typeof data.globalFlags]; + const isWorkerGroup = key === FEATURE_FLAG.defaultWorkerInstanceGroupId; + const globalDisplay = + isWorkerGroup && data.workerGroupName && globalValue !== undefined + ? `${data.workerGroupName} (${String(globalValue).slice(0, 8)}...)` + : globalValue !== undefined + ? String(globalValue) + : "unset"; + + if (locked) { + return ( +
+
+
{key}
+
global: {globalDisplay}
+
+ +
+ ); + } + + const isOverridden = key in overrides; + + return ( +
+
+
+ {key} +
+
global: {globalDisplay}
+
+ +
+ + + {isWorkerGroup && data.workerGroups ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "boolean" ? ( + setFlagValue(key, val)} + dimmed={!isOverridden} + /> + ) : control.type === "enum" ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "string" ? ( + { + if (val === "") { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : null} +
+
+ ); + })} +
+ ) : null} +
+ + {data && ( +
+ + Preview JSON + +
+              {jsonPreview}
+            
+
+ )} + + {saveError && {saveError}} + + + + + +
+
+ ); +} diff --git a/apps/webapp/app/components/admin/FlagControls.tsx b/apps/webapp/app/components/admin/FlagControls.tsx new file mode 100644 index 00000000000..b08f925dd90 --- /dev/null +++ b/apps/webapp/app/components/admin/FlagControls.tsx @@ -0,0 +1,120 @@ +import { Switch } from "~/components/primitives/Switch"; +import { Select, SelectItem } from "~/components/primitives/Select"; +import { Input } from "~/components/primitives/Input"; +import { cn } from "~/utils/cn"; + +export const UNSET_VALUE = "__unset__"; + +export function BooleanControl({ + value, + onChange, + dimmed, +}: { + value: boolean | undefined; + onChange: (val: boolean) => void; + dimmed: boolean; +}) { + return ( + + ); +} + +export function EnumControl({ + value, + options, + onChange, + dimmed, +}: { + value: string | undefined; + options: string[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...options]; + + return ( + + ); +} + +export type WorkerGroup = { id: string; name: string }; + +export function WorkerGroupControl({ + value, + workerGroups, + onChange, + dimmed, +}: { + value: string | undefined; + workerGroups: WorkerGroup[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...workerGroups.map((wg) => wg.id)]; + + return ( + + ); +} + +export function StringControl({ + value, + onChange, + dimmed, +}: { + value: string; + onChange: (val: string) => void; + dimmed: boolean; +}) { + return ( + onChange(e.target.value)} + placeholder="unset" + className={cn("w-40", dimmed && "opacity-50")} + /> + ); +} diff --git a/apps/webapp/app/components/code/CodeBlock.tsx b/apps/webapp/app/components/code/CodeBlock.tsx index 8757acc324e..cc28c8142d3 100644 --- a/apps/webapp/app/components/code/CodeBlock.tsx +++ b/apps/webapp/app/components/code/CodeBlock.tsx @@ -265,7 +265,10 @@ export const CodeBlock = forwardRef( return ( <>
(
{ const statusColor = groupByIsRunStatus ? getRunStatusHexColor(s) : undefined; + const originalIndex = config.yAxisColumns.indexOf(s); + const colorIndex = originalIndex >= 0 ? originalIndex : i; cfg[s] = { label: s, - color: statusColor ?? config.seriesColors?.[s] ?? getSeriesColor(i), + color: statusColor ?? config.seriesColors?.[s] ?? getSeriesColor(colorIndex), }; }); return cfg; - }, [sortedSeries, groupByIsRunStatus, config.seriesColors]); + }, [sortedSeries, groupByIsRunStatus, config.seriesColors, config.yAxisColumns]); // Custom tooltip label formatter for better date display const tooltipLabelFormatter = useMemo(() => { @@ -1207,9 +1209,19 @@ function createYAxisFormatter( formatDurationMilliseconds(value * 1000, { style: "short" }); } + if (format === "durationNs") { + return (value: number): string => + formatDurationMilliseconds(value / 1_000_000, { style: "short" }); + } + if (format === "costInDollars" || format === "cost") { return (value: number): string => { const dollars = format === "cost" ? value / 100 : value; + if (dollars === 0) return "$0"; + if (Math.abs(dollars) >= 1000) return `$${(dollars / 1000).toFixed(1)}K`; + if (Math.abs(dollars) >= 1) return `$${dollars.toFixed(2)}`; + if (Math.abs(dollars) >= 0.01) return `$${dollars.toFixed(4)}`; + if (Math.abs(dollars) >= 0.0001) return `$${dollars.toFixed(6)}`; return formatCurrencyAccurate(dollars); }; } diff --git a/apps/webapp/app/components/code/TSQLResultsTable.tsx b/apps/webapp/app/components/code/TSQLResultsTable.tsx index dae045bc4b8..73ca07180bf 100644 --- a/apps/webapp/app/components/code/TSQLResultsTable.tsx +++ b/apps/webapp/app/components/code/TSQLResultsTable.tsx @@ -81,6 +81,11 @@ function getFormattedValue(value: unknown, column: OutputColumnMetadata): string return formatDurationMilliseconds(value * 1000, { style: "short" }); } break; + case "durationNs": + if (typeof value === "number") { + return formatDurationMilliseconds(value / 1_000_000, { style: "short" }); + } + break; case "cost": if (typeof value === "number") { return formatCurrencyAccurate(value / 100); @@ -133,6 +138,7 @@ function getFormattedValue(value: unknown, column: OutputColumnMetadata): string hour: "2-digit", minute: "2-digit", second: "2-digit", + timeZone: "UTC", }); } catch { return String(value); @@ -281,6 +287,12 @@ function getDisplayLength(value: unknown, column: OutputColumnMetadata): number return formatted.length; } return 10; + case "durationNs": + if (typeof value === "number") { + const formatted = formatDurationMilliseconds(value / 1_000_000, { style: "short" }); + return formatted.length; + } + return 10; case "cost": case "costInDollars": // Currency format: "$1,234.56" @@ -448,10 +460,12 @@ function CellValueWrapper({ value, column, prettyFormatting, + row, }: { value: unknown; column: OutputColumnMetadata; prettyFormatting: boolean; + row?: Record; }) { const [hovered, setHovered] = useState(false); @@ -466,6 +480,7 @@ function CellValueWrapper({ column={column} prettyFormatting={prettyFormatting} hovered={hovered} + row={row} /> ); @@ -479,11 +494,13 @@ function CellValue({ column, prettyFormatting = true, hovered = false, + row, }: { value: unknown; column: OutputColumnMetadata; prettyFormatting?: boolean; hovered?: boolean; + row?: Record; }) { // Plain text mode - render everything as monospace text with truncation if (!prettyFormatting) { @@ -550,12 +567,20 @@ function CellValue({ switch (column.customRenderType) { case "runId": { if (typeof value === "string") { + const spanId = row?.["span_id"]; + const runPath = v3RunPathFromFriendlyId(value); + const href = typeof spanId === "string" && spanId + ? `${runPath}?span=${spanId}` + : runPath; + const tooltip = typeof spanId === "string" && spanId + ? "Jump to span" + : "Jump to run"; return (