` — + - **[obsolete]** `` — + + ## Rules + + - Maximum 3 suggestions per audit. Pick the highest-signal ones. + - Only flag things that would actually mislead a future reviewer. Style and wording do not count. + - Do NOT review the PR itself. Do NOT propose rules outside REVIEW.md's existing sections. + - Do NOT propose rules for one-off PR specifics that don't generalize to future PRs. + - If REVIEW.md does not exist in the repo, respond with `(skip)` and stop. + - When in doubt between "one more file read" and "finish now" — finish now. diff --git a/.github/workflows/claude-md-audit.yml b/.github/workflows/claude-md-audit.yml index a80bbca0f52..32240ba5ea8 100644 --- a/.github/workflows/claude-md-audit.yml +++ b/.github/workflows/claude-md-audit.yml @@ -8,7 +8,6 @@ on: - ".changeset/**" - ".server-changes/**" - "**/*.md" - - "references/**" concurrency: group: claude-md-audit-${{ github.event.pull_request.number }} @@ -16,7 +15,9 @@ concurrency: jobs: audit: - if: github.event.pull_request.draft == false + if: >- + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository runs-on: ubuntu-latest permissions: contents: read @@ -25,16 +26,18 @@ jobs: id-token: write steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: Run Claude Code id: claude - uses: anthropics/claude-code-action@v1 + uses: anthropics/claude-code-action@787c5a0ce96a9a6cfb050ea0c8f4c05f2447c251 # v1.0.133 with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} use_sticky_comment: true + allowed_bots: "devin-ai-integration[bot]" claude_args: | --max-turns 15 diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index cadbe31773f..1c783e7ef6d 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -19,26 +19,27 @@ jobs: (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: - contents: read - pull-requests: read - issues: read + contents: write + pull-requests: write + issues: write id-token: write actions: read # Required for Claude to read CI results on PRs steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" - name: 📥 Download deps @@ -49,9 +50,9 @@ jobs: - name: Run Claude Code id: claude - uses: anthropics/claude-code-action@v1 + uses: anthropics/claude-code-action@787c5a0ce96a9a6cfb050ea0c8f4c05f2447c251 # v1.0.133 with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} # This is an optional setting that allows Claude to read CI results on PRs additional_permissions: | diff --git a/.github/workflows/dependabot-critical-alerts.yml b/.github/workflows/dependabot-critical-alerts.yml new file mode 100644 index 00000000000..a71b14bebf9 --- /dev/null +++ b/.github/workflows/dependabot-critical-alerts.yml @@ -0,0 +1,83 @@ +name: Dependabot Critical Alerts + +on: + schedule: + - cron: "0 8 * * *" # Daily 08:00 UTC + workflow_dispatch: + inputs: + severity: + description: "Severity to alert on" + type: choice + options: + - critical + - high + - medium + - low + default: critical + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + alert: + name: Post critical alerts + runs-on: ubuntu-latest + environment: dependabot-summary + env: + SEVERITY: ${{ inputs.severity || 'critical' }} + steps: + - name: Fetch alerts + id: alerts + env: + GH_TOKEN: ${{ secrets.DEPENDABOT_ALERTS_TOKEN }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + gh api -X GET "/repos/$REPO/dependabot/alerts" \ + -F state=open -F severity="$SEVERITY" --paginate > pages.json + jq -s 'add' pages.json > alerts.json + TOTAL=$(jq 'length' alerts.json) + echo "total=$TOTAL" >> "$GITHUB_OUTPUT" + if [ "$TOTAL" = "0" ]; then + exit 0 + fi + LIST=$(jq -r ' + map("• <\(.html_url)|#\(.number)> *\(.dependency.package.name)* - \(.security_advisory.summary)") + | join("\n") + ' alerts.json) + { + echo "list<> "$GITHUB_OUTPUT" + + - name: Build Slack payload + if: steps.alerts.outputs.total != '0' + env: + REPO: ${{ github.repository }} + CHANNEL: ${{ vars.SLACK_CHANNEL_ID }} + TOTAL: ${{ steps.alerts.outputs.total }} + LIST: ${{ steps.alerts.outputs.list }} + run: | + jq -n \ + --arg channel "$CHANNEL" \ + --arg repo "$REPO" \ + --arg total "$TOTAL" \ + --arg list "$LIST" \ + --arg severity "$SEVERITY" \ + '{ + channel: $channel, + text: ":bufo-alarma: `\($repo)` - *\($total) open \($severity) alert(s)*\n\($list)\n\n" + }' > payload.json + + - name: Post Slack alert + if: steps.alerts.outputs.total != '0' + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload-file-path: payload.json diff --git a/.github/workflows/dependabot-weekly-summary.yml b/.github/workflows/dependabot-weekly-summary.yml new file mode 100644 index 00000000000..fb2717e2fb0 --- /dev/null +++ b/.github/workflows/dependabot-weekly-summary.yml @@ -0,0 +1,206 @@ +name: Dependabot Weekly Summary + +on: + schedule: + - cron: "0 8 * * 1" # Mon 08:00 UTC + workflow_dispatch: + +# Single-purpose monitoring workflow; serialise on workflow name only - we never +# want two concurrent summary runs racing to post the same digest. +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read # gh CLI baseline + pull-requests: read # gh pr list (open dependabot PRs) + actions: read # gh run list / view (parse latest dependabot run logs) + +jobs: + summary: + name: Post weekly Dependabot summary + runs-on: ubuntu-latest + environment: dependabot-summary + env: + # Severities surface in the actions list when their remaining TTR drops + # below this many days. Override via repo/env var ACTION_THRESHOLD_DAYS. + THRESHOLD_DAYS: ${{ vars.ACTION_THRESHOLD_DAYS || '7' }} + steps: + - name: Fetch alerts and compute summaries + id: alerts + env: + GH_TOKEN: ${{ secrets.DEPENDABOT_ALERTS_TOKEN }} + REPO: ${{ github.repository }} + run: | + if ! gh api -X GET "/repos/$REPO/dependabot/alerts" --paginate > pages.json 2> err.txt; then + echo "total=?" >> "$GITHUB_OUTPUT" + ERR=$(head -c 200 err.txt | tr '\n' ' ') + echo "by_severity=:x: _failed to fetch alerts: ${ERR}_" >> "$GITHUB_OUTPUT" + echo "actions=:x: _alerts unavailable_" >> "$GITHUB_OUTPUT" + exit 0 + fi + jq -s '[.[][] | select(.state == "open")]' pages.json > open.json + + TOTAL=$(jq 'length' open.json) + echo "total=$TOTAL" >> "$GITHUB_OUTPUT" + + if [ "$TOTAL" = "0" ]; then + echo "by_severity=:white_check_mark: No open alerts." >> "$GITHUB_OUTPUT" + echo "actions=_None_" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Severity breakdown - real newlines so jq --arg in the payload + # builder encodes them as proper \n in JSON (Slack renders as breaks). + BY_SEV=$(jq -r ' + group_by(.security_advisory.severity) + | map({sev: .[0].security_advisory.severity, + count: length, + weight: ({"critical":0,"high":1,"medium":2,"low":3}[.[0].security_advisory.severity])}) + | sort_by(.weight) + | map("• *\(.count)* \(.sev)") + | join("\n") + ' open.json) + { + echo "by_severity<> "$GITHUB_OUTPUT" + + # Actions: alerts within THRESHOLD_DAYS of their TTR (P0=7d, P1=30d, P2=90d, P3=no deadline) + # Grouped by (package, severity); shows earliest deadline per group. + ACTIONS=$(jq -r --argjson threshold "$THRESHOLD_DAYS" ' + [.[] + | (.security_advisory.severity) as $sev + | ({"critical":7,"high":30,"medium":90,"low":null}[$sev]) as $ttr + | select($ttr != null) + | ((now - (.created_at | fromdateiso8601)) / 86400 | floor) as $age + | {pkg: .dependency.package.name, sev: $sev, remaining: ($ttr - $age)} + ] + | group_by([.pkg, .sev]) + | map({pkg: .[0].pkg, sev: .[0].sev, count: length, min_remaining: ([.[].remaining] | min)}) + | map(select(.min_remaining < $threshold)) + | sort_by(.min_remaining) + | if length == 0 then "_None_" + else (map( + "• *\(.pkg)* (\(.sev))" + + (if .count > 1 then " ×\(.count)" else "" end) + " - " + + (if .min_remaining < 0 then "*OVERDUE* by \(-.min_remaining)d" + else "\(.min_remaining)d remaining" end) + ) | join("\n")) + end + ' open.json) + { + echo "actions<> "$GITHUB_OUTPUT" + + - name: Fetch open dependabot PRs + id: prs + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + REPO_URL: https://github.com/${{ github.repository }} + run: | + if ! PR_JSON=$(gh pr list --repo "$REPO" --state open --author "app/dependabot" --json number,title 2> err.txt); then + ERR=$(head -c 200 err.txt | tr '\n' ' ') + echo "list=:x: _failed to fetch PRs: ${ERR}_" >> "$GITHUB_OUTPUT" + exit 0 + fi + LIST=$(echo "$PR_JSON" | jq -r --arg url "$REPO_URL" ' + if length == 0 then "_None_" + else (map("• <\($url)/pull/\(.number)|#\(.number)> \(.title)") | join("\n")) + end + ') + { + echo "list<> "$GITHUB_OUTPUT" + + - name: Find latest npm dependabot run + id: latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + run: | + # Repos without a dependabot.yml have no "Dependabot Updates" workflow; + # treat the lookup failure as "no recent run found" rather than failing. + if ! RUN_ID=$(gh run list --repo "$REPO" --workflow "Dependabot Updates" --status success --limit 30 --json databaseId,name --jq 'first(.[] | select(.name | startswith("npm_and_yarn")) | .databaseId) // empty' 2>/dev/null); then + RUN_ID="" + fi + echo "run_id=$RUN_ID" >> "$GITHUB_OUTPUT" + + - name: Extract stuck deps (only if actions pending) + id: stuck + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + RUN_ID: ${{ steps.latest.outputs.run_id }} + ACTIONS: ${{ steps.alerts.outputs.actions }} + run: | + # Skip the stuck section entirely when nothing in the actions list + # - keeps the digest tidy when there's nothing to actually act on. + if [ "$ACTIONS" = "_None_" ]; then + echo "section=" >> "$GITHUB_OUTPUT" + exit 0 + fi + HEADER=$'\n\n*Couldn\'t auto-fix (need manual `pnpm.overrides`):*\n' + if [ -z "$RUN_ID" ]; then + { + echo "section<> "$GITHUB_OUTPUT" + exit 0 + fi + gh run view "$RUN_ID" --repo "$REPO" --log > log.txt 2>&1 || true + STUCK=$(grep -oE "No update possible for [^[:space:]]+ [0-9][^[:space:]]*" log.txt | sed 's/No update possible for //' | sort -u || true) + if [ -z "$STUCK" ]; then + { + echo "section<> "$GITHUB_OUTPUT" + exit 0 + fi + LIST=$(echo "$STUCK" | awk 'NR>1{printf "\n"} {printf "• *%s* %s", $1, $2}') + { + echo "section<> "$GITHUB_OUTPUT" + + - name: Build Slack payload + env: + REPO: ${{ github.repository }} + CHANNEL: ${{ vars.SLACK_CHANNEL_ID }} + TOTAL: ${{ steps.alerts.outputs.total }} + BY_SEVERITY: ${{ steps.alerts.outputs.by_severity }} + PRS_LIST: ${{ steps.prs.outputs.list }} + ACTIONS: ${{ steps.alerts.outputs.actions }} + STUCK: ${{ steps.stuck.outputs.section }} + run: | + # Build payload via jq so PR titles or error strings containing + # quotes/backslashes/newlines can't break the JSON. + jq -n \ + --arg channel "$CHANNEL" \ + --arg repo "$REPO" \ + --arg total "$TOTAL" \ + --arg by_severity "$BY_SEVERITY" \ + --arg prs_list "$PRS_LIST" \ + --arg actions "$ACTIONS" \ + --arg stuck "$STUCK" \ + --arg threshold "$THRESHOLD_DAYS" \ + '{ + channel: $channel, + text: ":calendar: *Weekly Dependabot summary* - `\($repo)`\n\n*Open alerts (\($total)):*\n\($by_severity)\n\n*Open Dependabot PRs:*\n\($prs_list)\n\n*Actions needed (<\($threshold)d remaining):*\n\($actions)\($stuck)\n\n" + }' > payload.json + + - name: Post Slack summary + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload-file-path: payload.json diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index bef575c353a..0cac7c8595f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -26,10 +26,12 @@ jobs: working-directory: ./docs steps: - name: 📥 Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Cache npm - uses: actions/cache@v4 + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: | ~/.npm diff --git a/.github/workflows/e2e-webapp-auth-full.yml b/.github/workflows/e2e-webapp-auth-full.yml new file mode 100644 index 00000000000..de9d66c07e9 --- /dev/null +++ b/.github/workflows/e2e-webapp-auth-full.yml @@ -0,0 +1,120 @@ +name: "🛡️ E2E Tests: Webapp Auth (full)" + +# Comprehensive RBAC auth test suite — see TRI-8731. Runs separately from +# the smoke e2e-webapp.yml because it covers every route family with a +# pass/fail matrix and would otherwise dominate per-PR CI time. +# +# Triggered: +# - Manually via workflow_dispatch. +# - Nightly via schedule. +# - On pull requests touching auth-relevant files only (paths filter). + +permissions: + contents: read + +on: + workflow_dispatch: + schedule: + - cron: "0 4 * * *" # 04:00 UTC daily + pull_request: + paths: + - "apps/webapp/app/services/routeBuilders/**" + - "apps/webapp/app/services/rbac.server.ts" + - "apps/webapp/app/services/apiAuth.server.ts" + - "apps/webapp/app/services/personalAccessToken.server.ts" + - "apps/webapp/app/services/sessionStorage.server.ts" + - "apps/webapp/app/routes/api.v*.**" + - "apps/webapp/app/routes/realtime.v*.**" + - "apps/webapp/test/**/*.e2e.full.test.ts" + - "apps/webapp/test/setup/global-e2e-full-setup.ts" + - "apps/webapp/test/helpers/sharedTestServer.ts" + - "apps/webapp/test/helpers/seedTestSession.ts" + - "apps/webapp/vitest.e2e.full.config.ts" + - "internal-packages/rbac/**" + - "packages/plugins/**" + - ".github/workflows/e2e-webapp-auth-full.yml" + +jobs: + e2eAuthFull: + name: "🛡️ E2E Auth Tests (full)" + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + # Don't leave the GITHUB_TOKEN in .git/config — this job + # doesn't need to push and the persisted creds would be + # readable from any subsequent step (zizmor/artipacked). + persist-credentials: false + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 + with: + version: 10.33.2 + + - name: ⎔ Setup node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 20.20.2 + cache: "pnpm" + + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + docker pull postgres:14 + docker pull redis:7.2 + docker pull testcontainers/ryuk:0.11.0 + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🏗️ Build Webapp + run: pnpm run build --filter webapp + + - name: 🛡️ Run Webapp Full Auth E2E Tests + run: cd apps/webapp && pnpm exec vitest run --config vitest.e2e.full.config.ts --reporter=default + env: + WEBAPP_TEST_VERBOSE: "1" diff --git a/.github/workflows/e2e-webapp.yml b/.github/workflows/e2e-webapp.yml new file mode 100644 index 00000000000..f306a86cd28 --- /dev/null +++ b/.github/workflows/e2e-webapp.yml @@ -0,0 +1,97 @@ +name: "🧪 E2E Tests: Webapp" + +permissions: + contents: read + +on: + workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false + +jobs: + e2eTests: + name: "🧪 E2E Tests: Webapp" + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 + with: + version: 10.33.2 + + - name: ⎔ Setup node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 20.20.2 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull redis:7.2 + docker pull testcontainers/ryuk:0.11.0 + echo "Image pre-pull complete" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🏗️ Build Webapp + run: pnpm run build --filter webapp + + - name: 🧪 Run Webapp E2E Tests + run: cd apps/webapp && pnpm exec vitest run --config vitest.e2e.config.ts --reporter=default + env: + WEBAPP_TEST_VERBOSE: "1" diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 9518ca6157c..a70f0400e0a 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -24,19 +24,20 @@ jobs: package-manager: ["npm", "pnpm"] steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 - name: 📥 Download deps run: pnpm install --frozen-lockfile --filter trigger.dev... @@ -48,7 +49,7 @@ jobs: run: pnpm run build --filter trigger.dev^... - name: 🔧 Build worker template files - run: pnpm --filter trigger.dev run build:workers + run: pnpm --filter trigger.dev run --if-present build:workers - name: Enable corepack run: corepack enable diff --git a/.github/workflows/helm-pr-prerelease.yml b/.github/workflows/helm-pr-prerelease.yml deleted file mode 100644 index 8df045945e6..00000000000 --- a/.github/workflows/helm-pr-prerelease.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: 🧭 Helm Chart PR Prerelease - -on: - pull_request: - types: [opened, synchronize, reopened] - paths: - - "hosting/k8s/helm/**" - -concurrency: - group: helm-prerelease-${{ github.event.pull_request.number }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - CHART_NAME: trigger - -jobs: - lint-and-test: - runs-on: ubuntu-latest - permissions: - contents: read - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: "3.18.3" - - - name: Build dependencies - run: helm dependency build ./hosting/k8s/helm/ - - - name: Extract dependency charts - run: | - cd ./hosting/k8s/helm/ - for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - - - name: Lint Helm Chart - run: | - helm lint ./hosting/k8s/helm/ - - - name: Render templates - run: | - helm template test-release ./hosting/k8s/helm/ \ - --values ./hosting/k8s/helm/values.yaml \ - --output-dir ./helm-output - - - name: Validate manifests - uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 - with: - entrypoint: "/kubeconform" - args: "-summary -output json ./helm-output" - - prerelease: - needs: lint-and-test - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - pull-requests: write - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: "3.18.3" - - - name: Build dependencies - run: helm dependency build ./hosting/k8s/helm/ - - - name: Extract dependency charts - run: | - cd ./hosting/k8s/helm/ - for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - - - name: Log in to Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate prerelease version - id: version - run: | - BASE_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') - PR_NUMBER=${{ github.event.pull_request.number }} - SHORT_SHA=$(echo "${{ github.event.pull_request.head.sha }}" | cut -c1-7) - PRERELEASE_VERSION="${BASE_VERSION}-pr${PR_NUMBER}.${SHORT_SHA}" - echo "version=$PRERELEASE_VERSION" >> $GITHUB_OUTPUT - echo "Prerelease version: $PRERELEASE_VERSION" - - - name: Update Chart.yaml with prerelease version - run: | - sed -i "s/^version:.*/version: ${{ steps.version.outputs.version }}/" ./hosting/k8s/helm/Chart.yaml - - - name: Package Helm Chart - run: | - helm package ./hosting/k8s/helm/ --destination /tmp/ - - - name: Push Helm Chart to GHCR - run: | - VERSION="${{ steps.version.outputs.version }}" - CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" - - # Push to GHCR OCI registry - helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" - - - name: Find existing comment - uses: peter-evans/find-comment@v3 - id: find-comment - with: - issue-number: ${{ github.event.pull_request.number }} - comment-author: "github-actions[bot]" - body-includes: "Helm Chart Prerelease Published" - - - name: Create or update PR comment - uses: peter-evans/create-or-update-comment@v4 - with: - comment-id: ${{ steps.find-comment.outputs.comment-id }} - issue-number: ${{ github.event.pull_request.number }} - body: | - ### 🧭 Helm Chart Prerelease Published - - **Version:** `${{ steps.version.outputs.version }}` - - **Install:** - ```bash - helm upgrade --install trigger \ - oci://ghcr.io/${{ github.repository_owner }}/charts/trigger \ - --version "${{ steps.version.outputs.version }}" - ``` - - > ⚠️ This is a prerelease for testing. Do not use in production. - edit-mode: replace diff --git a/.github/workflows/helm-prerelease.yml b/.github/workflows/helm-prerelease.yml new file mode 100644 index 00000000000..ff2c8f5a614 --- /dev/null +++ b/.github/workflows/helm-prerelease.yml @@ -0,0 +1,200 @@ +name: 🧭 Helm Chart Prerelease + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - "hosting/k8s/helm/**" + push: + branches: + - main + paths: + - "hosting/k8s/helm/**" + workflow_dispatch: + inputs: + app_version: + description: "Override appVersion (e.g. 'main', 'v4.4.4'). Leave empty to keep Chart.yaml value." + required: false + type: string + default: "" + +concurrency: + group: helm-prerelease-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + REGISTRY: ghcr.io + CHART_NAME: trigger + +jobs: + lint-and-test: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Lint Helm Chart + run: | + helm lint ./hosting/k8s/helm/ + + - name: Render templates + run: | + helm template test-release ./hosting/k8s/helm/ \ + --values ./hosting/k8s/helm/values.yaml \ + --output-dir ./helm-output + + - name: Validate manifests + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0@sha256:85dbef6b4b312b99133decc9c6fc9495e9fc5f92293d4ff3b7e1b30f5611823c + with: + entrypoint: "/kubeconform" + args: "-summary -output json ./helm-output" + + prerelease: + needs: lint-and-test + if: | + (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || + github.event_name == 'push' || + github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + pull-requests: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Log in to Container Registry + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate prerelease version + id: version + run: | + BASE_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + PR_NUMBER=${{ github.event.pull_request.number }} + SHORT_SHA=$(echo "${{ github.event.pull_request.head.sha }}" | cut -c1-7) + PRERELEASE_VERSION="${BASE_VERSION}-pr${PR_NUMBER}.${SHORT_SHA}" + elif [[ "${{ github.event_name }}" == "push" ]]; then + SHORT_SHA=$(echo "${GITHUB_SHA}" | cut -c1-7) + PRERELEASE_VERSION="${BASE_VERSION}-main.${SHORT_SHA}" + else + SHORT_SHA=$(echo "${GITHUB_SHA}" | cut -c1-7) + REF_SLUG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-' | tr -cd 'a-zA-Z0-9-') + if [[ -z "$REF_SLUG" ]]; then + REF_SLUG="manual" + fi + PRERELEASE_VERSION="${BASE_VERSION}-${REF_SLUG}.${SHORT_SHA}" + fi + echo "version=$PRERELEASE_VERSION" >> "$GITHUB_OUTPUT" + echo "Prerelease version: $PRERELEASE_VERSION" + + - name: Update Chart.yaml with prerelease version + run: | + sed -i "s/^version:.*/version: ${STEPS_VERSION_OUTPUTS_VERSION}/" ./hosting/k8s/helm/Chart.yaml + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Override appVersion + if: github.event_name == 'workflow_dispatch' && inputs.app_version != '' + env: + APP_VERSION: ${{ inputs.app_version }} + run: | + yq -i '.appVersion = strenv(APP_VERSION)' ./hosting/k8s/helm/Chart.yaml + + - name: Package Helm Chart + run: | + helm package ./hosting/k8s/helm/ --destination /tmp/ + + - name: Push Helm Chart to GHCR + run: | + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" + CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" + + # Push to GHCR OCI registry + helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Write run summary + run: | + { + echo "### 🧭 Helm Chart Prerelease Published" + echo "" + echo "**Version:** \`${STEPS_VERSION_OUTPUTS_VERSION}\`" + echo "" + echo "**Install:**" + echo '```bash' + echo "helm upgrade --install trigger \\" + echo " oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts/${{ env.CHART_NAME }} \\" + echo " --version \"${STEPS_VERSION_OUTPUTS_VERSION}\"" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Find existing comment + if: github.event_name == 'pull_request' + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0 + id: find-comment + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: "github-actions[bot]" + body-includes: "Helm Chart Prerelease Published" + + - name: Create or update PR comment + if: github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 + with: + comment-id: ${{ steps.find-comment.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body: | + ### 🧭 Helm Chart Prerelease Published + + **Version:** `${{ steps.version.outputs.version }}` + + **Install:** + ```bash + helm upgrade --install trigger \ + oci://ghcr.io/${{ github.repository_owner }}/charts/trigger \ + --version "${{ steps.version.outputs.version }}" + ``` + + > ⚠️ This is a prerelease for testing. Do not use in production. + edit-mode: replace diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index dab18223e35..95805539807 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -3,9 +3,6 @@ name: 🤖 PR Checks on: pull_request: types: [opened, synchronize, reopened] - paths-ignore: - - "docs/**" - - ".changeset/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -13,23 +10,169 @@ concurrency: permissions: contents: read - id-token: write + pull-requests: read jobs: + changes: + name: Detect changes + runs-on: ubuntu-latest + outputs: + code: ${{ steps.code_filter.outputs.code }} + typecheck_self: ${{ steps.filter.outputs.typecheck_self }} + webapp: ${{ steps.filter.outputs.webapp }} + packages: ${{ steps.filter.outputs.packages }} + internal: ${{ steps.filter.outputs.internal }} + cli: ${{ steps.filter.outputs.cli }} + sdk: ${{ steps.filter.outputs.sdk }} + steps: + # `code` uses `every` semantics so the negation patterns actually subtract. + # With the default `some` quantifier, `**` matches every file and the + # subsequent `!...` patterns are no-ops (each pattern is OR'd, not AND'd). + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: code_filter + with: + predicate-quantifier: every + filters: | + code: + - '**' + - '!docs/**' + - '!.changeset/**' + - '!hosting/**' + - '!.github/**' + - '!**/*.md' + - '!**/.env.example' + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: filter + with: + filters: | + typecheck_self: + - '.github/workflows/pr_checks.yml' + - '.github/workflows/typecheck.yml' + webapp: + - 'apps/webapp/**' + - 'packages/**' + - 'internal-packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-webapp.yml' + - '.github/workflows/e2e-webapp.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + packages: + - 'packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-packages.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + internal: + - 'internal-packages/**' + - 'packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-internal.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + cli: + - 'packages/cli-v3/**' + - 'packages/build/**' + - 'packages/core/**' + - 'packages/schema-to-json/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/e2e.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + sdk: + - 'packages/trigger-sdk/**' + - 'packages/core/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/sdk-compat.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + typecheck: + needs: changes + if: needs.changes.outputs.code == 'true' || needs.changes.outputs.typecheck_self == 'true' uses: ./.github/workflows/typecheck.yml - secrets: inherit - units: - uses: ./.github/workflows/unit-tests.yml - secrets: inherit + webapp: + needs: changes + if: needs.changes.outputs.webapp == 'true' + uses: ./.github/workflows/unit-tests-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + e2e-webapp: + needs: changes + if: needs.changes.outputs.webapp == 'true' + uses: ./.github/workflows/e2e-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + packages: + needs: changes + if: needs.changes.outputs.packages == 'true' + uses: ./.github/workflows/unit-tests-packages.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + internal: + needs: changes + if: needs.changes.outputs.internal == 'true' + uses: ./.github/workflows/unit-tests-internal.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} e2e: + needs: changes + if: needs.changes.outputs.cli == 'true' uses: ./.github/workflows/e2e.yml with: package: cli-v3 - secrets: inherit sdk-compat: + needs: changes + if: needs.changes.outputs.sdk == 'true' uses: ./.github/workflows/sdk-compat.yml - secrets: inherit + + all-checks: + name: All PR Checks + needs: + - changes + - typecheck + - webapp + - e2e-webapp + - packages + - internal + - e2e + - sdk-compat + if: always() + runs-on: ubuntu-latest + steps: + - name: Verify all checks + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more checks failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more checks were cancelled" + exit 1 + fi + echo "All checks passed or were skipped due to path filters" diff --git a/.github/workflows/preview-dispatch.yml b/.github/workflows/preview-dispatch.yml new file mode 100644 index 00000000000..3f26c66cf33 --- /dev/null +++ b/.github/workflows/preview-dispatch.yml @@ -0,0 +1,76 @@ +name: 🌱 Preview environment dispatch + +# Opt-in per-PR preview environments + +on: + pull_request: + types: [opened, reopened, synchronize, closed, labeled, unlabeled] + +# Serialize a PR's events so dispatches arrive in order. Cloud-side concurrency +# collapses by branch but can't fix out-of-order arrival — e.g. a push racing a +# close could cancel the in-flight destroy and leak the preview. One short API +# call, so queuing is cheap; cancel-in-progress: false lets an in-flight +# dispatch finish (GitHub keeps only the latest pending, the desired behavior). +concurrency: + group: preview-dispatch-${{ github.event.pull_request.number }} + cancel-in-progress: false + +permissions: {} + +jobs: + dispatch: + name: Dispatch preview-deploy to cloud + runs-on: ubuntu-latest + # label added -> create + # new commit while labeled -> update + # label removed / PR closed -> destroy + if: >- + github.event.pull_request.head.repo.full_name == github.repository && + ( + (github.event.action == 'labeled' && github.event.label.name == 'preview') || + (github.event.action == 'unlabeled' && github.event.label.name == 'preview') || + ( + contains(github.event.pull_request.labels.*.name, 'preview') && + contains(fromJSON('["opened","reopened","synchronize","closed"]'), github.event.action) + ) + ) + steps: + - name: Build dispatch payload + id: payload + env: + ACTION: ${{ github.event.action }} + BRANCH: ${{ github.event.pull_request.head.ref }} + COMMIT: ${{ github.event.pull_request.head.sha }} + run: | + set -euo pipefail + # Map the GitHub PR action to the cloud pipeline's lifecycle event. + case "$ACTION" in + labeled | opened | reopened) EVENT=opened ;; + synchronize) EVENT=synchronize ;; + unlabeled | closed) EVENT=closed ;; + *) echo "unexpected action: $ACTION" >&2; exit 1 ;; + esac + # jq --arg JSON-escapes every value, so a branch name containing + # quotes/braces can't break or inject into the client payload. + payload=$(jq -nc \ + --arg b "$BRANCH" \ + --arg c "$COMMIT" \ + --arg e "$EVENT" \ + '{branch_name: $b, commit: $c, pull_request_event: $e}') + { + echo "client_payload=$payload" + echo "summary=$EVENT for $BRANCH @ ${COMMIT:0:7}" + } >> "$GITHUB_OUTPUT" + + - name: Log dispatch + env: + SUMMARY: ${{ steps.payload.outputs.summary }} + run: echo "Dispatching preview-deploy event ($SUMMARY)" + + - name: Send repository_dispatch + uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 # v4.0.1 + with: + token: ${{ secrets.CROSS_REPO_PAT }} + repository: triggerdotdev/cloud + event-type: preview-deploy + client-payload: ${{ steps.payload.outputs.client_payload }} diff --git a/.github/workflows/preview-packages.yml b/.github/workflows/preview-packages.yml new file mode 100644 index 00000000000..f4dd5b39930 --- /dev/null +++ b/.github/workflows/preview-packages.yml @@ -0,0 +1,83 @@ +name: 📦 Preview packages (pkg.pr.new) + +# Publishes installable preview builds of the public @trigger.dev/* packages +# for every push to a branch, via https://pkg.pr.new. These are NOT published +# to npm — pkg.pr.new serves them by commit SHA and drops install instructions +# in a comment on the associated PR, e.g. +# npm i https://pkg.pr.new/@trigger.dev/sdk@ +# +# Prerequisites: +# - The pkg.pr.new GitHub App must be installed on triggerdotdev/trigger.dev +# (https://github.com/apps/pkg-pr-new). Publishing fails until it is. +# +# Fork note: pkg.pr.new authenticates with a GitHub Actions OIDC token, which +# GitHub does not issue to pull_request workflows from forks. This `push` +# trigger therefore covers branches pushed to this repo (the core team), not +# external fork PRs. Adding fork coverage would require a workflow_run two-stage +# setup. + +on: + push: + branches-ignore: + - main + - changeset-release/main + paths: + - "package.json" + - "packages/**" + - "pnpm-lock.yaml" + - "pnpm-workspace.yaml" + - "turbo.json" + - ".github/workflows/preview-packages.yml" + - "scripts/stamp-preview-version.mjs" + - "scripts/updateVersion.ts" + +concurrency: + group: preview-packages-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + id-token: write # OIDC token used by pkg.pr.new to authenticate the publish + +jobs: + publish: + name: Build and publish previews + runs-on: ubuntu-latest + if: github.repository == 'triggerdotdev/trigger.dev' + steps: + - name: ⬇️ Checkout repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 + with: + version: 10.33.2 + + - name: ⎔ Setup node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 📥 Install dependencies + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma client + run: pnpm run generate + + # Stamp a unique 0.0.0-preview- version before building so it can't + # collide with real npm versions and so updateVersion.ts bakes it into the + # runtime VERSION constant. See scripts/stamp-preview-version.mjs. + - name: 🏷️ Stamp preview version + run: node scripts/stamp-preview-version.mjs + env: + GITHUB_SHA: ${{ github.sha }} + + - name: 🔨 Build packages + run: pnpm run build --filter "@trigger.dev/*" --filter "trigger.dev" + + - name: 🚀 Publish previews to pkg.pr.new + run: pnpm exec pkg-pr-new publish --pnpm --compact --commentWithSha './packages/*' diff --git a/.github/workflows/publish-webapp.yml b/.github/workflows/publish-webapp.yml index 6fcc30209ab..5a604e26082 100644 --- a/.github/workflows/publish-webapp.yml +++ b/.github/workflows/publish-webapp.yml @@ -4,6 +4,7 @@ permissions: contents: read packages: write id-token: write + attestations: write on: workflow_call: @@ -13,6 +14,27 @@ on: type: string required: false default: "" + image_registry: + description: The registry namespace to publish under (e.g. ghcr.io/) + type: string + required: false + default: "" + outputs: + version: + description: The published image tag + value: ${{ jobs.publish.outputs.version }} + short_sha: + description: Short commit SHA of the published build + value: ${{ jobs.publish.outputs.short_sha }} + image_repo: + description: The image repository the build was published to (without tag) + value: ${{ jobs.publish.outputs.image_repo }} + digest: + description: Multi-arch index digest (sha256:...) of the published image + value: ${{ jobs.publish.outputs.digest }} + secrets: + SENTRY_AUTH_TOKEN: + required: false jobs: publish: @@ -22,14 +44,17 @@ jobs: outputs: version: ${{ steps.get_tag.outputs.tag }} short_sha: ${{ steps.get_commit.outputs.sha_short }} + image_repo: ${{ steps.set_tags.outputs.image_repo }} + digest: ${{ steps.build_push.outputs.digest }} steps: - name: 🏭 Setup Depot CLI - uses: depot/setup-action@v1 + uses: depot/setup-action@15c09a5f77a0840ad4bce955686522a257853461 # v1.7.1 - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive + persist-credentials: false - name: "#️⃣ Get the image tag" id: get_tag @@ -40,42 +65,57 @@ jobs: - name: 🔢 Get the commit hash id: get_commit run: | - echo "sha_short=$(echo ${{ github.sha }} | cut -c1-7)" >> "$GITHUB_OUTPUT" + echo "sha_short=$(echo "${GITHUB_SHA}" | cut -c1-7)" >> "$GITHUB_OUTPUT" - name: 📛 Set the tags id: set_tags run: | - ref_without_tag=ghcr.io/triggerdotdev/trigger.dev - image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} + # The registry namespace is resolved by the caller (defaulting to + # ghcr.io/, overridable via the IMAGE_REGISTRY repository + # variable); the webapp image lives at /. A fork + # therefore publishes to its own package automatically. + image_tags=$REF_WITHOUT_TAG:${STEPS_GET_TAG_OUTPUTS_TAG} - # if tag is a semver, also tag it as v4 - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - # TODO: switch to v4 tag on GA - image_tags=$image_tags,$ref_without_tag:v4-beta + # when pushing the mutable main tag, also push an immutable-by-convention + # full-commit-sha tag so a commit can be resolved to a specific digest + if [[ "${STEPS_GET_TAG_OUTPUTS_TAG}" == "main" ]]; then + image_tags=$image_tags,$REF_WITHOUT_TAG:${GITHUB_SHA} fi echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + echo "image_repo=${REF_WITHOUT_TAG}" >> "$GITHUB_OUTPUT" + env: + REF_WITHOUT_TAG: ${{ format('{0}/{1}', inputs.image_registry || vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner), github.event.repository.name) }} + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 📝 Set the build info id: set_build_info run: | - tag=${{ steps.get_tag.outputs.tag }} - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - echo "BUILD_APP_VERSION=${tag}" >> "$GITHUB_OUTPUT" - fi - echo "BUILD_GIT_SHA=${{ github.sha }}" >> "$GITHUB_OUTPUT" - echo "BUILD_GIT_REF_NAME=${{ github.ref_name }}" >> "$GITHUB_OUTPUT" - echo "BUILD_TIMESTAMP_SECONDS=$(date +%s)" >> "$GITHUB_OUTPUT" + { + tag="${STEPS_GET_TAG_OUTPUTS_TAG}" + if [[ "${STEPS_GET_TAG_OUTPUTS_IS_SEMVER}" == true ]]; then + echo "BUILD_APP_VERSION=${tag}" + fi + echo "BUILD_GIT_SHA=${GITHUB_SHA}" + echo "BUILD_GIT_REF_NAME=${GITHUB_REF_NAME}" + echo "BUILD_TIMESTAMP_SECONDS=$(date +%s)" + echo "BUILD_TIMESTAMP_RFC3339=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + } >> "$GITHUB_OUTPUT" + env: + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: 🐳 Build image and push to GitHub Container Registry - uses: depot/build-push-action@v1 + id: build_push + uses: depot/build-push-action@98e78adca7817480b8185f474a400b451d74e287 # v1.18.0 with: file: ./docker/Dockerfile platforms: linux/amd64,linux/arm64 @@ -86,8 +126,20 @@ jobs: BUILD_GIT_SHA=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} BUILD_GIT_REF_NAME=${{ steps.set_build_info.outputs.BUILD_GIT_REF_NAME }} BUILD_TIMESTAMP_SECONDS=${{ steps.set_build_info.outputs.BUILD_TIMESTAMP_SECONDS }} + BUILD_TIMESTAMP_RFC3339=${{ steps.set_build_info.outputs.BUILD_TIMESTAMP_RFC3339 }} SENTRY_RELEASE=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} SENTRY_ORG=triggerdev SENTRY_PROJECT=trigger-cloud secrets: | sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + + - name: 🪪 Attest build provenance + # Image is already pushed by this point — don't fail releases (and the + # downstream publish-helm job) on a Sigstore/GHCR-referrer hiccup. Real + # config errors still surface as a step warning in the workflow run. + continue-on-error: true + uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 + with: + subject-name: ${{ steps.set_tags.outputs.image_repo }} + subject-digest: ${{ steps.build_push.outputs.digest }} + push-to-registry: true diff --git a/.github/workflows/publish-worker-v4.yml b/.github/workflows/publish-worker-v4.yml index 4a2853da081..85ca903a8d6 100644 --- a/.github/workflows/publish-worker-v4.yml +++ b/.github/workflows/publish-worker-v4.yml @@ -8,6 +8,11 @@ on: type: string required: false default: "" + image_registry: + description: The registry namespace to publish under (e.g. ghcr.io/) + type: string + required: false + default: "" push: tags: - "re2-test-*" @@ -37,19 +42,22 @@ jobs: DOCKER_BUILDKIT: "1" steps: - name: 🏭 Setup Depot CLI - uses: depot/setup-action@v1 + uses: depot/setup-action@15c09a5f77a0840ad4bce955686522a257853461 # v1.7.1 - name: ⬇️ Checkout git repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Get image repo id: get_repository + env: + PACKAGE: ${{ matrix.package }} run: | - if [[ "${{ matrix.package }}" == *-provider ]]; then - provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) - repo=provider/${provider_type} + if [[ "$PACKAGE" == *-provider ]]; then + repo="provider/${PACKAGE%-provider}" else - repo="${{ matrix.package }}" + repo="$PACKAGE" fi echo "repo=${repo}" >> "$GITHUB_OUTPUT" @@ -62,26 +70,28 @@ jobs: - name: 📛 Set tags to push id: set_tags run: | - ref_without_tag=ghcr.io/triggerdotdev/${{ steps.get_repository.outputs.repo }} - image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} - - # if tag is a semver, also tag it as v4 - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - # TODO: switch to v4 tag on GA - image_tags=$image_tags,$ref_without_tag:v4-beta - fi + # Resolved by the caller when invoked from publish.yml; falls back to the + # IMAGE_REGISTRY repository variable (or ghcr.io/) for the direct + # push triggers above, so a fork publishes to its own namespace. + ref_without_tag=${IMAGE_REGISTRY}/${STEPS_GET_REPOSITORY_OUTPUTS_REPO} + image_tags=$ref_without_tag:${STEPS_GET_TAG_OUTPUTS_TAG} echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + env: + IMAGE_REGISTRY: ${{ inputs.image_registry || vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner) }} + STEPS_GET_REPOSITORY_OUTPUTS_REPO: ${{ steps.get_repository.outputs.repo }} + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: 🐳 Build image and push to GitHub Container Registry - uses: depot/build-push-action@v1 + uses: depot/build-push-action@98e78adca7817480b8185f474a400b451d74e287 # v1.18.0 with: file: ./apps/${{ matrix.package }}/Containerfile platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/publish-worker.yml b/.github/workflows/publish-worker.yml index 74a70d83667..f443e5dab1e 100644 --- a/.github/workflows/publish-worker.yml +++ b/.github/workflows/publish-worker.yml @@ -8,6 +8,16 @@ on: type: string required: false default: "" + image_registry: + description: The registry namespace to publish under (e.g. ghcr.io/) + type: string + required: false + default: "" + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false push: tags: - "infra-dev-*" @@ -26,18 +36,22 @@ jobs: runs-on: ubuntu-latest env: DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} steps: - name: ⬇️ Checkout git repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Get image repo id: get_repository + env: + PACKAGE: ${{ matrix.package }} run: | - if [[ "${{ matrix.package }}" == *-provider ]]; then - provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) - repo=provider/${provider_type} + if [[ "$PACKAGE" == *-provider ]]; then + repo="provider/${PACKAGE%-provider}" else - repo="${{ matrix.package }}" + repo="$PACKAGE" fi echo "repo=${repo}" >> "$GITHUB_OUTPUT" @@ -47,11 +61,12 @@ jobs: tag: ${{ inputs.image_tag }} - name: 🐋 Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub - uses: docker/login-action@v3 + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -62,7 +77,7 @@ jobs: # ..to push image - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: registry: ghcr.io username: ${{ github.repository_owner }} @@ -73,7 +88,10 @@ jobs: docker tag infra_image "$REGISTRY/$REPOSITORY:$IMAGE_TAG" docker push "$REGISTRY/$REPOSITORY:$IMAGE_TAG" env: - REGISTRY: ghcr.io/triggerdotdev + # Resolved by the caller when invoked from publish.yml; falls back to the + # IMAGE_REGISTRY repository variable (or ghcr.io/) for the direct + # push triggers above, so a fork publishes to its own namespace. + REGISTRY: ${{ inputs.image_registry || vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner) }} REPOSITORY: ${{ steps.get_repository.outputs.repo }} IMAGE_TAG: ${{ steps.get_tag.outputs.tag }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 6213499c5ad..2f2744c7702 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,6 +8,15 @@ on: description: The image tag to publish required: true type: string + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false + SENTRY_AUTH_TOKEN: + required: false + CROSS_REPO_PAT: + required: false push: branches: - main @@ -37,8 +46,6 @@ on: - "tests/**" permissions: - id-token: write - packages: write contents: read concurrency: @@ -50,29 +57,107 @@ env: jobs: typecheck: uses: ./.github/workflows/typecheck.yml - secrets: inherit units: uses: ./.github/workflows/unit-tests.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} publish-webapp: needs: [typecheck] + permissions: + contents: read + packages: write + id-token: write + attestations: write uses: ./.github/workflows/publish-webapp.yml - secrets: inherit + secrets: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} with: image_tag: ${{ inputs.image_tag }} + # Target registry namespace. Defaults to ghcr.io/ so a fork publishes + # to its own namespace; set the IMAGE_REGISTRY repository variable to override. + image_registry: ${{ vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner) }} publish-worker: needs: [typecheck] + permissions: + contents: read + packages: write uses: ./.github/workflows/publish-worker.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} with: image_tag: ${{ inputs.image_tag }} + image_registry: ${{ vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner) }} publish-worker-v4: needs: [typecheck] + permissions: + contents: read + packages: write + id-token: write uses: ./.github/workflows/publish-worker-v4.yml - secrets: inherit with: image_tag: ${{ inputs.image_tag }} + image_registry: ${{ vars.IMAGE_REGISTRY || format('ghcr.io/{0}', github.repository_owner) }} + + # OS-level CVE scan of the image just published above. Report-only (writes to + # the run summary); runs alongside the worker publishes and never blocks them. + scan-webapp: + needs: [publish-webapp] + permissions: + contents: read + packages: read # pull the just-published image from GHCR + uses: ./.github/workflows/trivy-image-webapp.yml + with: + image-ref: ${{ needs.publish-webapp.outputs.image_repo }}:${{ needs.publish-webapp.outputs.version }} + + # Announce the freshly published mutable `main` webapp image to subscriber + # repos via repository_dispatch, handing them a digest-pinned ref to build or + # deploy from. The repo, ref prefix, and dispatch target all default to the + # canonical values and can be overridden by repository variables. + # + # `push` only: release builds reach publish.yml via workflow_call (from + # release.yml) with an explicit image_tag while github.ref_name is still + # `main`, so gate on the event to avoid dispatching — and failing on the + # absent CROSS_REPO_PAT — during a release. + dispatch-main-image: + name: 📣 Dispatch main image + needs: [publish-webapp] + if: github.repository == (vars.MAIN_IMAGE_DISPATCH_REPO || 'triggerdotdev/trigger.dev') && github.event_name == 'push' && startsWith(github.ref_name, vars.MAIN_IMAGE_DISPATCH_REF_PREFIX || 'main') + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Build dispatch payload + id: payload + env: + IMAGE_REPO: ${{ needs.publish-webapp.outputs.image_repo }} + DIGEST: ${{ needs.publish-webapp.outputs.digest }} + COMMIT: ${{ github.sha }} + run: | + set -euo pipefail + # Pin to the exact multi-arch index just pushed so subscribers resolve a + # single immutable artifact rather than chasing the moving `main` tag. + if [[ -z "${DIGEST}" ]]; then + echo "::error::publish-webapp produced no image digest; refusing to dispatch" + exit 1 + fi + image="${IMAGE_REPO}@${DIGEST}" + # jq --arg JSON-escapes every value, so the ref/commit can't break out of + # or inject into the client payload. + payload=$(jq -nc \ + --arg img "$image" \ + --arg c "$COMMIT" \ + '{image: $img, commit: $c}') + echo "client_payload=$payload" >> "$GITHUB_OUTPUT" + + - name: Send repository_dispatch + uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 # v4.0.1 + with: + token: ${{ secrets.CROSS_REPO_PAT }} + repository: ${{ vars.MAIN_IMAGE_DISPATCH_TARGET || 'triggerdotdev/cloud' }} + event-type: main-image-published + client-payload: ${{ steps.payload.outputs.client_payload }} diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml index c6efd382ff6..13d28545e7f 100644 --- a/.github/workflows/release-helm.yml +++ b/.github/workflows/release-helm.yml @@ -4,6 +4,12 @@ on: push: tags: - 'helm-v*' + workflow_call: + inputs: + chart_version: + description: 'Chart version to release' + required: true + type: string workflow_dispatch: inputs: chart_version: @@ -22,10 +28,12 @@ jobs: contents: read steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 with: version: "3.18.3" @@ -48,7 +56,7 @@ jobs: --output-dir ./helm-output - name: Validate manifests - uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0@sha256:85dbef6b4b312b99133decc9c6fc9495e9fc5f92293d4ff3b7e1b30f5611823c with: entrypoint: '/kubeconform' args: "-summary -output json ./helm-output" @@ -61,10 +69,12 @@ jobs: packages: write steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 with: version: "3.18.3" @@ -77,7 +87,7 @@ jobs: for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - name: Log in to Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -86,18 +96,20 @@ jobs: - name: Extract version from tag or input id: version run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ github.event.inputs.chart_version }}" + if [ -n "${INPUTS_CHART_VERSION}" ]; then + VERSION="${INPUTS_CHART_VERSION}" else - VERSION="${{ github.ref_name }}" + VERSION="${GITHUB_REF_NAME}" VERSION="${VERSION#helm-v}" fi - echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "version=$VERSION" >> "$GITHUB_OUTPUT" echo "Releasing version: $VERSION" + env: + INPUTS_CHART_VERSION: ${{ inputs.chart_version }} - name: Check Chart.yaml version matches release version run: | - VERSION="${{ steps.version.outputs.version }}" + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" CHART_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') echo "Chart.yaml version: $CHART_VERSION" echo "Release version: $VERSION" @@ -106,6 +118,8 @@ jobs: exit 1 fi echo "✅ Chart.yaml version matches release version." + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} - name: Package Helm Chart run: | @@ -113,18 +127,19 @@ jobs: - name: Push Helm Chart to GHCR run: | - VERSION="${{ steps.version.outputs.version }}" + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" # Push to GHCR OCI registry helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} - name: Create GitHub Release id: release - uses: softprops/action-gh-release@v1 - if: github.event_name == 'push' + uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0 with: - tag_name: ${{ github.ref_name }} + tag_name: helm-v${{ steps.version.outputs.version }} name: "Helm Chart ${{ steps.version.outputs.version }}" body: | ### Installation diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 79b113b0f2a..e3b339dfca7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,7 @@ jobs: show-release-summary: name: 📋 Release Summary runs-on: ubuntu-latest + permissions: {} if: | github.repository == 'triggerdotdev/trigger.dev' && github.event_name == 'pull_request' && @@ -43,7 +44,7 @@ jobs: env: PR_BODY: ${{ github.event.pull_request.body }} run: | - echo "$PR_BODY" | sed -n '/^# Releases/,$p' >> $GITHUB_STEP_SUMMARY + echo "$PR_BODY" | sed -n '/^# Releases/,$p' >> "$GITHUB_STEP_SUMMARY" release: name: 🚀 Release npm packages @@ -63,9 +64,10 @@ jobs: published: ${{ steps.changesets.outputs.published }} published_packages: ${{ steps.changesets.outputs.publishedPackages }} published_package_version: ${{ steps.get_version.outputs.package_version }} + is_prerelease: ${{ steps.get_version.outputs.is_prerelease }} steps: - name: Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 # zizmor: ignore[artipacked] needs persisted git creds for tag push; no artifact upload here so no leak path with: fetch-depth: 0 ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.sha }} @@ -73,20 +75,22 @@ jobs: - name: Verify ref is on main if: github.event_name == 'workflow_dispatch' run: | - if ! git merge-base --is-ancestor ${{ github.event.inputs.ref }} origin/main; then + if ! git merge-base --is-ancestor "${GITHUB_EVENT_INPUTS_REF}" origin/main; then echo "Error: ref must be an ancestor of main (i.e., already merged)" exit 1 fi + env: + GITHUB_EVENT_INPUTS_REF: ${{ github.event.inputs.ref }} - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" # npm v11.5.1 or newer is required for OIDC support @@ -108,7 +112,7 @@ jobs: - name: Publish id: changesets - uses: changesets/action@v1 + uses: changesets/action@63a615b9cd06ba9a3e6d13796c7fbcb080a60a0b # v1.8.0 with: publish: pnpm run changeset:release createGithubReleases: false @@ -119,28 +123,54 @@ jobs: if: steps.changesets.outputs.published == 'true' id: get_version run: | - package_version=$(echo '${{ steps.changesets.outputs.publishedPackages }}' | jq -r '.[0].version') + package_version=$(echo "${STEPS_CHANGESETS_OUTPUTS_PUBLISHEDPACKAGES}" | jq -r '.[0].version') echo "package_version=${package_version}" >> "$GITHUB_OUTPUT" + # Any semver with a hyphen is a prerelease (e.g. 4.5.0-rc.0, 0.0.0-snapshot-...) + if [[ "${package_version}" == *-* ]]; then + echo "is_prerelease=true" >> "$GITHUB_OUTPUT" + else + echo "is_prerelease=false" >> "$GITHUB_OUTPUT" + fi + env: + STEPS_CHANGESETS_OUTPUTS_PUBLISHEDPACKAGES: ${{ steps.changesets.outputs.publishedPackages }} - name: Create unified GitHub release if: steps.changesets.outputs.published == 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} RELEASE_PR_BODY: ${{ github.event.pull_request.body }} + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} + STEPS_GET_VERSION_OUTPUTS_IS_PRERELEASE: ${{ steps.get_version.outputs.is_prerelease }} run: | - VERSION="${{ steps.get_version.outputs.package_version }}" + VERSION="${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" node scripts/generate-github-release.mjs "$VERSION" > /tmp/release-body.md + PRERELEASE_FLAG="" + if [ "${STEPS_GET_VERSION_OUTPUTS_IS_PRERELEASE}" = "true" ]; then + PRERELEASE_FLAG="--prerelease" + fi gh release create "v${VERSION}" \ --title "trigger.dev v${VERSION}" \ --notes-file /tmp/release-body.md \ - --target main + --target main \ + $PRERELEASE_FLAG - name: Create and push Docker tag if: steps.changesets.outputs.published == 'true' run: | set -e - git tag "v.docker.${{ steps.get_version.outputs.package_version }}" - git push origin "v.docker.${{ steps.get_version.outputs.package_version }}" + git tag "v.docker.${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + git push origin "v.docker.${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + env: + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} + + - name: Create and push Helm chart tag + if: steps.changesets.outputs.published == 'true' + run: | + set -e + git tag "helm-v${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + git push origin "helm-v${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + env: + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} # Trigger Docker builds directly via workflow_call since tags pushed with # GITHUB_TOKEN don't trigger other workflows (GitHub Actions limitation). @@ -148,11 +178,33 @@ jobs: name: 🐳 Publish Docker images needs: release if: needs.release.outputs.published == 'true' + permissions: + contents: read + packages: write + id-token: write + attestations: write uses: ./.github/workflows/publish.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} with: image_tag: v${{ needs.release.outputs.published_package_version }} + # Trigger Helm chart release directly via workflow_call (same GITHUB_TOKEN + # limitation as the Docker path). Runs after Docker images are published so + # the chart never references images that don't exist yet. + publish-helm: + name: 🧭 Publish Helm chart + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + permissions: + contents: write + packages: write + uses: ./.github/workflows/release-helm.yml + with: + chart_version: ${{ needs.release.outputs.published_package_version }} + # After Docker images are published, update the GitHub release with the exact GHCR tag URL. # The GHCR package version ID is only known after the image is pushed, so we query for it here. update-release: @@ -167,9 +219,10 @@ jobs: - name: Update GitHub release with Docker image link env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NEEDS_RELEASE_OUTPUTS_PUBLISHED_PACKAGE_VERSION: ${{ needs.release.outputs.published_package_version }} run: | set -e - VERSION="${{ needs.release.outputs.published_package_version }}" + VERSION="${NEEDS_RELEASE_OUTPUTS_PUBLISHED_PACKAGE_VERSION}" TAG="v${VERSION}" # Query GHCR for the version ID matching this tag @@ -199,10 +252,11 @@ jobs: dispatch-changelog: name: 📝 Dispatch changelog PR needs: [release, update-release] - if: needs.release.outputs.published == 'true' + if: needs.release.outputs.published == 'true' && needs.release.outputs.is_prerelease != 'true' runs-on: ubuntu-latest + permissions: {} steps: - - uses: peter-evans/repository-dispatch@v3 + - uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 # v4.0.1 with: token: ${{ secrets.CROSS_REPO_PAT }} repository: triggerdotdev/trigger.dev-site-v3 @@ -220,20 +274,21 @@ jobs: if: github.repository == 'triggerdotdev/trigger.dev' && github.event_name == 'workflow_dispatch' && github.event.inputs.type == 'prerelease' steps: - name: Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 ref: ${{ github.event.inputs.ref }} + persist-credentials: false - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" # npm v11.5.1 or newer is required for OIDC support @@ -247,10 +302,18 @@ jobs: - name: Generate Prisma Client run: pnpm run generate + - name: Exit changeset pre mode (if active) + run: | + if [ -f .changeset/pre.json ]; then + echo "Repo is in changeset pre mode; exiting so snapshot release can run" + pnpm exec changeset pre exit + fi + - name: Snapshot version - run: pnpm exec changeset version --snapshot ${{ github.event.inputs.prerelease_tag }} + run: pnpm exec changeset version --snapshot "${GITHUB_EVENT_INPUTS_PRERELEASE_TAG}" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_EVENT_INPUTS_PRERELEASE_TAG: ${{ github.event.inputs.prerelease_tag }} - name: Clean run: pnpm run clean --filter "@trigger.dev/*" --filter "trigger.dev" @@ -259,6 +322,7 @@ jobs: run: pnpm run build --filter "@trigger.dev/*" --filter "trigger.dev" - name: Publish prerelease - run: pnpm exec changeset publish --no-git-tag --snapshot --tag ${{ github.event.inputs.prerelease_tag }} + run: pnpm exec changeset publish --no-git-tag --snapshot --tag "${GITHUB_EVENT_INPUTS_PRERELEASE_TAG}" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_EVENT_INPUTS_PRERELEASE_TAG: ${{ github.event.inputs.prerelease_tag }} diff --git a/.github/workflows/sdk-compat.yml b/.github/workflows/sdk-compat.yml index eb347c0f771..1510af23181 100644 --- a/.github/workflows/sdk-compat.yml +++ b/.github/workflows/sdk-compat.yml @@ -18,17 +18,18 @@ jobs: steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: ${{ matrix.node }} cache: "pnpm" @@ -56,23 +57,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" - name: 🥟 Setup Bun - uses: oven-sh/setup-bun@v2 + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0 with: bun-version: latest @@ -97,23 +99,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" - name: 🦕 Setup Deno - uses: denoland/setup-deno@v2 + uses: denoland/setup-deno@667a34cdef165d8d2b2e98dde39547c9daac7282 # v2.0.4 with: deno-version: v2.x @@ -142,19 +145,20 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" - name: 📥 Download deps diff --git a/.github/workflows/trivy-image-webapp.yml b/.github/workflows/trivy-image-webapp.yml new file mode 100644 index 00000000000..7dae65ef2bf --- /dev/null +++ b/.github/workflows/trivy-image-webapp.yml @@ -0,0 +1,75 @@ +name: Trivy Image Scan (webapp) + +# OS-level CVE scan of a published webapp image. Called by the publish pipeline +# (publish.yml) to scan each build right after it's pushed to GHCR — so every +# main build and every release is scanned, not rebuilt. Also runnable ad-hoc +# via workflow_dispatch against any image ref. +# +# Report-only: writes a table to the run summary. No SARIF upload, no gate. +# Library/dependency CVEs are covered by Dependabot, so this is restricted to +# OS packages (`vuln-type: os`) to avoid double-reporting. + +on: + workflow_call: + inputs: + image-ref: + description: "Full image ref to scan (e.g. ghcr.io/triggerdotdev/trigger.dev:main)" + type: string + required: true + workflow_dispatch: + inputs: + image-ref: + description: "Full image ref to scan" + type: string + required: false + default: "ghcr.io/triggerdotdev/trigger.dev:main" + +permissions: {} + +concurrency: + group: trivy-image-webapp-${{ inputs.image-ref }} + cancel-in-progress: true + +jobs: + scan: + name: Scan + runs-on: ubuntu-latest + permissions: + contents: read + packages: read # pull the image from GHCR + steps: + # Authenticate to GHCR so the scan also works for private images + # (GITHUB_TOKEN isn't forwarded to Docker automatically). Harmless for + # public images. Pairs with the packages: read permission above. + - name: Log in to GitHub Container Registry + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run Trivy image scan + uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0 + with: + scan-type: image + image-ref: ${{ inputs.image-ref }} + # vuln-type maps to --pkg-types: OS packages only (library deps are + # Dependabot's job). ignore-unfixed drops vulns with no patch yet. + vuln-type: os + ignore-unfixed: true + severity: HIGH,CRITICAL + format: table + output: trivy-image-webapp.txt + + - name: Job summary + if: always() + env: + IMAGE_REF: ${{ inputs.image-ref }} + run: | + { + echo "## Trivy Image Scan (webapp) — \`${IMAGE_REF}\`" + echo '```' + # GitHub step summary is capped at 1 MiB; truncate large reports. + head -c 900000 trivy-image-webapp.txt 2>/dev/null || echo "(no report produced)" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml index 665d54b2563..91ec46f3a9a 100644 --- a/.github/workflows/typecheck.yml +++ b/.github/workflows/typecheck.yml @@ -12,19 +12,20 @@ jobs: steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" - name: 📥 Download deps diff --git a/.github/workflows/unit-tests-internal.yml b/.github/workflows/unit-tests-internal.yml index 92b951e8aa0..e2aae11b846 100644 --- a/.github/workflows/unit-tests-internal.yml +++ b/.github/workflows/unit-tests-internal.yml @@ -5,15 +5,22 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: name: "🧪 Unit Tests: Internal" runs-on: ubuntu-latest strategy: + # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard + fail-fast: false matrix: - shardIndex: [1, 2, 3, 4, 5, 6, 7, 8] - shardTotal: [8] + shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + shardTotal: [12] env: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} SHARD_INDEX: ${{ matrix.shardIndex }} @@ -46,25 +53,26 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -75,12 +83,22 @@ jobs: - name: 🐳 Pre-pull testcontainer images if: ${{ env.DOCKERHUB_USERNAME }} run: | + # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake. + pull() { + for attempt in 1 2 3; do + docker pull "$1" && return 0 + echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s" + sleep 10 + done + echo "::error::docker pull $1 failed after 3 attempts" + return 1 + } echo "Pre-pulling Docker images with authenticated session..." - docker pull postgres:14 - docker pull clickhouse/clickhouse-server:25.4-alpine - docker pull redis:7-alpine - docker pull testcontainers/ryuk:0.11.0 - docker pull electricsql/electric:1.2.4 + pull postgres:14 + pull clickhouse/clickhouse-server:25.4-alpine + pull redis:7.2 + pull testcontainers/ryuk:0.14.0 + pull electricsql/electric:1.2.4 echo "Image pre-pull complete" - name: 📥 Download deps @@ -90,7 +108,7 @@ jobs: run: pnpm run generate - name: 🧪 Run Internal Unit Tests - run: pnpm run test:internal --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + run: pnpm run test:internal --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} --passWithNoTests - name: Gather all reports if: ${{ !cancelled() }} @@ -101,7 +119,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: internal-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -115,27 +133,28 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: internal-blob-report-* merge-multiple: true - name: Merge reports - run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests + run: pnpm dlx vitest@4.1.7 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests-packages.yml b/.github/workflows/unit-tests-packages.yml index 78474e03f27..6642f2443c4 100644 --- a/.github/workflows/unit-tests-packages.yml +++ b/.github/workflows/unit-tests-packages.yml @@ -5,15 +5,22 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: name: "🧪 Unit Tests: Packages" runs-on: ubuntu-latest strategy: + # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard + fail-fast: false matrix: - shardIndex: [1] - shardTotal: [1] + shardIndex: [1, 2, 3] + shardTotal: [3] env: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} SHARD_INDEX: ${{ matrix.shardIndex }} @@ -46,25 +53,26 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -75,12 +83,22 @@ jobs: - name: 🐳 Pre-pull testcontainer images if: ${{ env.DOCKERHUB_USERNAME }} run: | + # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake. + pull() { + for attempt in 1 2 3; do + docker pull "$1" && return 0 + echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s" + sleep 10 + done + echo "::error::docker pull $1 failed after 3 attempts" + return 1 + } echo "Pre-pulling Docker images with authenticated session..." - docker pull postgres:14 - docker pull clickhouse/clickhouse-server:25.4-alpine - docker pull redis:7-alpine - docker pull testcontainers/ryuk:0.11.0 - docker pull electricsql/electric:1.2.4 + pull postgres:14 + pull clickhouse/clickhouse-server:25.4-alpine + pull redis:7.2 + pull testcontainers/ryuk:0.14.0 + pull electricsql/electric:1.2.4 echo "Image pre-pull complete" - name: 📥 Download deps @@ -90,7 +108,7 @@ jobs: run: pnpm run generate - name: 🧪 Run Package Unit Tests - run: pnpm run test:packages --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + run: pnpm run test:packages --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} --passWithNoTests - name: Gather all reports if: ${{ !cancelled() }} @@ -101,7 +119,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: packages-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -115,27 +133,28 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: packages-blob-report-* merge-multiple: true - name: Merge reports - run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests + run: pnpm dlx vitest@4.1.7 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests-webapp.yml b/.github/workflows/unit-tests-webapp.yml index 523a1887db8..dc1cc978f35 100644 --- a/.github/workflows/unit-tests-webapp.yml +++ b/.github/workflows/unit-tests-webapp.yml @@ -5,15 +5,22 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: name: "🧪 Unit Tests: Webapp" runs-on: ubuntu-latest strategy: + # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard + fail-fast: false matrix: - shardIndex: [1, 2, 3, 4, 5, 6, 7, 8] - shardTotal: [8] + shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + shardTotal: [10] env: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} SHARD_INDEX: ${{ matrix.shardIndex }} @@ -46,25 +53,26 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 cache: "pnpm" # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -75,12 +83,23 @@ jobs: - name: 🐳 Pre-pull testcontainer images if: ${{ env.DOCKERHUB_USERNAME }} run: | + # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake. + pull() { + for attempt in 1 2 3; do + docker pull "$1" && return 0 + echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s" + sleep 10 + done + echo "::error::docker pull $1 failed after 3 attempts" + return 1 + } echo "Pre-pulling Docker images with authenticated session..." - docker pull postgres:14 - docker pull clickhouse/clickhouse-server:25.4-alpine - docker pull redis:7-alpine - docker pull testcontainers/ryuk:0.11.0 - docker pull electricsql/electric:1.2.4 + pull postgres:14 + pull clickhouse/clickhouse-server:25.4-alpine + pull redis:7.2 + pull testcontainers/ryuk:0.14.0 + pull electricsql/electric:1.2.4 + pull minio/minio:latest echo "Image pre-pull complete" - name: 📥 Download deps @@ -90,7 +109,7 @@ jobs: run: pnpm run generate - name: 🧪 Run Webapp Unit Tests - run: pnpm run test:webapp --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} + run: pnpm run test:webapp --reporter=default --reporter=blob --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} --passWithNoTests env: DATABASE_URL: postgresql://postgres:postgres@localhost:5432/postgres DIRECT_URL: postgresql://postgres:postgres@localhost:5432/postgres @@ -109,7 +128,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: webapp-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -123,27 +142,28 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: - node-version: 20.20.0 + node-version: 20.20.2 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: webapp-blob-report-* merge-multiple: true - name: Merge reports - run: pnpm dlx vitest@3.1.4 run --merge-reports --pass-with-no-tests + run: pnpm dlx vitest@4.1.7 run --merge-reports --pass-with-no-tests diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7c90a5a30ad..96e76279c82 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -5,14 +5,30 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: webapp: uses: ./.github/workflows/unit-tests-webapp.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + e2e-webapp: + uses: ./.github/workflows/e2e-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} packages: uses: ./.github/workflows/unit-tests-packages.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} internal: uses: ./.github/workflows/unit-tests-internal.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/vouch-check-pr.yml b/.github/workflows/vouch-check-pr.yml index 21597cf467a..d854b1e0ce6 100644 --- a/.github/workflows/vouch-check-pr.yml +++ b/.github/workflows/vouch-check-pr.yml @@ -1,17 +1,18 @@ name: Vouch - Check PR on: - pull_request_target: + pull_request_target: # zizmor: ignore[dangerous-triggers] needed to comment/close fork PRs; safe because we never check out PR HEAD ref so no fork-controlled code runs types: [opened, reopened] -permissions: - contents: read - pull-requests: write - issues: read +permissions: {} jobs: check-vouch: runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write # auto-close unvouched PRs + issues: read steps: - uses: mitchellh/vouch/action/check-pr@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 with: @@ -23,11 +24,16 @@ jobs: require-draft: needs: check-vouch + permissions: + pull-requests: write # close non-draft PRs with a comment if: > github.event.pull_request.draft == false && github.event.pull_request.author_association != 'MEMBER' && github.event.pull_request.author_association != 'OWNER' && - github.event.pull_request.author_association != 'COLLABORATOR' + github.event.pull_request.author_association != 'COLLABORATOR' && + github.event.pull_request.user.login != 'devin-ai-integration[bot]' && + github.event.pull_request.user.login != 'dependabot[bot]' && + github.event.pull_request.user.login != 'github-actions[bot]' runs-on: ubuntu-latest steps: - name: Close non-draft PR diff --git a/.github/workflows/workflow-checks.yml b/.github/workflows/workflow-checks.yml new file mode 100644 index 00000000000..a11918c04fe --- /dev/null +++ b/.github/workflows/workflow-checks.yml @@ -0,0 +1,51 @@ +name: Workflow Checks + +on: + push: + branches: [main] + paths: + - '.github/workflows/**' + - '.github/actions/**' + - '.github/zizmor.yml' + pull_request: + paths: + - '.github/workflows/**' + - '.github/actions/**' + - '.github/zizmor.yml' + +permissions: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + actionlint: + name: Actionlint + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run actionlint + uses: docker://rhysd/actionlint:1.7.12@sha256:b1934ee5f1c509618f2508e6eb47ee0d3520686341fec936f3b79331f9315667 + + zizmor: + name: Zizmor + runs-on: ubuntu-latest + permissions: + security-events: write # Upload SARIF to GitHub Security tab + contents: read # Read workflow files for analysis + actions: read # Read workflow run metadata + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run zizmor + uses: zizmorcore/zizmor-action@5f14fd08f7cf1cb1609c1e344975f152c7ee938d # v0.5.6 diff --git a/.github/zizmor.yml b/.github/zizmor.yml new file mode 100644 index 00000000000..2fcbb540127 --- /dev/null +++ b/.github/zizmor.yml @@ -0,0 +1,5 @@ +rules: + unpinned-uses: + config: + policies: + '*': hash-pin diff --git a/.gitignore b/.gitignore index 5f6adddba0a..d5f0c945ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -65,7 +65,13 @@ apps/**/public/build /packages/trigger-sdk/src/package.json /packages/python/src/package.json **/.claude/settings.local.json +.claude/architecture/ +.claude/docs-plans/ +.claude/review-guides/ +.claude/scheduled_tasks.lock .mcp.log .mcp.json .cursor/debug.log -ailogger-output.log \ No newline at end of file +ailogger-output.log +# per-package vitest timing capture (transient; merged into root test-timings.json) +.vitest-timing.json diff --git a/.nvmrc b/.nvmrc index 7c663e0a0bd..c675bca8de0 100644 --- a/.nvmrc +++ b/.nvmrc @@ -1 +1 @@ -v20.20.0 \ No newline at end of file +v20.20.2 diff --git a/.server-changes/README.md b/.server-changes/README.md index 82716de981c..2b0eeade36b 100644 --- a/.server-changes/README.md +++ b/.server-changes/README.md @@ -38,6 +38,14 @@ Speed up batch queue processing by removing stalls and fixing retry race The body text (below the frontmatter) is a one-line description of the change. Keep it concise — it will appear in release notes. +### Writing guidance + +These entries are public-facing - they ship verbatim in user-visible release notes. A few rules to keep them clean: + +- **One sentence is usually enough.** The body is the bullet in the changelog. If you need a paragraph, you're probably describing the implementation rather than the change. +- **Describe behavior, not implementation.** Skip internal scopes, middleware names, library specifics, framework internals. Users care about what's different for them, not how it's wired. +- **Never name internal tools or infra.** Observability stacks, internal services, infra components, monitoring backends, CI surfaces, AWS specifics - none of these belong in user-facing notes. + ## Lifecycle 1. Engineer adds a `.server-changes/` file in their PR diff --git a/.server-changes/batch-fast-fail-queue-size-limit.md b/.server-changes/batch-fast-fail-queue-size-limit.md deleted file mode 100644 index 77b926a5a80..00000000000 --- a/.server-changes/batch-fast-fail-queue-size-limit.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -area: webapp -type: fix ---- - -Batch items that hit the environment queue size limit now fast-fail without -retries and without creating pre-failed TaskRuns. diff --git a/.server-changes/bulk-action-cursor-pagination.md b/.server-changes/bulk-action-cursor-pagination.md new file mode 100644 index 00000000000..5f506493d11 --- /dev/null +++ b/.server-changes/bulk-action-cursor-pagination.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix run pagination that could duplicate or skip runs: the query orders by `(created_at, run_id)` but the cursor cut on `run_id` alone, which diverges when run_id order doesn't match created_at order (e.g. bulk replay re-processing runs). Cursors now encode the composite key as an opaque token and cut on the matching tuple; legacy bare-run_id cursors stay supported for in-flight pagination. diff --git a/.server-changes/cancel-stale-delayed-snapshots.md b/.server-changes/cancel-stale-delayed-snapshots.md new file mode 100644 index 00000000000..9a167c613b1 --- /dev/null +++ b/.server-changes/cancel-stale-delayed-snapshots.md @@ -0,0 +1,6 @@ +--- +area: supervisor +type: fix +--- + +Cancel pending delayed snapshots when a run completes or disconnects, preventing stale snapshots from pausing microVMs that have moved on to new work. diff --git a/.server-changes/compute-network-labels.md b/.server-changes/compute-network-labels.md new file mode 100644 index 00000000000..874081885d5 --- /dev/null +++ b/.server-changes/compute-network-labels.md @@ -0,0 +1,6 @@ +--- +area: supervisor +type: feature +--- + +Forward per-run identity labels to the compute provider on create and restore, letting network policy select runs (e.g. private link). diff --git a/.server-changes/compute-org-label.md b/.server-changes/compute-org-label.md new file mode 100644 index 00000000000..9306a0e2dc3 --- /dev/null +++ b/.server-changes/compute-org-label.md @@ -0,0 +1,8 @@ +--- +area: supervisor +type: improvement +--- + +Compute workload manager now sets an `org` label on every run (create + +restore) for network-policy selection, instead of a plan-gated label. The +Kubernetes workload manager is unchanged. diff --git a/.server-changes/env-vars-page-scope-values-to-visible-environments.md b/.server-changes/env-vars-page-scope-values-to-visible-environments.md new file mode 100644 index 00000000000..067c04661b7 --- /dev/null +++ b/.server-changes/env-vars-page-scope-values-to-visible-environments.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Speed up the environment variables page for projects with many archived preview branches. The page now only loads variable values for the environments it displays instead of every value ever created, including those left behind by archived branches. diff --git a/.server-changes/hipaa-addon-pricing-cta.md b/.server-changes/hipaa-addon-pricing-cta.md new file mode 100644 index 00000000000..8dc4a41f8b2 --- /dev/null +++ b/.server-changes/hipaa-addon-pricing-cta.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Request a HIPAA BAA add-on directly from any paid pricing tier in the dashboard. diff --git a/.server-changes/include-prisma-cli-in-prod-image.md b/.server-changes/include-prisma-cli-in-prod-image.md new file mode 100644 index 00000000000..888544239fe --- /dev/null +++ b/.server-changes/include-prisma-cli-in-prod-image.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix database migrations failing to run in the production image because the Prisma CLI was missing from the build. diff --git a/.server-changes/mollifier-decision-enrolled-org-labels.md b/.server-changes/mollifier-decision-enrolled-org-labels.md new file mode 100644 index 00000000000..b9e8a11f84a --- /dev/null +++ b/.server-changes/mollifier-decision-enrolled-org-labels.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add bounded `enrolled` and `org` labels to the `mollifier.decisions` metric so per-enrolled-org pass-through vs mollify is visible (the `org` label is attached only for the enrolled cohort to keep cardinality bounded). diff --git a/.server-changes/react-router-route-matching-perf.md b/.server-changes/react-router-route-matching-perf.md new file mode 100644 index 00000000000..a264835af55 --- /dev/null +++ b/.server-changes/react-router-route-matching-perf.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Speed up the dashboard and API under high request load by memoizing react-router's per-request route matching, which previously re-flattened, re-ranked, and recompiled the entire route table on every request. diff --git a/.server-changes/realtime-replica-read-consistency.md b/.server-changes/realtime-replica-read-consistency.md new file mode 100644 index 00000000000..d23c73e682d --- /dev/null +++ b/.server-changes/realtime-replica-read-consistency.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Realtime feed reads now wait out measured read-replica lag and retry stale reads, so subscribers receive each change's current content instead of trailing one change behind when a read replica races the write. diff --git a/.server-changes/realtime-runs-subscription-scalability.md b/.server-changes/realtime-runs-subscription-scalability.md new file mode 100644 index 00000000000..5de00aae675 --- /dev/null +++ b/.server-changes/realtime-runs-subscription-scalability.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a new backend for the realtime runs feed (single runs, tags, and batches) that scales under high concurrency, available behind a feature flag diff --git a/.server-changes/require-plugins-fail-fast.md b/.server-changes/require-plugins-fail-fast.md new file mode 100644 index 00000000000..591cd47c402 --- /dev/null +++ b/.server-changes/require-plugins-fail-fast.md @@ -0,0 +1,8 @@ +--- +area: webapp +type: feature +--- + +Add `REQUIRE_PLUGINS=1` env var. When set, the RBAC plugin loader throws instead of silently falling back to the default implementation if the plugin module fails to load (missing, broken transitive dep, etc.). The webapp's `/healthcheck` route now resolves the lazy plugin controller so the throw surfaces during readiness probes — a deploy where the plugin didn't load fails the probe and is rolled back. + +Self-hosters leave `REQUIRE_PLUGINS` unset and continue to use the fallback when no plugin is installed. diff --git a/.server-changes/retry-transient-instance-create-failures.md b/.server-changes/retry-transient-instance-create-failures.md new file mode 100644 index 00000000000..f7b9c7afd11 --- /dev/null +++ b/.server-changes/retry-transient-instance-create-failures.md @@ -0,0 +1,6 @@ +--- +area: supervisor +type: fix +--- + +Retry transient instance create failures during cold starts instead of waiting minutes for the run to be requeued. diff --git a/.server-changes/runs-backward-pagination-slice.md b/.server-changes/runs-backward-pagination-slice.md new file mode 100644 index 00000000000..41695f4e159 --- /dev/null +++ b/.server-changes/runs-backward-pagination-slice.md @@ -0,0 +1,14 @@ +--- +area: webapp +type: fix +--- + +Fix an off-by-one in `ClickHouseRunsRepository.listRunIds` backward pagination. +When paging backward with more rows before the page (`hasMore`), the displayed +page was sliced as `rows.slice(1, size + 1)`, which dropped the row closest to +the cursor and kept the extra "has-more" sentinel — returning a page that +straddled two logical pages (one row from the correct previous page plus one +from the page before it). The result set is always the first `page.size` rows +(the sentinel is the trailing element in both directions), so the slice is now +`rows.slice(0, size)` for forward and backward alike. Forward pagination and the +cursor values were already correct and are unchanged. diff --git a/.server-changes/runs-bulk-action-no-reload.md b/.server-changes/runs-bulk-action-no-reload.md new file mode 100644 index 00000000000..1926ab20b75 --- /dev/null +++ b/.server-changes/runs-bulk-action-no-reload.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Stop reloading the runs list when opening or closing the bulk action inspector diff --git a/.server-changes/sanitize-agent-view-urls.md b/.server-changes/sanitize-agent-view-urls.md new file mode 100644 index 00000000000..c534a03623d --- /dev/null +++ b/.server-changes/sanitize-agent-view-urls.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Sanitize URLs from streamed agent and tool data before rendering them in the dashboard's Agent view, so an unsafe scheme such as `javascript:` can no longer produce a clickable link or image source. diff --git a/.server-changes/scheduled-run-region-display.md b/.server-changes/scheduled-run-region-display.md new file mode 100644 index 00000000000..dca41c4341e --- /dev/null +++ b/.server-changes/scheduled-run-region-display.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Scheduled runs now show under their correct region in the dashboard, run details, and the API, and match region filters, instead of appearing under a separate region. diff --git a/.server-changes/session-route-hardening.md b/.server-changes/session-route-hardening.md new file mode 100644 index 00000000000..2734b35a784 --- /dev/null +++ b/.server-changes/session-route-hardening.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Hardening fixes for realtime sessions: stricter authorization on snapshot URLs and out-channel appends, environment-scoped message delivery for waiting runs, and idempotent appends via the X-Part-Id header. Session creation now rejects expired sessions, externalId can no longer be changed after creation, and the sessions list returns friendly run ids. diff --git a/.server-changes/snapshots-since-replica-primary-fallback.md b/.server-changes/snapshots-since-replica-primary-fallback.md new file mode 100644 index 00000000000..9b8257f6410 --- /dev/null +++ b/.server-changes/snapshots-since-replica-primary-fallback.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Run snapshot polling no longer errors or pays extra latency when the database read replica hasn't yet replicated the snapshot the runner is polling from (`RUN_ENGINE_READ_REPLICA_SNAPSHOTS_SINCE_ENABLED`): the read is briefly retried on the replica and served from the primary if it still hasn't caught up. Polling also now rejects a since-snapshot id that doesn't belong to the run being polled. diff --git a/.server-changes/stop-creating-taskruntag-records.md b/.server-changes/stop-creating-taskruntag-records.md deleted file mode 100644 index 0b068d3c3ac..00000000000 --- a/.server-changes/stop-creating-taskruntag-records.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -area: webapp -type: improvement ---- - -Stop creating TaskRunTag records and _TaskRunToTaskRunTag join table entries during task triggering. The denormalized runTags string array on TaskRun already stores tag names, making the M2M relation redundant write overhead. diff --git a/.server-changes/trace-export-formats.md b/.server-changes/trace-export-formats.md new file mode 100644 index 00000000000..ff15483003f --- /dev/null +++ b/.server-changes/trace-export-formats.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Export a run's full trace from the run page as a downloadable Log, Markdown, or JSON Lines file, or copy it to the clipboard for pasting into an AI assistant. The export streams straight from the store, so even very large runs export reliably. diff --git a/.server-changes/trace-page-payload-diet.md b/.server-changes/trace-page-payload-diet.md new file mode 100644 index 00000000000..9f84e4b22db --- /dev/null +++ b/.server-changes/trace-page-payload-diet.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Shrinks the run trace page loader payload by keeping raw span events server-side and makes large trace trees render more efficiently. Also adds an optional `TRACE_VIEW_EMERGENCY_SPAN_CAP` env var that clamps trace summary and detailed summary span limits on both event store paths. diff --git a/.server-changes/trigger-worker-queue-db-error-leak.md b/.server-changes/trigger-worker-queue-db-error-leak.md new file mode 100644 index 00000000000..9725ef9f2eb --- /dev/null +++ b/.server-changes/trigger-worker-queue-db-error-leak.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Stop `trigger()` from leaking raw database connection errors to API clients during a database outage; infrastructure errors now return a generic, retryable 500. diff --git a/.server-changes/vercel-auto-promote-toggle.md b/.server-changes/vercel-auto-promote-toggle.md deleted file mode 100644 index bb5f25a21c1..00000000000 --- a/.server-changes/vercel-auto-promote-toggle.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -area: webapp -type: feature ---- - -Vercel integration option to disable auto promotions diff --git a/.vouch.yml b/.vouch.yml index 8a9668392d3..ec6e85aa705 100644 --- a/.vouch.yml +++ b/.vouch.yml @@ -1,2 +1,4 @@ vouch: - github: edosrecki + - github: GautamBytes + - github: ConProgramming diff --git a/.vscode/launch.json b/.vscode/launch.json index 71a76904a2b..1044443e197 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -47,78 +47,6 @@ "url": "http://localhost:3030", "webRoot": "${workspaceFolder}/apps/webapp/app" }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 init CLI", - "command": "pnpm exec trigger init", - "cwd": "${workspaceFolder}/references/init-shell", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 init dev CLI", - "command": "pnpm exec trigger dev", - "cwd": "${workspaceFolder}/references/init-shell", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 Dev CLI", - "command": "pnpm exec trigger dev", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug Dev Next.js Realtime", - "command": "pnpm exec trigger dev", - "cwd": "${workspaceFolder}/references/nextjs-realtime", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug prisma-catalog deploy CLI", - "command": "pnpm exec trigger deploy --self-hosted --load-image", - "cwd": "${workspaceFolder}/references/prisma-catalog", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 Deploy CLI", - "command": "pnpm exec trigger deploy --self-hosted --load-image", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 list-profiles CLI", - "command": "pnpm exec trigger list-profiles --log-level debug", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 update CLI", - "command": "pnpm exec trigger update", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug V3 Management", - "command": "pnpm run management", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, { "type": "node", "request": "attach", @@ -135,14 +63,6 @@ "cwd": "${workspaceFolder}/packages/cli-v3", "sourceMaps": true }, - { - "type": "node-terminal", - "request": "launch", - "name": "debug v3 hello-world dev", - "command": "pnpm exec trigger dev", - "cwd": "${workspaceFolder}/references/hello-world", - "sourceMaps": true - }, { "type": "node-terminal", "request": "launch", @@ -158,14 +78,6 @@ "command": "pnpm run test ./src/run-queue/index.test.ts --run", "cwd": "${workspaceFolder}/internal-packages/run-engine", "sourceMaps": true - }, - { - "type": "node-terminal", - "request": "launch", - "name": "Debug d3-demo", - "command": "pnpm exec trigger dev", - "cwd": "${workspaceFolder}/references/d3-demo", - "sourceMaps": true } ] } diff --git a/.vscode/settings.json b/.vscode/settings.json index fd9f3dcde0c..f969bb6d5de 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,5 @@ { - "deno.enablePaths": ["references/deno-reference", "runtime_tests/tests/deno"], + "deno.enablePaths": ["runtime_tests/tests/deno"], "debug.toolBarLocation": "commandCenter", "typescript.tsdk": "node_modules/typescript/lib", "search.exclude": { diff --git a/AGENTS.md b/AGENTS.md index 99496f91bde..8ff9f18663c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,18 +7,19 @@ This repository is a pnpm monorepo managed with Turbo. It contains multiple apps - `apps/supervisor` – Node application for executing built tasks. - `packages/*` – Published packages such as `@trigger.dev/sdk`, the CLI (`trigger.dev`), and shared libraries. - `internal-packages/*` – Internal-only packages used by the webapp and other apps. -- `references/*` – Example projects for manual testing and development of new features. +- Example/reference projects for manual testing live in a separate repo: [`triggerdotdev/references`](https://github.com/triggerdotdev/references). - `ai/references` – Contains additional documentation including an overview (`repo.md`) and testing guidelines (`tests.md`). See `ai/references/repo.md` for a more complete explanation of the workspaces. ## Development setup -1. Install dependencies with `pnpm i` (pnpm `10.23.0` and Node.js `20.20.0` are required). +1. Install dependencies with `pnpm i` (pnpm `10.33.2` and Node.js `20.20.2` are required). 2. Copy `.env.example` to `.env` and generate a random 16 byte hex string for `ENCRYPTION_KEY` (`openssl rand -hex 16`). Update other secrets if needed. 3. Start the local services with Docker: ```bash pnpm run docker ``` + Add `:full` (`pnpm run docker:full`) for the optional observability + chaos tooling. See `docker/docker-compose.extras.yml`. 4. Run database migrations: ```bash pnpm run db:migrate @@ -64,5 +65,5 @@ Refer to `ai/references/tests.md` for details on writing tests. Tests should avo ```bash pnpm run dev --filter docs ``` -- `references/README.md` explains how to create new reference projects for manual testing. +- The [`triggerdotdev/references`](https://github.com/triggerdotdev/references) repo's README explains how to create new reference projects for manual testing. diff --git a/CLAUDE.md b/CLAUDE.md index 0a54cced672..c0fd82fb368 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,10 +4,13 @@ This file provides guidance to Claude Code when working with this repository. Su ## Build and Development Commands -This is a pnpm 10.23.0 monorepo using Turborepo. Run commands from root with `pnpm run`. +This is a pnpm 10.33.2 monorepo using Turborepo. Run commands from root with `pnpm run`. + +**Adding dependencies:** Edit `package.json` directly instead of using `pnpm add`, then run `pnpm i` from the repo root. See `.claude/rules/package-installation.md` for the full process. ```bash -pnpm run docker # Start Docker services (PostgreSQL, Redis, Electric) +pnpm run docker # Core dev services (Postgres, Redis, Electric, MinIO, ClickHouse, s2-lite) +# pnpm run docker:full # Same + observability stack (Prometheus, Grafana, OTEL) and chaos tooling pnpm run db:migrate # Run database migrations pnpm run db:seed # Seed the database (required for reference projects) @@ -66,6 +69,17 @@ containerTest("should use both", async ({ prisma, redisOptions }) => { }); ``` +## Code Style + +### Imports + +**Prefer static imports over dynamic imports.** Only use dynamic `import()` when: +- Circular dependencies cannot be resolved otherwise +- Code splitting is genuinely needed for performance +- The module must be loaded conditionally at runtime + +Dynamic imports add unnecessary overhead in hot paths and make code harder to analyze. If you find yourself using `await import()`, ask if a regular `import` statement would work instead. + ## Changesets and Server Changes When modifying any public package (`packages/*` or `integrations/*`), add a changeset: @@ -92,7 +106,7 @@ User API call -> Webapp routes -> Services -> RunEngine -> Redis Queue -> Superv ### Apps -- **apps/webapp**: Remix 2.1.0 app - main API, dashboard, orchestration. Uses Express server. +- **apps/webapp**: Remix 2.17.4 app - main API, dashboard, orchestration. Uses Express server. - **apps/supervisor**: Manages task execution containers (Docker/Kubernetes). ### Public Packages @@ -124,7 +138,7 @@ Docs live in `docs/` as a Mintlify site (MDX format). See `docs/CLAUDE.md` for c ### Reference Projects -The `references/` directory contains test workspaces for testing SDK and platform features. Use `references/hello-world` to manually test changes before submitting PRs. +Reference/example projects for testing SDK and platform features live in a separate repo: [`triggerdotdev/references`](https://github.com/triggerdotdev/references). Clone it alongside this repo and use its `projects/hello-world` to manually test changes before submitting PRs. See that repo's README for setup and linking to a local monorepo build. ## Docker Image Guidelines @@ -153,15 +167,17 @@ export const myTask = task({ The `rules/` directory contains versioned SDK documentation distributed via the SDK installer. Current version: `rules/manifest.json`. Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked - these are maintained in separate dedicated passes. -## Testing with hello-world Reference Project +## Testing with the hello-world Reference Project + +The reference projects live in the separate [`triggerdotdev/references`](https://github.com/triggerdotdev/references) repo - clone it alongside this repo. First-time setup: -1. `pnpm run db:seed` to seed the database -2. Build CLI: `pnpm run build --filter trigger.dev && pnpm i` -3. Authorize: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` +1. `pnpm run db:seed` to seed the database (creates the References org + hello-world project) +2. Build the CLI/packages you want to test: `pnpm run build --filter trigger.dev` +3. In your `references` clone, follow its README to link to your local monorepo build, then authorize: `cd projects/hello-world && pnpm exec trigger login -a http://localhost:3030` -Running: `cd references/hello-world && pnpm exec trigger dev` +Running (from your `references` clone): `cd projects/hello-world && pnpm exec trigger dev` ## Local Task Testing Workflow @@ -176,7 +192,8 @@ curl -s http://localhost:3030/healthcheck # Verify running ### Step 2: Start Trigger Dev in Background ```bash -cd references/hello-world && pnpm exec trigger dev +# in your triggerdotdev/references clone +cd projects/hello-world && pnpm exec trigger dev # Wait for "Local worker ready [node]" ``` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 88e24cba4f0..cddb974417d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,8 +29,8 @@ branch are tagged into a release periodically. ### Prerequisites -- [Node.js](https://nodejs.org/en) version 20.20.0 -- [pnpm package manager](https://pnpm.io/installation) version 10.23.0 +- [Node.js](https://nodejs.org/en) version 20.20.2 +- [pnpm package manager](https://pnpm.io/installation) version 10.33.2 - [Docker](https://www.docker.com/get-started/) - [protobuf](https://github.com/protocolbuffers/protobuf) @@ -49,9 +49,9 @@ branch are tagged into a release periodically. ``` cd trigger.dev ``` -3. Ensure you are on the correct version of Node.js (20.20.0). If you are using `nvm`, there is an `.nvmrc` file that will automatically select the correct version of Node.js when you navigate to the repository. +3. Ensure you are on the correct version of Node.js (20.20.2). If you are using `nvm`, there is an `.nvmrc` file that will automatically select the correct version of Node.js when you navigate to the repository. -4. Run `corepack enable` to use the correct version of pnpm (`10.23.0`) as specified in the root `package.json` file. +4. Run `corepack enable` to use the correct version of pnpm (`10.33.2`) as specified in the root `package.json` file. 5. Install the required packages using pnpm. ``` @@ -71,21 +71,27 @@ branch are tagged into a release periodically. Feel free to update `SESSION_SECRET` and `MAGIC_LINK_SECRET` as well using the same method. -8. Start Docker. This starts the required services like Postgres & Redis. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md) +8. Start Docker. This starts the core dev services (Postgres, Redis, Electric, MinIO, ClickHouse, s2-lite) and runs the ClickHouse migrator once on first start. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md). ``` pnpm run docker ``` + For the observability stack (Prometheus, Grafana, OTEL collector) and other optional tooling (Toxiproxy, nginx-h2, ch-ui, extra electric shard), use `pnpm run docker:full` instead. See `docker/docker-compose.extras.yml` for the full list. + 9. Migrate the database ``` pnpm run db:migrate ``` -10. Build everything +10. Build the webapp, CLI, and SDK + ``` + pnpm run build --filter webapp --filter trigger.dev --filter @trigger.dev/sdk ``` - pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk +11. Seed the database. This creates a local user, a `References` org, and the reference projects (including `hello-world`) with stable IDs. ``` -11. Run the app. See the section below. + pnpm run db:seed + ``` +12. Run the app. See the section below. ## Running @@ -101,29 +107,32 @@ branch are tagged into a release periodically. ## Manual testing using hello-world -We use the `/references/hello-world` subdirectory as a staging ground for testing changes to the SDK (`@trigger.dev/sdk` at `/packages/trigger-sdk`), the Core package (`@trigger.dev/core` at `packages/core`), the CLI (`trigger.dev` at `/packages/cli-v3`) and the platform (The remix app at `/apps/webapp`). The instructions below will get you started on using the `hello-world` for local development of Trigger.dev. +The `hello-world` reference project (and the others) live in a separate repo: +[`triggerdotdev/references`](https://github.com/triggerdotdev/references). Clone it +alongside this repo. It's the staging ground for testing changes to the SDK +(`@trigger.dev/sdk` at `/packages/trigger-sdk`), the Core package +(`@trigger.dev/core` at `/packages/core`), the CLI (`trigger.dev` at +`/packages/cli-v3`) and the platform (the Remix app at `/apps/webapp`). +To exercise your local monorepo changes, the reference project links to your local +build — see the references repo's README for the `pnpm run link` flow. -### First-time setup +> Paths below such as `projects/hello-world` are relative to your `references` +> clone, not this repo. -First, make sure you are running the webapp according to the instructions above. Then: - -1. Visit http://localhost:3030 in your browser and create a new project called "hello-world". +### First-time setup -2. In Postgres go to the "Projects" table and for the project you create change the `externalRef` to `proj_rrkpdguyagvsoktglnod`. +First, make sure you are running the webapp according to the instructions above. The seed step from setup already created a `hello-world` project under the `References` org with the stable ref `proj_rrkpdguyagvsoktglnod` — log in at http://localhost:3030 with any email to access it. Then: -3. Build the CLI +1. Build the CLI and packages (skip if you already ran the build step in setup) ```sh -# Build the CLI -pnpm run build --filter trigger.dev -# Make it accessible to `pnpm exec` -pnpm i +pnpm run build --filter trigger.dev --filter "@trigger.dev/*" ``` -4. Change into the `/references/hello-world` directory and authorize the CLI to the local server: +2. In your `references` clone, link to your local monorepo build (see its README), then change into `projects/hello-world` and authorize the CLI to the local server: ```sh -cd references/hello-world +cd projects/hello-world cp .env.example .env pnpm exec trigger login -a http://localhost:3030 ``` @@ -133,7 +142,7 @@ This will open a new browser window and authorize the CLI against your local use You can optionally pass a `--profile` flag to the `login` command, which will allow you to use the CLI with separate accounts/servers. We suggest using a profile called `local` for your local development: ```sh -cd references/hello-world +cd projects/hello-world pnpm exec trigger login -a http://localhost:3030 --profile local # later when you run the dev or deploy command: pnpm exec trigger dev --profile local @@ -146,46 +155,46 @@ The following steps should be followed any time you start working on a new featu 1. Make sure the webapp is running on localhost:3030 -2. Open a terminal window and build the CLI and packages and watch for changes +2. In this repo, open a terminal window and build the CLI and packages and watch for changes (the reference project links against this build) ```sh pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" ``` -3. Open another terminal window, and change into the `/references/hello-world` directory. +3. Open another terminal window, and change into `projects/hello-world` in your `references` clone. 4. Run the `dev` command, which will register all the local tasks with the platform and allow you to start testing task execution: ```sh -# in /references/hello-world +# in /projects/hello-world pnpm exec trigger dev ``` If you want additional debug logging, you can use the `--log-level debug` flag: ```sh -# in /references/hello-world +# in /projects/hello-world pnpm exec trigger dev --log-level debug ``` -6. If you make any changes in the CLI/Core/SDK, you'll need to `CTRL+C` to exit the `dev` command and restart it to pickup changes. Any changes to the files inside of the `hello-world/src/trigger` dir will automatically be rebuilt by the `dev` command. +5. If you make any changes in the CLI/Core/SDK, you'll need to `CTRL+C` to exit the `dev` command and restart it to pickup changes. Any changes to the files inside the reference project's `src/trigger` dir will automatically be rebuilt by the `dev` command. -7. Navigate to the `hello-world` project in your local dashboard at localhost:3030 and you should see the list of tasks. +6. Navigate to the `hello-world` project in your local dashboard at localhost:3030 and you should see the list of tasks. -8. Go to the "Test" page in the sidebar and select a task. Then enter a payload and click "Run test". You can tell what the payloads should be by looking at the relevant task file inside the `/references/hello-world/src/trigger` folder. Many of them accept an empty payload. +7. Go to the "Test" page in the sidebar and select a task. Then enter a payload and click "Run test". You can tell what the payloads should be by looking at the relevant task file inside the reference project's `src/trigger` folder. Many of them accept an empty payload. -9. Feel free to add additional files in `hello-world/src/trigger` to test out specific aspects of the system, or add in edge cases. +8. Feel free to add additional files in the reference project's `src/trigger` dir to test out specific aspects of the system, or add in edge cases. ## Adding and running migrations -1. Modify internal-packages/database/prisma/schema.prisma file -2. Change directory to the packages/database folder +1. Modify `internal-packages/database/prisma/schema.prisma`. +2. Change directory to the database package: ```sh - cd packages/database + cd internal-packages/database ``` -3. Create a migration +3. Create a migration: ``` pnpm run db:migrate:dev:create @@ -193,50 +202,17 @@ pnpm exec trigger dev --log-level debug This creates a migration file. Check the migration file does only what you want. If you're adding any database indexes they must use `CONCURRENTLY`, otherwise they'll lock the table when executed. -4. Run the migration. - -``` -pnpm run db:migrate:deploy -pnpm run generate -``` - -This executes the migrations against your database and applies changes to the database schema(s), and then regenerates the Prisma client. - -4. Commit generated migrations as well as changes to the schema.prisma file -5. If you're using VSCode you may need to restart the Typescript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. - -## Add sample jobs - -The [references/job-catalog](./references/job-catalog/) project defines simple jobs you can get started with. - -1. `cd` into `references/job-catalog` -2. Create a `.env` file with the following content, - replacing `` with an actual key: +4. Run the migration: -```env -TRIGGER_API_KEY=[TRIGGER_DEV_API_KEY] -TRIGGER_API_URL=http://localhost:3030 -``` - -`TRIGGER_API_URL` is used to configure the URL for your Trigger.dev instance, -where the jobs will be registered. - -3. Run one of the the `job-catalog` files: - -```sh -pnpm run events -``` - -This will open up a local server using `express` on port 8080. Then in a new terminal window you can run the trigger-cli dev command: - -```sh -pnpm run dev:trigger -``` + ``` + pnpm run db:migrate:deploy + pnpm run generate + ``` -See the [Job Catalog](./references/job-catalog/README.md) file for more. + This executes the migrations against your database and applies changes to the database schema(s), and then regenerates the Prisma client. -4. Navigate to your trigger.dev instance ([http://localhost:3030](http://localhost:3030/)), to see the jobs. - You can use the test feature to trigger them. +5. Commit the generated migration files as well as the changes to `schema.prisma`. +6. If you're using VSCode you may need to restart the TypeScript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. ## Making a pull request @@ -334,3 +310,7 @@ The process running on port `3030` should be destroyed. ```sh sudo kill -9 ``` + +### Running two clones side by side (worktree, branch experiment) + +The default `pnpm run docker` uses the project name `triggerdotdev-docker` and the standard host ports (5432, 6379, 3060, 4566, 8123, 9000, 9005, 9006). To stand up a second instance in another clone without clashing, set a different `COMPOSE_PROJECT_NAME` and the offset host ports in that clone's `.env`. The "Running multiple instances side by side" block in `.env.example` lists every overridable env var with its default for reference; uncomment the lines you need and update `DATABASE_URL` / `CLICKHOUSE_URL` / `REDIS_PORT` / `APP_ORIGIN` / `LOGIN_ORIGIN` / `ELECTRIC_ORIGIN` / `REALTIME_STREAMS_S2_ENDPOINT` to match. diff --git a/ai/references/repo.md b/ai/references/repo.md index 4f67bde2b4b..6e0ff056716 100644 --- a/ai/references/repo.md +++ b/ai/references/repo.md @@ -1,6 +1,6 @@ ## Repo Overview -This is a pnpm 10.23.0 monorepo that uses turborepo @turbo.json. The following workspaces are relevant +This is a pnpm 10.33.2 monorepo that uses turborepo @turbo.json. The following workspaces are relevant ## Apps diff --git a/apps/supervisor/Containerfile b/apps/supervisor/Containerfile index d5bb5862e96..5b3b148a7cb 100644 --- a/apps/supervisor/Containerfile +++ b/apps/supervisor/Containerfile @@ -16,7 +16,7 @@ COPY --from=pruner --chown=node:node /app/out/json/ . COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml -RUN corepack enable && corepack prepare pnpm@10.23.0 --activate +RUN corepack enable && corepack prepare pnpm@10.33.2 --activate FROM base AS deps-fetcher RUN apk add --no-cache python3-dev py3-setuptools make g++ gcc linux-headers diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index 7456d421850..2725fe2b729 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -18,6 +18,7 @@ "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", + "ioredis": "~5.6.0", "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", @@ -25,6 +26,7 @@ "zod": "3.25.76" }, "devDependencies": { + "@internal/testcontainers": "workspace:*", "@types/dockerode": "^3.3.33" } } diff --git a/apps/supervisor/src/backpressure/backpressureMetrics.ts b/apps/supervisor/src/backpressure/backpressureMetrics.ts new file mode 100644 index 00000000000..ffe57628548 --- /dev/null +++ b/apps/supervisor/src/backpressure/backpressureMetrics.ts @@ -0,0 +1,34 @@ +import { Counter, Gauge, type Registry } from "prom-client"; + +/** Prometheus metrics for dequeue backpressure. */ +export class BackpressureMetrics { + /** 1 while backpressure is engaged (computed signal, set even in dry-run). */ + readonly engaged: Gauge; + /** 1 when running in dry-run (gates inert). */ + readonly dryRun: Gauge; + /** Dequeue attempts the gate skipped - or would have, in dry-run (labelled). */ + readonly skipsTotal: Counter; + + constructor(opts: { register: Registry; prefix?: string }) { + const prefix = opts.prefix ?? "supervisor_backpressure"; + + this.engaged = new Gauge({ + name: `${prefix}_engaged`, + help: "1 while dequeue backpressure is engaged (computed signal, regardless of dry-run)", + registers: [opts.register], + }); + + this.dryRun = new Gauge({ + name: `${prefix}_dry_run`, + help: "1 when dequeue backpressure is in dry-run mode (gates inert)", + registers: [opts.register], + }); + + this.skipsTotal = new Counter({ + name: `${prefix}_skipped_dequeues_total`, + help: "Dequeue attempts skipped by backpressure (or would be, in dry-run)", + labelNames: ["dry_run"], + registers: [opts.register], + }); + } +} diff --git a/apps/supervisor/src/backpressure/backpressureMonitor.test.ts b/apps/supervisor/src/backpressure/backpressureMonitor.test.ts new file mode 100644 index 00000000000..7af28ffc9f5 --- /dev/null +++ b/apps/supervisor/src/backpressure/backpressureMonitor.test.ts @@ -0,0 +1,353 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { Registry } from "prom-client"; +import { BackpressureMonitor, type BackpressureSignalSource } from "./backpressureMonitor.js"; +import { BackpressureMetrics } from "./backpressureMetrics.js"; + +function countingSource(verdict: { engaged: boolean } | null): { + source: BackpressureSignalSource; + reads: () => number; +} { + let reads = 0; + return { + source: { + read: async () => { + reads++; + return verdict; + }, + }, + reads: () => reads, + }; +} + +describe("BackpressureMonitor", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("when disabled, never skips dequeue and never reads the signal source", () => { + // Even though the source would report "engaged", a disabled monitor must be + // a complete no-op: this is the backwards-compatibility guarantee. + const { source, reads } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ enabled: false, source }); + + monitor.start(); + + expect(monitor.shouldSkipDequeue()).toBe(false); + expect(reads()).toBe(0); + + monitor.stop(); + }); + + it("when enabled and the source reports engaged, skips dequeue after a refresh", async () => { + const { source } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); // flush the initial async read + + expect(monitor.shouldSkipDequeue()).toBe(true); + + monitor.stop(); + }); + + it("when enabled and the source reports clear, does not skip dequeue", async () => { + const { source } = countingSource({ engaged: false }); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(monitor.shouldSkipDequeue()).toBe(false); + + monitor.stop(); + }); + + it("fails open (stops skipping) when the source throws", async () => { + let call = 0; + const source: BackpressureSignalSource = { + read: async () => { + call++; + if (call === 1) { + return { engaged: true }; + } + throw new Error("signal source unreachable"); + }, + }; + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(monitor.shouldSkipDequeue()).toBe(true); // engaged from the first read + + await vi.advanceTimersByTimeAsync(1000); // next refresh throws + expect(monitor.shouldSkipDequeue()).toBe(false); // fail-open: a dead source must not pin the brake + + monitor.stop(); + }); + + it("fails open when the source reports unknown (null)", async () => { + const { source } = countingSource(null); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(monitor.shouldSkipDequeue()).toBe(false); + + monitor.stop(); + }); + + it("fails open when the cached verdict goes stale (older than max age)", async () => { + // Source stops updating (e.g. hangs) after the first read; the verdict ages out. + const source: BackpressureSignalSource = { + read: async () => ({ engaged: true, ts: Date.now() }), + }; + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1_000_000, // effectively only the initial read fires + maxVerdictAgeMs: 15_000, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(monitor.shouldSkipDequeue()).toBe(true); + + await vi.advanceTimersByTimeAsync(15_001); // verdict now older than max age + expect(monitor.shouldSkipDequeue()).toBe(false); + + monitor.stop(); + }); + + it("does not read the source on the hot path (reads are driven by the refresh tick)", async () => { + const { source, reads } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(reads()).toBe(1); // just the initial refresh + + for (let i = 0; i < 1000; i++) { + monitor.shouldSkipDequeue(); + } + + expect(reads()).toBe(1); // hot-path calls performed zero I/O + + monitor.stop(); + }); + + it("does not start an overlapping refresh while one is in flight", async () => { + let reads = 0; + const source: BackpressureSignalSource = { + // Never resolves - simulates a hung read. + read: () => { + reads++; + return new Promise<{ engaged: boolean } | null>(() => {}); + }, + }; + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(3000); // several intervals while the first read hangs + + expect(reads).toBe(1); // in-flight guard prevents stacking + + monitor.stop(); + }); + + it("stops refreshing after stop()", async () => { + const { source, reads } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + const readsAtStop = reads(); + + monitor.stop(); + await vi.advanceTimersByTimeAsync(5000); + + expect(reads()).toBe(readsAtStop); + }); + + it("isEngaged reflects the hard engaged state (the signal for freezing scale-up)", async () => { + const { source } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(monitor.isEngaged()).toBe(true); + + monitor.stop(); + }); + + it("isEngaged is false when clear and when stale", async () => { + const source: BackpressureSignalSource = { + read: async () => ({ engaged: true, ts: Date.now() }), + }; + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1_000_000, + maxVerdictAgeMs: 15_000, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(monitor.isEngaged()).toBe(true); + + await vi.advanceTimersByTimeAsync(15_001); // stale → fail-open + expect(monitor.isEngaged()).toBe(false); + + monitor.stop(); + }); + + it("ramps the dequeue gate after release instead of resuming instantly", async () => { + let engaged = true; + let rnd = 0.5; + const source: BackpressureSignalSource = { read: async () => ({ engaged }) }; + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1000, + rampMs: 10_000, + random: () => rnd, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(monitor.shouldSkipDequeue()).toBe(true); // hard engaged + + // Release: the next refresh observes the clear verdict and starts the ramp. + engaged = false; + await vi.advanceTimersByTimeAsync(1000); + expect(monitor.isEngaged()).toBe(false); + + // Just after release (progress ~0): skip probability ~1, so skip regardless. + rnd = 0.99; + expect(monitor.shouldSkipDequeue()).toBe(true); + + // Halfway through the ramp (progress 0.5): skip probability 0.5. + await vi.advanceTimersByTimeAsync(5000); + rnd = 0.4; + expect(monitor.shouldSkipDequeue()).toBe(true); // 0.4 < 0.5 → skip + rnd = 0.6; + expect(monitor.shouldSkipDequeue()).toBe(false); // 0.6 ≥ 0.5 → allow + + // Past the ramp window: never skip. + await vi.advanceTimersByTimeAsync(5000); + rnd = 0.0; + expect(monitor.shouldSkipDequeue()).toBe(false); + + monitor.stop(); + }); + + it("fails open on an engaged verdict with no timestamp when staleness is enforced", async () => { + // A verdict claiming engaged but carrying no ts can't be checked for freshness; + // when maxVerdictAgeMs is set we must not trust it (else a dead producer could + // pin the brake forever). + const { source } = countingSource({ engaged: true }); // no ts + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1000, + maxVerdictAgeMs: 15_000, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(monitor.computeEngaged()).toBe(false); + expect(monitor.shouldSkipDequeue()).toBe(false); + + monitor.stop(); + }); + + it("in dry-run, the gates are inert but computeEngaged still reflects the real signal", async () => { + const { source } = countingSource({ engaged: true }); + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1000, + dryRun: true, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(monitor.computeEngaged()).toBe(true); // real signal, for observability/metrics + expect(monitor.isEngaged()).toBe(false); // inert: no scale-up freeze + expect(monitor.shouldSkipDequeue()).toBe(false); // inert: no dequeue skip + + monitor.stop(); + }); + + it("logs on verdict transitions", async () => { + let engaged = true; + const source: BackpressureSignalSource = { read: async () => ({ engaged }) }; + const logs: Array<{ message: string; meta?: Record }> = []; + const logger = { + info: (message: string, meta?: Record) => logs.push({ message, meta }), + }; + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1000, + logger, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(logs.some((l) => l.meta?.engaged === true)).toBe(true); + + engaged = false; + await vi.advanceTimersByTimeAsync(1000); + expect(logs.some((l) => l.meta?.engaged === false)).toBe(true); + + monitor.stop(); + }); + + it("records prometheus metrics", async () => { + const { source } = countingSource({ engaged: true }); + const register = new Registry(); + const metrics = new BackpressureMetrics({ register }); + const monitor = new BackpressureMonitor({ + enabled: true, + source, + refreshIntervalMs: 1000, + metrics, + }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + + expect(await register.metrics()).toContain("supervisor_backpressure_engaged 1"); + + monitor.shouldSkipDequeue(); + expect(await register.metrics()).toMatch( + /supervisor_backpressure_skipped_dequeues_total\{dry_run="false"\} [1-9]/ + ); + + monitor.stop(); + }); + + it("resumes instantly when no ramp is configured", async () => { + let engaged = true; + const source: BackpressureSignalSource = { read: async () => ({ engaged }) }; + const monitor = new BackpressureMonitor({ enabled: true, source, refreshIntervalMs: 1000 }); + + monitor.start(); + await vi.advanceTimersByTimeAsync(0); + expect(monitor.shouldSkipDequeue()).toBe(true); + + engaged = false; + await vi.advanceTimersByTimeAsync(1000); + expect(monitor.shouldSkipDequeue()).toBe(false); // no ramp → instant resume + + monitor.stop(); + }); +}); diff --git a/apps/supervisor/src/backpressure/backpressureMonitor.ts b/apps/supervisor/src/backpressure/backpressureMonitor.ts new file mode 100644 index 00000000000..6b4170697e5 --- /dev/null +++ b/apps/supervisor/src/backpressure/backpressureMonitor.ts @@ -0,0 +1,179 @@ +import type { BackpressureMetrics } from "./backpressureMetrics.js"; + +export interface BackpressureLogger { + info(message: string, meta?: Record): void; +} + +export type BackpressureVerdict = { + engaged: boolean; + /** Epoch ms the verdict was produced. Used for consumer-side staleness fail-open. */ + ts?: number; +}; + +/** + * Source of the current backpressure verdict. `read()` returns `null` when the + * verdict is unknown (missing/unreadable) - the monitor treats unknown as + * "not engaged" (fail-open). + */ +export interface BackpressureSignalSource { + read(): Promise; +} + +export type BackpressureMonitorOptions = { + enabled: boolean; + source: BackpressureSignalSource; + refreshIntervalMs?: number; + /** + * If set, a cached verdict older than this is treated as unknown (fail-open). + * Guards against the source silently going stale (e.g. hanging reads). + */ + maxVerdictAgeMs?: number; + /** + * If set, after backpressure releases the dequeue gate stays partially engaged + * for this long, skipping a linearly-decaying fraction of attempts so the + * aggregate dequeue rate ramps from ~0 to full instead of snapping to full and + * re-flooding a freshly-recovered cluster. 0/unset = instant resume. + */ + rampMs?: number; + /** Injectable RNG for the resume ramp; defaults to Math.random. */ + random?: () => number; + /** + * When true, the gates are inert (never skip dequeues, never freeze scale-up). + * computeEngaged() still reflects the real signal so it can be observed. + */ + dryRun?: boolean; + logger?: BackpressureLogger; + metrics?: BackpressureMetrics; +}; + +const DEFAULT_REFRESH_INTERVAL_MS = 1000; + +export class BackpressureMonitor { + private verdict: BackpressureVerdict | null = null; + private timer?: ReturnType; + private refreshInFlight = false; + private wasEngaged = false; + private releasedAt?: number; + + constructor(private readonly opts: BackpressureMonitorOptions) { + this.opts.metrics?.dryRun.set(this.opts.dryRun ? 1 : 0); + } + + start(): void { + if (!this.opts.enabled) { + return; + } + + void this.refreshTick(); + this.timer = setInterval( + () => void this.refreshTick(), + this.opts.refreshIntervalMs ?? DEFAULT_REFRESH_INTERVAL_MS + ); + } + + /** Skip a tick if the previous refresh is still in flight, so slow/hung reads can't stack. */ + private async refreshTick(): Promise { + if (this.refreshInFlight) { + return; + } + this.refreshInFlight = true; + try { + await this.refresh(); + } finally { + this.refreshInFlight = false; + } + } + + stop(): void { + if (this.timer) { + clearInterval(this.timer); + this.timer = undefined; + } + } + + /** + * Raw hard backpressure state: true while the (fresh) verdict says engaged, + * ignoring dry-run. Used for observability/metrics so the real signal is + * visible even when the gates are inert. + */ + computeEngaged(): boolean { + const verdict = this.verdict; + if (verdict?.engaged !== true) { + return false; + } + + // When staleness enforcement is on, an engaged verdict must carry a fresh + // timestamp. A missing or stale ts can't be trusted (a dead producer could + // otherwise pin the brake forever), so fail open. + const maxAge = this.opts.maxVerdictAgeMs; + if (maxAge !== undefined) { + if (verdict.ts === undefined || Date.now() - verdict.ts > maxAge) { + return false; + } + } + + return true; + } + + /** + * Effective hard state: the signal for freezing consumer-pool scale-up. Inert + * (false) in dry-run. Hot-path read, no I/O. + */ + isEngaged(): boolean { + return this.opts.dryRun ? false : this.computeEngaged(); + } + + /** Hot-path read: synchronous, never performs I/O. Inert (false) in dry-run. */ + shouldSkipDequeue(): boolean { + const wouldSkip = this.computeShouldSkip(); + if (wouldSkip) { + this.opts.metrics?.skipsTotal.inc({ dry_run: this.opts.dryRun ? "true" : "false" }); + } + return this.opts.dryRun ? false : wouldSkip; + } + + private computeShouldSkip(): boolean { + if (this.computeEngaged()) { + return true; + } + + // Post-release ramp: skip a linearly-decaying fraction of attempts so the + // aggregate dequeue rate climbs back to full over rampMs rather than snapping. + const rampMs = this.opts.rampMs; + if (rampMs && this.releasedAt !== undefined) { + const elapsed = Date.now() - this.releasedAt; + if (elapsed < rampMs) { + const skipProbability = 1 - elapsed / rampMs; + return (this.opts.random ?? Math.random)() < skipProbability; + } + } + + return false; + } + + private async refresh(): Promise { + try { + this.verdict = await this.opts.source.read(); + } catch { + // Fail-open: a dead/unreachable source must never pin the brake. Treat as + // unknown (no verdict) so dequeue resumes as if backpressure were off. + this.verdict = null; + } + + // Track the engaged→released transition to anchor the resume ramp. Use the + // staleness-aware state so a stale verdict doesn't pin wasEngaged / the gauge. + const nowEngaged = this.computeEngaged(); + this.opts.metrics?.engaged.set(nowEngaged ? 1 : 0); + + if (nowEngaged !== this.wasEngaged) { + this.opts.logger?.info("backpressure verdict changed", { + engaged: nowEngaged, + dryRun: !!this.opts.dryRun, + }); + } + if (this.wasEngaged && !nowEngaged) { + this.releasedAt = Date.now(); + } + this.wasEngaged = nowEngaged; + } +} diff --git a/apps/supervisor/src/backpressure/redisBackpressureSignalSource.test.ts b/apps/supervisor/src/backpressure/redisBackpressureSignalSource.test.ts new file mode 100644 index 00000000000..77a7457b13c --- /dev/null +++ b/apps/supervisor/src/backpressure/redisBackpressureSignalSource.test.ts @@ -0,0 +1,62 @@ +import { redisTest } from "@internal/testcontainers"; +import { Redis } from "ioredis"; +import { describe, expect } from "vitest"; +import { RedisBackpressureSignalSource } from "./redisBackpressureSignalSource.js"; + +const KEY = "backpressure:test"; + +describe("RedisBackpressureSignalSource", () => { + redisTest("returns null when the key is absent", async ({ redisOptions }) => { + const redis = new Redis(redisOptions); + try { + const source = new RedisBackpressureSignalSource(redis, KEY); + expect(await source.read()).toBeNull(); + } finally { + await redis.quit(); + } + }); + + redisTest("parses a valid engaged verdict", async ({ redisOptions }) => { + const redis = new Redis(redisOptions); + try { + await redis.set(KEY, JSON.stringify({ engaged: true, ts: 1_700_000_000_000 })); + const source = new RedisBackpressureSignalSource(redis, KEY); + expect(await source.read()).toEqual({ engaged: true, ts: 1_700_000_000_000 }); + } finally { + await redis.quit(); + } + }); + + redisTest("parses a clear verdict", async ({ redisOptions }) => { + const redis = new Redis(redisOptions); + try { + await redis.set(KEY, JSON.stringify({ engaged: false })); + const source = new RedisBackpressureSignalSource(redis, KEY); + expect(await source.read()).toEqual({ engaged: false }); + } finally { + await redis.quit(); + } + }); + + redisTest("returns null for malformed JSON (fail-open)", async ({ redisOptions }) => { + const redis = new Redis(redisOptions); + try { + await redis.set(KEY, "not json {"); + const source = new RedisBackpressureSignalSource(redis, KEY); + expect(await source.read()).toBeNull(); + } finally { + await redis.quit(); + } + }); + + redisTest("returns null for valid JSON of the wrong shape (fail-open)", async ({ redisOptions }) => { + const redis = new Redis(redisOptions); + try { + await redis.set(KEY, JSON.stringify({ foo: "bar" })); + const source = new RedisBackpressureSignalSource(redis, KEY); + expect(await source.read()).toBeNull(); + } finally { + await redis.quit(); + } + }); +}); diff --git a/apps/supervisor/src/backpressure/redisBackpressureSignalSource.ts b/apps/supervisor/src/backpressure/redisBackpressureSignalSource.ts new file mode 100644 index 00000000000..4f8a54c6247 --- /dev/null +++ b/apps/supervisor/src/backpressure/redisBackpressureSignalSource.ts @@ -0,0 +1,35 @@ +import type { Redis } from "ioredis"; +import { z } from "zod"; +import type { BackpressureSignalSource, BackpressureVerdict } from "./backpressureMonitor.js"; + +const VerdictSchema = z.object({ + engaged: z.boolean(), + ts: z.number().optional(), +}); + +/** Reads the backpressure verdict from a Redis key written by the cluster-side aggregator. */ +export class RedisBackpressureSignalSource implements BackpressureSignalSource { + constructor( + private readonly redis: Redis, + private readonly key: string + ) {} + + async read(): Promise { + const raw = await this.redis.get(this.key); + if (raw === null) { + return null; + } + + // A malformed or wrong-shaped value is treated as unknown (null) so the + // monitor fails open rather than acting on garbage. + let json: unknown; + try { + json = JSON.parse(raw); + } catch { + return null; + } + + const parsed = VerdictSchema.safeParse(json); + return parsed.success ? parsed.data : null; + } +} diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index b69fb24d73f..3919e73a7ee 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -34,6 +34,10 @@ const Env = z // Dequeue settings (provider mode) TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + // Which worker-queue class this supervisor fleet serves. "default" pulls the + // region queue (standard/agent runs); "scheduled" pulls the dedicated + // scheduled-lineage queue. Run a separate fleet per class for isolation. + TRIGGER_WORKER_QUEUE_CLASS: z.enum(["default", "scheduled"]).default("default"), TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), @@ -47,6 +51,29 @@ const Env = z TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + // Dequeue backpressure - off by default. When enabled, the supervisor reads a + // verdict from Redis (written by the cluster-side aggregator) and pauses dequeues + // while the worker cluster can't schedule pods. Disabled = total no-op: no Redis + // client is created, no reads happen, and the dequeue loop is unaffected. + TRIGGER_DEQUEUE_BACKPRESSURE_ENABLED: BoolEnv.default(false), + // Safety default: even when enabled, backpressure only logs what it would do. + // Set to false to actually skip dequeues / freeze scale-up. + TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN: BoolEnv.default(true), + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY: z.string().default("engine:dequeue:backpressure"), + TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS: z.coerce.number().int().positive().default(1000), + TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS: z.coerce.number().int().min(0).default(30_000), // Resume ramp window after release; 0 = instant resume + + TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS: z.coerce + .number() + .int() + .positive() + .default(15_000), // Stale verdict → fail-open (treat as not engaged) + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST: z.string().optional(), + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PORT: z.coerce.number().int().optional(), + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_USERNAME: z.string().optional(), + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PASSWORD: z.string().optional(), + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_TLS_DISABLED: BoolEnv.default(false), + // Optional services TRIGGER_WARM_START_URL: z.string().optional(), TRIGGER_CHECKPOINT_URL: z.string().optional(), @@ -87,6 +114,14 @@ const Env = z COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10), + // Instance create retries for transient placement failures (1 = no retries) + COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3), + COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce + .number() + .int() + .min(0) + .max(10_000) + .default(250), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), @@ -121,6 +156,16 @@ const Env = z KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + + // Pod DNS config — override the cluster default ndots to `KUBERNETES_POD_DNS_NDOTS`. + // Default k8s ndots is 5: any name with fewer than 5 dots (e.g. `api.example.com`, 2 dots) is first walked + // through every entry in the cluster search list (`.svc.cluster.local`, `svc.cluster.local`, `cluster.local`) + // before being tried as-is, turning one resolution into 4+ CoreDNS queries (×2 with A+AAAA). + // Overriding the default can be useful to cut CoreDNS query amplification for external domains. + // Note: before enabling, make sure no code path relies on search-list expansion for names with dots ≥ the value + // set here — those names will now hit their as-is form first and could resolve externally before falling back. + KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED: BoolEnv.default(false), + KUBERNETES_POD_DNS_NDOTS: z.coerce.number().int().min(1).max(15).default(2), // Large machine affinity settings - large-* presets prefer a dedicated pool KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z @@ -189,7 +234,9 @@ const Env = z if (!validEffects.includes(effect)) { ctx.addIssue({ code: z.ZodIssueCode.custom, - message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join(", ")}`, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join( + ", " + )}`, }); return z.NEVER; } @@ -244,6 +291,15 @@ const Env = z // Debug DEBUG: BoolEnv.default(false), SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + + // Wide-event observability - off by default. Emits one flat-keyed JSON + // line per natural unit of work (dequeue iteration, HTTP request, socket + // lifecycle). High-QPS hotpath, so the kill switch must be honoured. + TRIGGER_WIDE_EVENTS_ENABLED: BoolEnv.default(false), + // When true, also emit wide events for high-frequency HTTP routes + // (heartbeat, snapshots-since, logs/debug). Off in prod to keep event + // volume manageable; on in test environments for full-fidelity debugging. + TRIGGER_WIDE_EVENTS_NOISY_ROUTES: BoolEnv.default(false), }) .superRefine((data, ctx) => { if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { @@ -260,6 +316,14 @@ const Env = z path: ["TRIGGER_WORKLOAD_API_DOMAIN"], }); } + if (data.TRIGGER_DEQUEUE_BACKPRESSURE_ENABLED && !data.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: + "TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST is required when TRIGGER_DEQUEUE_BACKPRESSURE_ENABLED is true", + path: ["TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST"], + }); + } }) .transform((data) => ({ ...data, diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 6f5913c47ca..e97c4c7bb96 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -28,6 +28,18 @@ import { FailedPodHandler } from "./services/failedPodHandler.js"; import { getWorkerToken } from "./workerToken.js"; import { OtlpTraceService } from "./services/otlpTraceService.js"; import { extractTraceparent, getRestoreRunnerId } from "./util.js"; +import { Redis } from "ioredis"; +import { BackpressureMonitor } from "./backpressure/backpressureMonitor.js"; +import { RedisBackpressureSignalSource } from "./backpressure/redisBackpressureSignalSource.js"; +import { BackpressureMetrics } from "./backpressure/backpressureMetrics.js"; +import { + fromContext, + recordPhaseSince, + runWideEvent, + setExtra, + setMeta, + type WideEventOptions, +} from "./wideEvents/index.js"; if (env.METRICS_COLLECT_DEFAULTS) { collectDefaultMetrics({ register }); @@ -46,15 +58,28 @@ class ManagedSupervisor { private readonly podCleaner?: PodCleaner; private readonly failedPodHandler?: FailedPodHandler; private readonly tracing?: OtlpTraceService; + private readonly backpressureMonitor?: BackpressureMonitor; + private readonly backpressureRedis?: Redis; private readonly isKubernetes = isKubernetesEnvironment(env.KUBERNETES_FORCE_ENABLED); private readonly warmStartUrl = env.TRIGGER_WARM_START_URL; + private readonly wideEventOpts: WideEventOptions = { + service: "supervisor", + env: { nodeId: env.TRIGGER_WORKER_INSTANCE_NAME }, + enabled: env.TRIGGER_WIDE_EVENTS_ENABLED, + }; + private readonly wideEventsNoisyRoutes = env.TRIGGER_WIDE_EVENTS_NOISY_ROUTES; + constructor() { + // Strip secret-like env vars before debug-logging the rest. Add any new + // secret env var here so it never lands in the DEBUG "Starting up" log. const { TRIGGER_WORKER_TOKEN, MANAGED_WORKER_SECRET, COMPUTE_GATEWAY_AUTH_TOKEN, + DOCKER_REGISTRY_PASSWORD, + TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PASSWORD, ...envWithoutSecrets } = env; @@ -119,6 +144,10 @@ class ManagedSupervisor { otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT, prettyLogs: env.RUNNER_PRETTY_LOGS, }, + createRetry: { + maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS, + baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS, + }, }); this.computeManager = computeManager; this.workloadManager = computeManager; @@ -166,6 +195,42 @@ class ManagedSupervisor { ); } + if (env.TRIGGER_DEQUEUE_BACKPRESSURE_ENABLED) { + this.backpressureRedis = new Redis({ + host: env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST, + port: env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PORT, + username: env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_USERNAME, + password: env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PASSWORD, + ...(env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_TLS_DISABLED ? {} : { tls: {} }), + maxRetriesPerRequest: null, + }); + this.backpressureRedis.on("error", (error) => + this.logger.error("Backpressure redis error", { error: error.message }) + ); + + this.backpressureMonitor = new BackpressureMonitor({ + enabled: true, + source: new RedisBackpressureSignalSource( + this.backpressureRedis, + env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY + ), + refreshIntervalMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS, + maxVerdictAgeMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS, + rampMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS, + dryRun: env.TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN, + logger: this.logger, + metrics: new BackpressureMetrics({ register }), + }); + + this.logger.log("🛑 Dequeue backpressure enabled", { + key: env.TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY, + refreshIntervalMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS, + maxVerdictAgeMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS, + rampMs: env.TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS, + dryRun: env.TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN, + }); + } + this.workerSession = new SupervisorSession({ workerToken: getWorkerToken(), apiUrl: env.TRIGGER_API_URL, @@ -175,6 +240,7 @@ class ManagedSupervisor { dequeueIdleIntervalMs: env.TRIGGER_DEQUEUE_IDLE_INTERVAL_MS, queueConsumerEnabled: env.TRIGGER_DEQUEUE_ENABLED, maxRunCount: env.TRIGGER_DEQUEUE_MAX_RUN_COUNT, + queueClass: env.TRIGGER_WORKER_QUEUE_CLASS, metricsRegistry: register, scaling: { strategy: env.TRIGGER_DEQUEUE_SCALING_STRATEGY, @@ -186,18 +252,20 @@ class ManagedSupervisor { ewmaAlpha: env.TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA, batchWindowMs: env.TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS, dampingFactor: env.TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR, + // Freeze scale-up while backpressure is hard-engaged (not during the resume + // ramp). Undefined when backpressure is disabled → no effect on scaling. + shouldPauseScaling: () => this.backpressureMonitor?.isEngaged() ?? false, }, runNotificationsEnabled: env.TRIGGER_WORKLOAD_API_ENABLED, heartbeatIntervalSeconds: env.TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS, sendRunDebugLogs: env.SEND_RUN_DEBUG_LOGS, preDequeue: async () => { - if (!env.RESOURCE_MONITOR_ENABLED) { - return {}; - } + // Synchronous, hot-path-safe cached read; undefined when backpressure is disabled. + const skipForBackpressure = this.backpressureMonitor?.shouldSkipDequeue() ?? false; - if (this.isKubernetes) { - // Not used in k8s for now - return {}; + if (!env.RESOURCE_MONITOR_ENABLED || this.isKubernetes) { + // Resource monitor is not used in k8s; backpressure is the only gate there. + return { skipDequeue: skipForBackpressure }; } const resources = await this.resourceMonitor.getNodeResources(); @@ -207,7 +275,10 @@ class ManagedSupervisor { cpu: resources.cpuAvailable, memory: resources.memoryAvailable, }, - skipDequeue: resources.cpuAvailable < 0.25 || resources.memoryAvailable < 0.25, + skipDequeue: + skipForBackpressure || + resources.cpuAvailable < 0.25 || + resources.memoryAvailable < 0.25, }; }, preSkip: async () => { @@ -239,149 +310,205 @@ class ManagedSupervisor { async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { this.logger.verbose(`Received message with timestamp ${time.toLocaleString()}`, message); - if (message.completedWaitpoints.length > 0) { - this.logger.debug("Run has completed waitpoints", { - runId: message.run.id, - completedWaitpoints: message.completedWaitpoints.length, - }); - } - - if (!message.image) { - this.logger.error("Run has no image", { runId: message.run.id }); - return; - } - - const { checkpoint, ...rest } = message; - - // Register trace context early so snapshot spans work for all paths - // (cold create, restore, warm start). Re-registration on restore is safe - // since dequeue always provides fresh context. - if (this.computeManager?.traceSpansEnabled) { - const traceparent = extractTraceparent(message.run.traceContext); - - if (traceparent) { - this.workloadServer.registerRunTraceContext(message.run.friendlyId, { - traceparent, - envId: message.environment.id, - orgId: message.organization.id, - projectId: message.project.id, - }); - } - } + const traceparent = extractTraceparent(message.run.traceContext); + + await runWideEvent( + { + ...this.wideEventOpts, + op: "dequeue", + kind: "inbound", + traceparent, + setup: (state) => { + setMeta(state, "run_id", message.run.friendlyId); + setMeta(state, "env_id", message.environment.id); + setMeta(state, "org_id", message.organization.id); + setMeta(state, "project_id", message.project.id); + if (message.deployment.friendlyId) { + setMeta(state, "deployment_id", message.deployment.friendlyId); + } + setMeta(state, "machine_preset", message.run.machine.name); + state.extras.iteration = "dequeue"; + state.extras.dequeue_response_ms = dequeueResponseMs; + state.extras.polling_interval_ms = pollingIntervalMs; + state.extras.completed_waitpoints = message.completedWaitpoints.length; + }, + }, + async () => { + if (message.completedWaitpoints.length > 0) { + this.logger.debug("Run has completed waitpoints", { + runId: message.run.id, + completedWaitpoints: message.completedWaitpoints.length, + }); + } - if (checkpoint) { - this.logger.debug("Restoring run", { runId: message.run.id }); + if (!message.image) { + setExtra(fromContext(), "path_taken", "skipped_no_image"); + this.logger.error("Run has no image", { runId: message.run.id }); + return; + } - if (this.computeManager) { - try { - const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id); + const { checkpoint, ...rest } = message; - const didRestore = await this.computeManager.restore({ - snapshotId: checkpoint.location, - runnerId, - runFriendlyId: message.run.friendlyId, - snapshotFriendlyId: message.snapshot.friendlyId, - machine: message.run.machine, - traceContext: message.run.traceContext, + // Register trace context early so snapshot spans work for all paths + // (cold create, restore, warm start). Re-registration on restore is safe + // since dequeue always provides fresh context. + if (this.computeManager?.traceSpansEnabled && traceparent) { + this.workloadServer.registerRunTraceContext(message.run.friendlyId, { + traceparent, envId: message.environment.id, orgId: message.organization.id, projectId: message.project.id, - dequeuedAt: message.dequeuedAt, }); + } - if (didRestore) { - this.logger.debug("Compute restore successful", { - runId: message.run.id, - runnerId, + if (checkpoint) { + setExtra(fromContext(), "path_taken", "restore"); + this.logger.debug("Restoring run", { runId: message.run.id }); + + if (this.computeManager) { + const restoreStart = performance.now(); + try { + const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id); + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + traceContext: message.run.traceContext, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + hasPrivateLink: message.organization.hasPrivateLink, + dequeuedAt: message.dequeuedAt, + }); + recordPhaseSince("restore", restoreStart, undefined); + setExtra(fromContext(), "did_restore", didRestore); + + if (didRestore) { + this.logger.debug("Compute restore successful", { + runId: message.run.id, + runnerId, + }); + } else { + this.logger.error("Compute restore failed", { + runId: message.run.id, + runnerId, + }); + } + } catch (error) { + recordPhaseSince( + "restore", + restoreStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + + if (!this.checkpointClient) { + this.logger.error("No checkpoint client", { runId: message.run.id }); + return; + } + + const restoreStart = performance.now(); + try { + const didRestore = await this.checkpointClient.restoreRun({ + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + body: { + ...rest, + checkpoint, + }, }); - } else { - this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); + recordPhaseSince("restore", restoreStart, undefined); + setExtra(fromContext(), "did_restore", didRestore); + + if (didRestore) { + this.logger.debug("Restore successful", { runId: message.run.id }); + } else { + this.logger.error("Restore failed", { runId: message.run.id }); + } + } catch (error) { + recordPhaseSince( + "restore", + restoreStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to restore run", { error }); } - } catch (error) { - this.logger.error("Failed to restore run (compute)", { error }); + + return; } - return; - } + this.logger.debug("Scheduling run", { runId: message.run.id }); - if (!this.checkpointClient) { - this.logger.error("No checkpoint client", { runId: message.run.id }); - return; - } + const warmStartStart = performance.now(); + const didWarmStart = await this.tryWarmStart(message, traceparent); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); + recordPhaseSince("warm_start", warmStartStart, undefined); + setExtra(fromContext(), "did_warm_start", didWarmStart); - try { - const didRestore = await this.checkpointClient.restoreRun({ - runFriendlyId: message.run.friendlyId, - snapshotFriendlyId: message.snapshot.friendlyId, - body: { - ...rest, - checkpoint, - }, - }); - - if (didRestore) { - this.logger.debug("Restore successful", { runId: message.run.id }); - } else { - this.logger.error("Restore failed", { runId: message.run.id }); + if (didWarmStart) { + setExtra(fromContext(), "path_taken", "warm_start"); + this.logger.debug("Warm start successful", { runId: message.run.id }); + return; } - } catch (error) { - this.logger.error("Failed to restore run", { error }); - } - - return; - } - this.logger.debug("Scheduling run", { runId: message.run.id }); + setExtra(fromContext(), "path_taken", "cold_create"); - const warmStartStart = performance.now(); - const didWarmStart = await this.tryWarmStart(message); - const warmStartCheckMs = Math.round(performance.now() - warmStartStart); + const createStart = performance.now(); + try { + if (!message.deployment.friendlyId) { + // mostly a type guard, deployments always exists for deployed environments + // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. + throw new Error("Deployment is missing"); + } - if (didWarmStart) { - this.logger.debug("Warm start successful", { runId: message.run.id }); - return; - } + await this.workloadManager.create({ + dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, + envId: message.environment.id, + envType: message.environment.type, + image: message.image, + machine: message.run.machine, + orgId: message.organization.id, + projectId: message.project.id, + deploymentFriendlyId: message.deployment.friendlyId, + deploymentVersion: message.backgroundWorker.version, + runId: message.run.id, + runFriendlyId: message.run.friendlyId, + version: message.version, + nextAttemptNumber: message.run.attemptNumber, + snapshotId: message.snapshot.id, + snapshotFriendlyId: message.snapshot.friendlyId, + placementTags: message.placementTags, + traceContext: message.run.traceContext, + annotations: message.run.annotations, + hasPrivateLink: message.organization.hasPrivateLink, + }); + recordPhaseSince("workload_create", createStart, undefined); - try { - if (!message.deployment.friendlyId) { - // mostly a type guard, deployments always exists for deployed environments - // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. - throw new Error("Deployment is missing"); + // Disabled for now + // this.resourceMonitor.blockResources({ + // cpu: message.run.machine.cpu, + // memory: message.run.machine.memory, + // }); + } catch (error) { + recordPhaseSince( + "workload_create", + createStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to create workload", { error }); + } } - - await this.workloadManager.create({ - dequeuedAt: message.dequeuedAt, - dequeueResponseMs, - pollingIntervalMs, - warmStartCheckMs, - envId: message.environment.id, - envType: message.environment.type, - image: message.image, - machine: message.run.machine, - orgId: message.organization.id, - projectId: message.project.id, - deploymentFriendlyId: message.deployment.friendlyId, - deploymentVersion: message.backgroundWorker.version, - runId: message.run.id, - runFriendlyId: message.run.friendlyId, - version: message.version, - nextAttemptNumber: message.run.attemptNumber, - snapshotId: message.snapshot.id, - snapshotFriendlyId: message.snapshot.friendlyId, - placementTags: message.placementTags, - traceContext: message.run.traceContext, - annotations: message.run.annotations, - hasPrivateLink: message.organization.hasPrivateLink, - }); - - // Disabled for now - // this.resourceMonitor.blockResources({ - // cpu: message.run.machine.cpu, - // memory: message.run.machine.memory, - // }); - } catch (error) { - this.logger.error("Failed to create workload", { error }); - } + ); } ); @@ -404,6 +531,8 @@ class ManagedSupervisor { checkpointClient: this.checkpointClient, computeManager: this.computeManager, tracing: this.tracing, + wideEventOpts: this.wideEventOpts, + wideEventsNoisyRoutes: this.wideEventsNoisyRoutes, }); this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); @@ -420,19 +549,31 @@ class ManagedSupervisor { this.workerSession.unsubscribeFromRunNotifications([run.friendlyId]); } - private async tryWarmStart(dequeuedMessage: DequeuedMessage): Promise { + private async tryWarmStart( + dequeuedMessage: DequeuedMessage, + traceparent: string | undefined + ): Promise { if (!this.warmStartUrl) { return false; } const warmStartUrlWithPath = new URL("/warm-start", this.warmStartUrl); + const headers: Record = { + "Content-Type": "application/json", + }; + // Propagate the inbound W3C traceparent so the upstream warm-start + // receiver continues the same trace instead of minting a new one. Gated + // by the same kill switch as the wide-event emission so the whole PR is + // a no-op on the wire when disabled. + if (this.wideEventOpts.enabled && traceparent) { + headers.traceparent = traceparent; + } + try { const res = await fetch(warmStartUrlWithPath.href, { method: "POST", - headers: { - "Content-Type": "application/json", - }, + headers, body: JSON.stringify({ dequeuedMessage }), }); @@ -468,6 +609,7 @@ class ManagedSupervisor { this.logger.log("Starting up"); // Optional services + this.backpressureMonitor?.start(); await this.podCleaner?.start(); await this.failedPodHandler?.start(); await this.metricsServer?.start(); @@ -492,6 +634,8 @@ class ManagedSupervisor { await this.workerSession.stop(); // Optional services + this.backpressureMonitor?.stop(); + await this.backpressureRedis?.quit(); await this.podCleaner?.stop(); await this.failedPodHandler?.stop(); await this.metricsServer?.stop(); diff --git a/apps/supervisor/src/services/computeSnapshotService.test.ts b/apps/supervisor/src/services/computeSnapshotService.test.ts new file mode 100644 index 00000000000..b039b63bd4d --- /dev/null +++ b/apps/supervisor/src/services/computeSnapshotService.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it, vi } from "vitest"; +import { setTimeout as sleep } from "node:timers/promises"; +import { ComputeSnapshotService } from "./computeSnapshotService.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers"; + +// The TimerWheel ticks every 100ms, so a 200ms delay dispatches within ~300ms. +const DELAY_MS = 200; +// Long enough that a pending snapshot would certainly have dispatched. +const SETTLE_MS = 600; + +function createService() { + const snapshot = vi.fn(async (_opts: { runnerId: string; metadata: Record }) => true); + + const computeManager = { + snapshotDelayMs: DELAY_MS, + snapshotDispatchLimit: 1, + snapshot, + } as unknown as ComputeWorkloadManager; + + const service = new ComputeSnapshotService({ + computeManager, + workerClient: {} as SupervisorHttpClient, + wideEventOpts: { service: "supervisor-test", env: {}, enabled: false }, + }); + + return { service, snapshot }; +} + +function delayedSnapshot(runnerId = "runner-1") { + return { + runnerId, + runFriendlyId: "run_1", + snapshotFriendlyId: "snapshot_1", + }; +} + +describe("ComputeSnapshotService", () => { + it("dispatches a scheduled snapshot after the delay", async () => { + const { service, snapshot } = createService(); + try { + service.schedule("run_1", delayedSnapshot()); + + await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 }); + expect(snapshot).toHaveBeenCalledWith({ + runnerId: "runner-1", + metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_1" }, + }); + } finally { + service.stop(); + } + }); + + it("cancel before the delay expires prevents the dispatch", async () => { + const { service, snapshot } = createService(); + try { + service.schedule("run_1", delayedSnapshot()); + + expect(service.cancel("run_1")).toBe(true); + + await sleep(SETTLE_MS); + expect(snapshot).not.toHaveBeenCalled(); + } finally { + service.stop(); + } + }); + + it("cancel returns false when nothing is pending", () => { + const { service } = createService(); + try { + expect(service.cancel("run_1")).toBe(false); + } finally { + service.stop(); + } + }); + + it("cancel with a matching runnerId cancels the pending snapshot", async () => { + const { service, snapshot } = createService(); + try { + service.schedule("run_1", delayedSnapshot("runner-a")); + + expect(service.cancel("run_1", "runner-a")).toBe(true); + + await sleep(SETTLE_MS); + expect(snapshot).not.toHaveBeenCalled(); + } finally { + service.stop(); + } + }); + + it("cancel with a different runnerId leaves the pending snapshot alone", async () => { + const { service, snapshot } = createService(); + try { + service.schedule("run_1", delayedSnapshot("runner-a")); + + // A stale runner for a reassigned run must not cancel the new runner's snapshot. + expect(service.cancel("run_1", "runner-b")).toBe(false); + + await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 }); + expect(snapshot).toHaveBeenCalledWith( + expect.objectContaining({ runnerId: "runner-a" }) + ); + } finally { + service.stop(); + } + }); + + it("re-scheduling the same run replaces the pending snapshot", async () => { + const { service, snapshot } = createService(); + try { + service.schedule("run_1", delayedSnapshot()); + service.schedule("run_1", { + runnerId: "runner-1", + runFriendlyId: "run_1", + snapshotFriendlyId: "snapshot_2", + }); + + await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 }); + await sleep(SETTLE_MS); + + expect(snapshot).toHaveBeenCalledTimes(1); + expect(snapshot).toHaveBeenCalledWith({ + runnerId: "runner-1", + metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_2" }, + }); + } finally { + service.stop(); + } + }); +}); diff --git a/apps/supervisor/src/services/computeSnapshotService.ts b/apps/supervisor/src/services/computeSnapshotService.ts index 041e2902c75..216753fc12d 100644 --- a/apps/supervisor/src/services/computeSnapshotService.ts +++ b/apps/supervisor/src/services/computeSnapshotService.ts @@ -6,6 +6,15 @@ import { type SnapshotCallbackPayload } from "@internal/compute"; import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; import { TimerWheel } from "./timerWheel.js"; import type { OtlpTraceService } from "./otlpTraceService.js"; +import { + emitOneShot, + fromContext, + recordPhaseSince, + runWideEvent, + setExtra, + setMeta, + type WideEventOptions, +} from "../wideEvents/index.js"; type DelayedSnapshot = { runnerId: string; @@ -24,6 +33,7 @@ export type ComputeSnapshotServiceOptions = { computeManager: ComputeWorkloadManager; workerClient: SupervisorHttpClient; tracing?: OtlpTraceService; + wideEventOpts: WideEventOptions; }; export class ComputeSnapshotService { @@ -37,11 +47,13 @@ export class ComputeSnapshotService { private readonly computeManager: ComputeWorkloadManager; private readonly workerClient: SupervisorHttpClient; private readonly tracing?: OtlpTraceService; + private readonly wideEventOpts: WideEventOptions; constructor(opts: ComputeSnapshotServiceOptions) { this.computeManager = opts.computeManager; this.workerClient = opts.workerClient; this.tracing = opts.tracing; + this.wideEventOpts = opts.wideEventOpts; this.dispatchLimit = pLimit(this.computeManager.snapshotDispatchLimit); this.timerWheel = new TimerWheel({ @@ -62,6 +74,17 @@ export class ComputeSnapshotService { /** Schedule a delayed snapshot for a run. Replaces any pending snapshot for the same run. */ schedule(runFriendlyId: string, data: DelayedSnapshot) { this.timerWheel.submit(runFriendlyId, data); + emitOneShot({ + ...this.wideEventOpts, + op: "snapshot.schedule", + kind: "event", + populate: (state) => { + state.meta.run_id = runFriendlyId; + state.meta.snapshot_id = data.snapshotFriendlyId; + state.extras.runner_id = data.runnerId; + state.extras.delay_ms = this.computeManager.snapshotDelayMs; + }, + }); this.logger.debug("Snapshot scheduled", { runFriendlyId, snapshotFriendlyId: data.snapshotFriendlyId, @@ -69,10 +92,29 @@ export class ComputeSnapshotService { }); } - /** Cancel a pending delayed snapshot. Returns true if one was cancelled. */ - cancel(runFriendlyId: string): boolean { + /** + * Cancel a pending delayed snapshot. Returns true if one was cancelled. + * When `runnerId` is given, only a snapshot scheduled for that same runner + * is cancelled - a stale runner for a run that has since been reassigned + * must not cancel the new runner's pending snapshot. + */ + cancel(runFriendlyId: string, runnerId?: string): boolean { + if (runnerId) { + const pending = this.timerWheel.peek(runFriendlyId); + if (pending && pending.data.runnerId !== runnerId) { + return false; + } + } const cancelled = this.timerWheel.cancel(runFriendlyId); if (cancelled) { + emitOneShot({ + ...this.wideEventOpts, + op: "snapshot.canceled", + kind: "event", + populate: (state) => { + state.meta.run_id = runFriendlyId; + }, + }); this.logger.debug("Snapshot cancelled", { runFriendlyId }); } return cancelled; @@ -81,6 +123,23 @@ export class ComputeSnapshotService { /** Handle the callback from the gateway after a snapshot completes or fails. */ async handleCallback(body: SnapshotCallbackPayload) { const snapshotId = body.status === "completed" ? body.snapshot_id : undefined; + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + // Enrich the wrapping route's wide event with snapshot metadata. The + // `/api/v1/compute/snapshot-complete` route is registered with `wideRoute`, + // so `fromContext()` returns the State of that route and these calls + // become extras/meta on the same wide event - no nested emission. + const state = fromContext(); + if (state) { + state.extras["snapshot.status"] = body.status; + if (body.instance_id) state.extras["snapshot.instance_id"] = body.instance_id; + if (body.duration_ms !== undefined) state.extras["snapshot.duration_ms"] = body.duration_ms; + if (snapshotId) state.extras["snapshot.id"] = snapshotId; + if (body.status === "failed" && body.error) state.extras["snapshot.error"] = body.error; + } + if (runId) setMeta(state, "run_id", runId); + if (snapshotFriendlyId) setMeta(state, "snapshot_id", snapshotFriendlyId); this.logger.debug("Snapshot callback", { snapshotId, @@ -91,9 +150,6 @@ export class ComputeSnapshotService { durationMs: body.duration_ms, }); - const runId = body.metadata?.runId; - const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; - if (!runId || !snapshotFriendlyId) { this.logger.error("Snapshot callback missing metadata", { body }); return { ok: false as const, status: 400 }; @@ -102,6 +158,7 @@ export class ComputeSnapshotService { this.#emitSnapshotSpan(runId, body.duration_ms, snapshotId); if (body.status === "completed") { + const submitStart = performance.now(); const result = await this.workerClient.submitSuspendCompletion({ runId, snapshotId: snapshotFriendlyId, @@ -113,6 +170,11 @@ export class ComputeSnapshotService { }, }, }); + recordPhaseSince( + "submit_completion", + submitStart, + result.success ? undefined : new Error(String(result.error)) + ); if (result.success) { this.logger.debug("Suspend completion submitted", { @@ -121,6 +183,7 @@ export class ComputeSnapshotService { snapshotId: body.snapshot_id, }); } else { + setExtra(state, "submit_completion.error", String(result.error)); this.logger.error("Failed to submit suspend completion", { runId, snapshotFriendlyId, @@ -128,6 +191,7 @@ export class ComputeSnapshotService { }); } } else { + const submitStart = performance.now(); const result = await this.workerClient.submitSuspendCompletion({ runId, snapshotId: snapshotFriendlyId, @@ -136,8 +200,14 @@ export class ComputeSnapshotService { error: body.error ?? "Snapshot failed", }, }); + recordPhaseSince( + "submit_completion", + submitStart, + result.success ? undefined : new Error(String(result.error)) + ); if (!result.success) { + setExtra(state, "submit_completion.error", String(result.error)); this.logger.error("Failed to submit suspend failure", { runId, snapshotFriendlyId, @@ -184,20 +254,31 @@ export class ComputeSnapshotService { /** Dispatch a snapshot request to the gateway. */ private async dispatch(snapshot: DelayedSnapshot): Promise { - const result = await this.computeManager.snapshot({ - runnerId: snapshot.runnerId, - metadata: { - runId: snapshot.runFriendlyId, - snapshotFriendlyId: snapshot.snapshotFriendlyId, + await runWideEvent( + { + ...this.wideEventOpts, + op: "snapshot.dispatch", + kind: "scheduled", + setup: (state) => { + state.meta.run_id = snapshot.runFriendlyId; + state.meta.snapshot_id = snapshot.snapshotFriendlyId; + state.extras.runner_id = snapshot.runnerId; + }, }, - }); + async () => { + const result = await this.computeManager.snapshot({ + runnerId: snapshot.runnerId, + metadata: { + runId: snapshot.runFriendlyId, + snapshotFriendlyId: snapshot.snapshotFriendlyId, + }, + }); - if (!result) { - this.logger.error("Failed to request snapshot", { - runId: snapshot.runFriendlyId, - runnerId: snapshot.runnerId, - }); - } + if (!result) { + throw new Error("Snapshot dispatch returned no result"); + } + } + ); } #emitSnapshotSpan(runFriendlyId: string, durationMs?: number, snapshotId?: string) { diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts index 3f6bb9aa19b..e685a26b1b4 100644 --- a/apps/supervisor/src/services/timerWheel.test.ts +++ b/apps/supervisor/src/services/timerWheel.test.ts @@ -51,6 +51,23 @@ describe("TimerWheel", () => { wheel.stop(); }); + it("peek returns the pending item without removing it", () => { + const wheel = new TimerWheel({ delayMs: 3000, onExpire: () => {} }); + + wheel.start(); + wheel.submit("run-1", "data"); + + expect(wheel.peek("run-1")).toEqual({ key: "run-1", data: "data" }); + expect(wheel.size).toBe(1); + expect(wheel.peek("run-2")).toBeUndefined(); + + // Dispatched items are no longer peekable + vi.advanceTimersByTime(3100); + expect(wheel.peek("run-1")).toBeUndefined(); + + wheel.stop(); + }); + it("cancel returns false for unknown key", () => { const wheel = new TimerWheel({ delayMs: 3000, diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts index 9584423824d..cab5a5d7a25 100644 --- a/apps/supervisor/src/services/timerWheel.ts +++ b/apps/supervisor/src/services/timerWheel.ts @@ -121,6 +121,12 @@ export class TimerWheel { return true; } + /** Look up a pending item without removing it. */ + peek(key: string): TimerWheelItem | undefined { + const entry = this.entries.get(key); + return entry ? { key, data: entry.data } : undefined; + } + /** Number of pending items in the wheel. */ get size(): number { return this.entries.size; diff --git a/apps/supervisor/src/wideEvents/baggage.test.ts b/apps/supervisor/src/wideEvents/baggage.test.ts new file mode 100644 index 00000000000..0579533345e --- /dev/null +++ b/apps/supervisor/src/wideEvents/baggage.test.ts @@ -0,0 +1,44 @@ +import { describe, it, expect } from "vitest"; +import { encodeBaggage } from "./baggage.js"; + +describe("encodeBaggage", () => { + it("returns empty string for an empty map", () => { + expect(encodeBaggage({})).toBe(""); + }); + + it("encodes a single entry as k=v", () => { + expect(encodeBaggage({ run_id: "run-1" })).toBe("run_id=run-1"); + }); + + it("sorts keys for stable output across hops", () => { + expect(encodeBaggage({ b: "2", a: "1", c: "3" })).toBe("a=1,b=2,c=3"); + }); + + it("skips empty keys and empty values", () => { + expect(encodeBaggage({ "": "v", k: "", real: "x" })).toBe("real=x"); + }); + + it("truncates values longer than the cap", () => { + const long = "x".repeat(1024); + const got = encodeBaggage({ k: long }); + const value = got.slice("k=".length); + expect(value.length).toBe(256); + }); + + it("caps multibyte values by UTF-8 bytes, not code units", () => { + const long = "あ".repeat(512); // 3 UTF-8 bytes each + const got = encodeBaggage({ k: long }); + const value = got.slice("k=".length); + expect(Buffer.byteLength(value, "utf8")).toBeLessThanOrEqual(256); + }); + + it("caps the number of entries", () => { + const meta: Record = {}; + for (let i = 0; i < 50; i++) { + // Sortable two-digit keys so we know which 32 survive. + meta[`k${String(i).padStart(2, "0")}`] = "v"; + } + const got = encodeBaggage(meta); + expect(got.split(",").length).toBe(32); + }); +}); diff --git a/apps/supervisor/src/wideEvents/baggage.ts b/apps/supervisor/src/wideEvents/baggage.ts new file mode 100644 index 00000000000..7750ac79303 --- /dev/null +++ b/apps/supervisor/src/wideEvents/baggage.ts @@ -0,0 +1,45 @@ +/** + * W3C Baggage (https://www.w3.org/TR/baggage/) encoding for outbound peer + * calls. Serialises a State's `meta` map into a `Baggage` header value so + * the downstream service auto-stamps the same labels onto its own wide + * events - even on early-error paths that bail before parsing the request + * body. + * + * Outbound discipline: only call this on peer-to-peer hops within the trust + * boundary. External-endpoint calls (image registries, cloud-provider + * APIs, third-party webhooks) must not include the Baggage header. + */ + +import { truncateUtf8 } from "./truncate.js"; + +/** + * Cap the number of entries serialised onto the header. A misbehaving + * caller's `meta` map shouldn't blow up downstream event width. + */ +const MAX_BAGGAGE_ENTRIES = 32; + +/** + * Cap each value's length. Defense against an upstream that stuffs + * unbounded payloads into a meta value. + */ +const MAX_BAGGAGE_VALUE_BYTES = 256; + +/** + * Encode a `meta` map as a Baggage header value (`k1=v1,k2=v2`). Keys are + * sorted for stable output across hops; an empty input yields the empty + * string so the caller can skip emitting the header entirely. + */ +export function encodeBaggage(meta: Record): string { + const entries = Object.entries(meta).filter(([k, v]) => k && v); + if (entries.length === 0) return ""; + + entries.sort(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0)); + + const out: string[] = []; + for (const [k, raw] of entries) { + if (out.length >= MAX_BAGGAGE_ENTRIES) break; + const v = truncateUtf8(raw, MAX_BAGGAGE_VALUE_BYTES); + out.push(`${k}=${v}`); + } + return out.join(","); +} diff --git a/apps/supervisor/src/wideEvents/context.ts b/apps/supervisor/src/wideEvents/context.ts new file mode 100644 index 00000000000..a89859c2707 --- /dev/null +++ b/apps/supervisor/src/wideEvents/context.ts @@ -0,0 +1,14 @@ +import { AsyncLocalStorage } from "node:async_hooks"; +import type { State } from "./state.js"; + +/** + * AsyncLocalStorage threading per-operation `State` through the call stack. + * Wrappers enter a state via `wideEventStorage.run(state, () => fn())` and + * any code in the async call tree retrieves it via `fromContext()`. + */ +export const wideEventStorage = new AsyncLocalStorage(); + +/** Returns the State attached to the current async context, or null. */ +export function fromContext(): State | null { + return wideEventStorage.getStore() ?? null; +} diff --git a/apps/supervisor/src/wideEvents/emit.test.ts b/apps/supervisor/src/wideEvents/emit.test.ts new file mode 100644 index 00000000000..0daefa64873 --- /dev/null +++ b/apps/supervisor/src/wideEvents/emit.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from "vitest"; +import { emit, EmitMessage } from "./emit.js"; +import { newState } from "./new.js"; + +function captureEmit(state: Parameters[0]): Record { + const captured: string[] = []; + const origWrite = process.stdout.write; + process.stdout.write = ((chunk: unknown) => { + captured.push(String(chunk)); + return true; + }) as typeof process.stdout.write; + try { + emit(state); + } finally { + process.stdout.write = origWrite; + } + expect(captured).toHaveLength(1); + const line = captured[0]; + if (!line) throw new Error("no captured line"); + return JSON.parse(line) as Record; +} + +describe("emit", () => { + it("emits a single line with the stable message + request_id", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.durationMs = 5; + const out = captureEmit(s); + expect(out.msg).toBe(EmitMessage); + expect(out.request_id).toBe(s.requestId); + expect(out.service).toBe("supervisor"); + expect(out.ok).toBe(true); + expect(out.status).toBe(200); + expect(out.duration_ms).toBe(5); + }); + + it("emits start_time as an ISO timestamp set by newState", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + const out = captureEmit(s); + expect(typeof out.start_time).toBe("string"); + // Microsecond-precision RFC3339 (6 fractional digits), parseable as a date. + expect(out.start_time).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z$/); + expect(Number.isNaN(new Date(out.start_time as string).getTime())).toBe(false); + }); + + it("omits start_time when unset", () => { + const s = newState({ service: "supervisor", env: {} }); + delete s.startTime; + s.statusCode = 200; + s.ok = true; + const out = captureEmit(s); + expect(out).not.toHaveProperty("start_time"); + }); + + it("omits empty optional fields", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + const out = captureEmit(s); + expect(out).not.toHaveProperty("trace_id"); + expect(out).not.toHaveProperty("version"); + expect(out).not.toHaveProperty("commit_sha"); + expect(out).not.toHaveProperty("error.code"); + }); + + it("flattens meta keys as meta.", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.meta.run_id = "run_abc"; + s.meta.deployment_id = "dep_xyz"; + const out = captureEmit(s); + expect(out["meta.run_id"]).toBe("run_abc"); + expect(out["meta.deployment_id"]).toBe("dep_xyz"); + expect(out).not.toHaveProperty("meta"); + }); + + it("flattens phases as phase..", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.phases.push({ name: "warm_start", durationMs: 12, ok: true, attempts: 1 }); + s.phases.push({ + name: "workload_create", + durationMs: 3, + ok: false, + attempts: 2, + errorCode: "Error", + errorMsg: "boom", + sub: { create_ms: 1 }, + }); + const out = captureEmit(s); + expect(out["phase.warm_start.duration_ms"]).toBe(12); + expect(out["phase.warm_start.ok"]).toBe(true); + expect(out["phase.warm_start.attempts"]).toBe(1); + expect(out["phase.workload_create.duration_ms"]).toBe(3); + expect(out["phase.workload_create.ok"]).toBe(false); + expect(out["phase.workload_create.attempts"]).toBe(2); + expect(out["phase.workload_create.error_code"]).toBe("Error"); + expect(out["phase.workload_create.error_message"]).toBe("boom"); + expect(out["phase.workload_create.create_ms"]).toBe(1); + }); + + it("includes error.code/message/kind when state.error is set", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 500; + s.error = { code: "InternalError", message: "kaboom", kind: "internal" }; + const out = captureEmit(s); + expect(out["error.code"]).toBe("InternalError"); + expect(out["error.message"]).toBe("kaboom"); + expect(out["error.kind"]).toBe("internal"); + }); + + it("truncates very long error messages", () => { + const s = newState({ service: "supervisor", env: {} }); + s.error = { code: "Big", message: "x".repeat(2000), kind: "internal" }; + const out = captureEmit(s); + expect((out["error.message"] as string).length).toBe(512); + }); + + it("flattens extras at the top level", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.extras.route = "/health"; + s.extras["dispatch.result"] = "hit"; + const out = captureEmit(s); + expect(out.route).toBe("/health"); + expect(out["dispatch.result"]).toBe("hit"); + }); +}); diff --git a/apps/supervisor/src/wideEvents/emit.ts b/apps/supervisor/src/wideEvents/emit.ts new file mode 100644 index 00000000000..bfb03ad36c4 --- /dev/null +++ b/apps/supervisor/src/wideEvents/emit.ts @@ -0,0 +1,84 @@ +import type { State } from "./state.js"; +import { truncateUtf8 } from "./truncate.js"; + +/** + * Stable slog message string for every wide event. Downstream filters (jq, + * Axiom queries, Vector pipelines) pin to this constant. The `service` field + * disambiguates which service emitted it. + */ +export const EmitMessage = "wide_event"; + +const MAX_ERROR_MSG_BYTES = 512; + +/** + * Serializes a State as a single flat-keyed JSON line on stdout. Keys are + * flat (no nested objects) to keep jq filtering and Axiom indexing cheap. + * Empty optional fields are omitted. + */ +export function emit(state: State): void { + // Best-effort: an observability failure (serialization, stdout write) must + // never break or mask the caller's operation. Every call site relies on this. + try { + const out: Record = { + msg: EmitMessage, + request_id: state.requestId, + }; + + if (state.traceId) out.trace_id = state.traceId; + appendIfSet(out, "start_time", state.startTime); + appendIfSet(out, "service", state.service); + appendIfSet(out, "version", state.version); + appendIfSet(out, "commit_sha", state.commitSha); + appendIfSet(out, "region", state.region); + appendIfSet(out, "node_id", state.nodeId); + + appendIfSet(out, "op", state.op); + appendIfSet(out, "kind", state.kind); + + out.ok = state.ok; + if (state.statusCode !== 0) out.status = state.statusCode; + out.duration_ms = state.durationMs; + + if (state.error) { + appendIfSet(out, "error.code", state.error.code); + appendIfSet(out, "error.message", truncateUtf8(state.error.message, MAX_ERROR_MSG_BYTES)); + appendIfSet(out, "error.kind", state.error.kind); + } + + for (const [k, v] of Object.entries(state.meta)) { + out["meta." + k] = v; + } + + for (const p of state.phases) { + const prefix = "phase." + p.name + "."; + out[prefix + "duration_ms"] = p.durationMs; + out[prefix + "ok"] = p.ok; + out[prefix + "attempts"] = p.attempts; + if (p.errorCode) out[prefix + "error_code"] = p.errorCode; + if (p.errorMsg) out[prefix + "error_message"] = p.errorMsg; + if (p.sub) { + for (const [sk, sv] of Object.entries(p.sub)) { + out[prefix + sk] = sv; + } + } + } + + for (const [k, v] of Object.entries(state.extras)) { + out[k] = v; + } + + process.stdout.write(JSON.stringify(out) + "\n"); + } catch (err) { + try { + process.stderr.write( + `wide_event_emit_failed: ${err instanceof Error ? err.message : String(err)}\n` + ); + } catch { + // last resort - drop the event rather than throw + } + } +} + +function appendIfSet(out: Record, key: string, value: string | undefined): void { + if (value) out[key] = value; +} diff --git a/apps/supervisor/src/wideEvents/index.ts b/apps/supervisor/src/wideEvents/index.ts new file mode 100644 index 00000000000..4eda429a50a --- /dev/null +++ b/apps/supervisor/src/wideEvents/index.ts @@ -0,0 +1,29 @@ +/** + * Wide-event observability surface for the supervisor. One flat-keyed JSON + * line per natural unit of work (HTTP request, dequeue iteration, socket + * lifecycle event). Events join across services via `trace_id` (parsed from + * the inbound W3C `traceparent`) and `meta.run_id`. + * + * Off by default behind a kill switch - the dispatch hotpath runs at high + * QPS, so logging pressure must be cleanly removable. + */ +export { type Env, isValidRequestId, newState, type NewStateOptions } from "./new.js"; +export { emit, EmitMessage } from "./emit.js"; +export { parseTraceId } from "./traceparent.js"; +export { fromContext, wideEventStorage } from "./context.js"; +export { + type PhaseOpt, + recordPhase, + recordPhaseSince, + timePhase, +} from "./record.js"; +export { + emitOneShot, + runWideEvent, + setExtra, + setMeta, + type WideEventLifecycleOptions, + type WideEventOptions, +} from "./middleware.js"; +export type { ErrorInfo, PhaseRecord, State } from "./state.js"; +export { encodeBaggage } from "./baggage.js"; diff --git a/apps/supervisor/src/wideEvents/middleware.test.ts b/apps/supervisor/src/wideEvents/middleware.test.ts new file mode 100644 index 00000000000..afb59f43d6e --- /dev/null +++ b/apps/supervisor/src/wideEvents/middleware.test.ts @@ -0,0 +1,207 @@ +import { describe, it, expect } from "vitest"; +import { fromContext } from "./context.js"; +import { emitOneShot, runWideEvent, setMeta } from "./middleware.js"; + +function captureStdout(fn: () => Promise | unknown): Promise { + const captured: string[] = []; + const orig = process.stdout.write; + process.stdout.write = ((chunk: unknown) => { + captured.push(String(chunk)); + return true; + }) as typeof process.stdout.write; + return Promise.resolve(fn()) + .finally(() => { + process.stdout.write = orig; + }) + .then(() => captured); +} + +describe("runWideEvent", () => { + it("emits one event with ok=true when no statusCode is set", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test", route: "/x", method: "POST" }, + async () => undefined + ); + }); + expect(lines).toHaveLength(1); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev.service).toBe("supervisor"); + expect(ev.route).toBe("/x"); + expect(ev.method).toBe("POST"); + expect(typeof ev.duration_ms).toBe("number"); + expect(typeof ev.request_id).toBe("string"); + }); + + it("derives ok from statusCode set via finalize", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test" }, + async () => undefined, + (state) => { + state.statusCode = 200; + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev.status).toBe(200); + }); + + it("treats 4xx as ok=false", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test" }, + async () => undefined, + (state) => { + state.statusCode = 400; + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(false); + expect(ev.status).toBe(400); + }); + + it("emits ok=false with error.kind=internal on throw", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test" }, + async () => { + throw new Error("boom"); + } + ).catch(() => undefined); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(false); + expect(ev.status).toBe(500); + expect(ev["error.kind"]).toBe("internal"); + expect(ev["error.message"]).toBe("boom"); + }); + + it("threads state through AsyncLocalStorage", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test" }, + async () => { + setMeta(fromContext(), "run_id", "run_abc"); + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.ok).toBe(true); + }); + + it("picks up inbound traceparent for trace_id", async () => { + const tp = "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"; + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test", traceparent: tp }, + async () => undefined + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.trace_id).toBe("4bf92f3577b34da6a3ce929d0e0e4736"); + }); + + it("honours setup() to attach meta and extras before fn runs", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { + service: "supervisor", + env: {}, + enabled: true, op: "test", + setup: (state) => { + state.meta.run_id = "run_abc"; + state.extras.iteration = "dequeue"; + }, + }, + async () => undefined + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.iteration).toBe("dequeue"); + }); + + it("short-circuits to pass-through when enabled=false", async () => { + let seenState: ReturnType = null; + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: false, op: "test" }, + async () => { + seenState = fromContext(); + } + ); + }); + expect(lines).toHaveLength(0); + expect(seenState).toBe(null); + }); + + it("isolates state across concurrent invocations", async () => { + const lines = await captureStdout(async () => { + await Promise.all( + ["a", "b", "c"].map((tag) => + runWideEvent( + { service: "supervisor", env: {}, enabled: true, op: "test" }, + async () => { + const s = fromContext(); + if (!s) throw new Error("no state"); + s.meta.tag = tag; + await new Promise((r) => setTimeout(r, 5)); + expect(s.meta.tag).toBe(tag); + } + ) + ) + ); + }); + const tags = lines.map((l) => (JSON.parse(l) as Record)["meta.tag"]); + expect(tags.sort()).toEqual(["a", "b", "c"]); + }); +}); + +describe("emitOneShot", () => { + it("emits a single event with populated meta when enabled", async () => { + const lines = await captureStdout(() => { + emitOneShot({ + service: "supervisor", + env: {}, + enabled: true, op: "test", + populate: (s) => { + s.meta.run_id = "run_abc"; + s.extras.event = "run:start"; + }, + }); + }); + expect(lines).toHaveLength(1); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.event).toBe("run:start"); + }); + + it("emits nothing when disabled", async () => { + const lines = await captureStdout(() => { + emitOneShot({ service: "supervisor", env: {}, enabled: false, op: "test" }); + }); + expect(lines).toHaveLength(0); + }); +}); diff --git a/apps/supervisor/src/wideEvents/middleware.ts b/apps/supervisor/src/wideEvents/middleware.ts new file mode 100644 index 00000000000..034c136414f --- /dev/null +++ b/apps/supervisor/src/wideEvents/middleware.ts @@ -0,0 +1,132 @@ +import { emit } from "./emit.js"; +import { newState, type Env } from "./new.js"; +import { wideEventStorage } from "./context.js"; +import type { State } from "./state.js"; + +/** Options common to every wide-event lifecycle. */ +export type WideEventOptions = { + service: string; + env: Env; + /** + * Kill switch. When false, lifecycles degenerate into transparent + * pass-through - no State allocation, no AsyncLocalStorage run, no emit. + * Important for the dispatch hotpath where logging pressure must be + * cleanly removable. + */ + enabled: boolean; +}; + +/** Per-invocation options layered on top of `WideEventOptions`. */ +export type WideEventLifecycleOptions = WideEventOptions & { + /** Operation discriminator (`instance.create`, `dequeue`, ...). Required. */ + op: string; + /** Event shape: `inbound` | `outbound` | `event` | `scheduled`. Optional. */ + kind?: string; + /** Route template (HTTP only) captured into `extras.route`. */ + route?: string; + /** HTTP method captured into `extras.method`. */ + method?: string; + /** Inbound W3C traceparent (HTTP header, queue message field). */ + traceparent?: string; + /** Inbound request id (e.g. `x-request-id` header). */ + inboundRequestId?: string; + /** Runs after the state is built, before the wrapped fn. Use to attach meta. */ + setup?: (state: State) => void; +}; + +/** + * Runs `fn` inside an AsyncLocalStorage state and emits one wide event on + * completion or error. `finalize` runs after `fn` returns but before emit - + * use it to read out-of-band outcome info (e.g. `res.statusCode` for an HTTP + * route) and assign to `state.statusCode`. The wrapper computes `ok` from + * `statusCode` if it's set; otherwise it defaults to true on success. + * + * Returns the original `fn` result. When `enabled=false`, `fn` runs unchanged + * with no event emitted. + */ +export async function runWideEvent( + opts: WideEventLifecycleOptions, + fn: () => Promise | T, + finalize?: (state: State) => void +): Promise { + if (!opts.enabled) { + return fn(); + } + + const state = newState({ + service: opts.service, + env: opts.env, + inboundRequestId: opts.inboundRequestId, + traceparent: opts.traceparent, + op: opts.op, + kind: opts.kind, + }); + if (opts.route) state.extras.route = opts.route; + if (opts.method) state.extras.method = opts.method; + + const start = performance.now(); + try { + if (opts.setup) opts.setup(state); + const result = await wideEventStorage.run(state, () => Promise.resolve(fn())); + state.durationMs = Math.round(performance.now() - start); + if (finalize) finalize(state); + if (state.statusCode !== 0) { + state.ok = state.statusCode >= 200 && state.statusCode < 300; + } else { + state.ok = true; + } + emit(state); + return result; + } catch (err) { + state.durationMs = Math.round(performance.now() - start); + const e = err instanceof Error ? err : new Error(String(err)); + if (state.statusCode === 0) state.statusCode = 500; + state.ok = false; + state.error = { + code: e.name || "Error", + message: e.message, + kind: "internal", + }; + emit(state); + throw err; + } +} + +/** + * One-shot wide event with no wrapped operation. Use for socket lifecycle + * events (`run:start`, `run:stop`) where there is no surrounding async unit + * of work to time. `populate` runs synchronously to attach meta/extras + * before emit. + */ +export function emitOneShot( + opts: WideEventOptions & { + op: string; + kind?: string; + traceparent?: string; + populate?: (state: State) => void; + } +): void { + if (!opts.enabled) return; + const state = newState({ + service: opts.service, + env: opts.env, + traceparent: opts.traceparent, + op: opts.op, + kind: opts.kind, + }); + if (opts.populate) opts.populate(state); + state.ok = true; + emit(state); +} + +/** Convenience accessor for in-handler meta mutation. */ +export function setMeta(state: State | null, key: string, value: string): void { + if (!state) return; + state.meta[key] = value; +} + +/** Convenience for free-form fields (did_warm_start, dispatch.result, ...). */ +export function setExtra(state: State | null, key: string, value: unknown): void { + if (!state) return; + state.extras[key] = value; +} diff --git a/apps/supervisor/src/wideEvents/new.test.ts b/apps/supervisor/src/wideEvents/new.test.ts new file mode 100644 index 00000000000..476c49c3d0e --- /dev/null +++ b/apps/supervisor/src/wideEvents/new.test.ts @@ -0,0 +1,81 @@ +import { describe, it, expect } from "vitest"; +import { isValidRequestId, newState } from "./new.js"; + +describe("isValidRequestId", () => { + it("accepts visible ASCII", () => { + expect(isValidRequestId("req-abc-123_456.7")).toBe(true); + expect(isValidRequestId("a")).toBe(true); + }); + + it("rejects empty string", () => { + expect(isValidRequestId("")).toBe(false); + }); + + it("rejects overlong strings (>128 bytes)", () => { + expect(isValidRequestId("a".repeat(128))).toBe(true); + expect(isValidRequestId("a".repeat(129))).toBe(false); + }); + + it("rejects whitespace, newlines, control chars", () => { + expect(isValidRequestId("has space")).toBe(false); + expect(isValidRequestId("has\ttab")).toBe(false); + expect(isValidRequestId("has\nnewline")).toBe(false); + expect(isValidRequestId("\x00null")).toBe(false); + }); + + it("rejects high-bit / non-ASCII", () => { + expect(isValidRequestId("café")).toBe(false); + expect(isValidRequestId("a\x7f")).toBe(false); + }); +}); + +describe("newState", () => { + const env = { version: "1.0.0", commitSha: "abc123", region: "us-east-1", nodeId: "node-1" }; + + it("populates service identity from env", () => { + const s = newState({ service: "supervisor", env }); + expect(s.service).toBe("supervisor"); + expect(s.version).toBe("1.0.0"); + expect(s.commitSha).toBe("abc123"); + expect(s.region).toBe("us-east-1"); + expect(s.nodeId).toBe("node-1"); + }); + + it("mints a fresh request id when none provided", () => { + const s = newState({ service: "test", env: {} }); + expect(s.requestId).toMatch(/^req-[0-9a-f]{32}$/); + }); + + it("honours a valid inbound request id", () => { + const s = newState({ service: "test", env: {}, inboundRequestId: "trace-abc-123" }); + expect(s.requestId).toBe("trace-abc-123"); + }); + + it("rejects unsafe inbound request id and mints a fresh one", () => { + const s = newState({ service: "test", env: {}, inboundRequestId: "has space" }); + expect(s.requestId).toMatch(/^req-[0-9a-f]{32}$/); + }); + + it("parses traceparent into traceId and preserves the raw header", () => { + const tp = "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"; + const s = newState({ service: "test", env: {}, traceparent: tp }); + expect(s.traceId).toBe("4bf92f3577b34da6a3ce929d0e0e4736"); + expect(s.traceparent).toBe(tp); + }); + + it("leaves traceId empty when no traceparent provided", () => { + const s = newState({ service: "test", env: {} }); + expect(s.traceId).toBe(""); + expect(s.traceparent).toBe(""); + }); + + it("initialises empty meta/extras/phases", () => { + const s = newState({ service: "test", env: {} }); + expect(s.meta).toEqual({}); + expect(s.extras).toEqual({}); + expect(s.phases).toEqual([]); + expect(s.ok).toBe(false); + expect(s.statusCode).toBe(0); + expect(s.durationMs).toBe(0); + }); +}); diff --git a/apps/supervisor/src/wideEvents/new.ts b/apps/supervisor/src/wideEvents/new.ts new file mode 100644 index 00000000000..7a4dba8a09c --- /dev/null +++ b/apps/supervisor/src/wideEvents/new.ts @@ -0,0 +1,96 @@ +import { randomBytes } from "node:crypto"; +import { parseTraceId } from "./traceparent.js"; +import type { State } from "./state.js"; + +const MAX_REQUEST_ID_LEN = 128; + +/** + * Validates an inbound request id. Non-empty, no longer than 128 bytes, + * composed entirely of visible ASCII (0x21..0x7E). Rejects newlines, control + * characters, whitespace, DEL, high-bit bytes - any of which could poison the + * log pipeline if echoed back verbatim. + */ +export function isValidRequestId(s: string): boolean { + if (s.length === 0 || s.length > MAX_REQUEST_ID_LEN) return false; + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i); + if (c < 0x21 || c > 0x7e) return false; + } + return true; +} + +/** + * Service-level identity that's constant for the lifetime of the process. + * Populated once at startup, copied into every State. + */ +export type Env = { + version?: string; + commitSha?: string; + region?: string; + nodeId?: string; +}; + +export type NewStateOptions = { + service: string; + env: Env; + /** Optional inbound request id (e.g. from `x-request-id`). If unsafe or absent, a fresh `req-` is minted. */ + inboundRequestId?: string; + /** Optional inbound W3C traceparent (HTTP header, queue message field). */ + traceparent?: string; + /** Operation discriminator. Dotted `noun.verb`. Defaults to empty (set later). */ + op?: string; + /** Event shape: `inbound` | `outbound` | `event` | `scheduled`. Defaults to empty. */ + kind?: string; +}; + +/** + * Builds a State for a wide-event lifecycle. + * + * - requestId: honours `inboundRequestId` if present and safe; otherwise + * mints a fresh `req-` id. + * - traceId: parsed from the provided traceparent (graceful empty if + * absent or malformed). + * - traceparent: preserved verbatim for downstream propagation. + */ +export function newState(opts: NewStateOptions): State { + const traceparent = opts.traceparent ?? ""; + const inbound = opts.inboundRequestId ?? ""; + const requestId = isValidRequestId(inbound) ? inbound : newRequestId(); + + return { + startTime: nowRfc3339(), + requestId, + traceId: parseTraceId(traceparent), + traceparent, + service: opts.service, + version: opts.env.version, + commitSha: opts.env.commitSha, + region: opts.env.region, + nodeId: opts.env.nodeId, + op: opts.op ?? "", + kind: opts.kind ?? "", + meta: {}, + phases: [], + ok: false, + statusCode: 0, + durationMs: 0, + extras: {}, + }; +} + +function newRequestId(): string { + return "req-" + randomBytes(16).toString("hex"); +} + +/** + * Current wall-clock time as an RFC3339 string with microsecond precision. + * `Date.toISOString()` only has millisecond resolution, which is too coarse to + * order multiple wide events emitted within the same millisecond. + * `performance.timeOrigin + performance.now()` gives a sub-millisecond wall-clock + * reading; we append the microsecond digits to the millisecond ISO string. + */ +function nowRfc3339(): string { + const ms = performance.timeOrigin + performance.now(); + const micros = Math.floor((ms % 1) * 1000); // microseconds within the millisecond (0..999) + return new Date(ms).toISOString().slice(0, -1) + String(micros).padStart(3, "0") + "Z"; +} diff --git a/apps/supervisor/src/wideEvents/record.test.ts b/apps/supervisor/src/wideEvents/record.test.ts new file mode 100644 index 00000000000..beeb0fff221 --- /dev/null +++ b/apps/supervisor/src/wideEvents/record.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect } from "vitest"; +import { fromContext, wideEventStorage } from "./context.js"; +import { recordPhase, recordPhaseSince, timePhase } from "./record.js"; +import { newState } from "./new.js"; +import type { State } from "./state.js"; + +function makeState(): State { + return newState({ service: "test", env: {} }); +} + +describe("recordPhase", () => { + it("appends a successful phase", () => { + const s = makeState(); + recordPhase(s, "lookup", performance.now() - 50, undefined); + expect(s.phases).toHaveLength(1); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.name).toBe("lookup"); + expect(phase.ok).toBe(true); + expect(phase.attempts).toBe(1); + expect(phase.durationMs).toBeGreaterThanOrEqual(45); + }); + + it("appends a failed phase with error code/message", () => { + const s = makeState(); + recordPhase(s, "dispatch", performance.now(), new Error("nope")); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.ok).toBe(false); + expect(phase.errorCode).toBe("Error"); + expect(phase.errorMsg).toBe("nope"); + }); + + it("truncates very long error messages", () => { + const s = makeState(); + recordPhase(s, "x", performance.now(), new Error("y".repeat(2000))); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.errorMsg?.length).toBe(512); + }); + + it("honours opts.attempts", () => { + const s = makeState(); + recordPhase(s, "retry", performance.now(), undefined, { attempts: 3 }); + expect(s.phases[0]?.attempts).toBe(3); + }); + + it("attaches sub-timings", () => { + const s = makeState(); + recordPhase(s, "complex", performance.now(), undefined, { sub: { setup_ms: 10, work_ms: 5 } }); + expect(s.phases[0]?.sub).toEqual({ setup_ms: 10, work_ms: 5 }); + }); + + it("is a no-op when state is null", () => { + expect(() => recordPhase(null, "x", performance.now(), undefined)).not.toThrow(); + }); +}); + +describe("timePhase + AsyncLocalStorage threading", () => { + it("records via fromContext on success", async () => { + const s = makeState(); + const value = await wideEventStorage.run(s, () => timePhase("work", async () => 42)); + expect(value).toBe(42); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.ok).toBe(true); + }); + + it("records via fromContext on error and rethrows", async () => { + const s = makeState(); + await expect( + wideEventStorage.run(s, () => + timePhase("work", async () => { + throw new Error("boom"); + }) + ) + ).rejects.toThrow("boom"); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.ok).toBe(false); + expect(s.phases[0]?.errorMsg).toBe("boom"); + }); + + it("runs fn unchanged when no state on context", async () => { + const value = await timePhase("work", async () => "ok"); + expect(value).toBe("ok"); + }); +}); + +describe("recordPhaseSince", () => { + it("records using a caller-captured start time", async () => { + const s = makeState(); + await wideEventStorage.run(s, async () => { + const start = performance.now(); + await new Promise((r) => setTimeout(r, 10)); + recordPhaseSince("spanning", start, undefined); + }); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.durationMs).toBeGreaterThanOrEqual(8); + }); +}); + +describe("fromContext", () => { + it("returns null when no state attached", () => { + expect(fromContext()).toBe(null); + }); + + it("returns the state when inside wideEventStorage.run", () => { + const s = makeState(); + wideEventStorage.run(s, () => { + expect(fromContext()).toBe(s); + }); + }); +}); diff --git a/apps/supervisor/src/wideEvents/record.ts b/apps/supervisor/src/wideEvents/record.ts new file mode 100644 index 00000000000..b7b59089a0e --- /dev/null +++ b/apps/supervisor/src/wideEvents/record.ts @@ -0,0 +1,82 @@ +import { fromContext } from "./context.js"; +import type { PhaseRecord, State } from "./state.js"; +import { truncateUtf8 } from "./truncate.js"; + +const MAX_ERROR_MSG_BYTES = 512; + +/** Optional knobs for a phase record. */ +export type PhaseOpt = { + /** Attempt count for the phase (default 1). */ + attempts?: number; + /** Sub-timings to fold into `phase..`. */ + sub?: Record; +}; + +/** + * Appends a phase outcome to `state.phases`. Safe to call on success + * (`err === undefined`) and error paths. `errorMsg` is truncated to 512 bytes + * to keep the wide event compact. No-op if state is null. + */ +export function recordPhase( + state: State | null, + name: string, + startMs: number, + err: Error | undefined, + opts: PhaseOpt = {} +): void { + if (!state) return; + const p: PhaseRecord = { + name, + durationMs: Math.round(performance.now() - startMs), + ok: err === undefined, + attempts: opts.attempts ?? 1, + }; + if (err) { + p.errorCode = err.name || "Error"; + p.errorMsg = truncateUtf8(err.message, MAX_ERROR_MSG_BYTES); + } + if (opts.sub) p.sub = opts.sub; + state.phases.push(p); +} + +/** + * Runs `fn` and appends a phase outcome to the State attached to the current + * async context. If no state is on context (test paths, background work), + * `fn` runs unchanged. The phase is recorded on both success and error paths + * so failed phases still appear in the wide event with duration_ms + + * error_code. + */ +export async function timePhase( + name: string, + fn: () => Promise | T, + opts: PhaseOpt = {} +): Promise { + const start = performance.now(); + try { + const result = await fn(); + recordPhase(fromContext(), name, start, undefined, opts); + return result; + } catch (err) { + recordPhase(fromContext(), name, start, asError(err), opts); + throw err; + } +} + +/** + * Appends a phase outcome to the State attached to the current async context + * using a `startMs` captured by the caller. Use when the phase boundary spans + * multiple calls with intermediate error handling that can't fit inside a + * single `timePhase` closure. Nil-state safe. + */ +export function recordPhaseSince( + name: string, + startMs: number, + err: Error | undefined, + opts: PhaseOpt = {} +): void { + recordPhase(fromContext(), name, startMs, err, opts); +} + +function asError(e: unknown): Error { + return e instanceof Error ? e : new Error(String(e)); +} diff --git a/apps/supervisor/src/wideEvents/state.ts b/apps/supervisor/src/wideEvents/state.ts new file mode 100644 index 00000000000..dece3a3f5fd --- /dev/null +++ b/apps/supervisor/src/wideEvents/state.ts @@ -0,0 +1,84 @@ +/** + * Per-event accumulator backing a single wide event. The supervisor emits one + * flat-keyed JSON line per natural unit of work (dequeue iteration, HTTP + * request, socket lifecycle event). Optional fields are omitted on emit so + * events stay compact. + */ +export type State = { + /** + * Wall-clock time the event began, as an ISO-8601 string. Emitted as + * `start_time` so log collection orders events by when work started rather + * than by the collector's ingestion time. + */ + startTime?: string; + + // Cross-stack correlation. + requestId: string; + traceId: string; + /** + * Raw inbound W3C `traceparent`, preserved verbatim so outbound calls can + * propagate the same trace context without losing the parent span-id. + * Empty when no inbound traceparent was set. + */ + traceparent: string; + + // Service identity (set by `newState` from Env). + service: string; + version?: string; + commitSha?: string; + region?: string; + nodeId?: string; + + /** + * Operation discriminator. Dotted `noun.verb` (e.g. `instance.create`, + * `snapshot.dispatch`). Low cardinality - bounded set per service, not + * unbounded. Empty allowed during construction but expected to be set + * before emit. + */ + op: string; + + /** + * Event shape. `inbound` for received requests, `outbound` for outgoing + * calls, `event` for ambient occurrences with no meaningful duration, + * `scheduled` for timer-driven work. Empty allowed; omitted from emit + * when empty. + */ + kind: string; + + // Caller-attached opaque metadata, flattened to `meta.` on emit. + meta: Record; + + // Per-phase outcomes, in completion order. + phases: PhaseRecord[]; + + // Top-level outcome (set after the wrapped operation returns). + ok: boolean; + statusCode: number; + durationMs: number; + error?: ErrorInfo; + + // Free-form ad-hoc additions (route, method, did_warm_start, ...). + extras: Record; +}; + +/** + * Single named phase outcome. Retries collapse into `attempts > 1` with the + * last error reflected in errorCode/errorMsg. + */ +export type PhaseRecord = { + name: string; + durationMs: number; + ok: boolean; + attempts: number; + errorCode?: string; + errorMsg?: string; + sub?: Record; +}; + +/** Top-level error summary for a failed operation. */ +export type ErrorInfo = { + code: string; + message: string; + /** Coarse classification - "client" | "upstream" | "internal" | "timeout". */ + kind: string; +}; diff --git a/apps/supervisor/src/wideEvents/traceparent.test.ts b/apps/supervisor/src/wideEvents/traceparent.test.ts new file mode 100644 index 00000000000..85ed31c3b6f --- /dev/null +++ b/apps/supervisor/src/wideEvents/traceparent.test.ts @@ -0,0 +1,43 @@ +import { describe, it, expect } from "vitest"; +import { parseTraceId } from "./traceparent.js"; + +describe("parseTraceId", () => { + const validTraceId = "4bf92f3577b34da6a3ce929d0e0e4736"; + const validHeader = `00-${validTraceId}-00f067aa0ba902b7-01`; + + it("extracts the trace-id from a valid W3C traceparent", () => { + expect(parseTraceId(validHeader)).toBe(validTraceId); + }); + + it("returns empty string for empty/null/undefined input", () => { + expect(parseTraceId("")).toBe(""); + expect(parseTraceId(null)).toBe(""); + expect(parseTraceId(undefined)).toBe(""); + }); + + it("returns empty for wrong segment count", () => { + expect(parseTraceId("00-abc-def")).toBe(""); + expect(parseTraceId("00-abc-def-01-extra")).toBe(""); + }); + + it("returns empty for non-zero version byte", () => { + expect(parseTraceId(`01-${validTraceId}-00f067aa0ba902b7-01`)).toBe(""); + }); + + it("returns empty for wrong-length trace-id", () => { + expect(parseTraceId("00-abc-00f067aa0ba902b7-01")).toBe(""); + }); + + it("returns empty for non-hex trace-id", () => { + expect(parseTraceId("00-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-00f067aa0ba902b7-01")).toBe(""); + }); + + it("returns empty for all-zero trace-id", () => { + expect(parseTraceId("00-00000000000000000000000000000000-00f067aa0ba902b7-01")).toBe(""); + }); + + it("accepts uppercase hex", () => { + const tid = "4BF92F3577B34DA6A3CE929D0E0E4736"; + expect(parseTraceId(`00-${tid}-00f067aa0ba902b7-01`)).toBe(tid); + }); +}); diff --git a/apps/supervisor/src/wideEvents/traceparent.ts b/apps/supervisor/src/wideEvents/traceparent.ts new file mode 100644 index 00000000000..9e84294f067 --- /dev/null +++ b/apps/supervisor/src/wideEvents/traceparent.ts @@ -0,0 +1,39 @@ +/** + * Extracts the trace-id from a W3C `traceparent` header. Returns "" when the + * header is absent, malformed, or carries an all-zero trace-id. + * + * Format: `---` + * version : 2 hex chars, must be "00" + * trace-id: 32 hex chars, non-zero + * span-id : 16 hex chars (not validated - we only need trace-id) + * flags : 2 hex chars (not validated) + */ +export function parseTraceId(header: string | null | undefined): string { + if (!header) return ""; + const parts = header.split("-"); + if (parts.length !== 4) return ""; + if (parts[0] !== "00") return ""; + const tid = parts[1]; + if (!tid || tid.length !== 32) return ""; + if (!isHex(tid)) return ""; + if (isAllZero(tid)) return ""; + return tid; +} + +function isHex(s: string): boolean { + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i); + const isDigit = c >= 0x30 && c <= 0x39; + const isLower = c >= 0x61 && c <= 0x66; + const isUpper = c >= 0x41 && c <= 0x46; + if (!isDigit && !isLower && !isUpper) return false; + } + return true; +} + +function isAllZero(s: string): boolean { + for (let i = 0; i < s.length; i++) { + if (s.charCodeAt(i) !== 0x30) return false; + } + return true; +} diff --git a/apps/supervisor/src/wideEvents/truncate.test.ts b/apps/supervisor/src/wideEvents/truncate.test.ts new file mode 100644 index 00000000000..4eb272f1e60 --- /dev/null +++ b/apps/supervisor/src/wideEvents/truncate.test.ts @@ -0,0 +1,30 @@ +import { describe, it, expect } from "vitest"; +import { truncateUtf8 } from "./truncate.js"; + +describe("truncateUtf8", () => { + it("returns short ASCII unchanged", () => { + expect(truncateUtf8("hello", 512)).toBe("hello"); + }); + + it("truncates ASCII to the byte cap", () => { + expect(truncateUtf8("x".repeat(1024), 256)).toBe("x".repeat(256)); + }); + + it("never exceeds the byte cap for multibyte input", () => { + // "あ" is 3 UTF-8 bytes; 200 of them = 600 bytes. + const got = truncateUtf8("あ".repeat(200), 256); + expect(Buffer.byteLength(got, "utf8")).toBeLessThanOrEqual(256); + }); + + it("does not split a multibyte sequence", () => { + // 256 / 3 bytes = 85 whole chars (255 bytes), the 86th would overflow. + expect(truncateUtf8("あ".repeat(200), 256)).toBe("あ".repeat(85)); + }); + + it("does not split a surrogate pair", () => { + // "😀" is 2 UTF-16 units / 4 UTF-8 bytes; only one fits under a 5-byte cap. + const got = truncateUtf8("😀😀", 5); + expect(got).toBe("😀"); + expect(Buffer.byteLength(got, "utf8")).toBe(4); + }); +}); diff --git a/apps/supervisor/src/wideEvents/truncate.ts b/apps/supervisor/src/wideEvents/truncate.ts new file mode 100644 index 00000000000..417735fe77d --- /dev/null +++ b/apps/supervisor/src/wideEvents/truncate.ts @@ -0,0 +1,20 @@ +/** + * Truncate `value` to at most `maxBytes` UTF-8 bytes without splitting a + * multi-byte sequence or surrogate pair. Plain `.slice()` counts UTF-16 code + * units, so multibyte text can blow past a byte cap and cutting mid-pair + * leaves a lone surrogate that downstream JSON / Postgres consumers reject. + */ +export function truncateUtf8(value: string, maxBytes: number): string { + if (Buffer.byteLength(value, "utf8") <= maxBytes) return value; + + let bytes = 0; + let end = 0; + // `for..of` yields whole code points, so a surrogate pair is never split. + for (const ch of value) { + const size = Buffer.byteLength(ch, "utf8"); + if (bytes + size > maxBytes) break; + bytes += size; + end += ch.length; + } + return value.slice(0, end); +} diff --git a/apps/supervisor/src/workloadManager/compute.test.ts b/apps/supervisor/src/workloadManager/compute.test.ts new file mode 100644 index 00000000000..ea5ddabf285 --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from "vitest"; +import { ComputeClientError } from "@internal/compute"; +import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js"; + +describe("runnerNameForAttempt", () => { + it("keeps the unsuffixed name for the first attempt", () => { + expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123"); + }); + + it("suffixes retry attempts deterministically", () => { + expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2"); + expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3"); + }); +}); + +describe("isRetryableCreateError", () => { + it("retries statuses where the create definitely did not commit", () => { + expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe( + true + ); + expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe( + true + ); + }); + + it("does not retry lost-response statuses (create may have committed)", () => { + expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe( + false + ); + expect( + isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw")) + ).toBe(false); + }); + + it("does not retry 4xx responses", () => { + expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe( + false + ); + expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe( + false + ); + }); + + it("does not retry timeouts (instance may still be provisioning)", () => { + expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false); + }); + + it("retries network-level fetch failures", () => { + expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true); + }); + + it("does not retry unknown errors", () => { + expect(isRetryableCreateError(new Error("something else"))).toBe(false); + expect(isRetryableCreateError("string error")).toBe(false); + }); +}); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 1c00f33aad3..88c7645bbdf 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -6,10 +6,57 @@ import { type WorkloadManagerCreateOptions, type WorkloadManagerOptions, } from "./types.js"; -import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { ComputeClient, ComputeClientError, stripImageDigest } from "@internal/compute"; +import { setTimeout as sleep } from "node:timers/promises"; import { extractTraceparent, getRunnerId } from "../util.js"; import type { OtlpTraceService } from "../services/otlpTraceService.js"; import { tryCatch } from "@trigger.dev/core"; +import { encodeBaggage, fromContext } from "../wideEvents/index.js"; + +const DEFAULT_CREATE_MAX_ATTEMPTS = 3; +const DEFAULT_CREATE_RETRY_BASE_DELAY_MS = 250; + +/** + * TEMPORARY (TRI-10293): a failed create can leave its instance name + * registered gateway/fcrun-side until async cleanup runs, so a same-name + * retry can 409 against our own residue. Until the gateway cleans up + * failed-create registrations properly, retry attempts get a deterministic + * suffix. Attempt 1 keeps the unsuffixed name so the non-retry path is + * unchanged; the suffixed name flows into both the instance name and + * TRIGGER_RUNNER_ID, which downstream flows treat as one opaque + * self-reported token. Only attempts following a ComputeClientError are + * suffixed - network-failure retries keep the same name on purpose, because + * the gateway's name-collision 409 is their safety net against + * double-creating an instance whose create response was lost. + */ +export function runnerNameForAttempt(runnerId: string, attempt: number): string { + return attempt === 1 ? runnerId : `${runnerId}-r${attempt}`; +} + +/** + * Whether a failed instance create is worth retrying. Only statuses where + * the create definitely did NOT commit are retried: 500 means the agent or + * fcrun returned a create error (e.g. a netns slot holding the tap busy, a + * full node disk - placement may differ on retry), 503 means the gateway + * had nowhere to place it. 502/504 are excluded: the gateway emits those + * when it fails to reach the node or read its response, which can happen + * AFTER the agent committed the create - and the gateway only records the + * instance name on a clean 201, so a same-name retry would miss the + * collision check and could double-create the VM on another node. 4xx won't + * heal on retry, and timeouts may still be provisioning. Network-level + * fetch failures are safe: if the gateway processed the create, its name + * index is populated and the retry 409s harmlessly. + */ +export function isRetryableCreateError(error: unknown): boolean { + if (error instanceof ComputeClientError) { + return error.status === 500 || error.status === 503; + } + if (error instanceof DOMException && error.name === "TimeoutError") { + return false; + } + // Network-level fetch failures (gateway briefly unreachable) + return error instanceof TypeError; +} type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { gateway: { @@ -29,13 +76,23 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { otelEndpoint: string; prettyLogs: boolean; }; + createRetry?: { + maxAttempts: number; + baseDelayMs: number; + }; }; export class ComputeWorkloadManager implements WorkloadManager { private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); private readonly compute: ComputeClient; + private readonly createMaxAttempts: number; + private readonly createRetryBaseDelayMs: number; constructor(private opts: ComputeWorkloadManagerOptions) { + this.createMaxAttempts = opts.createRetry?.maxAttempts ?? DEFAULT_CREATE_MAX_ATTEMPTS; + this.createRetryBaseDelayMs = + opts.createRetry?.baseDelayMs ?? DEFAULT_CREATE_RETRY_BASE_DELAY_MS; + if (opts.workloadApiDomain) { this.logger.warn("⚠️ Custom workload API domain", { domain: opts.workloadApiDomain, @@ -46,6 +103,26 @@ export class ComputeWorkloadManager implements WorkloadManager { gatewayUrl: opts.gateway.url, authToken: opts.gateway.authToken, timeoutMs: opts.gateway.timeoutMs, + // Forward the current wide-event scope's traceparent + request_id so the + // downstream service continues the same trace and joins its own wide + // events to ours. Additionally serialize caller-supplied meta labels + // into the W3C Baggage header so the downstream service auto-stamps + // them even on early-error paths that bail before parsing the body. + // When called outside a wide-event scope (or when wide events are + // disabled), `fromContext` returns undefined and propagation is skipped. + getPropagationHeaders: () => { + const state = fromContext(); + if (!state) return {}; + const headers: Record = { "x-request-id": state.requestId }; + if (state.traceparent) { + headers.traceparent = state.traceparent; + } + const baggage = encodeBaggage(state.meta); + if (baggage) { + headers.baggage = baggage; + } + return headers; + }, }); } @@ -112,6 +189,12 @@ export class ComputeWorkloadManager implements WorkloadManager { // Strip image digest - resolve by tag, not digest const imageRef = stripImageDigest(opts.image); + // Labels forwarded to the compute provider for network-policy selection. + // `org` is always set so every run carries its org identity. + const labels: Record = { + org: opts.orgId, + }; + // Wide event: single canonical log line emitted in finally const event: Record = { // High-cardinality identifiers @@ -136,26 +219,71 @@ export class ComputeWorkloadManager implements WorkloadManager { const startMs = performance.now(); try { - const [error, data] = await tryCatch( - this.compute.instances.create({ - name: runnerId, - image: imageRef, - env: envVars, - cpu: opts.machine.cpu, - memory_gb: opts.machine.memory, - metadata: { - runId: opts.runFriendlyId, - envId: opts.envId, - envType: opts.envType, - orgId: opts.orgId, - projectId: opts.projectId, - deploymentVersion: opts.deploymentVersion, - machine: opts.machine.name, - }, - }) - ); + const createRequest = { + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, + ...(Object.keys(labels).length > 0 ? { labels } : {}), + }; + + // Retry transient placement failures instead of abandoning the run: a + // swallowed create error leaves the run waiting for the run engine's + // PENDING_EXECUTING timeout (minutes) before it is redriven, while a + // retried create typically succeeds in under a second (TRI-10293). + let error: unknown; + let data: Awaited> | null | undefined; + let attempt = 1; + // Set after a ComputeClientError: the failed create may have left its + // name registered, so subsequent attempts use a suffixed name. + let suffixAttempts = false; + for (; attempt <= this.createMaxAttempts; attempt++) { + const attemptRunnerId = suffixAttempts + ? runnerNameForAttempt(runnerId, attempt) + : runnerId; + [error, data] = await tryCatch( + this.compute.instances.create( + attemptRunnerId === runnerId + ? createRequest + : { + ...createRequest, + name: attemptRunnerId, + env: { ...envVars, TRIGGER_RUNNER_ID: attemptRunnerId }, + } + ) + ); + + if (!error) { + event.runnerId = attemptRunnerId; + break; + } + + if (error instanceof ComputeClientError) { + suffixAttempts = true; + } + + this.logger.warn("create instance attempt failed", { + runnerId: attemptRunnerId, + attempt, + error: error instanceof Error ? error.message : String(error), + }); + + if (!isRetryableCreateError(error) || attempt === this.createMaxAttempts) break; + await sleep(this.createRetryBaseDelayMs * attempt); + } + event.createAttempts = attempt; - if (error) { + if (error || !data) { event.error = error instanceof Error ? error.message : String(error); event.errorType = error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; @@ -276,6 +404,7 @@ export class ComputeWorkloadManager implements WorkloadManager { envId?: string; orgId?: string; projectId?: string; + hasPrivateLink?: boolean; dequeuedAt?: Date; }): Promise { const metadata: Record = { @@ -288,6 +417,13 @@ export class ComputeWorkloadManager implements WorkloadManager { TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, }; + // Resupply labels on restore (the provider doesn't persist them across a + // snapshot). orgId is optional on the restore opts type, so guard it. + const labels: Record = {}; + if (opts.orgId) { + labels.org = opts.orgId; + } + this.logger.verbose("restore request body", { snapshotId: opts.snapshotId, runnerId: opts.runnerId, @@ -301,6 +437,7 @@ export class ComputeWorkloadManager implements WorkloadManager { metadata, cpu: opts.machine.cpu, memory_gb: opts.machine.memory, + ...(Object.keys(labels).length > 0 ? { labels } : {}), }) ); diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index ec089267219..b2ed05c9f11 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -321,6 +321,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { }, } : {}), + ...(env.KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED + ? { + dnsConfig: { + options: [{ name: "ndots", value: `${env.KUBERNETES_POD_DNS_NDOTS}` }], + }, + } + : {}), }; } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index bd38cc8700f..bf4b38012d5 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -31,6 +31,14 @@ import { } from "../services/computeSnapshotService.js"; import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; import type { OtlpTraceService } from "../services/otlpTraceService.js"; +import type { ServerResponse } from "node:http"; +import { + emitOneShot, + runWideEvent, + setMeta, + type State, + type WideEventOptions, +} from "../wideEvents/index.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -43,6 +51,18 @@ const WorkloadActionParams = z.object({ snapshotFriendlyId: z.string(), }); +// Workloads bundled into customer task images before CLI v4.4.4 use a strict +// zod enum for checkpoint type that only allows DOCKER and KUBERNETES. The +// workload never reads this field - it only validates the response shape - so +// rewriting it to a known value keeps older runners working without affecting +// the value stored in the database or seen by internal services. +function legacifyCheckpointType(item: T): T { + if (item.checkpoint?.type === "COMPUTE") { + return { ...item, checkpoint: { ...item.checkpoint, type: "KUBERNETES" } } as T; + } + return item; +} + type WorkloadServerEvents = { runConnected: [ { @@ -67,6 +87,9 @@ type WorkloadServerOptions = { checkpointClient?: CheckpointClient; computeManager?: ComputeWorkloadManager; tracing?: OtlpTraceService; + wideEventOpts: WideEventOptions; + /** When true, high-frequency HTTP routes also emit wide events. */ + wideEventsNoisyRoutes: boolean; }; export class WorkloadServer extends EventEmitter { @@ -74,6 +97,8 @@ export class WorkloadServer extends EventEmitter { private readonly snapshotService?: ComputeSnapshotService; private readonly logger = new SimpleStructuredLogger("workload-server"); + private readonly wideEventOpts: WideEventOptions; + private readonly wideEventsNoisyRoutes: boolean; private readonly httpServer: HttpServer; private readonly websocketServer: Namespace< @@ -103,12 +128,15 @@ export class WorkloadServer extends EventEmitter { this.workerClient = opts.workerClient; this.checkpointClient = opts.checkpointClient; + this.wideEventOpts = opts.wideEventOpts; + this.wideEventsNoisyRoutes = opts.wideEventsNoisyRoutes; if (opts.computeManager?.snapshotsEnabled) { this.snapshotService = new ComputeSnapshotService({ computeManager: opts.computeManager, workerClient: opts.workerClient, tracing: opts.tracing, + wideEventOpts: this.wideEventOpts, }); } @@ -142,6 +170,59 @@ export class WorkloadServer extends EventEmitter { return this.headerValueFromRequest(req, WORKLOAD_HEADERS.PROJECT_REF); } + /** + * Sets common route meta on the wide-event state from URL params. + */ + private attachRouteMeta(state: State, params: unknown): void { + if (!params || typeof params !== "object") return; + const p = params as Record; + if (typeof p.runFriendlyId === "string") setMeta(state, "run_id", p.runFriendlyId); + if (typeof p.snapshotFriendlyId === "string") { + setMeta(state, "snapshot_id", p.snapshotFriendlyId); + } + if (typeof p.deploymentId === "string") setMeta(state, "deployment_id", p.deploymentId); + } + + /** + * Wraps an HTTP route handler body with the wide-event lifecycle. Reads + * `traceparent` and `x-request-id` from `req.headers`, attaches `run_id` / + * `snapshot_id` / `deployment_id` meta from `params` when present, and + * captures the response status from `res.statusCode` after `fn` returns. + * + * Pass `highFrequency: true` for noisy routes (heartbeat, polling). Those + * still go through the wrapper but only emit when + * `TRIGGER_WIDE_EVENTS_NOISY_ROUTES` is on, so prod can keep them dark + * while test envs capture full-fidelity traffic for debugging. + */ + private wideRoute( + ctx: { req: IncomingMessage; res: ServerResponse; params?: unknown }, + op: string, + route: string, + method: string, + fn: () => Promise | T, + routeOpts: { highFrequency?: boolean } = {} + ): Promise { + const enabled = + this.wideEventOpts.enabled && (!routeOpts.highFrequency || this.wideEventsNoisyRoutes); + return runWideEvent( + { + ...this.wideEventOpts, + enabled, + op, + kind: "inbound", + route, + method, + traceparent: this.headerValueFromRequest(ctx.req, "traceparent"), + inboundRequestId: this.headerValueFromRequest(ctx.req, "x-request-id"), + setup: (state) => this.attachRouteMeta(state, ctx.params), + }, + fn, + (state) => { + state.statusCode = ctx.res.statusCode; + } + ); + } + private createHttpServer({ host, port }: { host: string; port: number }) { const httpServer = new HttpServer({ port, @@ -162,26 +243,34 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadRunAttemptStartRequestBody, - handler: async ({ req, reply, params, body }) => { - const startResponse = await this.workerClient.startRunAttempt( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!startResponse.success) { - this.logger.error("Failed to start run", { - params, - error: startResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(startResponse.data satisfies WorkloadRunAttemptStartResponseBody); - return; - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "attempt.start", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/start", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const startResponse = await this.workerClient.startRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!startResponse.success) { + this.logger.error("Failed to start run", { + params, + error: startResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(startResponse.data satisfies WorkloadRunAttemptStartResponseBody); + return; + } + ), } ) .route( @@ -190,26 +279,50 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadRunAttemptCompleteRequestBody, - handler: async ({ req, reply, params, body }) => { - const completeResponse = await this.workerClient.completeRunAttempt( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!completeResponse.success) { - this.logger.error("Failed to complete run", { - params, - error: completeResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(completeResponse.data satisfies WorkloadRunAttemptCompleteResponseBody); - return; - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "attempt.complete", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/complete", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const runnerId = this.runnerIdFromRequest(req); + + // A completion attempt invalidates any pending delayed snapshot + // regardless of outcome: the runner has finished executing, so the + // suspended state the snapshot was scheduled to capture no longer + // exists. Cancel BEFORE the async completion call - the timer + // wheel can tick during the await, so cancelling after it leaves + // a real window for a due snapshot to dispatch and pause a VM + // that has moved on. The runnerId guard keeps a stale duplicate + // runner's completion from cancelling a fresh runner's snapshot, + // and the runner can't schedule a new suspend until it receives + // this route's reply, so nothing legitimate can be cancelled here. + this.snapshotService?.cancel(params.runFriendlyId, runnerId); + + const completeResponse = await this.workerClient.completeRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + runnerId + ); + + if (!completeResponse.success) { + this.logger.error("Failed to complete run", { + params, + error: completeResponse.error, + }); + reply.empty(500); + return; + } + + reply.json( + completeResponse.data satisfies WorkloadRunAttemptCompleteResponseBody + ); + return; + } + ), } ) .route( @@ -218,27 +331,36 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadHeartbeatRequestBody, - handler: async ({ req, reply, params, body }) => { - const heartbeatResponse = await this.workerClient.heartbeatRun( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!heartbeatResponse.success) { - this.logger.error("Failed to heartbeat run", { - params, - error: heartbeatResponse.error, - }); - reply.empty(500); - return; - } - - reply.json({ - ok: true, - } satisfies WorkloadHeartbeatResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "heartbeat", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/heartbeat", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const heartbeatResponse = await this.workerClient.heartbeatRun( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!heartbeatResponse.success) { + this.logger.error("Failed to heartbeat run", { + params, + error: heartbeatResponse.error, + }); + reply.empty(500); + return; + } + + reply.json({ + ok: true, + } satisfies WorkloadHeartbeatResponseBody); + }, + { highFrequency: true } + ), } ) .route( @@ -246,87 +368,95 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ reply, params, req }) => { - const runnerId = this.runnerIdFromRequest(req); - const deploymentVersion = this.deploymentVersionFromRequest(req); - const projectRef = this.projectRefFromRequest(req); - - this.logger.debug("Suspend request", { - params, - runnerId, - deploymentVersion, - projectRef, - }); - - if (!runnerId || !deploymentVersion || !projectRef) { - this.logger.error("Invalid headers for suspend request", { - ...params, - runnerId, - deploymentVersion, - projectRef, - }); - reply.json( - { - ok: false, - error: "Invalid headers", - } satisfies WorkloadSuspendRunResponseBody, - false, - 400 - ); - return; - } - - if (this.snapshotService) { - // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. - // If the run continues before the delay expires, the snapshot is cancelled. - reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - - this.snapshotService.schedule(params.runFriendlyId, { - runnerId, - runFriendlyId: params.runFriendlyId, - snapshotFriendlyId: params.snapshotFriendlyId, - }); - - return; - } - - if (!this.checkpointClient) { - reply.json( - { - ok: false, - error: "Checkpoints disabled", - } satisfies WorkloadSuspendRunResponseBody, - false, - 400 - ); - return; - } - - reply.json( - { - ok: true, - } satisfies WorkloadSuspendRunResponseBody, - false, - 202 - ); - - const suspendResult = await this.checkpointClient.suspendRun({ - runFriendlyId: params.runFriendlyId, - snapshotFriendlyId: params.snapshotFriendlyId, - body: { - runnerId, - runId: params.runFriendlyId, - snapshotId: params.snapshotFriendlyId, - projectRef, - deploymentVersion, - }, - }); - - if (!suspendResult) { - this.logger.error("Failed to suspend run", { params }); - return; - } - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "suspend", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/suspend", + "GET", + async () => { + const { reply, params, req } = ctx; + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); + + this.logger.debug("Suspend request", { + params, + runnerId, + deploymentVersion, + projectRef, + }); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + runnerId, + deploymentVersion, + projectRef, + }); + reply.json( + { + ok: false, + error: "Invalid headers", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + if (this.snapshotService) { + // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. + // If the run continues before the delay expires, the snapshot is cancelled. + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); + + this.snapshotService.schedule(params.runFriendlyId, { + runnerId, + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + }); + + return; + } + + if (!this.checkpointClient) { + reply.json( + { + ok: false, + error: "Checkpoints disabled", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + reply.json( + { + ok: true, + } satisfies WorkloadSuspendRunResponseBody, + false, + 202 + ); + + const suspendResult = await this.checkpointClient.suspendRun({ + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + body: { + runnerId, + runId: params.runFriendlyId, + snapshotId: params.snapshotFriendlyId, + projectRef, + deploymentVersion, + }, + }); + + if (!suspendResult) { + this.logger.error("Failed to suspend run", { params }); + return; + } + } + ), } ) .route( @@ -334,33 +464,41 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ req, reply, params }) => { - this.logger.debug("Run continuation request", { params }); - - // Cancel any pending delayed snapshot for this run - this.snapshotService?.cancel(params.runFriendlyId); - - const continuationResult = await this.workerClient.continueRunExecution( - params.runFriendlyId, - params.snapshotFriendlyId, - this.runnerIdFromRequest(req) - ); - - if (!continuationResult.success) { - this.logger.error("Failed to continue run execution", { params }); - reply.json( - { - ok: false, - error: "Failed to continue run execution", - }, - false, - 400 - ); - return; - } - - reply.json(continuationResult.data as WorkloadContinueRunExecutionResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "continue", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/continue", + "GET", + async () => { + const { req, reply, params } = ctx; + this.logger.debug("Run continuation request", { params }); + + // Cancel any pending delayed snapshot for this run + this.snapshotService?.cancel(params.runFriendlyId); + + const continuationResult = await this.workerClient.continueRunExecution( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!continuationResult.success) { + this.logger.error("Failed to continue run execution", { params }); + reply.json( + { + ok: false, + error: "Failed to continue run execution", + }, + false, + 400 + ); + return; + } + + reply.json(continuationResult.data as WorkloadContinueRunExecutionResponseBody); + } + ), } ) .route( @@ -368,24 +506,35 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ req, reply, params }) => { - const sinceSnapshotResponse = await this.workerClient.getSnapshotsSince( - params.runFriendlyId, - params.snapshotFriendlyId, - this.runnerIdFromRequest(req) - ); - - if (!sinceSnapshotResponse.success) { - this.logger.error("Failed to get snapshots since", { - runId: params.runFriendlyId, - error: sinceSnapshotResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(sinceSnapshotResponse.data satisfies WorkloadRunSnapshotsSinceResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "snapshots.since", + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/since/:snapshotFriendlyId", + "GET", + async () => { + const { req, reply, params } = ctx; + const sinceSnapshotResponse = await this.workerClient.getSnapshotsSince( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!sinceSnapshotResponse.success) { + this.logger.error("Failed to get snapshots since", { + runId: params.runFriendlyId, + error: sinceSnapshotResponse.error, + }); + reply.empty(500); + return; + } + + reply.json({ + snapshots: sinceSnapshotResponse.data.snapshots.map(legacifyCheckpointType), + } satisfies WorkloadRunSnapshotsSinceResponseBody); + }, + { highFrequency: true } + ), } ) .route("/api/v1/workload-actions/deployments/:deploymentId/dequeue", "GET", { @@ -393,61 +542,90 @@ export class WorkloadServer extends EventEmitter { deploymentId: z.string(), }), - handler: async ({ req, reply, params }) => { - const dequeueResponse = await this.workerClient.dequeueFromVersion( - params.deploymentId, - 1, - this.runnerIdFromRequest(req) - ); - - if (!dequeueResponse.success) { - this.logger.error("Failed to get latest snapshot", { - deploymentId: params.deploymentId, - error: dequeueResponse.error, - }); - reply.empty(500); - return; - } + handler: async (ctx) => + this.wideRoute( + ctx, + "deployment.dequeue", + "/api/v1/workload-actions/deployments/:deploymentId/dequeue", + "GET", + async () => { + const { req, reply, params } = ctx; + const dequeueResponse = await this.workerClient.dequeueFromVersion( + params.deploymentId, + 1, + this.runnerIdFromRequest(req) + ); - reply.json(dequeueResponse.data satisfies WorkloadDequeueFromVersionResponseBody); - }, + if (!dequeueResponse.success) { + this.logger.error("Failed to get latest snapshot", { + deploymentId: params.deploymentId, + error: dequeueResponse.error, + }); + reply.empty(500); + return; + } + + reply.json( + dequeueResponse.data.map(legacifyCheckpointType) satisfies WorkloadDequeueFromVersionResponseBody + ); + } + ), }); if (env.SEND_RUN_DEBUG_LOGS) { httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { paramsSchema: WorkloadActionParams.pick({ runFriendlyId: true }), bodySchema: WorkloadDebugLogRequestBody, - handler: async ({ req, reply, params, body }) => { - reply.empty(204); - - await this.workerClient.sendDebugLog( - params.runFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "logs.debug", + "/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", + "POST", + async () => { + const { req, reply, params, body } = ctx; + reply.empty(204); + + await this.workerClient.sendDebugLog( + params.runFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + }, + { highFrequency: true } + ), }); } else { // Lightweight mock route without schemas httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { - handler: async ({ reply }) => { - reply.empty(204); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "logs.debug", + "/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", + "POST", + async () => { + ctx.reply.empty(204); + }, + { highFrequency: true } + ), }); } - // Compute snapshot callback endpoint + // Snapshot callback endpoint (inbound from compute path) httpServer.route("/api/v1/compute/snapshot-complete", "POST", { bodySchema: SnapshotCallbackPayloadSchema, - handler: async ({ reply, body }) => { - if (!this.snapshotService) { - reply.empty(404); - return; - } + handler: async (ctx) => + this.wideRoute(ctx, "snapshot.callback", "/api/v1/compute/snapshot-complete", "POST", async () => { + const { reply, body } = ctx; + if (!this.snapshotService) { + reply.empty(404); + return; + } - const result = await this.snapshotService.handleCallback(body); - reply.empty(result.status); - }, + const result = await this.snapshotService.handleCallback(body); + reply.empty(result.status); + }), }); return httpServer; @@ -520,6 +698,28 @@ export class WorkloadServer extends EventEmitter { }; }; + const emitSocketLifecycle = ( + event: "run_connected" | "run_disconnected", + friendlyId: string, + disconnectReason?: string + ) => { + emitOneShot({ + ...this.wideEventOpts, + op: event === "run_connected" ? "socket.run.connected" : "socket.run.disconnected", + kind: "event", + populate: (state) => { + state.extras.event = event; + setMeta(state, "run_id", friendlyId); + if (socket.data.deploymentId) { + setMeta(state, "deployment_id", socket.data.deploymentId); + } + if (socket.data.runnerId) setMeta(state, "runner_id", socket.data.runnerId); + state.extras.socket_id = socket.id; + if (disconnectReason) state.extras.disconnect_reason = disconnectReason; + }, + }); + }; + const runConnected = (friendlyId: string) => { socketLogger.debug("runConnected", { ...getSocketMetadata() }); @@ -530,20 +730,35 @@ export class WorkloadServer extends EventEmitter { newRunId: friendlyId, oldRunId: socket.data.runFriendlyId, }); - runDisconnected(socket.data.runFriendlyId); + runDisconnected(socket.data.runFriendlyId, "socket_run_replaced"); } this.runSockets.set(friendlyId, socket); this.emit("runConnected", { run: { friendlyId } }); socket.data.runFriendlyId = friendlyId; + emitSocketLifecycle("run_connected", friendlyId); }; - const runDisconnected = (friendlyId: string) => { + const runDisconnected = (friendlyId: string, reason: string) => { socketLogger.debug("runDisconnected", { ...getSocketMetadata() }); + // The run is gone from this runner (crash, exit, or replaced by a new + // run), so a pending delayed snapshot for it is stale. Genuine + // waitpoint suspensions keep the socket connected, so this doesn't + // cancel a snapshot that's still wanted; the runnerId match guards + // against a stale duplicate runner cancelling a fresh runner's + // snapshot after the run was reassigned. Caveat: socket.data.runnerId + // is frozen at the websocket handshake, so after a same-supervisor + // restore (new runner id, socket not recreated) this guard refuses + // the cancel - a missed cancel, never a wrong one. The + // attempt.complete cancel uses the runner's current HTTP header id + // and is unaffected. + this.snapshotService?.cancel(friendlyId, socket.data.runnerId); + this.runSockets.delete(friendlyId); this.emit("runDisconnected", { run: { friendlyId } }); socket.data.runFriendlyId = undefined; + emitSocketLifecycle("run_disconnected", friendlyId, reason); }; socketLogger.debug("wsServer socket connected", { ...getSocketMetadata() }); @@ -561,7 +776,7 @@ export class WorkloadServer extends EventEmitter { }); if (socket.data.runFriendlyId) { - runDisconnected(socket.data.runFriendlyId); + runDisconnected(socket.data.runFriendlyId, `socket_disconnecting:${reason}`); } }); @@ -606,7 +821,7 @@ export class WorkloadServer extends EventEmitter { log.debug("Handling run:stop"); try { - runDisconnected(message.run.friendlyId); + runDisconnected(message.run.friendlyId, "run_stop_message"); // Don't delete trace context here - run:stop fires after each snapshot/shutdown // but the run may be restored on a new VM and snapshot again. Trace context is // re-populated on dequeue, and entries are small (4 strings per run). diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md index b0f5e09b829..a4de6ab57b7 100644 --- a/apps/webapp/CLAUDE.md +++ b/apps/webapp/CLAUDE.md @@ -1,6 +1,6 @@ # Webapp -Remix 2.1.0 app serving as the main API, dashboard, and orchestration engine. Uses an Express server (`server.ts`). +Remix 2.17.4 app serving as the main API, dashboard, and orchestration engine. Uses an Express server (`server.ts`). ## Verifying Changes @@ -59,6 +59,17 @@ Use the `chrome-devtools` MCP server to visually verify local dashboard changes. Routes use Remix flat-file convention with dot-separated segments: `api.v1.tasks.$taskId.trigger.ts` -> `/api/v1/tasks/:taskId/trigger` +## Abort Signals + +**Never use `request.signal`** for detecting client disconnects. It is broken due to a Node.js bug ([nodejs/node#55428](https://github.com/nodejs/node/issues/55428)) where the AbortSignal chain is severed when Remix internally clones the Request object. Instead, use `getRequestAbortSignal()` from `app/services/httpAsyncStorage.server.ts`, which is wired directly to Express `res.on("close")` and fires reliably. + +```typescript +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; + +// In route handlers, SSE streams, or any server-side code: +const signal = getRequestAbortSignal(); +``` + ## Environment Variables Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** diff --git a/apps/webapp/app/clientBeforeFirstRender.ts b/apps/webapp/app/clientBeforeFirstRender.ts new file mode 100644 index 00000000000..3275c54423a --- /dev/null +++ b/apps/webapp/app/clientBeforeFirstRender.ts @@ -0,0 +1,38 @@ +/** + * Runs once on the client, synchronously, before React hydrates the app. + * Reserved for housekeeping that must happen before any component mounts. + */ +export function clientBeforeFirstRender() { + cleanupLegacyResizablePanelStorage(); +} + +/** + * Earlier versions of the resizable panel library wrote a per-session + * localStorage entry for every PanelGroup, including ones without an + * `autosaveId`. The keys look like `panel-group-react-aria-::` + * and accumulate without bound across sessions until they exhaust the + * ~5 MB origin quota and break subsequent `setItem` calls. + * + * The library no longer behaves this way, but existing users still carry + * the residue. Evict it (plus the orphaned `panel-run-parent-v2` key from + * the v2→v3 autosaveId bump) once on load. + */ +function cleanupLegacyResizablePanelStorage() { + try { + const toRemove: string[] = []; + for (let i = 0; i < window.localStorage.length; i++) { + const key = window.localStorage.key(i); + if ( + key && + (key.startsWith("panel-group-react-aria") || key === "panel-run-parent-v2") + ) { + toRemove.push(key); + } + } + for (const key of toRemove) { + window.localStorage.removeItem(key); + } + } catch { + // localStorage may be disabled (private browsing, security policy) + } +} diff --git a/apps/webapp/app/components/BlankStatePanels.tsx b/apps/webapp/app/components/BlankStatePanels.tsx index fe39f6785c5..9a4884e09d3 100644 --- a/apps/webapp/app/components/BlankStatePanels.tsx +++ b/apps/webapp/app/components/BlankStatePanels.tsx @@ -1,4 +1,5 @@ import { + ArrowsRightLeftIcon, BeakerIcon, BellAlertIcon, BookOpenIcon, @@ -189,6 +190,28 @@ export function BatchesNone() { ); } +export function SessionsNone() { + return ( + + Sessions docs + + } + > + + You have no sessions in this environment. Sessions are durable, typed, bidirectional I/O + primitives that outlive a single run — used by chat.agent and any + long-running task that needs streaming input and output. + + + ); +} + export function TestHasNoTasks() { const organization = useOrganization(); const project = useProject(); diff --git a/apps/webapp/app/components/BulkActionFilterSummary.tsx b/apps/webapp/app/components/BulkActionFilterSummary.tsx index a230e70b346..a2eabc879de 100644 --- a/apps/webapp/app/components/BulkActionFilterSummary.tsx +++ b/apps/webapp/app/components/BulkActionFilterSummary.tsx @@ -215,6 +215,19 @@ export function BulkActionFilterSummary({ /> ); } + case "regions": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } case "machines": { const values = Array.isArray(value) ? value : [`${value}`]; return ( @@ -240,6 +253,19 @@ export function BulkActionFilterSummary({ /> ); } + case "sources": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } default: { assertNever(typedKey); } diff --git a/apps/webapp/app/components/DefinitionTooltip.tsx b/apps/webapp/app/components/DefinitionTooltip.tsx index 5bb3a713997..d91cce92c99 100644 --- a/apps/webapp/app/components/DefinitionTooltip.tsx +++ b/apps/webapp/app/components/DefinitionTooltip.tsx @@ -6,14 +6,16 @@ export function DefinitionTip({ content, children, title, + disableHoverableContent = true, }: { content: React.ReactNode; children: React.ReactNode; title: React.ReactNode; + disableHoverableContent?: boolean; }) { return ( - + {children} diff --git a/apps/webapp/app/components/Feedback.tsx b/apps/webapp/app/components/Feedback.tsx index ecfd4e88c9a..0848359e219 100644 --- a/apps/webapp/app/components/Feedback.tsx +++ b/apps/webapp/app/components/Feedback.tsx @@ -1,10 +1,10 @@ import { conform, useForm } from "@conform-to/react"; import { parse } from "@conform-to/zod"; import { InformationCircleIcon, ArrowUpCircleIcon } from "@heroicons/react/20/solid"; -import { EnvelopeIcon } from "@heroicons/react/24/solid"; +import { EnvelopeIcon, ShieldCheckIcon } from "@heroicons/react/24/solid"; import { Form, useActionData, useLocation, useNavigation, useSearchParams } from "@remix-run/react"; import { type ReactNode, useEffect, useState } from "react"; -import { type FeedbackType, feedbackTypeLabel, schema } from "~/routes/resources.feedback"; +import { type FeedbackType, feedbackTypes, schema } from "~/routes/resources.feedback"; import { Button } from "./primitives/Buttons"; import { Dialog, DialogContent, DialogHeader, DialogTrigger } from "./primitives/Dialog"; import { Fieldset } from "./primitives/Fieldset"; @@ -84,9 +84,12 @@ export function Feedback({ button, defaultValue = "bug", onOpenChange }: Feedbac How can we help? We read every message and will respond as quickly as we can. - {!(type === "feature" || type === "help" || type === "concurrency") && ( -

- )} + {!( + type === "feature" || + type === "help" || + type === "concurrency" || + type === "hipaa" + ) &&

}

@@ -132,6 +135,19 @@ export function Feedback({ button, defaultValue = "bug", onOpenChange }: Feedbac )} + {type === "hipaa" && ( + + + We offer a signed Business Associate Agreement (BAA) as a paid add-on on any + paid plan. To help us get back to you quickly, please include your company + name, and a brief description of the PHI workload you plan to run. + + + )} diff --git a/apps/webapp/app/components/MachineLabelCombo.tsx b/apps/webapp/app/components/MachineLabelCombo.tsx index 3d22ca527d0..485f6094cf0 100644 --- a/apps/webapp/app/components/MachineLabelCombo.tsx +++ b/apps/webapp/app/components/MachineLabelCombo.tsx @@ -31,7 +31,9 @@ export function MachineLabel({ className?: string; }) { return ( - {formatMachinePresetName(preset)} + + {formatMachinePresetName(preset)} + ); } diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts new file mode 100644 index 00000000000..4855c4c2465 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts @@ -0,0 +1,55 @@ +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { type Duration } from "~/services/rateLimiter.server"; +import { API_RATE_LIMIT_INTENT } from "./ApiRateLimitSection"; +import { + handleRateLimitAction, + resolveEffectiveRateLimit, + type RateLimitActionResult, + type RateLimitDomain, +} from "./RateLimitSection.server"; +import type { EffectiveRateLimit } from "./RateLimitSection"; + +export const apiRateLimitDomain: RateLimitDomain = { + intent: API_RATE_LIMIT_INTENT, + systemDefault: () => ({ + type: "tokenBucket", + refillRate: env.API_RATE_LIMIT_REFILL_RATE, + interval: env.API_RATE_LIMIT_REFILL_INTERVAL as Duration, + maxTokens: env.API_RATE_LIMIT_MAX, + }), + apply: async (orgId, next, adminUserId) => { + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { apiRateLimiterConfig: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + await prisma.organization.update({ + where: { id: orgId }, + data: { apiRateLimiterConfig: next as any }, + }); + logger.info("admin.backOffice.apiRateLimit", { + adminUserId, + orgId, + previous: existing.apiRateLimiterConfig, + next, + }); + }, +}; + +export function resolveEffectiveApiRateLimit( + override: unknown +): EffectiveRateLimit { + return resolveEffectiveRateLimit(override, apiRateLimitDomain); +} + +export function handleApiRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + return handleRateLimitAction(formData, orgId, adminUserId, apiRateLimitDomain); +} diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx new file mode 100644 index 00000000000..b27956f4360 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx @@ -0,0 +1,17 @@ +import { + RateLimitSection, + type RateLimitWrapperProps, +} from "./RateLimitSection"; + +export const API_RATE_LIMIT_INTENT = "set-rate-limit"; +export const API_RATE_LIMIT_SAVED_VALUE = "rate-limit"; + +export function ApiRateLimitSection(props: RateLimitWrapperProps) { + return ( + + ); +} diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts new file mode 100644 index 00000000000..83a368094a9 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts @@ -0,0 +1,55 @@ +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { type Duration } from "~/services/rateLimiter.server"; +import { BATCH_RATE_LIMIT_INTENT } from "./BatchRateLimitSection"; +import { + handleRateLimitAction, + resolveEffectiveRateLimit, + type RateLimitActionResult, + type RateLimitDomain, +} from "./RateLimitSection.server"; +import type { EffectiveRateLimit } from "./RateLimitSection"; + +export const batchRateLimitDomain: RateLimitDomain = { + intent: BATCH_RATE_LIMIT_INTENT, + systemDefault: () => ({ + type: "tokenBucket", + refillRate: env.BATCH_RATE_LIMIT_REFILL_RATE, + interval: env.BATCH_RATE_LIMIT_REFILL_INTERVAL as Duration, + maxTokens: env.BATCH_RATE_LIMIT_MAX, + }), + apply: async (orgId, next, adminUserId) => { + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { batchRateLimitConfig: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + await prisma.organization.update({ + where: { id: orgId }, + data: { batchRateLimitConfig: next as any }, + }); + logger.info("admin.backOffice.batchRateLimit", { + adminUserId, + orgId, + previous: existing.batchRateLimitConfig, + next, + }); + }, +}; + +export function resolveEffectiveBatchRateLimit( + override: unknown +): EffectiveRateLimit { + return resolveEffectiveRateLimit(override, batchRateLimitDomain); +} + +export function handleBatchRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + return handleRateLimitAction(formData, orgId, adminUserId, batchRateLimitDomain); +} diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx new file mode 100644 index 00000000000..0e52124d290 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx @@ -0,0 +1,17 @@ +import { + RateLimitSection, + type RateLimitWrapperProps, +} from "./RateLimitSection"; + +export const BATCH_RATE_LIMIT_INTENT = "set-batch-rate-limit"; +export const BATCH_RATE_LIMIT_SAVED_VALUE = "batch-rate-limit"; + +export function BatchRateLimitSection(props: RateLimitWrapperProps) { + return ( + + ); +} diff --git a/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts new file mode 100644 index 00000000000..ec27234a306 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts @@ -0,0 +1,48 @@ +import { z } from "zod"; +import { prisma } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { MAX_PROJECTS_INTENT } from "./MaxProjectsSection"; + +const SetMaxProjectsSchema = z.object({ + intent: z.literal(MAX_PROJECTS_INTENT), + // Capped at PostgreSQL INTEGER max (Prisma Int) so oversized input fails + // validation cleanly instead of crashing the update. + maximumProjectCount: z.coerce.number().int().min(1).max(2_147_483_647), +}); + +export type MaxProjectsActionResult = + | { ok: true } + | { ok: false; errors: Record }; + +export async function handleMaxProjectsAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + const submission = SetMaxProjectsSchema.safeParse(Object.fromEntries(formData)); + if (!submission.success) { + return { ok: false, errors: submission.error.flatten().fieldErrors }; + } + + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { maximumProjectCount: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + + await prisma.organization.update({ + where: { id: orgId }, + data: { maximumProjectCount: submission.data.maximumProjectCount }, + }); + + logger.info("admin.backOffice.maxProjects", { + adminUserId, + orgId, + previous: existing.maximumProjectCount, + next: submission.data.maximumProjectCount, + }); + + return { ok: true }; +} diff --git a/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx new file mode 100644 index 00000000000..bf8ecf83161 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx @@ -0,0 +1,115 @@ +import { Form } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import { Button } from "~/components/primitives/Buttons"; +import { FormError } from "~/components/primitives/FormError"; +import { Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; +import { Label } from "~/components/primitives/Label"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import * as Property from "~/components/primitives/PropertyTable"; + +export const MAX_PROJECTS_INTENT = "set-max-projects"; +export const MAX_PROJECTS_SAVED_VALUE = "max-projects"; + +type FieldErrors = Record | null; + +type Props = { + maximumProjectCount: number; + errors: FieldErrors; + savedJustNow: boolean; + isSubmitting: boolean; +}; + +export function MaxProjectsSection({ + maximumProjectCount, + errors, + savedJustNow, + isSubmitting, +}: Props) { + const hasFieldErrors = !!errors && Object.keys(errors).length > 0; + const fieldError = (field: string) => + errors && field in errors ? errors[field]?.[0] : undefined; + + const [isEditing, setIsEditing] = useState(hasFieldErrors); + const [value, setValue] = useState(String(maximumProjectCount)); + + useEffect(() => { + if (hasFieldErrors) setIsEditing(true); + }, [hasFieldErrors]); + + useEffect(() => { + if (savedJustNow && !hasFieldErrors) setIsEditing(false); + }, [savedJustNow, hasFieldErrors]); + + return ( +

+ Maximum projects + {!isEditing && ( + + )} +

+ + {savedJustNow && ( +

+ + Saved. + +

+ )} + + {!isEditing ? ( + + + Limit + + {maximumProjectCount.toLocaleString()} + + + + ) : ( + + +

+ Maximum projects + setValue(e.target.value)} + required + /> + {fieldError("maximumProjectCount")} +

+ + +

+ + )} +

+ ); +} diff --git a/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts new file mode 100644 index 00000000000..799fc3605df --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts @@ -0,0 +1,76 @@ +import { z } from "zod"; +import { + RateLimitTokenBucketConfig, + RateLimiterConfig, +} from "~/services/authorizationRateLimitMiddleware.server"; +import { + parseDurationToMs, + type EffectiveRateLimit, +} from "./RateLimitSection"; + +export type RateLimitDomain = { + intent: string; + systemDefault: () => RateLimiterConfig; + apply: ( + orgId: string, + next: RateLimitTokenBucketConfig, + adminUserId: string + ) => Promise; +}; + +export function resolveEffectiveRateLimit( + override: unknown, + domain: RateLimitDomain +): EffectiveRateLimit { + if (override == null) { + return { source: "default", config: domain.systemDefault() }; + } + const parsed = RateLimiterConfig.safeParse(override); + if (parsed.success) { + return { source: "override", config: parsed.data }; + } + // Column holds malformed JSON — fall back silently. Admin must investigate + // at the DB level; this UI can't recover it. + return { source: "default", config: domain.systemDefault() }; +} + +export type RateLimitActionResult = + | { ok: true } + | { ok: false; errors: Record }; + +export async function handleRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string, + domain: RateLimitDomain +): Promise { + const schema = z.object({ + intent: z.literal(domain.intent), + refillRate: z.coerce.number().int().min(1), + interval: z + .string() + .trim() + .refine((v) => parseDurationToMs(v) > 0, { + message: "Must be a duration like 10s, 1m, 500ms.", + }), + maxTokens: z.coerce.number().int().min(1), + }); + + const submission = schema.safeParse(Object.fromEntries(formData)); + if (!submission.success) { + return { ok: false, errors: submission.error.flatten().fieldErrors }; + } + + const built = RateLimitTokenBucketConfig.safeParse({ + type: "tokenBucket", + refillRate: submission.data.refillRate, + interval: submission.data.interval, + maxTokens: submission.data.maxTokens, + }); + if (!built.success) { + return { ok: false, errors: built.error.flatten().fieldErrors }; + } + + await domain.apply(orgId, built.data, adminUserId); + return { ok: true }; +} diff --git a/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx new file mode 100644 index 00000000000..1af8abab3d9 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx @@ -0,0 +1,306 @@ +import { Form } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import { Button } from "~/components/primitives/Buttons"; +import { FormError } from "~/components/primitives/FormError"; +import { Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; +import { Label } from "~/components/primitives/Label"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import * as Property from "~/components/primitives/PropertyTable"; + +// Local shape mirrors the server-side discriminated union just enough for this +// view. Decoupled from the .server module so the component stays client-safe. +// Duration fields are always suffixed strings — the server's DurationSchema +// rejects anything else, so non-string overrides fall back to the default. +export type RateLimitConfig = + | { + type: "tokenBucket"; + refillRate: number; + interval: string; + maxTokens: number; + } + | { + type: "fixedWindow" | "slidingWindow"; + window: string; + tokens: number; + }; + +export type EffectiveRateLimit = { + source: "override" | "default"; + config: RateLimitConfig; +}; + +export type FieldErrors = Record | null; + +// Props shared by every per-domain wrapper (Api / Batch / future ones). +export type RateLimitWrapperProps = { + effective: EffectiveRateLimit; + errors: FieldErrors; + savedJustNow: boolean; + isSubmitting: boolean; +}; + +type Props = RateLimitWrapperProps & { + title: string; + intent: string; +}; + +export function RateLimitSection({ + title, + intent, + effective, + errors, + savedJustNow, + isSubmitting, +}: Props) { + const hasFieldErrors = !!errors && Object.keys(errors).length > 0; + const fieldError = (field: string) => + errors && field in errors ? errors[field]?.[0] : undefined; + + const current = + effective.config.type === "tokenBucket" ? effective.config : null; + + const [isEditing, setIsEditing] = useState(hasFieldErrors); + const [refillRate, setRefillRate] = useState( + current ? String(current.refillRate) : "" + ); + const [intervalStr, setIntervalStr] = useState( + current ? String(current.interval) : "" + ); + const [maxTokens, setMaxTokens] = useState( + current ? String(current.maxTokens) : "" + ); + + useEffect(() => { + if (hasFieldErrors) setIsEditing(true); + }, [hasFieldErrors]); + + useEffect(() => { + if (savedJustNow && !hasFieldErrors) setIsEditing(false); + }, [savedJustNow, hasFieldErrors]); + + const currentDescription = current + ? describeRateLimit( + current.refillRate, + parseDurationToMs(String(current.interval)), + current.maxTokens + ) + : null; + + const previewDescription = describeRateLimit( + Number(refillRate) || 0, + parseDurationToMs(intervalStr), + Number(maxTokens) || 0 + ); + + const cancelEdit = () => { + setRefillRate(current ? String(current.refillRate) : ""); + setIntervalStr(current ? String(current.interval) : ""); + setMaxTokens(current ? String(current.maxTokens) : ""); + setIsEditing(false); + }; + + return ( +

+ {title} + {!isEditing && ( + + )} +

+ + {savedJustNow && ( +

+ + Saved. + +

+ )} + + + Status:{" "} + {effective.source === "override" + ? "Custom override active." + : "Using system default."} + + + {!isEditing ? ( + <> + + {effective.config.type === "tokenBucket" ? ( + currentDescription ? ( + <> + + Sustained rate + + {currentDescription.sustained} + + + + Burst allowance + {currentDescription.burst} + + + ) : ( + + + Invalid interval on the stored config. + + + ) + ) : ( + <> + + Type + {effective.config.type} + + + Window + {String(effective.config.window)} + + + Tokens + + {effective.config.tokens.toLocaleString()} + + + + )} + + {effective.config.type !== "tokenBucket" && ( + + This override is a {effective.config.type} limit and can't be + edited from this form. Change it in the database directly. + + )} + + ) : ( +

+ + +

+ Refill rate (tokens per interval) + setRefillRate(e.target.value)} + required + /> + {fieldError("refillRate")} +

+ +

+ Interval (e.g. 10s, 1m) + setIntervalStr(e.target.value)} + required + /> + {fieldError("interval")} +

+ +

+ Max tokens (burst allowance) + setMaxTokens(e.target.value)} + required + /> + {fieldError("maxTokens")} +

+ + + {previewDescription + ? `Preview: ${previewDescription.sustained} · ${previewDescription.burst}.` + : "Preview: enter valid values to see the effective limit."} + + +

+ + +

+ )} +

+ ); +} + +export function parseDurationToMs(duration: string): number { + const match = duration.trim().match(/^(\d+)\s*(ms|s|m|h|d)$/); + if (!match) return 0; + const value = parseInt(match[1], 10); + switch (match[2]) { + case "ms": + return value; + case "s": + return value * 1_000; + case "m": + return value * 60_000; + case "h": + return value * 3_600_000; + case "d": + return value * 86_400_000; + default: + return 0; + } +} + +function describeRateLimit( + refillRate: number, + intervalMs: number, + maxTokens: number +): { sustained: string; burst: string } | null { + if (refillRate <= 0 || intervalMs <= 0 || maxTokens <= 0) return null; + const perMin = (refillRate * 60_000) / intervalMs; + let sustained: string; + if (perMin >= 1) { + sustained = `${formatRateValue(perMin)} requests per minute`; + } else { + const perHour = perMin * 60; + if (perHour >= 1) { + sustained = `${formatRateValue(perHour)} requests per hour`; + } else { + const perDay = perHour * 24; + sustained = `${formatRateValue(perDay)} requests per day`; + } + } + return { + sustained, + burst: `${maxTokens.toLocaleString()} request burst allowance`, + }; +} + +function formatRateValue(value: number): string { + return value >= 10 ? Math.round(value).toLocaleString() : value.toFixed(1); +} diff --git a/apps/webapp/app/components/code/AIQueryInput.tsx b/apps/webapp/app/components/code/AIQueryInput.tsx index 0775ec2c2a0..cd5e9db3bd8 100644 --- a/apps/webapp/app/components/code/AIQueryInput.tsx +++ b/apps/webapp/app/components/code/AIQueryInput.tsx @@ -1,25 +1,15 @@ import { CheckIcon, PencilSquareIcon, PlusIcon, XMarkIcon } from "@heroicons/react/20/solid"; import { AnimatePresence, motion } from "framer-motion"; -import { Suspense, lazy, useCallback, useEffect, useRef, useState } from "react"; +import { Suspense, useCallback, useEffect, useRef, useState } from "react"; import { Button } from "~/components/primitives/Buttons"; import { Spinner } from "~/components/primitives/Spinner"; +import { StreamdownRenderer } from "~/components/code/StreamdownRenderer"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; import type { AITimeFilter } from "~/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.query/types"; import { cn } from "~/utils/cn"; -// Lazy load streamdown components to avoid SSR issues -const StreamdownRenderer = lazy(() => - import("streamdown").then((mod) => ({ - default: ({ children, isAnimating }: { children: string; isAnimating: boolean }) => ( - - {children} - - ), - })) -); - type StreamEventType = | { type: "thinking"; content: string } | { type: "tool_call"; tool: string; args: unknown } diff --git a/apps/webapp/app/components/code/StreamdownRenderer.tsx b/apps/webapp/app/components/code/StreamdownRenderer.tsx new file mode 100644 index 00000000000..996234ab180 --- /dev/null +++ b/apps/webapp/app/components/code/StreamdownRenderer.tsx @@ -0,0 +1,29 @@ +import { lazy } from "react"; +import type { CodeHighlighterPlugin } from "streamdown"; + +export const StreamdownRenderer = lazy(() => + Promise.all([import("streamdown"), import("@streamdown/code"), import("./shikiTheme")]).then( + ([{ Streamdown }, { createCodePlugin }, { triggerDarkTheme }]) => { + // Type assertion needed: @streamdown/code and streamdown resolve different shiki + // versions under pnpm, causing structurally-identical CodeHighlighterPlugin types + // to be considered incompatible (different BundledLanguage string unions). + const codePlugin = createCodePlugin({ + themes: [triggerDarkTheme, triggerDarkTheme], + }) as unknown as CodeHighlighterPlugin; + + return { + default: ({ + children, + isAnimating = false, + }: { + children: string; + isAnimating?: boolean; + }) => ( + + {children} + + ), + }; + } + ) +); diff --git a/apps/webapp/app/components/code/shikiTheme.ts b/apps/webapp/app/components/code/shikiTheme.ts new file mode 100644 index 00000000000..5d47155b979 --- /dev/null +++ b/apps/webapp/app/components/code/shikiTheme.ts @@ -0,0 +1,222 @@ +import type { ThemeRegistrationAny } from "streamdown"; + +// Custom Shiki theme matching the Trigger.dev VS Code dark theme. +// Colors taken directly from the VS Code extension's tokenColors. +export const triggerDarkTheme: ThemeRegistrationAny = { + name: "trigger-dark", + type: "dark", + colors: { + "editor.background": "#212327", + "editor.foreground": "#878C99", + "editorLineNumber.foreground": "#484c54", + }, + tokenColors: [ + // Control flow keywords: pink-purple + { + scope: [ + "keyword.control", + "keyword.operator.delete", + "keyword.other.using", + "keyword.other.operator", + "entity.name.operator", + ], + settings: { foreground: "#E888F8" }, + }, + // Storage type (const, let, var, function, class): purple + { + scope: "storage.type", + settings: { foreground: "#8271ED" }, + }, + // Storage modifiers (async, export, etc.): purple + { + scope: ["storage.modifier", "keyword.operator.noexcept"], + settings: { foreground: "#8271ED" }, + }, + // Keyword operator expressions (new, typeof, instanceof, etc.): purple + { + scope: [ + "keyword.operator.new", + "keyword.operator.expression", + "keyword.operator.cast", + "keyword.operator.sizeof", + "keyword.operator.instanceof", + "keyword.operator.logical.python", + "keyword.operator.wordlike", + ], + settings: { foreground: "#8271ED" }, + }, + // Types and namespaces: hot pink + { + scope: [ + "support.class", + "support.type", + "entity.name.type", + "entity.name.namespace", + "entity.name.scope-resolution", + "entity.name.class", + "entity.other.inherited-class", + ], + settings: { foreground: "#F770C6" }, + }, + // Functions: lime/yellow-green + { + scope: ["entity.name.function", "support.function"], + settings: { foreground: "#D9F07C" }, + }, + // Variables and parameters: light lavender + { + scope: [ + "variable", + "meta.definition.variable.name", + "support.variable", + "entity.name.variable", + "constant.other.placeholder", + ], + settings: { foreground: "#CCCBFF" }, + }, + // Constants and enums: medium purple + { + scope: ["variable.other.constant", "variable.other.enummember"], + settings: { foreground: "#9C9AF2" }, + }, + // this/self: purple-blue + { + scope: "variable.language", + settings: { foreground: "#9B99FF" }, + }, + // Object literal keys: medium purple-blue + { + scope: "meta.object-literal.key", + settings: { foreground: "#8B89FF" }, + }, + // Strings: sage green + { + scope: ["string", "meta.embedded.assembly"], + settings: { foreground: "#AFEC73" }, + }, + // String interpolation punctuation: blue-purple + { + scope: [ + "punctuation.definition.template-expression.begin", + "punctuation.definition.template-expression.end", + "punctuation.section.embedded", + ], + settings: { foreground: "#7A78EA" }, + }, + // Template expression reset + { + scope: "meta.template.expression", + settings: { foreground: "#d4d4d4" }, + }, + // Operators: gray (same as foreground) + { + scope: "keyword.operator", + settings: { foreground: "#878C99" }, + }, + // Comments: olive gray + { + scope: "comment", + settings: { foreground: "#6f736d" }, + }, + // Language constants (true, false, null, undefined): purple-blue + { + scope: "constant.language", + settings: { foreground: "#9B99FF" }, + }, + // Numeric constants: light green + { + scope: [ + "constant.numeric", + "keyword.operator.plus.exponent", + "keyword.operator.minus.exponent", + ], + settings: { foreground: "#b5cea8" }, + }, + // Regex: dark red + { + scope: "constant.regexp", + settings: { foreground: "#646695" }, + }, + // HTML/JSX tags: purple-blue + { + scope: "entity.name.tag", + settings: { foreground: "#9B99FF" }, + }, + // Tag brackets: dark gray + { + scope: "punctuation.definition.tag", + settings: { foreground: "#5F6570" }, + }, + // HTML/JSX attributes: light purple + { + scope: "entity.other.attribute-name", + settings: { foreground: "#C39EFF" }, + }, + // Escape characters: gold + { + scope: "constant.character.escape", + settings: { foreground: "#d7ba7d" }, + }, + // Regex string: dark red + { + scope: "string.regexp", + settings: { foreground: "#d16969" }, + }, + // Storage: purple-blue + { + scope: "storage", + settings: { foreground: "#9B99FF" }, + }, + // TS-specific: type casts, math/dom/json constants + { + scope: [ + "meta.type.cast.expr", + "meta.type.new.expr", + "support.constant.math", + "support.constant.dom", + "support.constant.json", + ], + settings: { foreground: "#9B99FF" }, + }, + // Markdown headings: purple-blue bold + { + scope: "markup.heading", + settings: { foreground: "#9B99FF", fontStyle: "bold" }, + }, + // Markup bold: purple-blue + { + scope: "markup.bold", + settings: { foreground: "#9B99FF", fontStyle: "bold" }, + }, + // Markup inline raw: sage green + { + scope: "markup.inline.raw", + settings: { foreground: "#AFEC73" }, + }, + // Markup inserted: light green + { + scope: "markup.inserted", + settings: { foreground: "#b5cea8" }, + }, + // Markup deleted: sage green + { + scope: "markup.deleted", + settings: { foreground: "#AFEC73" }, + }, + // Markup changed: purple-blue + { + scope: "markup.changed", + settings: { foreground: "#9B99FF" }, + }, + // Invalid: red + { + scope: "invalid", + settings: { foreground: "#f44747" }, + }, + // JSX text content + { + scope: ["meta.jsx.children"], + settings: { foreground: "#D7D9DD" }, + }, + ], +}; diff --git a/apps/webapp/app/components/environments/RegenerateApiKeyModal.tsx b/apps/webapp/app/components/environments/RegenerateApiKeyModal.tsx index 439fd892f91..52e1f499cbe 100644 --- a/apps/webapp/app/components/environments/RegenerateApiKeyModal.tsx +++ b/apps/webapp/app/components/environments/RegenerateApiKeyModal.tsx @@ -75,8 +75,9 @@ const RegenerateApiKeyModalContent = ({ return (

- {`Regenerating the keys for this environment will temporarily break any live tasks in the - ${title} environment until the new API keys are set in the relevant environment variables.`} + {`A new API key will be issued for the ${title} environment. The previous key stays valid + for 24 hours so you can roll out the new key in your environment variables without downtime. + After 24 hours, the previous key stops working.`} Select a Slack channel} heading="Filter channels…" - defaultValue={selectedSlackChannelValue} + value={selectedSlackChannelValue ?? ""} dropdownIcon variant="tertiary/medium" items={slack.channels} @@ -218,6 +218,15 @@ export function ConfigureErrorAlerts({ > {(matches) => ( <> + +

+ + No channel +

+ {matches?.map((channel) => ( >; autoPromote?: boolean; onAutoPromoteChange?: (value: boolean) => void; + /** The currently pinned TRIGGER_VERSION on Vercel production, if any. Shown under the + * Atomic deployments toggle so the user knows what version is set on Vercel right now. */ + currentTriggerVersion?: string | null; + /** True when the Vercel lookup for TRIGGER_VERSION failed. We show this so the user knows + * the pin status is unknown — distinct from "not set". */ + currentTriggerVersionFetchFailed?: boolean; + /** Hide the section-level master toggles for "Pull env vars" and "Discover new env vars". */ + hideSectionToggles?: boolean; }; export function BuildSettingsFields({ @@ -37,6 +45,9 @@ export function BuildSettingsFields({ disabledEnvSlugs, autoPromote, onAutoPromoteChange, + currentTriggerVersion, + currentTriggerVersionFetchFailed, + hideSectionToggles, }: BuildSettingsFieldsProps) { const isSlugDisabled = (slug: EnvSlug) => !!disabledEnvSlugs?.[slug]; const enabledSlugs = availableEnvSlugs.filter((s) => !isSlugDisabled(s)); @@ -48,7 +59,7 @@ export function BuildSettingsFields({

Pull env vars before build - {availableEnvSlugs.length > 1 && ( + {!hideSectionToggles && availableEnvSlugs.length > 1 && (

Discover new env vars - {availableEnvSlugs.length > 1 && ( + {!hideSectionToggles && availableEnvSlugs.length > 1 && ( . + {currentTriggerVersion && ( + + Currently pinned to{" "} + {currentTriggerVersion} in Vercel + production. + + )} + {!currentTriggerVersion && currentTriggerVersionFetchFailed && ( + + Couldn't read{" "} + TRIGGER_VERSION from Vercel — + check the Vercel dashboard to confirm the production pin. + + )}

{/* Auto promotion — only visible when atomic deployments are on */} diff --git a/apps/webapp/app/components/integrations/VercelOnboardingModal.tsx b/apps/webapp/app/components/integrations/VercelOnboardingModal.tsx index 7ff99d7d448..21734c5c038 100644 --- a/apps/webapp/app/components/integrations/VercelOnboardingModal.tsx +++ b/apps/webapp/app/components/integrations/VercelOnboardingModal.tsx @@ -600,6 +600,20 @@ export function VercelOnboardingModal({ } }, [completeOnboardingFetcher.data, completeOnboardingFetcher.state, state]); + useEffect(() => { + if (state === "github-connection" && isGitHubConnectedForOnboarding) { + trackOnboarding("vercel onboarding github completed"); + if (fromMarketplaceContext && nextUrl) { + const validUrl = safeRedirectUrl(nextUrl); + if (validUrl) { + window.location.href = validUrl; + return; + } + } + setState("completed"); + } + }, [state, isGitHubConnectedForOnboarding, fromMarketplaceContext, nextUrl, trackOnboarding]); + useEffect(() => { if (state === "completed" && !hasTrackedCompletionRef.current) { hasTrackedCompletionRef.current = true; @@ -1114,6 +1128,7 @@ export function VercelOnboardingModal({ redirectParams.set("next", nextUrl); } const redirectUrlWithContext = `${baseSettingsPath}?${redirectParams.toString()}`; + const nextDirectRedirect = nextUrl ? safeRedirectUrl(nextUrl) : null; return gitHubAppInstallations.length === 0 ? (

@@ -1137,7 +1152,10 @@ export function VercelOnboardingModal({ organizationSlug={organizationSlug} projectSlug={projectSlug} environmentSlug={environmentSlug} - redirectUrl={redirectUrlWithContext} + redirectUrl={ + nextDirectRedirect ?? + (fromMarketplaceContext ? redirectUrlWithContext : baseSettingsPath) + } preventDismiss={fromMarketplaceContext} /> diff --git a/apps/webapp/app/components/logs/LogsLevelFilter.tsx b/apps/webapp/app/components/logs/LogsLevelFilter.tsx index 947bef88fcc..c61da4d3084 100644 --- a/apps/webapp/app/components/logs/LogsLevelFilter.tsx +++ b/apps/webapp/app/components/logs/LogsLevelFilter.tsx @@ -53,7 +53,7 @@ export function LogsLevelFilter() { const hasLevels = selectedLevels.length > 0 && selectedLevels.some((v) => v !== ""); if (hasLevels) { - return ; + return ; } return ( @@ -64,19 +64,16 @@ export function LogsLevelFilter() { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by level" + className="pl-1.5" > - Level + Level } /> ); } -function LevelDropdown({ - trigger, -}: { - trigger: ReactNode; -}) { +function LevelDropdown({ trigger }: { trigger: ReactNode }) { const { values, replace } = useSearchParams(); const handleChange = (values: string[]) => { diff --git a/apps/webapp/app/components/logs/LogsRunIdFilter.tsx b/apps/webapp/app/components/logs/LogsRunIdFilter.tsx index 857e623d7c9..e23c39534a6 100644 --- a/apps/webapp/app/components/logs/LogsRunIdFilter.tsx +++ b/apps/webapp/app/components/logs/LogsRunIdFilter.tsx @@ -6,11 +6,7 @@ import { Button } from "~/components/primitives/Buttons"; import { FormError } from "~/components/primitives/FormError"; import { Input } from "~/components/primitives/Input"; import { Label } from "~/components/primitives/Label"; -import { - SelectPopover, - SelectProvider, - SelectTrigger, -} from "~/components/primitives/Select"; +import { SelectPopover, SelectProvider, SelectTrigger } from "~/components/primitives/Select"; import { useSearchParams } from "~/hooks/useSearchParam"; import { FilterMenuProvider } from "~/components/runs/v3/SharedFilters"; @@ -34,8 +30,9 @@ export function LogsRunIdFilter() { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by run ID" + className="pl-1.5" > - Run ID + Run ID } clearSearchValue={() => setSearch("")} diff --git a/apps/webapp/app/components/logs/LogsTaskFilter.tsx b/apps/webapp/app/components/logs/LogsTaskFilter.tsx index fa64eff7bd3..6c15464cc49 100644 --- a/apps/webapp/app/components/logs/LogsTaskFilter.tsx +++ b/apps/webapp/app/components/logs/LogsTaskFilter.tsx @@ -4,6 +4,8 @@ import { useMemo } from "react"; import * as Ariakit from "@ariakit/react"; import { ComboBox, + SelectGroup, + SelectGroupLabel, SelectItem, SelectList, SelectPopover, @@ -21,6 +23,7 @@ const shortcut = { key: "t" }; type TaskOption = { slug: string; triggerSource: TaskTriggerSource; + isInLatestDeployment: boolean; }; interface LogsTaskFilterProps { @@ -42,8 +45,9 @@ export function LogsTaskFilter({ possibleTasks }: LogsTaskFilterProps) { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by task" + className="pl-1.5" > - Tasks + Tasks } searchValue={search} @@ -126,17 +130,44 @@ function TasksDropdown({ > - {filtered.map((item, index) => ( - - } - > - {item.slug} - - ))} + {filtered + .filter((item) => item.isInLatestDeployment) + .map((item) => ( + + } + > + {item.slug} + + ))} + {filtered.some((item) => !item.isInLatestDeployment) && ( + + Archived + {filtered + .filter((item) => !item.isInLatestDeployment) + .map((item) => ( + + + + } + > + {item.slug} + + ))} + + )} diff --git a/apps/webapp/app/components/logs/LogsVersionFilter.tsx b/apps/webapp/app/components/logs/LogsVersionFilter.tsx index 4cc10545060..a5a83f6eda4 100644 --- a/apps/webapp/app/components/logs/LogsVersionFilter.tsx +++ b/apps/webapp/app/components/logs/LogsVersionFilter.tsx @@ -22,8 +22,9 @@ export function LogsVersionFilter() { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by version" + className="pl-1.5" > - Versions + Versions } searchValue={search} diff --git a/apps/webapp/app/components/metrics/ModelsFilter.tsx b/apps/webapp/app/components/metrics/ModelsFilter.tsx index e641f826ae3..9b330834c84 100644 --- a/apps/webapp/app/components/metrics/ModelsFilter.tsx +++ b/apps/webapp/app/components/metrics/ModelsFilter.tsx @@ -16,7 +16,7 @@ import { tablerIcons } from "~/utils/tablerIcons"; import tablerSpritePath from "~/components/primitives/tabler-sprite.svg"; import { AnthropicLogoIcon } from "~/assets/icons/AnthropicLogoIcon"; -const shortcut = { key: "l" }; +const shortcut = { key: "m" }; export type ModelOption = { model: string; @@ -38,19 +38,19 @@ function modelIcon(system: string, model: string): ReactNode { // Special case: Anthropic uses a custom SVG icon if (provider === "anthropic") { - return ; + return ; } const iconName = `tabler-brand-${provider}`; if (tablerIcons.has(iconName)) { return ( - Models + Models } searchValue={search} @@ -147,7 +148,7 @@ function ModelsDropdown({ {filtered.map((m) => ( - + {m.model} ))} diff --git a/apps/webapp/app/components/metrics/OperationsFilter.tsx b/apps/webapp/app/components/metrics/OperationsFilter.tsx index 679332fc3c4..679e73ccb7f 100644 --- a/apps/webapp/app/components/metrics/OperationsFilter.tsx +++ b/apps/webapp/app/components/metrics/OperationsFilter.tsx @@ -13,7 +13,7 @@ import { import { useSearchParams } from "~/hooks/useSearchParam"; import { appliedSummary, FilterMenuProvider } from "~/components/runs/v3/SharedFilters"; -const shortcut = { key: "n" }; +const shortcut = { key: "o" }; interface OperationsFilterProps { possibleOperations: string[]; @@ -45,8 +45,9 @@ export function OperationsFilter({ possibleOperations }: OperationsFilterProps) variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by operation" + className="pl-1.5" > - Operations + Operations } searchValue={search} @@ -125,7 +126,7 @@ function OperationsDropdown({ {filtered.map((op) => ( - }> + }> {formatOperation(op)} ))} diff --git a/apps/webapp/app/components/metrics/PromptsFilter.tsx b/apps/webapp/app/components/metrics/PromptsFilter.tsx index a4ad8a00045..09a91f4f1fd 100644 --- a/apps/webapp/app/components/metrics/PromptsFilter.tsx +++ b/apps/webapp/app/components/metrics/PromptsFilter.tsx @@ -34,8 +34,9 @@ export function PromptsFilter({ possiblePrompts }: PromptsFilterProps) { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by prompt" + className="pl-1.5" > - Prompts + Prompts } searchValue={search} @@ -113,7 +114,7 @@ function PromptsDropdown({ {filtered.map((slug) => ( - }> + }> {slug} ))} diff --git a/apps/webapp/app/components/metrics/ProvidersFilter.tsx b/apps/webapp/app/components/metrics/ProvidersFilter.tsx index fe018eefb98..d22bec8f70b 100644 --- a/apps/webapp/app/components/metrics/ProvidersFilter.tsx +++ b/apps/webapp/app/components/metrics/ProvidersFilter.tsx @@ -34,8 +34,9 @@ export function ProvidersFilter({ possibleProviders }: ProvidersFilterProps) { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by provider" + className="pl-1.5" > - Providers + Providers } searchValue={search} @@ -111,7 +112,7 @@ function ProvidersDropdown({ {filtered.map((provider) => ( - }> + }> {provider} ))} diff --git a/apps/webapp/app/components/metrics/QueuesFilter.tsx b/apps/webapp/app/components/metrics/QueuesFilter.tsx index 87d7a612547..3da71e0c7d0 100644 --- a/apps/webapp/app/components/metrics/QueuesFilter.tsx +++ b/apps/webapp/app/components/metrics/QueuesFilter.tsx @@ -39,6 +39,7 @@ export function QueuesFilter() { variant="secondary/small" shortcut={shortcut} tooltipTitle="Filter by queue" + className="pl-1.5" > Queues @@ -190,6 +191,7 @@ function QueuesDropdown({ diff --git a/apps/webapp/app/components/metrics/ScopeFilter.tsx b/apps/webapp/app/components/metrics/ScopeFilter.tsx index 1bf6b685676..0cdaa4adb32 100644 --- a/apps/webapp/app/components/metrics/ScopeFilter.tsx +++ b/apps/webapp/app/components/metrics/ScopeFilter.tsx @@ -1,14 +1,17 @@ import * as Ariakit from "@ariakit/react"; -import { EnvironmentLabel } from "~/components/environments/EnvironmentLabel"; +import { FolderIcon } from "@heroicons/react/20/solid"; +import { useRef } from "react"; +import { EnvironmentIcon, EnvironmentLabel } from "~/components/environments/EnvironmentLabel"; import { AppliedFilter } from "~/components/primitives/AppliedFilter"; +import { Avatar } from "~/components/primitives/Avatar"; import { SelectItem, SelectPopover, SelectProvider } from "~/components/primitives/Select"; +import { ShortcutKey } from "~/components/primitives/ShortcutKey"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; import { useSearchParams } from "~/hooks/useSearchParam"; +import { type ShortcutDefinition, useShortcutKeys } from "~/hooks/useShortcutKeys"; import type { QueryScope } from "~/services/queryService.server"; -import { CubeTransparentIcon, GlobeAltIcon } from "@heroicons/react/20/solid"; -import { IconListLetters } from "@tabler/icons-react"; const scopeOptions = [ { value: "environment", label: "Environment" }, @@ -16,29 +19,76 @@ const scopeOptions = [ { value: "organization", label: "Organization" }, ] as const; -export function ScopeFilter() { - const { value, replace } = useSearchParams(); - const scope = (value("scope") as QueryScope) ?? "environment"; +export type ScopeFilterProps = { + shortcut?: ShortcutDefinition; + /** Controlled value. If provided, the filter uses controlled mode and ignores search params. */ + value?: QueryScope; + /** Called when the user selects a new scope. Required when `value` is provided. */ + onValueChange?: (scope: QueryScope) => void; +}; + +export function ScopeFilter({ shortcut, value, onValueChange }: ScopeFilterProps = {}) { + const { value: paramValue, replace } = useSearchParams(); + const isControlled = value !== undefined; + const scope: QueryScope = isControlled + ? value + : ((paramValue("scope") as QueryScope) ?? "environment"); + const triggerRef = useRef(null); const handleChange = (newScope: string) => { + if (isControlled) { + onValueChange?.(newScope as QueryScope); + return; + } replace({ scope: newScope === "environment" ? undefined : newScope }); }; + useShortcutKeys({ + shortcut, + action: (e) => { + e.preventDefault(); + e.stopPropagation(); + triggerRef.current?.click(); + }, + disabled: !shortcut, + }); + return ( - }> - } - value={} - removable={false} - variant="secondary/small" - /> - + + } + /> + } + > + } + removable={false} + variant="secondary/small" + /> + + {shortcut && ( + +

+ Change scope + +

+ + )} + {scopeOptions.map((option) => ( - - + } + > + ))} @@ -46,19 +96,44 @@ export function ScopeFilter() { ); } -function ScopeItem({ scope }: { scope: QueryScope }) { +function ScopeIcon({ scope }: { scope: QueryScope }) { + const organization = useOrganization(); + const environment = useEnvironment(); + + switch (scope) { + case "organization": + return ; + case "project": + return ; + case "environment": + return ; + default: + return null; + } +} + +function ScopeLabel({ scope }: { scope: QueryScope }) { const organization = useOrganization(); const project = useProject(); const environment = useEnvironment(); switch (scope) { case "organization": - return `Org: ${organization.title}`; + return {organization.title}; case "project": - return `Project: ${project.name}`; + return {project.name}; case "environment": - return ; + return ; default: return scope; } } + +function ScopeItem({ scope }: { scope: QueryScope }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/components/navigation/DashboardDialogs.tsx b/apps/webapp/app/components/navigation/DashboardDialogs.tsx index f0cdd0406e0..6466038400c 100644 --- a/apps/webapp/app/components/navigation/DashboardDialogs.tsx +++ b/apps/webapp/app/components/navigation/DashboardDialogs.tsx @@ -4,6 +4,7 @@ import { motion } from "framer-motion"; import { ArrowUpCircleIcon } from "@heroicons/react/24/outline"; import { PlusIcon } from "@heroicons/react/20/solid"; import { useEffect, useState } from "react"; +import { type ShortcutDefinition } from "~/hooks/useShortcutKeys"; import { type MatchedOrganization, useDashboardLimits } from "~/hooks/useOrganizations"; import { useCurrentPlan } from "~/routes/_app.orgs.$organizationSlug/route"; import { Feedback } from "~/components/Feedback"; @@ -118,17 +119,19 @@ export function CreateDashboardPageButton({ organization, project, environment, + shortcut, }: { organization: { slug: string }; project: { slug: string }; environment: { slug: string }; + shortcut?: ShortcutDefinition; }) { const dashboard = useCreateDashboard({ organization, project, environment }); return (