Pipeline Watchdog #251
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Pipeline Watchdog | |
| on: | |
| schedule: | |
| # Run every 30 minutes | |
| - cron: "*/30 * * * *" | |
| workflow_dispatch: {} | |
| permissions: | |
| contents: read | |
| issues: write | |
| pull-requests: write | |
| actions: write | |
| jobs: | |
| watchdog: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| steps: | |
| - name: Generate GitHub App token | |
| id: app-token | |
| uses: actions/create-github-app-token@v2 | |
| with: | |
| app-id: ${{ secrets.APP_ID }} | |
| private-key: ${{ secrets.APP_PRIVATE_KEY }} | |
| - name: Check for stalled issues and PRs | |
| env: | |
| GH_TOKEN: ${{ steps.app-token.outputs.token }} | |
| DISPATCH_TOKEN: ${{ github.token }} | |
| run: | | |
| REPO="${{ github.repository }}" | |
| echo "=== Pipeline Watchdog — $(date -u) ===" | |
| FIXES=0 | |
| # --- 1. ops-feedback issues: Operator sent back but Product Agent didn't pick up --- | |
| echo "Checking ops-feedback issues..." | |
| for NUM in $(gh issue list --repo $REPO --label "ops-feedback" --state open --json number -q '.[].number'); do | |
| # Always re-dispatch — the Product Agent is idempotent and will skip if already reviewing | |
| echo " Stalled: #$NUM (ops-feedback) — dispatching Product Agent" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| done | |
| # --- 2. needs-clarification with recent human response --- | |
| echo "Checking needs-clarification issues..." | |
| for NUM in $(gh issue list --repo $REPO --label "needs-clarification" --state open --json number -q '.[].number'); do | |
| # Check if a non-bot commented after the clarification was requested | |
| LAST_HUMAN=$(gh api repos/$REPO/issues/$NUM/comments \ | |
| --jq '[.[] | select(.user.login | contains("[bot]") | not)] | last | .created_at // ""' 2>/dev/null) | |
| LAST_BOT=$(gh api repos/$REPO/issues/$NUM/comments \ | |
| --jq '[.[] | select(.user.login | contains("[bot]"))] | last | .created_at // ""' 2>/dev/null) | |
| if [ -n "$LAST_HUMAN" ] && [ "$LAST_HUMAN" \> "$LAST_BOT" ]; then | |
| echo " Stalled: #$NUM (needs-clarification but human responded) — dispatching" | |
| gh issue edit "$NUM" --remove-label "needs-clarification" 2>/dev/null || true | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| fi | |
| done | |
| # --- 3. Stale in-dev labels (no active Developer Agent run) --- | |
| echo "Checking stale in-dev labels..." | |
| ACTIVE_DEV=$(gh run list --repo $REPO --workflow "Developer Agent" --limit 5 --json status \ | |
| -q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0") | |
| if [ "$ACTIVE_DEV" = "0" ]; then | |
| for NUM in $(gh issue list --repo $REPO --label "in-dev" --state open --json number -q '.[].number'); do | |
| echo " Stale in-dev: #$NUM — removing label" | |
| gh issue edit "$NUM" --remove-label "in-dev" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| done | |
| fi | |
| # --- 4. PRs with only agent-pr label (stuck before security) --- | |
| echo "Checking PRs stuck before security..." | |
| gh pr list --repo $REPO --state open --json number,labels -q '.[] | select([.labels[].name] == ["agent-pr"]) | .number' | while read -r PR; do | |
| echo " Stalled PR: #$PR (no security label) — dispatching security" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-security.yml -f pr_number="$PR" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| done | |
| # --- 5. PRs with security-passed but not yet approved (stuck between security and review) --- | |
| echo "Checking PRs stuck between security and review..." | |
| ACTIVE_REVIEW=$(gh run list --repo $REPO --workflow "Review Agent" --limit 5 --json status \ | |
| -q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0") | |
| if [ "$ACTIVE_REVIEW" = "0" ]; then | |
| gh pr list --repo $REPO --state open --json number,labels \ | |
| -q '.[] | select(([.labels[].name] | contains(["security-passed"])) and ([.labels[].name] | contains(["review-approved"]) | not) and ([.labels[].name] | contains(["review-changes-needed"]) | not)) | .number' | while read -r PR; do | |
| echo " Stalled PR: #$PR (security-passed, no active review) — dispatching review" | |
| gh pr edit "$PR" --repo $REPO --remove-label "in-review" 2>/dev/null || true | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-review.yml -f pr_number="$PR" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| done | |
| fi | |
| # --- 6. PRs with review-approved but not merged (stuck before merge) --- | |
| echo "Checking PRs stuck before merge..." | |
| gh pr list --repo $REPO --state open --json number,labels \ | |
| -q '.[] | select([.labels[].name] | contains(["review-approved"])) | .number' | while read -r PR; do | |
| echo " Stalled PR: #$PR (review-approved, not merged) — dispatching merge" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-merge.yml -f pr_number="$PR" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| done | |
| # --- 7. PRs with review-changes-needed but no active revise run --- | |
| echo "Checking PRs stuck waiting for revision..." | |
| ACTIVE_REVISE=$(gh run list --repo $REPO --workflow "Revise Agent" --limit 5 --json status \ | |
| -q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0") | |
| if [ "$ACTIVE_REVISE" = "0" ]; then | |
| gh pr list --repo $REPO --state open --json number,labels \ | |
| -q '.[] | select(([.labels[].name] | contains(["review-changes-needed"])) or ([.labels[].name] | contains(["security-failed"]))) | .number' | while read -r PR; do | |
| echo " Stalled PR: #$PR (needs revision, no active revise run) — dispatching revise" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-revise.yml -f pr_number="$PR" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| done | |
| fi | |
| # --- 8. Approved issues with no active Developer Agent run and no open PR --- | |
| # Check both ops-approved (CSC) and product-approved (python-docx/docxjs which skip Operator) | |
| echo "Checking approved issues not in development..." | |
| if [ "$ACTIVE_DEV" = "0" ]; then | |
| for LABEL in "ops-approved" "product-approved"; do | |
| for NUM in $(gh issue list --repo $REPO --label "$LABEL" --state open --json number,labels \ | |
| -q '.[] | select([.labels[].name] | contains(["in-dev"]) | not) | .number' 2>/dev/null); do | |
| # Check if an OPEN PR already exists for this issue | |
| PR_EXISTS=$(gh pr list --repo $REPO --state open --head "agent/issue-$NUM" --json number -q 'length' 2>/dev/null || echo "0") | |
| if [ "$PR_EXISTS" = "0" ]; then | |
| echo " Stalled: #$NUM ($LABEL, no open PR, no active dev) — dispatching developer" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-develop.yml -f issue_number="$NUM" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| fi | |
| done | |
| done | |
| fi | |
| # --- 9. Issues with agent label but no pipeline progress (no other labels, no PR) --- | |
| echo "Checking agent-labeled issues with no pipeline progress..." | |
| if [ "$ACTIVE_DEV" = "0" ]; then | |
| for NUM in $(gh issue list --repo $REPO --label "agent" --state open --json number,labels \ | |
| -q '.[] | select(([.labels[].name] | length) <= 1) | .number' 2>/dev/null); do | |
| PR_EXISTS=$(gh pr list --repo $REPO --state open --head "agent/issue-$NUM" --json number -q 'length' 2>/dev/null || echo "0") | |
| if [ "$PR_EXISTS" = "0" ]; then | |
| echo " Stalled: #$NUM (agent label only, no progress) — dispatching Product Agent" | |
| GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true | |
| FIXES=$((FIXES + 1)) | |
| sleep 2 | |
| fi | |
| done | |
| fi | |
| echo "" | |
| echo "=== Watchdog complete: $FIXES fixes applied ===" |