Skip to content

Pipeline Watchdog

Pipeline Watchdog #251

name: Pipeline Watchdog
on:
schedule:
# Run every 30 minutes
- cron: "*/30 * * * *"
workflow_dispatch: {}
permissions:
contents: read
issues: write
pull-requests: write
actions: write
jobs:
watchdog:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Generate GitHub App token
id: app-token
uses: actions/create-github-app-token@v2
with:
app-id: ${{ secrets.APP_ID }}
private-key: ${{ secrets.APP_PRIVATE_KEY }}
- name: Check for stalled issues and PRs
env:
GH_TOKEN: ${{ steps.app-token.outputs.token }}
DISPATCH_TOKEN: ${{ github.token }}
run: |
REPO="${{ github.repository }}"
echo "=== Pipeline Watchdog — $(date -u) ==="
FIXES=0
# --- 1. ops-feedback issues: Operator sent back but Product Agent didn't pick up ---
echo "Checking ops-feedback issues..."
for NUM in $(gh issue list --repo $REPO --label "ops-feedback" --state open --json number -q '.[].number'); do
# Always re-dispatch — the Product Agent is idempotent and will skip if already reviewing
echo " Stalled: #$NUM (ops-feedback) — dispatching Product Agent"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
done
# --- 2. needs-clarification with recent human response ---
echo "Checking needs-clarification issues..."
for NUM in $(gh issue list --repo $REPO --label "needs-clarification" --state open --json number -q '.[].number'); do
# Check if a non-bot commented after the clarification was requested
LAST_HUMAN=$(gh api repos/$REPO/issues/$NUM/comments \
--jq '[.[] | select(.user.login | contains("[bot]") | not)] | last | .created_at // ""' 2>/dev/null)
LAST_BOT=$(gh api repos/$REPO/issues/$NUM/comments \
--jq '[.[] | select(.user.login | contains("[bot]"))] | last | .created_at // ""' 2>/dev/null)
if [ -n "$LAST_HUMAN" ] && [ "$LAST_HUMAN" \> "$LAST_BOT" ]; then
echo " Stalled: #$NUM (needs-clarification but human responded) — dispatching"
gh issue edit "$NUM" --remove-label "needs-clarification" 2>/dev/null || true
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
fi
done
# --- 3. Stale in-dev labels (no active Developer Agent run) ---
echo "Checking stale in-dev labels..."
ACTIVE_DEV=$(gh run list --repo $REPO --workflow "Developer Agent" --limit 5 --json status \
-q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0")
if [ "$ACTIVE_DEV" = "0" ]; then
for NUM in $(gh issue list --repo $REPO --label "in-dev" --state open --json number -q '.[].number'); do
echo " Stale in-dev: #$NUM — removing label"
gh issue edit "$NUM" --remove-label "in-dev" 2>/dev/null || true
FIXES=$((FIXES + 1))
done
fi
# --- 4. PRs with only agent-pr label (stuck before security) ---
echo "Checking PRs stuck before security..."
gh pr list --repo $REPO --state open --json number,labels -q '.[] | select([.labels[].name] == ["agent-pr"]) | .number' | while read -r PR; do
echo " Stalled PR: #$PR (no security label) — dispatching security"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-security.yml -f pr_number="$PR" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
done
# --- 5. PRs with security-passed but not yet approved (stuck between security and review) ---
echo "Checking PRs stuck between security and review..."
ACTIVE_REVIEW=$(gh run list --repo $REPO --workflow "Review Agent" --limit 5 --json status \
-q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0")
if [ "$ACTIVE_REVIEW" = "0" ]; then
gh pr list --repo $REPO --state open --json number,labels \
-q '.[] | select(([.labels[].name] | contains(["security-passed"])) and ([.labels[].name] | contains(["review-approved"]) | not) and ([.labels[].name] | contains(["review-changes-needed"]) | not)) | .number' | while read -r PR; do
echo " Stalled PR: #$PR (security-passed, no active review) — dispatching review"
gh pr edit "$PR" --repo $REPO --remove-label "in-review" 2>/dev/null || true
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-review.yml -f pr_number="$PR" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
done
fi
# --- 6. PRs with review-approved but not merged (stuck before merge) ---
echo "Checking PRs stuck before merge..."
gh pr list --repo $REPO --state open --json number,labels \
-q '.[] | select([.labels[].name] | contains(["review-approved"])) | .number' | while read -r PR; do
echo " Stalled PR: #$PR (review-approved, not merged) — dispatching merge"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-merge.yml -f pr_number="$PR" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
done
# --- 7. PRs with review-changes-needed but no active revise run ---
echo "Checking PRs stuck waiting for revision..."
ACTIVE_REVISE=$(gh run list --repo $REPO --workflow "Revise Agent" --limit 5 --json status \
-q '[.[] | select(.status == "in_progress" or .status == "queued")] | length' 2>/dev/null || echo "0")
if [ "$ACTIVE_REVISE" = "0" ]; then
gh pr list --repo $REPO --state open --json number,labels \
-q '.[] | select(([.labels[].name] | contains(["review-changes-needed"])) or ([.labels[].name] | contains(["security-failed"]))) | .number' | while read -r PR; do
echo " Stalled PR: #$PR (needs revision, no active revise run) — dispatching revise"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-revise.yml -f pr_number="$PR" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
done
fi
# --- 8. Approved issues with no active Developer Agent run and no open PR ---
# Check both ops-approved (CSC) and product-approved (python-docx/docxjs which skip Operator)
echo "Checking approved issues not in development..."
if [ "$ACTIVE_DEV" = "0" ]; then
for LABEL in "ops-approved" "product-approved"; do
for NUM in $(gh issue list --repo $REPO --label "$LABEL" --state open --json number,labels \
-q '.[] | select([.labels[].name] | contains(["in-dev"]) | not) | .number' 2>/dev/null); do
# Check if an OPEN PR already exists for this issue
PR_EXISTS=$(gh pr list --repo $REPO --state open --head "agent/issue-$NUM" --json number -q 'length' 2>/dev/null || echo "0")
if [ "$PR_EXISTS" = "0" ]; then
echo " Stalled: #$NUM ($LABEL, no open PR, no active dev) — dispatching developer"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-develop.yml -f issue_number="$NUM" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
fi
done
done
fi
# --- 9. Issues with agent label but no pipeline progress (no other labels, no PR) ---
echo "Checking agent-labeled issues with no pipeline progress..."
if [ "$ACTIVE_DEV" = "0" ]; then
for NUM in $(gh issue list --repo $REPO --label "agent" --state open --json number,labels \
-q '.[] | select(([.labels[].name] | length) <= 1) | .number' 2>/dev/null); do
PR_EXISTS=$(gh pr list --repo $REPO --state open --head "agent/issue-$NUM" --json number -q 'length' 2>/dev/null || echo "0")
if [ "$PR_EXISTS" = "0" ]; then
echo " Stalled: #$NUM (agent label only, no progress) — dispatching Product Agent"
GH_TOKEN=$DISPATCH_TOKEN gh workflow run agent-product.yml -f issue_number="$NUM" 2>/dev/null || true
FIXES=$((FIXES + 1))
sleep 2
fi
done
fi
echo ""
echo "=== Watchdog complete: $FIXES fixes applied ==="