docs/.github/workflows/index-general-search.yml at main · github/docs

360 lines (312 loc) · 14.5 KB
name: Index general search in Elasticsearch
# **What it does**: It scrapes the whole site and dumps the records in a
#                   temp directory. Then it indexes that into Elasticsearch.
# **Why we have it**: We want our search indexes kept up to date.
# **Who does it impact**: Anyone using search on docs.
  workflow_dispatch:
    inputs:
      version:
        description: "Version to exclusively generate the search index for. E.g. 'dotcom', 'ghes-3.12'"
        required: false
        default: ''
      languages:
        description: "Comma separated languages. E.g. 'en,es,ja,pt,zh,ru,fr,ko,de' (defaults to all)"
        required: false
        default: ''
  schedule:
    - cron: '20 16 * * 1-5' # Run Mon-Fri at 16:20 UTC / 8:20 PST
  workflow_run:
    workflows: ['Purge Fastly']
      - completed
permissions:
  contents: read
# This allows a subsequently queued workflow run to cancel previous runs.
# Include the triggering workflow's conclusion in the group so that runs triggered
# by skipped Purge Fastly workflows don't cancel runs triggered by successful ones.
concurrency:
  group: '${{ github.workflow }} @ ${{ github.head_ref }} ${{ github.event_name }} ${{ github.event.workflow_run.conclusion }}'
  cancel-in-progress: true
  ELASTICSEARCH_URL: ${{ secrets.ELASTICSEARCH_URL }}
  # Since we'll run in NODE_ENV=production, we need to be explicit that
  # we don't want Hydro configured.
  HYDRO_ENDPOINT: ''
  HYDRO_SECRET: ''
  figureOutMatrix:
    # Skip immediately if triggered by a non-successful Purge Fastly run.
    # This prevents skipped runs from canceling valid indexing runs via concurrency.
    if: ${{ github.repository == 'github/docs-internal' && (github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success') }}
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.result }}
      - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        id: set-matrix
        with:
          script: |
            // Edit this list for the definitive list of languages
            // (other than English) we want to index in Elasticsearch.
            const allNonEnglish = 'es,ja,pt,zh,ru,fr,ko,de'.split(',')
            const allPossible = ["en", ...allNonEnglish]
            if (context.eventName === "workflow_run") {
              // Job-level `if` already ensures we only get here for successful runs,
              // but keep this as a safety check.
              if (context.payload.workflow_run.conclusion === "success") {
                return ["en"]
              // This shouldn't happen due to job-level filter, but handle gracefully.
              console.warn(`Unexpected: workflow_run with conclusion '${context.payload.workflow_run.conclusion}'`)
              return []
            if (context.eventName === "workflow_dispatch") {
              if (context.payload.inputs.languages) {
                const clean = context.payload.inputs.languages.split(',').map(x => x.trim()).filter(Boolean)
                const notRecognized = clean.find(x => !allPossible.includes(x))
                if (notRecognized) {
                  throw new Error(`'${notRecognized}' is not a recognized language code`)
                return clean
              return allPossible
            if (context.eventName === "schedule") {
              return allNonEnglish
            console.log(context)
            throw new Error(`Unable figure out what languages to run (${context.eventName})`)
      - name: Debug output
        run: echo "${{ steps.set-matrix.outputs.result }}"
      - name: Check out repo
        if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
      - uses: ./.github/actions/slack-alert
        if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
        with:
          slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
          slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
      - uses: ./.github/actions/create-workflow-failure-issue
        if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
        with:
          token: ${{ secrets.DOCS_BOT_PAT_BASE }}
  updateElasticsearchIndexes:
    needs: figureOutMatrix
    name: Update indexes
    if: ${{ github.repository == 'github/docs-internal' && needs.figureOutMatrix.outputs.matrix != '[]' }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      # When it's only English (i.e. a simple array of ['en']), this value
      # does not matter. If it's ALL the languages, then we know we can
      # be patient because it's a daily scheduled run and it's run by bots
      # while humans are asleep. So there's no rush and no need to finish
      # the whole job fast.
      # As of June 2023, it takes about 10+ minutes to index one whole
      # language and we have 8 non-English languages.
      # As of May 2025, we index so many pages that we are being rate-limited by
      # Elasticsearch. So we are shrinking this value to 2, down from 3
      max-parallel: 2
      matrix:
        language: ${{ fromJSON(needs.figureOutMatrix.outputs.matrix) }}
      - name: Check out repo
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
      - name: Clone docs-internal-data
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          repository: github/docs-internal-data
          # This works because user `docs-bot` has read access to that private repo.
          token: ${{ secrets.DOCS_BOT_PAT_BASE }}
          path: docs-internal-data
      - name: Clone all translations
        if: ${{ matrix.language != 'en' }}
        uses: ./.github/actions/clone-translations
        with:
          token: ${{ secrets.DOCS_BOT_PAT_BASE }}
      - uses: ./.github/actions/node-npm-setup
      - uses: ./.github/actions/cache-nextjs
      - name: Run build scripts
        run: npm run build
      - name: Start the server in the background
        env:
          ENABLE_DEV_LOGGING: false
        run: |
          npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
          # first sleep to give it a chance to start
          sleep 6
          curl --retry-connrefused --retry 6 -I http://localhost:4002/
      - if: ${{ failure() }}
        name: Debug server outputs on errors
        run: |
          echo "____STDOUT____"
          cat /tmp/stdout.log
          echo "____STDERR____"
          cat /tmp/stderr.log
      - name: Scrape records into a temp directory
        env:
          # If a reusable, or anything in the `data/*` directory is deleted
          # you might get a
          #   RenderError: Can't find the key 'site.data.reusables...' in the scope
          # But that'll get fixed in the next translation pipeline. For now,
          # let's just accept an empty string instead.
          THROW_ON_EMPTY: false
          # Note that by default, this is '' (empty string) and that means
          # the same as not set within the script.
          VERSION: ${{ inputs.version }}
          DOCS_INTERNAL_DATA: docs-internal-data
        run: |
          mkdir /tmp/records
          npm run general-search-scrape -- /tmp/records \
            --language ${{ matrix.language }}
          ls -lh /tmp/records
      - name: Check for scraping failures
        id: check-failures
        run: |
          if [ -f /tmp/records/failures-summary.json ]; then
            FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
            echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
            echo "has_failures=true" >> $GITHUB_OUTPUT
            echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
          else
            echo "has_failures=false" >> $GITHUB_OUTPUT
            echo "✅ All pages scraped successfully"
      - name: Check that Elasticsearch is accessible
        run: |
          curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
      - name: Index into Elasticsearch
        env:
          # Must match what we used when scraping (npm run general-search-scrape)
          # otherwise the script will seek other versions from disk that might
          # not exist.
          VERSION: ${{ inputs.version }}
        run: |
          npm run index-general-search -- /tmp/records \
            --language ${{ matrix.language }} \
            --stagger-seconds 5 \
            --retries 5
      - name: Check created indexes and aliases
        run: |
          # Not using `--fail` here because I've observed that it can fail
          # with a rather cryptic 404 error when it should, if anything, be
          # a 200 OK with a list of no indices.
          curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
          curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
      - name: Purge Fastly edge cache
        env:
          FASTLY_TOKEN: ${{ secrets.FASTLY_TOKEN }}
          FASTLY_SERVICE_ID: ${{ secrets.FASTLY_SERVICE_ID }}
          FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
        run: npm run purge-fastly-edge-cache
      - name: Upload failures artifact
        if: ${{ steps.check-failures.outputs.has_failures == 'true' }}
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
          name: search-failures-${{ matrix.language }}
          path: /tmp/records/failures-summary.json
          retention-days: 1
      - uses: ./.github/actions/slack-alert
        if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
        with:
          slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
          slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
      - uses: ./.github/actions/create-workflow-failure-issue
        if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
        with:
          token: ${{ secrets.DOCS_BOT_PAT_BASE }}
  notifyScrapingFailures:
    name: Notify scraping failures
    needs: updateElasticsearchIndexes
    if: ${{ always() && github.repository == 'github/docs-internal' && github.event_name != 'workflow_dispatch' && needs.updateElasticsearchIndexes.result != 'cancelled' }}
    runs-on: ubuntu-latest
      - name: Check out repo
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
      - name: Download all failure artifacts
        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
        with:
          pattern: search-failures-*
          path: /tmp/failures
        continue-on-error: true
      - name: Check if any failures were downloaded
        id: check-artifacts
        run: |
          if [ -d /tmp/failures ] && [ "$(ls -A /tmp/failures 2>/dev/null)" ]; then
            echo "has_artifacts=true" >> $GITHUB_OUTPUT
          else
            echo "has_artifacts=false" >> $GITHUB_OUTPUT
      - uses: ./.github/actions/node-npm-setup
        if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' }}
      - name: Aggregate failures and format message
        if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' }}
        id: aggregate
        run: |
          RESULT=$(npx tsx src/search/scripts/aggregate-search-index-failures.ts /tmp/failures \
            --workflow-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}")
            echo 'result<<EOF'
            echo "$RESULT"
            echo 'EOF'
          } >> "$GITHUB_OUTPUT"
      - name: Close previous scraping failure issues
        if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
        env:
          GH_TOKEN: ${{ secrets.DOCS_BOT_PAT_BASE }}
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
        run: |
          open_issues=$(gh issue list \
            --repo github/docs-engineering \
            --label "search-scraping-failures" \
            --state open \
            --json number \
            --jq '.[].number')
          for issue in $open_issues; do
            gh issue close "$issue" \
              --repo github/docs-engineering \
              --comment "Closing in favor of a newer scraping failure report from $RUN_URL"
          done
      - name: Create scraping failure issue
        if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
        env:
          GH_TOKEN: ${{ secrets.DOCS_BOT_PAT_BASE }}
          FAILURE_MESSAGE: ${{ fromJSON(steps.aggregate.outputs.result || '{"message":""}').message }}
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
          FILE_URL: ${{ github.server_url }}/${{ github.repository }}/blob/main/.github/workflows/index-general-search.yml
          WORKFLOW_NAME: ${{ github.workflow }}
        run: |
          body=$(cat <<EOF
          ### Search index scraping failures
          $FAILURE_MESSAGE
          ---
          **Workflow run:** $RUN_URL
          **Workflow file:** $FILE_URL
          This issue was automatically created by the \`$WORKFLOW_NAME\` workflow.
          EOF
          gh issue create \
            --repo github/docs-engineering \
            --label "search-scraping-failures" \
            --title "[Search Scraping Failures] $(date -u +%Y-%m-%d)" \
            --body "$body"
      - name: Send consolidated Slack notification
        if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
        uses: ./.github/actions/slack-alert
        with:
          slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
          slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
          color: warning
          message: ${{ fromJSON(steps.aggregate.outputs.result || '{"message":""}').message }}
      - uses: ./.github/actions/slack-alert
        if: ${{ failure() }}
        with:
          slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
          slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
      - uses: ./.github/actions/create-workflow-failure-issue
        if: ${{ failure() }}
        with:
          token: ${{ secrets.DOCS_BOT_PAT_BASE }}
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

index-general-search.yml

Latest commit

History

index-general-search.yml

File metadata and controls