-
Notifications
You must be signed in to change notification settings - Fork 66.8k
360 lines (312 loc) · 14.5 KB
/
index-general-search.yml
File metadata and controls
360 lines (312 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
name: Index general search in Elasticsearch
# **What it does**: It scrapes the whole site and dumps the records in a
# temp directory. Then it indexes that into Elasticsearch.
# **Why we have it**: We want our search indexes kept up to date.
# **Who does it impact**: Anyone using search on docs.
on:
workflow_dispatch:
inputs:
version:
description: "Version to exclusively generate the search index for. E.g. 'dotcom', 'ghes-3.12'"
required: false
default: ''
languages:
description: "Comma separated languages. E.g. 'en,es,ja,pt,zh,ru,fr,ko,de' (defaults to all)"
required: false
default: ''
schedule:
- cron: '20 16 * * 1-5' # Run Mon-Fri at 16:20 UTC / 8:20 PST
workflow_run:
workflows: ['Purge Fastly']
types:
- completed
permissions:
contents: read
# This allows a subsequently queued workflow run to cancel previous runs.
# Include the triggering workflow's conclusion in the group so that runs triggered
# by skipped Purge Fastly workflows don't cancel runs triggered by successful ones.
concurrency:
group: '${{ github.workflow }} @ ${{ github.head_ref }} ${{ github.event_name }} ${{ github.event.workflow_run.conclusion }}'
cancel-in-progress: true
env:
ELASTICSEARCH_URL: ${{ secrets.ELASTICSEARCH_URL }}
# Since we'll run in NODE_ENV=production, we need to be explicit that
# we don't want Hydro configured.
HYDRO_ENDPOINT: ''
HYDRO_SECRET: ''
jobs:
figureOutMatrix:
# Skip immediately if triggered by a non-successful Purge Fastly run.
# This prevents skipped runs from canceling valid indexing runs via concurrency.
if: ${{ github.repository == 'github/docs-internal' && (github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success') }}
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.result }}
steps:
- uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
id: set-matrix
with:
script: |
// Edit this list for the definitive list of languages
// (other than English) we want to index in Elasticsearch.
const allNonEnglish = 'es,ja,pt,zh,ru,fr,ko,de'.split(',')
const allPossible = ["en", ...allNonEnglish]
if (context.eventName === "workflow_run") {
// Job-level `if` already ensures we only get here for successful runs,
// but keep this as a safety check.
if (context.payload.workflow_run.conclusion === "success") {
return ["en"]
}
// This shouldn't happen due to job-level filter, but handle gracefully.
console.warn(`Unexpected: workflow_run with conclusion '${context.payload.workflow_run.conclusion}'`)
return []
}
if (context.eventName === "workflow_dispatch") {
if (context.payload.inputs.languages) {
const clean = context.payload.inputs.languages.split(',').map(x => x.trim()).filter(Boolean)
const notRecognized = clean.find(x => !allPossible.includes(x))
if (notRecognized) {
throw new Error(`'${notRecognized}' is not a recognized language code`)
}
return clean
}
return allPossible
}
if (context.eventName === "schedule") {
return allNonEnglish
}
console.log(context)
throw new Error(`Unable figure out what languages to run (${context.eventName})`)
- name: Debug output
run: echo "${{ steps.set-matrix.outputs.result }}"
- name: Check out repo
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
- uses: ./.github/actions/create-workflow-failure-issue
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
token: ${{ secrets.DOCS_BOT_PAT_BASE }}
updateElasticsearchIndexes:
needs: figureOutMatrix
name: Update indexes
if: ${{ github.repository == 'github/docs-internal' && needs.figureOutMatrix.outputs.matrix != '[]' }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
# When it's only English (i.e. a simple array of ['en']), this value
# does not matter. If it's ALL the languages, then we know we can
# be patient because it's a daily scheduled run and it's run by bots
# while humans are asleep. So there's no rush and no need to finish
# the whole job fast.
# As of June 2023, it takes about 10+ minutes to index one whole
# language and we have 8 non-English languages.
# As of May 2025, we index so many pages that we are being rate-limited by
# Elasticsearch. So we are shrinking this value to 2, down from 3
max-parallel: 2
matrix:
language: ${{ fromJSON(needs.figureOutMatrix.outputs.matrix) }}
steps:
- name: Check out repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Clone docs-internal-data
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
repository: github/docs-internal-data
# This works because user `docs-bot` has read access to that private repo.
token: ${{ secrets.DOCS_BOT_PAT_BASE }}
path: docs-internal-data
- name: Clone all translations
if: ${{ matrix.language != 'en' }}
uses: ./.github/actions/clone-translations
with:
token: ${{ secrets.DOCS_BOT_PAT_BASE }}
- uses: ./.github/actions/node-npm-setup
- uses: ./.github/actions/cache-nextjs
- name: Run build scripts
run: npm run build
- name: Start the server in the background
env:
ENABLE_DEV_LOGGING: false
run: |
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
curl --retry-connrefused --retry 6 -I http://localhost:4002/
- if: ${{ failure() }}
name: Debug server outputs on errors
run: |
echo "____STDOUT____"
cat /tmp/stdout.log
echo "____STDERR____"
cat /tmp/stderr.log
- name: Scrape records into a temp directory
env:
# If a reusable, or anything in the `data/*` directory is deleted
# you might get a
#
# RenderError: Can't find the key 'site.data.reusables...' in the scope
#
# But that'll get fixed in the next translation pipeline. For now,
# let's just accept an empty string instead.
THROW_ON_EMPTY: false
# Note that by default, this is '' (empty string) and that means
# the same as not set within the script.
VERSION: ${{ inputs.version }}
DOCS_INTERNAL_DATA: docs-internal-data
run: |
mkdir /tmp/records
npm run general-search-scrape -- /tmp/records \
--language ${{ matrix.language }}
ls -lh /tmp/records
- name: Check for scraping failures
id: check-failures
run: |
if [ -f /tmp/records/failures-summary.json ]; then
FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
echo "has_failures=true" >> $GITHUB_OUTPUT
echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
else
echo "has_failures=false" >> $GITHUB_OUTPUT
echo "✅ All pages scraped successfully"
fi
- name: Check that Elasticsearch is accessible
run: |
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
- name: Index into Elasticsearch
env:
# Must match what we used when scraping (npm run general-search-scrape)
# otherwise the script will seek other versions from disk that might
# not exist.
VERSION: ${{ inputs.version }}
run: |
npm run index-general-search -- /tmp/records \
--language ${{ matrix.language }} \
--stagger-seconds 5 \
--retries 5
- name: Check created indexes and aliases
run: |
# Not using `--fail` here because I've observed that it can fail
# with a rather cryptic 404 error when it should, if anything, be
# a 200 OK with a list of no indices.
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
curl --retry-connrefused --retry 5 ${{ env.ELASTICSEARCH_URL }}/_cat/indices?v
- name: Purge Fastly edge cache
env:
FASTLY_TOKEN: ${{ secrets.FASTLY_TOKEN }}
FASTLY_SERVICE_ID: ${{ secrets.FASTLY_SERVICE_ID }}
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
run: npm run purge-fastly-edge-cache
- name: Upload failures artifact
if: ${{ steps.check-failures.outputs.has_failures == 'true' }}
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: search-failures-${{ matrix.language }}
path: /tmp/records/failures-summary.json
retention-days: 1
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
- uses: ./.github/actions/create-workflow-failure-issue
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:
token: ${{ secrets.DOCS_BOT_PAT_BASE }}
notifyScrapingFailures:
name: Notify scraping failures
needs: updateElasticsearchIndexes
if: ${{ always() && github.repository == 'github/docs-internal' && github.event_name != 'workflow_dispatch' && needs.updateElasticsearchIndexes.result != 'cancelled' }}
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Download all failure artifacts
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
with:
pattern: search-failures-*
path: /tmp/failures
continue-on-error: true
- name: Check if any failures were downloaded
id: check-artifacts
run: |
if [ -d /tmp/failures ] && [ "$(ls -A /tmp/failures 2>/dev/null)" ]; then
echo "has_artifacts=true" >> $GITHUB_OUTPUT
else
echo "has_artifacts=false" >> $GITHUB_OUTPUT
fi
- uses: ./.github/actions/node-npm-setup
if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' }}
- name: Aggregate failures and format message
if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' }}
id: aggregate
run: |
RESULT=$(npx tsx src/search/scripts/aggregate-search-index-failures.ts /tmp/failures \
--workflow-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}")
{
echo 'result<<EOF'
echo "$RESULT"
echo 'EOF'
} >> "$GITHUB_OUTPUT"
- name: Close previous scraping failure issues
if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
env:
GH_TOKEN: ${{ secrets.DOCS_BOT_PAT_BASE }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
open_issues=$(gh issue list \
--repo github/docs-engineering \
--label "search-scraping-failures" \
--state open \
--json number \
--jq '.[].number')
for issue in $open_issues; do
gh issue close "$issue" \
--repo github/docs-engineering \
--comment "Closing in favor of a newer scraping failure report from $RUN_URL"
done
- name: Create scraping failure issue
if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
env:
GH_TOKEN: ${{ secrets.DOCS_BOT_PAT_BASE }}
FAILURE_MESSAGE: ${{ fromJSON(steps.aggregate.outputs.result || '{"message":""}').message }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
FILE_URL: ${{ github.server_url }}/${{ github.repository }}/blob/main/.github/workflows/index-general-search.yml
WORKFLOW_NAME: ${{ github.workflow }}
run: |
body=$(cat <<EOF
### Search index scraping failures
$FAILURE_MESSAGE
---
**Workflow run:** $RUN_URL
**Workflow file:** $FILE_URL
This issue was automatically created by the \`$WORKFLOW_NAME\` workflow.
EOF
)
gh issue create \
--repo github/docs-engineering \
--label "search-scraping-failures" \
--title "[Search Scraping Failures] $(date -u +%Y-%m-%d)" \
--body "$body"
- name: Send consolidated Slack notification
if: ${{ steps.check-artifacts.outputs.has_artifacts == 'true' && fromJSON(steps.aggregate.outputs.result || '{"hasFailures":false}').hasFailures }}
uses: ./.github/actions/slack-alert
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
color: warning
message: ${{ fromJSON(steps.aggregate.outputs.result || '{"message":""}').message }}
- uses: ./.github/actions/slack-alert
if: ${{ failure() }}
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
- uses: ./.github/actions/create-workflow-failure-issue
if: ${{ failure() }}
with:
token: ${{ secrets.DOCS_BOT_PAT_BASE }}