66//
77// [end-readme]
88
9+ import fs from 'fs/promises'
10+
11+ import got , { RequestError } from 'got'
12+
913import { getContents , getPathsWithMatchingStrings } from './helpers/git-utils.js'
10- import got from 'got'
1114
1215if ( ! process . env . GITHUB_TOKEN ) {
13- console . error ( 'Error! You must have a GITHUB_TOKEN set in an .env file to run this script.' )
14- process . exit ( 1 )
16+ throw new Error ( 'Error! You must have a GITHUB_TOKEN set in an .env file to run this script.' )
1517}
1618
17- const sleep = ( ms ) => new Promise ( ( resolve ) => setTimeout ( resolve , ms ) )
19+ const FORCE_DOWNLOAD = Boolean ( JSON . parse ( process . env . FORCE_DOWNLOAD || 'false' ) )
20+ const BATCH_SIZE = JSON . parse ( process . env . BATCH_SIZE || '10' )
21+ const BASE_URL = process . env . BASE_URL || 'http://localhost:4000'
1822
1923main ( )
2024
25+ // The way `got` does retries:
26+ //
27+ // sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
28+ //
29+ // So, it means:
30+ //
31+ // 1. ~1000ms
32+ // 2. ~2000ms
33+ // 3. ~4000ms
34+ //
35+ // ...if the limit we set is 3.
36+ // Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
37+ // So there's no point in trying more attempts than 3 because it would
38+ // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
39+ const retryConfiguration = {
40+ limit : 3 ,
41+ }
42+ // According to our Datadog metrics, the *average* time for the
43+ // the 'archive_enterprise_proxy' metric is ~70ms (excluding spikes)
44+ // which much less than 500ms.
45+ const timeoutConfiguration = 1000
46+
2147async function main ( ) {
2248 const searchStrings = [ 'https://docs.github.com' , 'GitHub help_url' , 'GitHub developer_help_url' ]
23- const foundFiles = await getPathsWithMatchingStrings ( searchStrings , 'github' , 'github' )
24- const searchFiles = [ ...foundFiles ]
49+
50+ const foundFiles = [ ]
51+ try {
52+ foundFiles . push ( ...JSON . parse ( await fs . readFile ( '/tmp/foundFiles.json' , 'utf-8' ) ) )
53+ } catch ( error ) {
54+ if ( ! ( error . code && error . code === 'ENOENT' ) ) {
55+ throw error
56+ }
57+ }
58+ if ( ! foundFiles . length || FORCE_DOWNLOAD ) {
59+ foundFiles . push ( ...( await getPathsWithMatchingStrings ( searchStrings , 'github' , 'github' ) ) )
60+ await fs . writeFile ( '/tmp/foundFiles.json' , JSON . stringify ( foundFiles , undefined , 2 ) , 'utf-8' )
61+ }
62+ const searchFiles = [ ...new Set ( foundFiles ) ] // filters out dupes
2563 . filter ( ( file ) => endsWithAny ( [ '.rb' , '.yml' , '.yaml' , '.txt' , '.pdf' , '.erb' , '.js' ] , file ) )
2664 . filter (
2765 ( file ) =>
@@ -35,79 +73,106 @@ async function main() {
3573 const urlRegEx =
3674 / h t t p s ? : \/ \/ ( w w w \. ) ? [ - a - z A - Z 0 - 9 @ : % . _ + ~ # = ] { 1 , 256 } \. [ a - z A - Z 0 - 9 ( ) ] { 1 , 6 } \b ( [ - a - z A - Z 0 - 9 ( ) @ : % _ + . ~ # ? & / / = ] * ) / g
3775
38- for ( const file of searchFiles ) {
39- const contents = await getContents ( 'github' , 'github' , 'master' , file )
40-
41- if (
42- contents . includes ( 'https://docs.github.com' ) ||
43- contents . includes ( 'GitHub.help_url' ) ||
44- contents . includes ( 'GitHub.developer_help_url' )
45- ) {
46- const docsIndices = getIndicesOf ( 'https://docs.github.com' , contents )
47- const helpIndices = getIndicesOf ( 'GitHub.help_url' , contents )
48- helpIndices . push ( ...getIndicesOf ( 'GitHub.developer_help_url' , contents ) )
49- if ( docsIndices . length > 0 ) {
50- docsIndices . forEach ( ( numIndex ) => {
51- // Assuming we don't have links close to 500 characters long
52- const docsLink = contents . substring ( numIndex , numIndex + 500 ) . match ( urlRegEx )
53- docsLinksFiles . push ( [ docsLink [ 0 ] . toString ( ) . replace ( / [ ^ a - z A - Z 0 - 9 ] * $ | \\ n $ / g, '' ) , file ] )
54- } )
55- }
76+ try {
77+ docsLinksFiles . push ( ...JSON . parse ( await fs . readFile ( '/tmp/docsLinksFiles.json' , 'utf-8' ) ) )
78+ } catch ( error ) {
79+ if ( ! ( error . code && error . code === 'ENOENT' ) ) {
80+ throw error
81+ }
82+ }
5683
57- if ( helpIndices . length > 0 ) {
58- helpIndices . forEach ( ( numIndex ) => {
59- // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
60- if (
61- ( contents . substring ( numIndex , numIndex + 11 ) === 'GitHub.help' &&
62- contents . charAt ( numIndex + 16 ) === '#' ) ||
63- ( contents . substring ( numIndex , numIndex + 16 ) === 'GitHub.developer' &&
64- contents . charAt ( numIndex + 26 ) === '#' )
65- ) {
66- return
67- }
84+ if ( ! docsLinksFiles . length || FORCE_DOWNLOAD ) {
85+ for ( const file of searchFiles ) {
86+ const contents = await getContents ( 'github' , 'github' , 'master' , file )
87+
88+ if (
89+ contents . includes ( 'https://docs.github.com' ) ||
90+ contents . includes ( 'GitHub.help_url' ) ||
91+ contents . includes ( 'GitHub.developer_help_url' )
92+ ) {
93+ const docsIndices = getIndicesOf ( 'https://docs.github.com' , contents )
94+ const helpIndices = getIndicesOf ( 'GitHub.help_url' , contents )
95+ helpIndices . push ( ...getIndicesOf ( 'GitHub.developer_help_url' , contents ) )
96+ if ( docsIndices . length > 0 ) {
97+ docsIndices . forEach ( ( numIndex ) => {
98+ // Assuming we don't have links close to 500 characters long
99+ const docsLink = contents . substring ( numIndex , numIndex + 500 ) . match ( urlRegEx )
100+ const linkURL = new URL ( docsLink [ 0 ] . toString ( ) . replace ( / [ ^ a - z A - Z 0 - 9 ] * $ | \\ n $ / g, '' ) )
101+ const linkPath = linkURL . pathname + linkURL . hash
102+ docsLinksFiles . push ( { linkPath, file } )
103+ } )
104+ }
105+
106+ if ( helpIndices . length > 0 ) {
107+ helpIndices . forEach ( ( numIndex ) => {
108+ // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
109+ if (
110+ ( contents . substring ( numIndex , numIndex + 11 ) === 'GitHub.help' &&
111+ contents . charAt ( numIndex + 16 ) === '#' ) ||
112+ ( contents . substring ( numIndex , numIndex + 16 ) === 'GitHub.developer' &&
113+ contents . charAt ( numIndex + 26 ) === '#' )
114+ ) {
115+ return
116+ }
68117
69- const startSearchIndex = contents . indexOf ( '/' , numIndex )
70- // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
71- // There are certain links that don't start with `/` so we want to skip those.
72- // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
73- if ( startSearchIndex - numIndex < 30 ) {
74- const linkPath = contents
75- . substring (
76- startSearchIndex ,
77- regexIndexOf (
78- contents ,
79- / \n | " \) | { @ e m a i l _ t r a c k i n g _ p a r a m s } | \^ h t t p | A h t t p s | e x a m p l e | T h i s | T O D O " | [ { } | " % > < . , ' ) * ] / ,
80- startSearchIndex + 1
118+ const startSearchIndex = contents . indexOf ( '/' , numIndex )
119+ // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
120+ // There are certain links that don't start with `/` so we want to skip those.
121+ // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
122+ if ( startSearchIndex - numIndex < 30 ) {
123+ const linkPath = contents
124+ . substring (
125+ startSearchIndex ,
126+ regexIndexOf (
127+ contents ,
128+ / \n | " \) | { @ e m a i l _ t r a c k i n g _ p a r a m s } | \^ h t t p | A h t t p s | e x a m p l e | T h i s | T O D O " | [ { } | " % > < . , ' ) * ] / ,
129+ startSearchIndex + 1
130+ )
81131 )
82- )
83- . trim ( )
132+ . trim ( )
84133
85- // Certain specific links can be ignored as well
86- if ( [ '/deprecation-1' ] . includes ( linkPath ) ) {
87- return
88- }
134+ // Certain specific links can be ignored as well
135+ if ( [ '/deprecation-1' ] . includes ( linkPath ) ) {
136+ return
137+ }
89138
90- docsLinksFiles . push ( [ `https://docs.github.com${ linkPath } ` , file ] )
91- }
92- } )
139+ docsLinksFiles . push ( { linkPath, file } )
140+ }
141+ } )
142+ }
93143 }
94144 }
145+ await fs . writeFile (
146+ '/tmp/docsLinksFiles.json' ,
147+ JSON . stringify ( docsLinksFiles , undefined , 2 ) ,
148+ 'utf-8'
149+ )
95150 }
96-
97151 const brokenLinks = [ ]
98- // Done serially with delay to avoid hitting the rate limiter
99- for ( const file of docsLinksFiles ) {
100- try {
101- await got ( file [ 0 ] , {
102- headers : {
103- 'X-WAF-TOKEN' : process . env . WAF_TOKEN ,
104- } ,
152+
153+ // Break up the long list of URLs to test into batches
154+ for ( const batch of [ ...Array ( Math . floor ( docsLinksFiles . length / BATCH_SIZE ) ) . keys ( ) ] ) {
155+ const slice = docsLinksFiles . slice ( batch * BATCH_SIZE , batch * BATCH_SIZE + BATCH_SIZE )
156+ await Promise . all (
157+ slice . map ( async ( { linkPath, file } ) => {
158+ // This isn't necessary but if it can't be constructed, it'll
159+ // fail in quite a nice way and not "blame got".
160+ const url = new URL ( BASE_URL + linkPath )
161+ try {
162+ await got ( url . href , {
163+ retry : retryConfiguration ,
164+ timeout : timeoutConfiguration ,
165+ } )
166+ } catch ( error ) {
167+ if ( error instanceof RequestError ) {
168+ brokenLinks . push ( { linkPath, file } )
169+ } else {
170+ console . warn ( `URL when it threw: ${ url } ` )
171+ throw error
172+ }
173+ }
105174 } )
106- } catch ( e ) {
107- brokenLinks . push ( file )
108- } finally {
109- await sleep ( 300 )
110- }
175+ )
111176 }
112177
113178 if ( ! brokenLinks . length ) {
0 commit comments