forked from EFForg/https-everywhere
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.js
More file actions
216 lines (178 loc) · 6.34 KB
/
index.js
File metadata and controls
216 lines (178 loc) · 6.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"use strict";
var fs = require('fs');
var readline = require('readline');
var GitHubApi = require('github');
var _ = require('lodash');
var parseXML = require('xml2js').parseString;
var async = require('async');
var request = require('request');
var unzip = require('unzip');
var ProgressBar = require('progress');
var config = require('./config');
// Fetch the Alexa top 1M sites and push it to an array `alexa` via streams
function get_alexa(alexa_cb){
var alexa = []
var csv_regex = /^[0-9]+,(.+)/
request.get('https://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
.on('error', function(err) {
alexa_cb(err);
})
.pipe(unzip.Parse())
.on('entry', function (entry) {
var bar = new ProgressBar('Processing Alexa Top 1M [:bar] :percent :etas', {
total: 100
});
var lineReader = require('readline').createInterface({
input: entry
});
var x = 0;
lineReader.on('line', function (line) {
var domain = line.match(csv_regex)[1]
alexa.push(domain);
if(x % 10000 == 0) bar.tick();
x++;
});
lineReader.on('close', function(){
alexa_cb(null, alexa);
});
});
};
function get_most_recent_pr(alexa, recent_cb){
fs.readFile(config.state_file, function(err, data){
if(err){
fs.writeFile(config.state_file, '0', function(err){
if(err) return recent_cb(err);
recent_cb(null, [alexa, 0]);
});
} else {
recent_cb(null, [alexa, Number(data)]);
}
});
}
function github_process_prs(res, pr_cb){
var alexa = res[0],
most_recent_pr_checked = res[1];
var github = new GitHubApi();
var wildcard_www_regex = /^(www|\*)\.(.+)/
var httpse = {
user: config.github_user,
repo: config.github_repo
}
github.authenticate({
type: "oauth",
token: config.github_token || process.env.GITHUB_TOKEN
})
// Label all PRs which meet the criteria for labelling
function github_process_pr_page(first_page){
return function(err, pull_requests){
if(first_page){
fs.writeFile(config.state_file, pull_requests[0].number, function(err){
if(err) return pr_cb(err);
});
}
_.each(pull_requests, function(pull_request){
if(pull_request.number > most_recent_pr_checked){
github.pullRequests.getFiles(_.extend(httpse, {
number: pull_request.number
}), function(err, files){
if(err) return pr_cb(err);
// Rank a list of target hosts, returning the minimum alexa placing
function rank_targets(targets){
var minimum_placing = 9999999;
_.each(targets, function(host){
if(host.match(wildcard_www_regex)){
host = host.match(wildcard_www_regex)[2];
}
var alexa_placing = alexa.indexOf(host);
if(~alexa_placing && alexa_placing < minimum_placing){
minimum_placing = alexa_placing;
}
});
if(minimum_placing != 9999999){
return minimum_placing;
}
}
// Given the url of an HTTPSE ruleset, return a list of targets to fetch_cb
function fetch_url_and_parse_targets(url, fetch_cb){
request({url: url}, function(err, res, body){
if(err) return fetch_cb(err);
parseXML(body, function(err, root){
if(err) return fetch_cb(err);
fetch_cb(null, _.map(root.ruleset.target, function(target){
return target.$.host;
}));
});
});
}
var file_fetches = [];
// Out of the list of files for this PR, figure out the minimum Alexa ranking for each
_.each(files, function(file){
if(file.filename.match(/^src\/chrome\/content\/rules\//)){
file_fetches.push(function(file_cb){
fetch_url_and_parse_targets(file.raw_url, function(err, targets){
if(err) return file_cb(err);
console.log("Processing PR: " + pull_request.number + ", file: " + file.filename);
var ranking = rank_targets(targets);
if(ranking){
return file_cb(null, {
alexa_placing: ranking,
pr_number: pull_request.number
});
} else {
return file_cb();
}
});
});
}
});
async.parallel(file_fetches, function(err, res){
if(err) pr_cb(err);
var reduced_pr_ranking = _.reduce(_.filter(res),
function(minimum_file_res, file_res){
if(file_res.alexa_placing < minimum_file_res.alexa_placing){
return file_res;
}
return minimum_file_res;
});
if(reduced_pr_ranking){
let label;
if(reduced_pr_ranking.alexa_placing < 100){
label = "top-100";
} else if(reduced_pr_ranking.alexa_placing < 1000){
label = "top-1k";
} else if(reduced_pr_ranking.alexa_placing < 10000){
label = "top-10k";
} else if(reduced_pr_ranking.alexa_placing < 100000){
label = "top-100k";
} else {
label = "top-1m";
}
console.log("Applying label `" + label + "` to PR: " + reduced_pr_ranking.pr_number);
github.issues.addLabels(_.extend(httpse, {
number: reduced_pr_ranking.pr_number,
body: [label]
}), function(err, res){
if(err) console.log(err);
});
}
});
});
}
});
if(github.hasNextPage(pull_requests)){
github.getNextPage(pull_requests, github_process_pr_page(false));
}
}
}
github.pullRequests.getAll(_.extend(httpse, {
state: "open",
per_page: 100
}), github_process_pr_page(true));
}
async.waterfall([
get_alexa,
get_most_recent_pr,
github_process_prs
], function(err, result){
if(err) console.log(err);
});