Skip to content

Commit d39f16c

Browse files
authored
Merge pull request EFForg#6581 from Hainish/labeller
Add labeller to utils (closes EFForg#6424)
2 parents 8d375cd + 64934fd commit d39f16c

File tree

7 files changed

+296
-0
lines changed

7 files changed

+296
-0
lines changed

utils/labeller/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
config.json
2+
node_modules
3+
*.swp

utils/labeller/Dockerfile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM node
2+
MAINTAINER William Budington <bill@eff.org>
3+
4+
WORKDIR /opt
5+
6+
COPY package.json .
7+
RUN npm install
8+
COPY index.js .
9+
COPY config.json.example .
10+
RUN mv config.json.example config.json
11+
12+
CMD ["node", "index.js"]

utils/labeller/README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# HTTPS Everywhere Labeller
2+
3+
This compares the open PR requests that have ruleset changes to the Alexa top 1M sites, and adds labels corresponding to how they place within Alexa. Current labels are:
4+
5+
1. `top-100`
6+
2. `top-1k`
7+
3. `top-10k`
8+
4. `top-100k`
9+
5. `top-1m`
10+
11+
This will work for admins of HTTPS Everywhere that generate a [GitHub token](https://github.com/settings/tokens).
12+
13+
## Setup
14+
15+
### With Docker
16+
17+
docker build -t labeller .
18+
19+
### Without Docker
20+
21+
Download and install `node` and `npm`, then
22+
23+
nmp install
24+
cp config.json.example config.json
25+
26+
Enter your GitHub token info into `config.json`.
27+
28+
## Running
29+
30+
### With Docker
31+
32+
Set your `$GITHUB_TOKEN`, and run
33+
34+
docker run -it -v $(pwd)/state_dir:/opt/state_dir -e GITHUB_TOKEN=$GITHUB_TOKEN labeller
35+
36+
### Without Docker
37+
38+
node index.js

utils/labeller/config.json.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"github_token": "",
3+
"github_user": "efforg",
4+
"github_repo": "https-everywhere",
5+
"state_file": "state_dir/most_recent_pr_checked.state"
6+
}

utils/labeller/index.js

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
"use strict";
2+
3+
var fs = require('fs');
4+
var readline = require('readline');
5+
6+
var GitHubApi = require('github');
7+
var _ = require('lodash');
8+
var parseXML = require('xml2js').parseString;
9+
var async = require('async');
10+
var request = require('request');
11+
var unzip = require('unzip');
12+
var ProgressBar = require('progress');
13+
14+
var config = require('./config');
15+
16+
// Fetch the Alexa top 1M sites and push it to an array `alexa` via streams
17+
function get_alexa(alexa_cb){
18+
19+
var alexa = []
20+
var csv_regex = /^[0-9]+,(.+)/
21+
22+
request.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
23+
.on('error', function(err) {
24+
alexa_cb(err);
25+
})
26+
.pipe(unzip.Parse())
27+
.on('entry', function (entry) {
28+
29+
var bar = new ProgressBar('Processing Alexa Top 1M [:bar] :percent :etas', {
30+
total: 100
31+
});
32+
33+
var lineReader = require('readline').createInterface({
34+
input: entry
35+
});
36+
37+
var x = 0;
38+
lineReader.on('line', function (line) {
39+
var domain = line.match(csv_regex)[1]
40+
alexa.push(domain);
41+
42+
if(x % 10000 == 0) bar.tick();
43+
x++;
44+
});
45+
46+
lineReader.on('close', function(){
47+
alexa_cb(null, alexa);
48+
});
49+
50+
});
51+
};
52+
53+
function get_most_recent_pr(alexa, recent_cb){
54+
fs.readFile(config.state_file, function(err, data){
55+
if(err){
56+
fs.writeFile(config.state_file, '0', function(err){
57+
if(err) return recent_cb(err);
58+
recent_cb(null, [alexa, 0]);
59+
});
60+
} else {
61+
recent_cb(null, [alexa, Number(data)]);
62+
}
63+
});
64+
}
65+
66+
function github_process_prs(res, pr_cb){
67+
var alexa = res[0],
68+
most_recent_pr_checked = res[1];
69+
70+
var github = new GitHubApi();
71+
var wildcard_www_regex = /^(www|\*)\.(.+)/
72+
73+
var httpse = {
74+
user: config.github_user,
75+
repo: config.github_repo
76+
}
77+
78+
github.authenticate({
79+
type: "oauth",
80+
token: config.github_token || process.env.GITHUB_TOKEN
81+
})
82+
83+
// Label all PRs which meet the criteria for labelling
84+
function github_process_pr_page(first_page){
85+
return function(err, pull_requests){
86+
if(first_page){
87+
fs.writeFile(config.state_file, pull_requests[0].number, function(err){
88+
if(err) return pr_cb(err);
89+
});
90+
}
91+
92+
_.each(pull_requests, function(pull_request){
93+
94+
if(pull_request.number > most_recent_pr_checked){
95+
github.pullRequests.getFiles(_.extend(httpse, {
96+
number: pull_request.number
97+
}), function(err, files){
98+
if(err) return pr_cb(err);
99+
100+
// Rank a list of target hosts, returning the minimum alexa placing
101+
function rank_targets(targets){
102+
var minimum_placing = 9999999;
103+
104+
_.each(targets, function(host){
105+
if(host.match(wildcard_www_regex)){
106+
host = host.match(wildcard_www_regex)[1];
107+
}
108+
109+
var alexa_placing = alexa.indexOf(host);
110+
if(~alexa_placing && alexa_placing < minimum_placing){
111+
minimum_placing = alexa_placing;
112+
}
113+
});
114+
115+
if(minimum_placing != 9999999){
116+
return minimum_placing;
117+
}
118+
}
119+
120+
// Given the url of an HTTPSE ruleset, return a list of targets to fetch_cb
121+
function fetch_url_and_parse_targets(url, fetch_cb){
122+
request({url: url}, function(err, res, body){
123+
if(err) return fetch_cb(err);
124+
125+
parseXML(body, function(err, root){
126+
if(err) return fetch_cb(err);
127+
128+
fetch_cb(null, _.map(root.ruleset.target, function(target){
129+
return target.$.host;
130+
}));
131+
});
132+
});
133+
}
134+
135+
var file_fetches = [];
136+
137+
// Out of the list of files for this PR, figure out the minimum Alexa ranking for each
138+
_.each(files, function(file){
139+
if(file.filename.match(/^src\/chrome\/content\/rules\//)){
140+
file_fetches.push(function(file_cb){
141+
fetch_url_and_parse_targets(file.raw_url, function(err, targets){
142+
if(err) return file_cb(err);
143+
144+
console.log("Processing PR: " + pull_request.number + ", file: " + file.filename);
145+
146+
var ranking = rank_targets(targets);
147+
if(ranking){
148+
return file_cb(null, {
149+
alexa_placing: ranking,
150+
pr_number: pull_request.number
151+
});
152+
} else {
153+
return file_cb();
154+
}
155+
});
156+
});
157+
}
158+
});
159+
160+
async.parallel(file_fetches, function(err, res){
161+
if(err) pr_cb(err);
162+
163+
var reduced_pr_ranking = _.reduce(_.filter(res),
164+
function(minimum_file_res, file_res){
165+
if(file_res.alexa_placing < minimum_file_res.alexa_placing){
166+
return file_res;
167+
}
168+
return minimum_file_res;
169+
});
170+
171+
if(reduced_pr_ranking){
172+
let label;
173+
if(reduced_pr_ranking.alexa_placing < 100){
174+
label = "top-100";
175+
} else if(reduced_pr_ranking.alexa_placing < 1000){
176+
label = "top-1k";
177+
} else if(reduced_pr_ranking.alexa_placing < 10000){
178+
label = "top-10k";
179+
} else if(reduced_pr_ranking.alexa_placing < 100000){
180+
label = "top-100k";
181+
} else {
182+
label = "top-1m";
183+
}
184+
console.log("Applying label `" + label + "` to PR: " + reduced_pr_ranking.pr_number);
185+
186+
github.issues.addLabels(_.extend(httpse, {
187+
number: reduced_pr_ranking.pr_number,
188+
body: [label]
189+
}), function(err, res){
190+
if(err) console.log(err);
191+
});
192+
}
193+
});
194+
});
195+
}
196+
});
197+
198+
if(github.hasNextPage(pull_requests)){
199+
github.getNextPage(pull_requests, github_process_pr_page(false));
200+
}
201+
}
202+
}
203+
204+
github.pullRequests.getAll(_.extend(httpse, {
205+
state: "open",
206+
per_page: 100
207+
}), github_process_pr_page(true));
208+
}
209+
210+
async.waterfall([
211+
get_alexa,
212+
get_most_recent_pr,
213+
github_process_prs
214+
], function(err, result){
215+
if(err) console.log(err);
216+
});

utils/labeller/package.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"name": "https-everywhere-labeller",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "index.js",
6+
"scripts": {
7+
"test": "echo \"Error: no test specified\" && exit 1"
8+
},
9+
"author": "William Budington",
10+
"license": "GPL-3.0",
11+
"dependencies": {
12+
"async": "^2.0.1",
13+
"github": "^2.5.1",
14+
"lodash": "^4.15.0",
15+
"progress": "^1.1.8",
16+
"request": "^2.74.0",
17+
"unzip": "^0.1.11",
18+
"xml2js": "^0.4.17"
19+
}
20+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
most_recent_pr_checked.state

0 commit comments

Comments
 (0)