Skip to content

Commit cc4a6fc

Browse files
committed
Added ruleset merger script
Merger script added. TODO: fix it to work with the correct tree
1 parent 2048767 commit cc4a6fc

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed

utils/merger.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#! /usr/bin/env python3.3
2+
3+
# Copyright 2014 Claudio Moretti
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU Affero General Public License as
6+
# published by the Free Software Foundation, either version 3 of the
7+
# License, or (at your option) any later version.
8+
9+
#
10+
# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py
11+
# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff
12+
#
13+
14+
import csv
15+
import xml.etree.ElementTree as etree
16+
17+
# Variables and constants
18+
sitesList = []
19+
20+
# Functions
21+
def ruleLookup(target):
22+
try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found
23+
sitesList.index(target)
24+
return 1
25+
except:
26+
return 0
27+
28+
# Handles reading the Alexa Top 1M and pushing all sites in a list
29+
sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"')
30+
for row in sitesReader:
31+
try:
32+
# Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
33+
siteFQDN = sitesList.append(row[1].split("/",1)[0])
34+
35+
except csv.Error as e:
36+
sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
37+
38+
# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch
39+
# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff"
40+
rulesList = open('newRules.diff', 'r')
41+
for line in rulesList:
42+
try:
43+
# Split into "file mode in commit + file path"
44+
ruleFile = line.split()
45+
found = 0
46+
# If file mode is "A" (add)
47+
if ruleFile[0] == "A": #If file was "added", parse
48+
ruleText = etree.parse(ruleFile[1])
49+
for target in ruleText.findall('target'):
50+
FQDN = target.get('host') # URL of the website
51+
if ruleLookup(FQDN) == 1: # Look it up in the sitesList
52+
found = 1
53+
break
54+
# If found, print it
55+
if found == 1:
56+
print("FOUND: ", ruleFile[1])
57+
# else ignore
58+
# There are some problems with file name encoding. So, for now, just print an error and pass
59+
except FileNotFoundError: # Won't happen before line.split() is invoked
60+
print("File not found:", ruleFile[1])
61+
pass
62+
63+

0 commit comments

Comments
 (0)