Skip to content

Commit 23ad914

Browse files
committed
Merge branch 'master' of https://github.com/flyingstar16/https-everywhere into flyingstar16-master
2 parents 7ae2aaf + 421be4b commit 23ad914

1 file changed

Lines changed: 178 additions & 0 deletions

File tree

utils/alexa-ruleset-checker.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#! /usr/bin/env python3.3
2+
3+
# Copyright 2014 Claudio Moretti <flyingstar16@gmail.com>
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU Affero General Public License as
6+
# published by the Free Software Foundation, either version 3 of the
7+
# License, or (at your option) any later version.
8+
9+
#
10+
# This little piece of software works by downloading the Alexa Top 1M website list, which freely available,
11+
# then it uses `git diff` to generate a list of XML ruleset files that are in the master branch but not in stable.
12+
# Finally, it compares the two and prints the file name and path of every ruleset file that
13+
# a) is in master but not in stable and
14+
# b) has a target in the Alexa Top1M list
15+
#
16+
17+
import sys
18+
import csv
19+
import xml.etree.ElementTree as etree
20+
import subprocess
21+
import random
22+
import urllib.request
23+
import urllib.error
24+
import zipfile
25+
import os
26+
import time
27+
28+
# Variables and constants
29+
sitesList = []
30+
31+
# Temporary file containing the `git diff` between master and stable
32+
tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed
33+
34+
# URL of the Alexa Top1M
35+
# alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
36+
alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip"
37+
38+
# Temporary file name, to aboid conflicts
39+
tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv"
40+
41+
# Logfile. Records the same output as the script
42+
logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log"
43+
44+
# Filename of the CSV file contained in the Alexa zipfile
45+
tmpAlexaZipFileContents = 'top-1m.csv'
46+
47+
# Absolute path of the git repo (the folder containing src/)
48+
# Remember to change this accordingly to your system, if you ever move the script
49+
#
50+
# By default, it refers to the parent directory of the one containing the script
51+
# because the script was put in utils/
52+
#
53+
# __NEEDS A TRAILING SLASH__
54+
#
55+
# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir))
56+
gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/"
57+
58+
# Maximum number of websites to use in the Alexa Top 1M (i.e. it's no longer 1M but maxSitesNumber)
59+
# Set to -1 for 'unlimited'
60+
maxSitesNumber = 1000
61+
62+
# Functions
63+
def ruleLookup(target):
64+
try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found
65+
sitesList.index(target)
66+
return 1
67+
except:
68+
return 0
69+
70+
# Fetch the Alexa Top 1M - http://stackoverflow.com/questions/1517616/stream-large-binary-files-with-urllib2-to-file
71+
try:
72+
print("Retrieving Alexa Top1M from", alexaTop1MURL)
73+
tmpAlexaZipFileName, headers = urllib.request.urlretrieve(alexaTop1MURL)
74+
print("File downloaded and stored in %s" % tmpAlexaZipFileName)
75+
except urllib.error.URLError as e:
76+
print("Failed to download Alexa Top 1M")
77+
sys.exit('Error message: %s' % e)
78+
79+
# Now unzip it
80+
try:
81+
# Extract in /tmp/
82+
print("Start extracting %s" % tmpAlexaZipFileName)
83+
tmpAlexaZipFile = zipfile.ZipFile(tmpAlexaZipFileName,'r')
84+
tmpAlexaZipFile.extractall('/tmp/')
85+
except zipfile.BadZipfile:
86+
sys.exit("The zip file %s is corrupted.",tmpAlexaZipFileName)
87+
88+
try:
89+
# Rename the file to match the file with the random in it
90+
os.rename('/tmp/' + tmpAlexaZipFileContents,tmpAlexaFileName)
91+
print("Alexa Top1M retrieved and stored in %s" % tmpAlexaFileName)
92+
except OSError as e:
93+
print("Failed to rename /tmp/top-1M.csv to %s." % (tmpAlexaFileName))
94+
sys.exit('Error message: %s' % (e))
95+
96+
# Handles reading the Alexa Top 1M and pushing all sites in a list
97+
sitesReader = csv.reader(open(tmpAlexaFileName), delimiter=',', quotechar='"')
98+
for row in sitesReader:
99+
try:
100+
# Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
101+
siteFQDN = sitesList.append(row[1].split("/",1)[0])
102+
# print("Line %s: %s" % (sitesReader.line_num, sitesList[len(sitesList) - 1])) # Outputs the current line
103+
if sitesReader.line_num == maxSitesNumber:
104+
break
105+
except csv.Error as e:
106+
sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e))
107+
108+
# `git diff` the master revision against stable, rules folder only
109+
try:
110+
print("Create git diff between master and stable in %s" % tmpRulesFileName)
111+
tmpRulesFile = open(tmpRulesFileName,"w")
112+
#subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile)
113+
subprocess.call(['git', 'diff', '--name-status', 'remotes/origin/stable..master', '../src/chrome/content/rules'], stdout=tmpRulesFile)
114+
tmpRulesFile.close()
115+
except OSError as e:
116+
sys.exit('An OSError exception was raised: %s' % (e))
117+
118+
rulesList = open(tmpRulesFileName, 'r')
119+
logFile = open(logFileName,'w')
120+
logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S"))
121+
122+
# Let's keep track of how many rules were added and how many were modified
123+
# Must be declared here or won't be available at the end of the loop
124+
countAddedRules = 0
125+
countEditedRules = 0
126+
127+
# Start parsing the list
128+
for line in rulesList:
129+
try:
130+
# Split into "file mode in commit + file path"
131+
ruleFile = line.split()
132+
found = 0
133+
# If file mode is "A" (add) or "M" (edited)
134+
if ruleFile[0] == "A" or ruleFile[0] == "M": # If file was added or edited between stable and master, parse
135+
ruleFileObject= open(gitRepositoryPath + ruleFile[1])
136+
ruleText = etree.parse(ruleFileObject) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning?
137+
for target in ruleText.findall('target'):
138+
FQDN = target.get('host') # URL of the website
139+
if ruleLookup(FQDN) == 1: # Look it up in the sitesList
140+
# Message different according to file mode
141+
if ruleFile[0] == "A": # New
142+
found = "NEW"
143+
countAddedRules = countAddedRules + 1
144+
break
145+
elif ruleFile[0] == "M": # Edited
146+
found = "EDITED"
147+
countEditedRules = countEditedRules + 1
148+
break
149+
150+
# If found, print it TABULATED
151+
if found != 0:
152+
print("%s:\t%s" % (found, ruleFile[1]))
153+
logFile.write("%s:\t%s" % (found, ruleFile[1]))
154+
155+
# else ignore
156+
# There are some problems with file name encoding. So, for now, just print an error and pass
157+
except FileNotFoundError as e: # Won't happen before line.split() is invoked
158+
print("File not found:", ruleFile[1])
159+
# logFile.write ("File not found: %s\n" % ruleFile[1])
160+
logFile.write("%s\n" % e)
161+
pass
162+
except IOError as ioe: #Treated same as FileNotFoundError
163+
print("File not found:", ruleFile[1])
164+
# logFile.write ("File not found: %s\n" % ruleFile[1])
165+
logFile.write("%s\n" % e)
166+
pass
167+
168+
169+
170+
# Print our simple statistics
171+
print("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules))
172+
logFile.write("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules))
173+
print("\n\nLog file can be found at %s" % logFileName)
174+
175+
# Close the rules file
176+
rulesList.close()
177+
# And the log file
178+
logFile.close()

0 commit comments

Comments
 (0)