Skip to content
This repository was archived by the owner on Nov 4, 2020. It is now read-only.

Commit cf80a67

Browse files
committed
Alexa ruleset checker working
The ruleset checker seems to be working: it downloads and unzips the Alexa Top1M and automatically generates the git diff. Comparing the two seems to be working as well. Manually checked some rules: they seem to have been correctly identified as in the Top 1M and not in stable
1 parent f71fe37 commit cf80a67

1 file changed

Lines changed: 78 additions & 10 deletions

File tree

utils/alexa-ruleset-checker.py

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,61 @@
11
#! /usr/bin/env python3.3
22

3-
# Copyright 2014 Claudio Moretti
3+
# Copyright 2014 Claudio Moretti <flyingstar16@gmail.com>
44
# This program is free software: you can redistribute it and/or modify
55
# it under the terms of the GNU Affero General Public License as
66
# published by the Free Software Foundation, either version 3 of the
77
# License, or (at your option) any later version.
88

99
#
10-
# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py
11-
# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff
10+
# This little piece of software works by downloading the Alexa Top 1M website list, which freely available,
11+
# then it uses `git diff` to generate a list of XML ruleset files that are in the master branch but not in stable.
12+
# Finally, it compares the two and prints the file name and path of every ruleset file that
13+
# a) is in master but not in stable and
14+
# b) has a target in the Alexa Top1M list
1215
#
1316

17+
import sys
1418
import csv
1519
import xml.etree.ElementTree as etree
1620
import subprocess
1721
import random
22+
import urllib.request
23+
import urllib.error
24+
import zipfile
25+
import os
26+
import time
1827

1928
# Variables and constants
2029
sitesList = []
30+
31+
# Temporary file containing the `git diff` between master and stable
2132
tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed
2233

34+
# URL of the Alexa Top1M
35+
alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
36+
# alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip"
37+
38+
# Temporary file name, to aboid conflicts
39+
tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv"
40+
41+
# Logfile. Records the same output as the script (FOUND and "File not found" messages)
42+
logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log"
43+
44+
# Filename of the CSV file contained in the Alexa zipfile
45+
tmpAlexaZipFileContents = 'top-1m.csv'
46+
47+
# Absolute path of the git repo (the folder containing src/)
48+
# Remember to change this accordingly to your system, if you ever move the script
49+
#
50+
# By default, it refers to the parent directory of the one containing the script
51+
# because the script was put in utils/
52+
#
53+
# __NEEDS A TRAILING SLASH__
54+
#
55+
# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir))
56+
gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/"
57+
58+
2359
# Functions
2460
def ruleLookup(target):
2561
try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found
@@ -28,36 +64,63 @@ def ruleLookup(target):
2864
except:
2965
return 0
3066

31-
# Fetch the Alexa Top 1M
67+
# Fetch the Alexa Top 1M - http://stackoverflow.com/questions/1517616/stream-large-binary-files-with-urllib2-to-file
68+
try:
69+
print("Retrieving Alexa Top1M from", alexaTop1MURL)
70+
tmpAlexaZipFileName, headers = urllib.request.urlretrieve(alexaTop1MURL)
71+
print("File downloaded and stored in %s" % tmpAlexaZipFileName)
72+
except urllib.error.URLError as e:
73+
print("Failed to download Alexa Top 1M")
74+
sys.exit('Error message: %s' % e)
75+
76+
# Now unzip it
77+
try:
78+
# Extract in /tmp/
79+
print("Start extracting %s" % tmpAlexaZipFileName)
80+
tmpAlexaZipFile = zipfile.ZipFile(tmpAlexaZipFileName,'r')
81+
tmpAlexaZipFile.extractall('/tmp/')
82+
except zipfile.BadZipfile:
83+
sys.exit("The zip file %s is corrupted.",tmpAlexaZipFileName)
3284

85+
try:
86+
# Rename the file to match the file with the random in it
87+
os.rename('/tmp/' + tmpAlexaZipFileContents,tmpAlexaFileName)
88+
print("Alexa Top1M retrieved and stored in %s" % tmpAlexaFileName)
89+
except OSError as e:
90+
print("Failed to rename /tmp/top-1M.csv to %s." % (tmpAlexaFileName))
91+
sys.exit('Error message: %s' % (e))
3392

3493
# Handles reading the Alexa Top 1M and pushing all sites in a list
35-
sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"')
94+
sitesReader = csv.reader(open(tmpAlexaFileName), delimiter=',', quotechar='"')
3695
for row in sitesReader:
3796
try:
3897
# Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
3998
siteFQDN = sitesList.append(row[1].split("/",1)[0])
40-
4199
except csv.Error as e:
42-
sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
100+
sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e))
43101

44102
# `git diff` the master revision against stable, rules folder only
45103
try:
104+
print("Create git diff between master and stable in %s" % tmpRulesFileName)
46105
tmpRulesFile = open(tmpRulesFileName,"w")
47-
subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile)
106+
#subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile)
107+
subprocess.call(['git', 'diff', '--name-status', 'remotes/origin/stable..master', '../src/chrome/content/rules'], stdout=tmpRulesFile)
48108
tmpRulesFile.close()
49109
except OSError as e:
50110
sys.exit('An OSError exception was raised: %s' % (e))
51111

52112
rulesList = open(tmpRulesFileName, 'r')
113+
logFile = open(logFileName,'w')
114+
logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S"))
115+
53116
for line in rulesList:
54117
try:
55118
# Split into "file mode in commit + file path"
56119
ruleFile = line.split()
57120
found = 0
58121
# If file mode is "A" (add)
59122
if ruleFile[0] == "A": # If file was "added", parse
60-
ruleText = etree.parse(ruleFile[1])
123+
ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT
61124
for target in ruleText.findall('target'):
62125
FQDN = target.get('host') # URL of the website
63126
if ruleLookup(FQDN) == 1: # Look it up in the sitesList
@@ -66,11 +129,16 @@ def ruleLookup(target):
66129
# If found, print it
67130
if found == 1:
68131
print("FOUND: ", ruleFile[1])
132+
logFile.write("FOUND: %s\n" % ruleFile[1])
69133
# else ignore
70134
# There are some problems with file name encoding. So, for now, just print an error and pass
71-
except FileNotFoundError: # Won't happen before line.split() is invoked
135+
except FileNotFoundError as e: # Won't happen before line.split() is invoked
72136
print("File not found:", ruleFile[1])
137+
# logFile.write ("File not found: %s\n" % ruleFile[1])
138+
logFile.write("%s\n" % e)
73139
pass
74140

75141
# Close the rules file
76142
rulesList.close()
143+
# And the log file
144+
logFile.close()

0 commit comments

Comments
 (0)