11#! /usr/bin/env python3.3
22
3- # Copyright 2014 Claudio Moretti
3+ # Copyright 2014 Claudio Moretti <flyingstar16@gmail.com>
44# This program is free software: you can redistribute it and/or modify
55# it under the terms of the GNU Affero General Public License as
66# published by the Free Software Foundation, either version 3 of the
77# License, or (at your option) any later version.
88
99#
10- # You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py
11- # git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff
10+ # This little piece of software works by downloading the Alexa Top 1M website list, which freely available,
11+ # then it uses `git diff` to generate a list of XML ruleset files that are in the master branch but not in stable.
12+ # Finally, it compares the two and prints the file name and path of every ruleset file that
13+ # a) is in master but not in stable and
14+ # b) has a target in the Alexa Top1M list
1215#
1316
17+ import sys
1418import csv
1519import xml .etree .ElementTree as etree
1620import subprocess
1721import random
22+ import urllib .request
23+ import urllib .error
24+ import zipfile
25+ import os
26+ import time
1827
1928# Variables and constants
2029sitesList = []
30+
31+ # Temporary file containing the `git diff` between master and stable
2132tmpRulesFileName = "/tmp/rulesDiff-" + format (random .randrange (1 ,65535 )) # Feel free to enlarge if needed
2233
34+ # URL of the Alexa Top1M
35+ alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
36+ # alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip"
37+
38+ # Temporary file name, to aboid conflicts
39+ tmpAlexaFileName = "/tmp/alexa-top1M-" + format (random .randrange (1 ,65535 )) + ".csv"
40+
41+ # Logfile. Records the same output as the script (FOUND and "File not found" messages)
42+ logFileName = "/tmp/alexa-ruleset-log-" + format (random .randrange (1 ,65535 )) + ".log"
43+
44+ # Filename of the CSV file contained in the Alexa zipfile
45+ tmpAlexaZipFileContents = 'top-1m.csv'
46+
47+ # Absolute path of the git repo (the folder containing src/)
48+ # Remember to change this accordingly to your system, if you ever move the script
49+ #
50+ # By default, it refers to the parent directory of the one containing the script
51+ # because the script was put in utils/
52+ #
53+ # __NEEDS A TRAILING SLASH__
54+ #
55+ # gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir))
56+ gitRepositoryPath = os .path .abspath (os .path .join (os .curdir , os .pardir )) + "/"
57+
58+
2359# Functions
2460def ruleLookup (target ):
2561 try : # list.index(value) throus an exception for a "not found", so if it throws it, it's not found
@@ -28,36 +64,63 @@ def ruleLookup(target):
2864 except :
2965 return 0
3066
31- # Fetch the Alexa Top 1M
67+ # Fetch the Alexa Top 1M - http://stackoverflow.com/questions/1517616/stream-large-binary-files-with-urllib2-to-file
68+ try :
69+ print ("Retrieving Alexa Top1M from" , alexaTop1MURL )
70+ tmpAlexaZipFileName , headers = urllib .request .urlretrieve (alexaTop1MURL )
71+ print ("File downloaded and stored in %s" % tmpAlexaZipFileName )
72+ except urllib .error .URLError as e :
73+ print ("Failed to download Alexa Top 1M" )
74+ sys .exit ('Error message: %s' % e )
75+
76+ # Now unzip it
77+ try :
78+ # Extract in /tmp/
79+ print ("Start extracting %s" % tmpAlexaZipFileName )
80+ tmpAlexaZipFile = zipfile .ZipFile (tmpAlexaZipFileName ,'r' )
81+ tmpAlexaZipFile .extractall ('/tmp/' )
82+ except zipfile .BadZipfile :
83+ sys .exit ("The zip file %s is corrupted." ,tmpAlexaZipFileName )
3284
85+ try :
86+ # Rename the file to match the file with the random in it
87+ os .rename ('/tmp/' + tmpAlexaZipFileContents ,tmpAlexaFileName )
88+ print ("Alexa Top1M retrieved and stored in %s" % tmpAlexaFileName )
89+ except OSError as e :
90+ print ("Failed to rename /tmp/top-1M.csv to %s." % (tmpAlexaFileName ))
91+ sys .exit ('Error message: %s' % (e ))
3392
3493# Handles reading the Alexa Top 1M and pushing all sites in a list
35- sitesReader = csv .reader (open ('top-1m.csv' ), delimiter = ',' , quotechar = '"' )
94+ sitesReader = csv .reader (open (tmpAlexaFileName ), delimiter = ',' , quotechar = '"' )
3695for row in sitesReader :
3796 try :
3897 # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
3998 siteFQDN = sitesList .append (row [1 ].split ("/" ,1 )[0 ])
40-
4199 except csv .Error as e :
42- sys .exit ('file %s, line %d: %s' % (filename , reader .line_num , e ))
100+ sys .exit ('file %s, line %d: %s' % (tmpAlexaFileName , sitesReader .line_num , e ))
43101
44102# `git diff` the master revision against stable, rules folder only
45103try :
104+ print ("Create git diff between master and stable in %s" % tmpRulesFileName )
46105 tmpRulesFile = open (tmpRulesFileName ,"w" )
47- subprocess .call (['git' , 'diff' , '--name-status' , 'master..remotes/origin/stable' , '../src/chrome/content/rules' ], stdout = tmpRulesFile )
106+ #subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile)
107+ subprocess .call (['git' , 'diff' , '--name-status' , 'remotes/origin/stable..master' , '../src/chrome/content/rules' ], stdout = tmpRulesFile )
48108 tmpRulesFile .close ()
49109except OSError as e :
50110 sys .exit ('An OSError exception was raised: %s' % (e ))
51111
52112rulesList = open (tmpRulesFileName , 'r' )
113+ logFile = open (logFileName ,'w' )
114+ logFile .write ("Log file generated on %s.\n Paths are relative to the root directory of the git repo.\n \n " % time .strftime ("%Y-%m-%d %H:%M:%S" ))
115+
53116for line in rulesList :
54117 try :
55118 # Split into "file mode in commit + file path"
56119 ruleFile = line .split ()
57120 found = 0
58121 # If file mode is "A" (add)
59122 if ruleFile [0 ] == "A" : # If file was "added", parse
60- ruleText = etree .parse (ruleFile [1 ])
123+ ruleText = etree .parse (gitRepositoryPath + ruleFile [1 ]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT
61124 for target in ruleText .findall ('target' ):
62125 FQDN = target .get ('host' ) # URL of the website
63126 if ruleLookup (FQDN ) == 1 : # Look it up in the sitesList
@@ -66,11 +129,16 @@ def ruleLookup(target):
66129 # If found, print it
67130 if found == 1 :
68131 print ("FOUND: " , ruleFile [1 ])
132+ logFile .write ("FOUND: %s\n " % ruleFile [1 ])
69133 # else ignore
70134 # There are some problems with file name encoding. So, for now, just print an error and pass
71- except FileNotFoundError : # Won't happen before line.split() is invoked
135+ except FileNotFoundError as e : # Won't happen before line.split() is invoked
72136 print ("File not found:" , ruleFile [1 ])
137+ # logFile.write ("File not found: %s\n" % ruleFile[1])
138+ logFile .write ("%s\n " % e )
73139 pass
74140
75141# Close the rules file
76142rulesList .close ()
143+ # And the log file
144+ logFile .close ()
0 commit comments