|
| 1 | +#! /usr/bin/env python3.3 |
| 2 | + |
| 3 | +# Copyright 2014 Claudio Moretti <flyingstar16@gmail.com> |
| 4 | +# This program is free software: you can redistribute it and/or modify |
| 5 | +# it under the terms of the GNU Affero General Public License as |
| 6 | +# published by the Free Software Foundation, either version 3 of the |
| 7 | +# License, or (at your option) any later version. |
| 8 | + |
| 9 | +# |
| 10 | +# This little piece of software works by downloading the Alexa Top 1M website list, which freely available, |
| 11 | +# then it uses `git diff` to generate a list of XML ruleset files that are in the master branch but not in stable. |
| 12 | +# Finally, it compares the two and prints the file name and path of every ruleset file that |
| 13 | +# a) is in master but not in stable and |
| 14 | +# b) has a target in the Alexa Top1M list |
| 15 | +# |
| 16 | + |
| 17 | +import sys |
| 18 | +import csv |
| 19 | +import xml.etree.ElementTree as etree |
| 20 | +import subprocess |
| 21 | +import random |
| 22 | +import urllib.request |
| 23 | +import urllib.error |
| 24 | +import zipfile |
| 25 | +import os |
| 26 | +import time |
| 27 | + |
| 28 | +# Variables and constants |
| 29 | +sitesList = [] |
| 30 | + |
| 31 | +# Temporary file containing the `git diff` between master and stable |
| 32 | +tmpRulesFileName = "/tmp/rulesDiff-" + format(random.randrange(1,65535)) # Feel free to enlarge if needed |
| 33 | + |
| 34 | +# URL of the Alexa Top1M |
| 35 | +# alexaTop1MURL = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" |
| 36 | +alexaTop1MURL = "http://127.0.0.1/top-1m.csv.zip" |
| 37 | + |
| 38 | +# Temporary file name, to aboid conflicts |
| 39 | +tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv" |
| 40 | + |
| 41 | +# Logfile. Records the same output as the script |
| 42 | +logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log" |
| 43 | + |
| 44 | +# Filename of the CSV file contained in the Alexa zipfile |
| 45 | +tmpAlexaZipFileContents = 'top-1m.csv' |
| 46 | + |
| 47 | +# Absolute path of the git repo (the folder containing src/) |
| 48 | +# Remember to change this accordingly to your system, if you ever move the script |
| 49 | +# |
| 50 | +# By default, it refers to the parent directory of the one containing the script |
| 51 | +# because the script was put in utils/ |
| 52 | +# |
| 53 | +# __NEEDS A TRAILING SLASH__ |
| 54 | +# |
| 55 | +# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) |
| 56 | +gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/" |
| 57 | + |
| 58 | +# Maximum number of websites to use in the Alexa Top 1M (i.e. it's no longer 1M but maxSitesNumber) |
| 59 | +# Set to -1 for 'unlimited' |
| 60 | +maxSitesNumber = 1000 |
| 61 | + |
| 62 | +# Functions |
| 63 | +def ruleLookup(target): |
| 64 | + try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found |
| 65 | + sitesList.index(target) |
| 66 | + return 1 |
| 67 | + except: |
| 68 | + return 0 |
| 69 | + |
| 70 | +# Fetch the Alexa Top 1M - http://stackoverflow.com/questions/1517616/stream-large-binary-files-with-urllib2-to-file |
| 71 | +try: |
| 72 | + print("Retrieving Alexa Top1M from", alexaTop1MURL) |
| 73 | + tmpAlexaZipFileName, headers = urllib.request.urlretrieve(alexaTop1MURL) |
| 74 | + print("File downloaded and stored in %s" % tmpAlexaZipFileName) |
| 75 | +except urllib.error.URLError as e: |
| 76 | + print("Failed to download Alexa Top 1M") |
| 77 | + sys.exit('Error message: %s' % e) |
| 78 | + |
| 79 | +# Now unzip it |
| 80 | +try: |
| 81 | + # Extract in /tmp/ |
| 82 | + print("Start extracting %s" % tmpAlexaZipFileName) |
| 83 | + tmpAlexaZipFile = zipfile.ZipFile(tmpAlexaZipFileName,'r') |
| 84 | + tmpAlexaZipFile.extractall('/tmp/') |
| 85 | +except zipfile.BadZipfile: |
| 86 | + sys.exit("The zip file %s is corrupted.",tmpAlexaZipFileName) |
| 87 | + |
| 88 | +try: |
| 89 | + # Rename the file to match the file with the random in it |
| 90 | + os.rename('/tmp/' + tmpAlexaZipFileContents,tmpAlexaFileName) |
| 91 | + print("Alexa Top1M retrieved and stored in %s" % tmpAlexaFileName) |
| 92 | +except OSError as e: |
| 93 | + print("Failed to rename /tmp/top-1M.csv to %s." % (tmpAlexaFileName)) |
| 94 | + sys.exit('Error message: %s' % (e)) |
| 95 | + |
| 96 | +# Handles reading the Alexa Top 1M and pushing all sites in a list |
| 97 | +sitesReader = csv.reader(open(tmpAlexaFileName), delimiter=',', quotechar='"') |
| 98 | +for row in sitesReader: |
| 99 | + try: |
| 100 | + # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part |
| 101 | + siteFQDN = sitesList.append(row[1].split("/",1)[0]) |
| 102 | + # print("Line %s: %s" % (sitesReader.line_num, sitesList[len(sitesList) - 1])) # Outputs the current line |
| 103 | + if sitesReader.line_num == maxSitesNumber: |
| 104 | + break |
| 105 | + except csv.Error as e: |
| 106 | + sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e)) |
| 107 | + |
| 108 | +# `git diff` the master revision against stable, rules folder only |
| 109 | +try: |
| 110 | + print("Create git diff between master and stable in %s" % tmpRulesFileName) |
| 111 | + tmpRulesFile = open(tmpRulesFileName,"w") |
| 112 | + #subprocess.call(['git', 'diff', '--name-status', 'master..remotes/origin/stable', '../src/chrome/content/rules'], stdout=tmpRulesFile) |
| 113 | + subprocess.call(['git', 'diff', '--name-status', 'remotes/origin/stable..master', '../src/chrome/content/rules'], stdout=tmpRulesFile) |
| 114 | + tmpRulesFile.close() |
| 115 | +except OSError as e: |
| 116 | + sys.exit('An OSError exception was raised: %s' % (e)) |
| 117 | + |
| 118 | +rulesList = open(tmpRulesFileName, 'r') |
| 119 | +logFile = open(logFileName,'w') |
| 120 | +logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S")) |
| 121 | + |
| 122 | +# Let's keep track of how many rules were added and how many were modified |
| 123 | +# Must be declared here or won't be available at the end of the loop |
| 124 | +countAddedRules = 0 |
| 125 | +countEditedRules = 0 |
| 126 | + |
| 127 | +# Start parsing the list |
| 128 | +for line in rulesList: |
| 129 | + try: |
| 130 | + # Split into "file mode in commit + file path" |
| 131 | + ruleFile = line.split() |
| 132 | + found = 0 |
| 133 | + # If file mode is "A" (add) or "M" (edited) |
| 134 | + if ruleFile[0] == "A" or ruleFile[0] == "M": # If file was added or edited between stable and master, parse |
| 135 | + ruleFileObject= open(gitRepositoryPath + ruleFile[1]) |
| 136 | + ruleText = etree.parse(ruleFileObject) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning? |
| 137 | + for target in ruleText.findall('target'): |
| 138 | + FQDN = target.get('host') # URL of the website |
| 139 | + if ruleLookup(FQDN) == 1: # Look it up in the sitesList |
| 140 | + # Message different according to file mode |
| 141 | + if ruleFile[0] == "A": # New |
| 142 | + found = "NEW" |
| 143 | + countAddedRules = countAddedRules + 1 |
| 144 | + break |
| 145 | + elif ruleFile[0] == "M": # Edited |
| 146 | + found = "EDITED" |
| 147 | + countEditedRules = countEditedRules + 1 |
| 148 | + break |
| 149 | + |
| 150 | + # If found, print it TABULATED |
| 151 | + if found != 0: |
| 152 | + print("%s:\t%s" % (found, ruleFile[1])) |
| 153 | + logFile.write("%s:\t%s" % (found, ruleFile[1])) |
| 154 | + |
| 155 | + # else ignore |
| 156 | + # There are some problems with file name encoding. So, for now, just print an error and pass |
| 157 | + except FileNotFoundError as e: # Won't happen before line.split() is invoked |
| 158 | + print("File not found:", ruleFile[1]) |
| 159 | +# logFile.write ("File not found: %s\n" % ruleFile[1]) |
| 160 | + logFile.write("%s\n" % e) |
| 161 | + pass |
| 162 | + except IOError as ioe: #Treated same as FileNotFoundError |
| 163 | + print("File not found:", ruleFile[1]) |
| 164 | +# logFile.write ("File not found: %s\n" % ruleFile[1]) |
| 165 | + logFile.write("%s\n" % e) |
| 166 | + pass |
| 167 | + |
| 168 | + |
| 169 | + |
| 170 | +# Print our simple statistics |
| 171 | +print("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules)) |
| 172 | +logFile.write("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules)) |
| 173 | +print("\n\nLog file can be found at %s" % logFileName) |
| 174 | + |
| 175 | +# Close the rules file |
| 176 | +rulesList.close() |
| 177 | +# And the log file |
| 178 | +logFile.close() |
0 commit comments