|
| 1 | +#! /usr/bin/env python3.3 |
| 2 | + |
| 3 | +# Copyright 2014 Claudio Moretti |
| 4 | +# This program is free software: you can redistribute it and/or modify |
| 5 | +# it under the terms of the GNU Affero General Public License as |
| 6 | +# published by the Free Software Foundation, either version 3 of the |
| 7 | +# License, or (at your option) any later version. |
| 8 | + |
| 9 | +# |
| 10 | +# You NEED: 'top-1m.csv' and 'newRules.diff' in the same directory as merger.py |
| 11 | +# git diff --name-status master..remotes/origin/stable src/chrome/content/rules >> newRules.diff |
| 12 | +# |
| 13 | + |
| 14 | +import csv |
| 15 | +import xml.etree.ElementTree as etree |
| 16 | + |
| 17 | +# Variables and constants |
| 18 | +sitesList = [] |
| 19 | + |
| 20 | +# Functions |
| 21 | +def ruleLookup(target): |
| 22 | + try: # list.index(value) throus an exception for a "not found", so if it throws it, it's not found |
| 23 | + sitesList.index(target) |
| 24 | + return 1 |
| 25 | + except: |
| 26 | + return 0 |
| 27 | + |
| 28 | +# Handles reading the Alexa Top 1M and pushing all sites in a list |
| 29 | +sitesReader = csv.reader(open('top-1m.csv'), delimiter=',', quotechar='"') |
| 30 | +for row in sitesReader: |
| 31 | + try: |
| 32 | + # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part |
| 33 | + siteFQDN = sitesList.append(row[1].split("/",1)[0]) |
| 34 | + |
| 35 | + except csv.Error as e: |
| 36 | + sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) |
| 37 | + |
| 38 | +# TODO: Somebody needs to write a function that generates a diff from the STABLE and UNSTABLE branch |
| 39 | +# I'll go manually with `git diff --name-status master..remotes/origin/stable src/chrome/content/rules` and call the file "newRules.diff" |
| 40 | +rulesList = open('newRules.diff', 'r') |
| 41 | +for line in rulesList: |
| 42 | + try: |
| 43 | + # Split into "file mode in commit + file path" |
| 44 | + ruleFile = line.split() |
| 45 | + found = 0 |
| 46 | + # If file mode is "A" (add) |
| 47 | + if ruleFile[0] == "A": #If file was "added", parse |
| 48 | + ruleText = etree.parse(ruleFile[1]) |
| 49 | + for target in ruleText.findall('target'): |
| 50 | + FQDN = target.get('host') # URL of the website |
| 51 | + if ruleLookup(FQDN) == 1: # Look it up in the sitesList |
| 52 | + found = 1 |
| 53 | + break |
| 54 | + # If found, print it |
| 55 | + if found == 1: |
| 56 | + print("FOUND: ", ruleFile[1]) |
| 57 | + # else ignore |
| 58 | + # There are some problems with file name encoding. So, for now, just print an error and pass |
| 59 | + except FileNotFoundError: # Won't happen before line.split() is invoked |
| 60 | + print("File not found:", ruleFile[1]) |
| 61 | + pass |
| 62 | + |
| 63 | + |
0 commit comments