Skip to content

Commit 6f56e58

Browse files
committed
Added rule limit and recognition of edited rules
Rule limit was implemented using csvReader.max_lines and breaking the loop when it's hit Implemented tecognition of edited rules via the "M" flag of git diff. Output was tidied up a bit to account for the different wording (used tabulation)
1 parent 802e1a0 commit 6f56e58

File tree

1 file changed

+36
-10
lines changed

1 file changed

+36
-10
lines changed

utils/alexa-ruleset-checker.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
# Temporary file name, to aboid conflicts
3939
tmpAlexaFileName = "/tmp/alexa-top1M-" + format(random.randrange(1,65535)) + ".csv"
4040

41-
# Logfile. Records the same output as the script (FOUND and "File not found" messages)
41+
# Logfile. Records the same output as the script
4242
logFileName = "/tmp/alexa-ruleset-log-" + format(random.randrange(1,65535)) + ".log"
4343

4444
# Filename of the CSV file contained in the Alexa zipfile
@@ -55,6 +55,9 @@
5555
# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir))
5656
gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir)) + "/"
5757

58+
# Maximum number of websites to use in the Alexa Top 1M (i.e. it's no longer 1M but maxSitesNumber)
59+
# Set to -1 for 'unlimited'
60+
maxSitesNumber = 1000
5861

5962
# Functions
6063
def ruleLookup(target):
@@ -96,6 +99,9 @@ def ruleLookup(target):
9699
try:
97100
# Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
98101
siteFQDN = sitesList.append(row[1].split("/",1)[0])
102+
# print("Line %s: %s" % (sitesReader.line_num, sitesList[len(sitesList) - 1])) # Outputs the current line
103+
if sitesReader.line_num == maxSitesNumber:
104+
break
99105
except csv.Error as e:
100106
sys.exit('file %s, line %d: %s' % (tmpAlexaFileName, sitesReader.line_num, e))
101107

@@ -113,23 +119,38 @@ def ruleLookup(target):
113119
logFile = open(logFileName,'w')
114120
logFile.write("Log file generated on %s.\nPaths are relative to the root directory of the git repo.\n\n" % time.strftime("%Y-%m-%d %H:%M:%S"))
115121

122+
# Let's keep track of how many rules were added and how many were modified
123+
# Must be declared here or won't be available at the end of the loop
124+
countAddedRules = 0
125+
countEditedRules = 0
126+
127+
# Start parsing the list
116128
for line in rulesList:
117129
try:
118130
# Split into "file mode in commit + file path"
119131
ruleFile = line.split()
120132
found = 0
121-
# If file mode is "A" (add)
122-
if ruleFile[0] == "A": # If file was "added", parse
123-
ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT
133+
# If file mode is "A" (add) or "M" (edited)
134+
if ruleFile[0] == "A" or ruleFile[0] == "M": # If file was added or edited between stable and master, parse
135+
ruleText = etree.parse(gitRepositoryPath + ruleFile[1]) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning?
124136
for target in ruleText.findall('target'):
125137
FQDN = target.get('host') # URL of the website
126138
if ruleLookup(FQDN) == 1: # Look it up in the sitesList
127-
found = 1
128-
break
129-
# If found, print it
130-
if found == 1:
131-
print("FOUND: ", ruleFile[1])
132-
logFile.write("FOUND: %s\n" % ruleFile[1])
139+
# Message different according to file mode
140+
if ruleFile[0] == "A": # New
141+
found = "NEW"
142+
countAddedRules = countAddedRules + 1
143+
break
144+
elif ruleFile[0] == "M": # Edited
145+
found = "EDITED"
146+
countEditedRules = countEditedRules + 1
147+
break
148+
149+
# If found, print it TABULATED
150+
if found != 0:
151+
print("%s:\t%s" % (found, ruleFile[1]))
152+
logFile.write("%s:\t%s" % (found, ruleFile[1]))
153+
133154
# else ignore
134155
# There are some problems with file name encoding. So, for now, just print an error and pass
135156
except FileNotFoundError as e: # Won't happen before line.split() is invoked
@@ -138,6 +159,11 @@ def ruleLookup(target):
138159
logFile.write("%s\n" % e)
139160
pass
140161

162+
# Print our simple statistics
163+
print("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules))
164+
logFile.write("\n\nStatistics:\nParsed rules: %s\nNewly added rules: %s\nEdited rules: %d" % (maxSitesNumber, countAddedRules, countEditedRules))
165+
print("\n\nLog file can be found at %s" % logFileName)
166+
141167
# Close the rules file
142168
rulesList.close()
143169
# And the log file

0 commit comments

Comments
 (0)