3838# Temporary file name, to aboid conflicts
3939tmpAlexaFileName = "/tmp/alexa-top1M-" + format (random .randrange (1 ,65535 )) + ".csv"
4040
41- # Logfile. Records the same output as the script (FOUND and "File not found" messages)
41+ # Logfile. Records the same output as the script
4242logFileName = "/tmp/alexa-ruleset-log-" + format (random .randrange (1 ,65535 )) + ".log"
4343
4444# Filename of the CSV file contained in the Alexa zipfile
5555# gitRepositoryPath = os.path.abspath(os.path.join(os.curdir, os.pardir))
5656gitRepositoryPath = os .path .abspath (os .path .join (os .curdir , os .pardir )) + "/"
5757
58+ # Maximum number of websites to use in the Alexa Top 1M (i.e. it's no longer 1M but maxSitesNumber)
59+ # Set to -1 for 'unlimited'
60+ maxSitesNumber = 1000
5861
5962# Functions
6063def ruleLookup (target ):
@@ -96,6 +99,9 @@ def ruleLookup(target):
9699 try :
97100 # Since some Alexa sites are not FQDNs, split where there's a "/" and keep ony the first part
98101 siteFQDN = sitesList .append (row [1 ].split ("/" ,1 )[0 ])
102+ # print("Line %s: %s" % (sitesReader.line_num, sitesList[len(sitesList) - 1])) # Outputs the current line
103+ if sitesReader .line_num == maxSitesNumber :
104+ break
99105 except csv .Error as e :
100106 sys .exit ('file %s, line %d: %s' % (tmpAlexaFileName , sitesReader .line_num , e ))
101107
@@ -113,23 +119,38 @@ def ruleLookup(target):
113119logFile = open (logFileName ,'w' )
114120logFile .write ("Log file generated on %s.\n Paths are relative to the root directory of the git repo.\n \n " % time .strftime ("%Y-%m-%d %H:%M:%S" ))
115121
122+ # Let's keep track of how many rules were added and how many were modified
123+ # Must be declared here or won't be available at the end of the loop
124+ countAddedRules = 0
125+ countEditedRules = 0
126+
127+ # Start parsing the list
116128for line in rulesList :
117129 try :
118130 # Split into "file mode in commit + file path"
119131 ruleFile = line .split ()
120132 found = 0
121- # If file mode is "A" (add)
122- if ruleFile [0 ] == "A" : # If file was " added" , parse
123- ruleText = etree .parse (gitRepositoryPath + ruleFile [1 ]) # ADJUST FILE PATH (here is '../' IF YOU MOVE THE SCRIPT
133+ # If file mode is "A" (add) or "M" (edited)
134+ if ruleFile [0 ] == "A" or ruleFile [ 0 ] == "M" : # If file was added or edited between stable and master , parse
135+ ruleText = etree .parse (gitRepositoryPath + ruleFile [1 ]) # ADJUST FILE PATH (here is '../') IF YOU MOVE THE SCRIPT - XXX: Obsolete warning?
124136 for target in ruleText .findall ('target' ):
125137 FQDN = target .get ('host' ) # URL of the website
126138 if ruleLookup (FQDN ) == 1 : # Look it up in the sitesList
127- found = 1
128- break
129- # If found, print it
130- if found == 1 :
131- print ("FOUND: " , ruleFile [1 ])
132- logFile .write ("FOUND: %s\n " % ruleFile [1 ])
139+ # Message different according to file mode
140+ if ruleFile [0 ] == "A" : # New
141+ found = "NEW"
142+ countAddedRules = countAddedRules + 1
143+ break
144+ elif ruleFile [0 ] == "M" : # Edited
145+ found = "EDITED"
146+ countEditedRules = countEditedRules + 1
147+ break
148+
149+ # If found, print it TABULATED
150+ if found != 0 :
151+ print ("%s:\t %s" % (found , ruleFile [1 ]))
152+ logFile .write ("%s:\t %s" % (found , ruleFile [1 ]))
153+
133154 # else ignore
134155 # There are some problems with file name encoding. So, for now, just print an error and pass
135156 except FileNotFoundError as e : # Won't happen before line.split() is invoked
@@ -138,6 +159,11 @@ def ruleLookup(target):
138159 logFile .write ("%s\n " % e )
139160 pass
140161
162+ # Print our simple statistics
163+ print ("\n \n Statistics:\n Parsed rules: %s\n Newly added rules: %s\n Edited rules: %d" % (maxSitesNumber , countAddedRules , countEditedRules ))
164+ logFile .write ("\n \n Statistics:\n Parsed rules: %s\n Newly added rules: %s\n Edited rules: %d" % (maxSitesNumber , countAddedRules , countEditedRules ))
165+ print ("\n \n Log file can be found at %s" % logFileName )
166+
141167# Close the rules file
142168rulesList .close ()
143169# And the log file
0 commit comments