@@ -121,69 +121,78 @@ def queue_result(self, result, details, fname, url, https_url=None):
121121 res ["https_url" ] = https_url
122122 self .resQueue .put (res )
123123
124- def processUrl (self , plainUrl , task ):
124+ def fetchUrl (self , plainUrl , transformedUrl , fetcherPlain , fetcherRewriting , ruleFname ):
125+ logging .debug ("=**= Start %s => %s ****" , plainUrl , transformedUrl )
126+ logging .debug ("Fetching transformed page %s" , transformedUrl )
127+ transformedRcode , transformedPage = fetcherRewriting .fetchHtml (transformedUrl )
128+ logging .debug ("Fetching plain page %s" , plainUrl )
129+ # If we get an exception (e.g. connection refused,
130+ # connection timeout) on the plain page, don't treat
131+ # that as a failure.
132+ plainRcode , plainPage = None , None
125133 try :
126- transformedUrl = task . ruleset . apply (plainUrl )
134+ plainRcode , plainPage = fetcherPlain . fetchHtml (plainUrl )
127135 except Exception , e :
128- self .queue_result ("regex_error" , str (e ), task .ruleFname , plainUrl )
129- logging .error ("%s: Regex Error %s" % (task .ruleFname , str (e )))
130- return
136+ logging .debug ("Non-fatal fetch error for plain page %s: %s" % (plainUrl , e ))
137+
138+ # Compare HTTP return codes - if original page returned 2xx,
139+ # but the transformed didn't, consider it an error in ruleset
140+ # (note this is not symmetric, we don't care if orig page is broken).
141+ # We don't handle 1xx codes for now.
142+ if plainRcode and plainRcode // 100 == 2 and transformedRcode // 100 != 2 :
143+ message = "Non-2xx HTTP code: %s (%d) => %s (%d)" % (
144+ plainUrl , plainRcode , transformedUrl , transformedRcode )
145+ self .queue_result ("error" , "non-2xx http code" , ruleFname , plainUrl , https_url = transformedUrl )
146+ logging .debug (message )
147+ return message
148+
149+ # If the plain page fetch got an exception, we don't
150+ # need to do the distance comparison. Intuitively, if a
151+ # plain page is fetchable people expect it to have the
152+ # same content as the HTTPS page. But if the plain page
153+ # is unreachable, there's nothing to compare to.
154+ if plainPage :
155+ distance = self .metric .distanceNormed (plainPage , transformedPage )
131156
157+ logging .debug ("==== D: %0.4f; %s (%d) -> %s (%d) =====" ,
158+ distance , plainUrl , len (plainPage ), transformedUrl , len (transformedPage ))
159+ if distance >= self .thresholdDistance :
160+ logging .info ("Big distance %0.4f: %s (%d) -> %s (%d). Rulefile: %s =====" ,
161+ distance , plainUrl , len (plainPage ), transformedUrl , len (transformedPage ), ruleFname )
162+
163+ self .queue_result ("success" , "" , ruleFname , plainUrl )
164+
165+ def processUrl (self , plainUrl , task ):
132166 fetcherPlain = task .fetcherPlain
133167 fetcherRewriting = task .fetcherRewriting
134168 ruleFname = task .ruleFname
135-
169+
170+ try :
171+ transformedUrl = task .ruleset .apply (plainUrl )
172+ except Exception , e :
173+ self .queue_result ("regex_error" , str (e ), ruleFname , plainUrl )
174+ logging .error ("%s: Regex Error %s" % (ruleFname , str (e )))
175+ return
176+
136177 try :
137- logging .debug ("=**= Start %s => %s ****" , plainUrl , transformedUrl )
138- logging .debug ("Fetching transformed page %s" , transformedUrl )
139- transformedRcode , transformedPage = fetcherRewriting .fetchHtml (transformedUrl )
140- logging .debug ("Fetching plain page %s" , plainUrl )
141- # If we get an exception (e.g. connection refused,
142- # connection timeout) on the plain page, don't treat
143- # that as a failure.
144- plainRcode , plainPage = None , None
178+ message = self .fetchUrl (plainUrl , transformedUrl , fetcherPlain , fetcherRewriting , ruleFname )
179+
180+ except :
181+ # Try once more before sending an error result
145182 try :
146- plainRcode , plainPage = fetcherPlain . fetchHtml (plainUrl )
183+ message = self . fetchUrl (plainUrl , transformedUrl , fetcherPlain , fetcherRewriting , ruleFname )
147184 except Exception , e :
148- logging .debug ("Non-fatal fetch error for plain page %s: %s" % (plainUrl , e ))
149-
150- # Compare HTTP return codes - if original page returned 2xx,
151- # but the transformed didn't, consider it an error in ruleset
152- # (note this is not symmetric, we don't care if orig page is broken).
153- # We don't handle 1xx codes for now.
154- if plainRcode and plainRcode // 100 == 2 and transformedRcode // 100 != 2 :
155- message = "Non-2xx HTTP code: %s (%d) => %s (%d)" % (
156- plainUrl , plainRcode , transformedUrl , transformedRcode )
157- self .queue_result ("error" , "non-2xx http code" , task .ruleFname , plainUrl , https_url = transformedUrl )
158- logging .debug (message )
159- return message
160-
161- # If the plain page fetch got an exception, we don't
162- # need to do the distance comparison. Intuitively, if a
163- # plain page is fetchable people expect it to have the
164- # same content as the HTTPS page. But if the plain page
165- # is unreachable, there's nothing to compare to.
166- if plainPage :
167- distance = self .metric .distanceNormed (plainPage , transformedPage )
168-
169- logging .debug ("==== D: %0.4f; %s (%d) -> %s (%d) =====" ,
170- distance , plainUrl , len (plainPage ), transformedUrl , len (transformedPage ))
171- if distance >= self .thresholdDistance :
172- logging .info ("Big distance %0.4f: %s (%d) -> %s (%d). Rulefile: %s =====" ,
173- distance , plainUrl , len (plainPage ), transformedUrl , len (transformedPage ), ruleFname )
174-
175- self .queue_result ("success" , "" , task .ruleFname , plainUrl )
185+ message = "Fetch error: %s => %s: %s" % (
186+ plainUrl , transformedUrl , e )
187+ self .queue_result ("error" , "fetch-error %s" % e , ruleFname , plainUrl , https_url = transformedUrl )
188+ logging .debug (message )
176189
177- except Exception , e :
178- message = "Fetch error: %s => %s: %s" % (
179- plainUrl , transformedUrl , e )
180- self .queue_result ("error" , "fetch-error %s" % e , task .ruleFname , plainUrl , https_url = transformedUrl )
181- logging .debug (message )
182- return message
183190 finally :
184191 logging .info ("Finished comparing %s -> %s. Rulefile: %s." ,
185192 plainUrl , transformedUrl , ruleFname )
186193
194+ return message
195+
187196def disableRuleset (ruleset , problems ):
188197 logging .info ("Disabling ruleset %s" , ruleset .filename )
189198 contents = open (ruleset .filename ).read ()
0 commit comments