Skip to content

Commit e2e20ff

Browse files
Hainishjsha
authored andcommitted
When a fetch error occurs, try the URL once more before reporting an error (EFForg#5015)
When a fetch error occurs, try the URL once more before reporting an error
1 parent cf53344 commit e2e20ff

File tree

1 file changed

+58
-49
lines changed

1 file changed

+58
-49
lines changed

test/rules/src/https_everywhere_checker/check_rules.py

Lines changed: 58 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -121,69 +121,78 @@ def queue_result(self, result, details, fname, url, https_url=None):
121121
res["https_url"] = https_url
122122
self.resQueue.put(res)
123123

124-
def processUrl(self, plainUrl, task):
124+
def fetchUrl(self, plainUrl, transformedUrl, fetcherPlain, fetcherRewriting, ruleFname):
125+
logging.debug("=**= Start %s => %s ****", plainUrl, transformedUrl)
126+
logging.debug("Fetching transformed page %s", transformedUrl)
127+
transformedRcode, transformedPage = fetcherRewriting.fetchHtml(transformedUrl)
128+
logging.debug("Fetching plain page %s", plainUrl)
129+
# If we get an exception (e.g. connection refused,
130+
# connection timeout) on the plain page, don't treat
131+
# that as a failure.
132+
plainRcode, plainPage = None, None
125133
try:
126-
transformedUrl = task.ruleset.apply(plainUrl)
134+
plainRcode, plainPage = fetcherPlain.fetchHtml(plainUrl)
127135
except Exception, e:
128-
self.queue_result("regex_error", str(e), task.ruleFname, plainUrl)
129-
logging.error("%s: Regex Error %s" % (task.ruleFname, str(e)))
130-
return
136+
logging.debug("Non-fatal fetch error for plain page %s: %s" % (plainUrl, e))
137+
138+
# Compare HTTP return codes - if original page returned 2xx,
139+
# but the transformed didn't, consider it an error in ruleset
140+
# (note this is not symmetric, we don't care if orig page is broken).
141+
# We don't handle 1xx codes for now.
142+
if plainRcode and plainRcode//100 == 2 and transformedRcode//100 != 2:
143+
message = "Non-2xx HTTP code: %s (%d) => %s (%d)" % (
144+
plainUrl, plainRcode, transformedUrl, transformedRcode)
145+
self.queue_result("error", "non-2xx http code", ruleFname, plainUrl, https_url=transformedUrl)
146+
logging.debug(message)
147+
return message
148+
149+
# If the plain page fetch got an exception, we don't
150+
# need to do the distance comparison. Intuitively, if a
151+
# plain page is fetchable people expect it to have the
152+
# same content as the HTTPS page. But if the plain page
153+
# is unreachable, there's nothing to compare to.
154+
if plainPage:
155+
distance = self.metric.distanceNormed(plainPage, transformedPage)
131156

157+
logging.debug("==== D: %0.4f; %s (%d) -> %s (%d) =====",
158+
distance, plainUrl, len(plainPage), transformedUrl, len(transformedPage))
159+
if distance >= self.thresholdDistance:
160+
logging.info("Big distance %0.4f: %s (%d) -> %s (%d). Rulefile: %s =====",
161+
distance, plainUrl, len(plainPage), transformedUrl, len(transformedPage), ruleFname)
162+
163+
self.queue_result("success", "", ruleFname, plainUrl)
164+
165+
def processUrl(self, plainUrl, task):
132166
fetcherPlain = task.fetcherPlain
133167
fetcherRewriting = task.fetcherRewriting
134168
ruleFname = task.ruleFname
135-
169+
170+
try:
171+
transformedUrl = task.ruleset.apply(plainUrl)
172+
except Exception, e:
173+
self.queue_result("regex_error", str(e), ruleFname, plainUrl)
174+
logging.error("%s: Regex Error %s" % (ruleFname, str(e)))
175+
return
176+
136177
try:
137-
logging.debug("=**= Start %s => %s ****", plainUrl, transformedUrl)
138-
logging.debug("Fetching transformed page %s", transformedUrl)
139-
transformedRcode, transformedPage = fetcherRewriting.fetchHtml(transformedUrl)
140-
logging.debug("Fetching plain page %s", plainUrl)
141-
# If we get an exception (e.g. connection refused,
142-
# connection timeout) on the plain page, don't treat
143-
# that as a failure.
144-
plainRcode, plainPage = None, None
178+
message = self.fetchUrl(plainUrl, transformedUrl, fetcherPlain, fetcherRewriting, ruleFname)
179+
180+
except:
181+
# Try once more before sending an error result
145182
try:
146-
plainRcode, plainPage = fetcherPlain.fetchHtml(plainUrl)
183+
message = self.fetchUrl(plainUrl, transformedUrl, fetcherPlain, fetcherRewriting, ruleFname)
147184
except Exception, e:
148-
logging.debug("Non-fatal fetch error for plain page %s: %s" % (plainUrl, e))
149-
150-
# Compare HTTP return codes - if original page returned 2xx,
151-
# but the transformed didn't, consider it an error in ruleset
152-
# (note this is not symmetric, we don't care if orig page is broken).
153-
# We don't handle 1xx codes for now.
154-
if plainRcode and plainRcode//100 == 2 and transformedRcode//100 != 2:
155-
message = "Non-2xx HTTP code: %s (%d) => %s (%d)" % (
156-
plainUrl, plainRcode, transformedUrl, transformedRcode)
157-
self.queue_result("error", "non-2xx http code", task.ruleFname, plainUrl, https_url=transformedUrl)
158-
logging.debug(message)
159-
return message
160-
161-
# If the plain page fetch got an exception, we don't
162-
# need to do the distance comparison. Intuitively, if a
163-
# plain page is fetchable people expect it to have the
164-
# same content as the HTTPS page. But if the plain page
165-
# is unreachable, there's nothing to compare to.
166-
if plainPage:
167-
distance = self.metric.distanceNormed(plainPage, transformedPage)
168-
169-
logging.debug("==== D: %0.4f; %s (%d) -> %s (%d) =====",
170-
distance, plainUrl, len(plainPage), transformedUrl, len(transformedPage))
171-
if distance >= self.thresholdDistance:
172-
logging.info("Big distance %0.4f: %s (%d) -> %s (%d). Rulefile: %s =====",
173-
distance, plainUrl, len(plainPage), transformedUrl, len(transformedPage), ruleFname)
174-
175-
self.queue_result("success", "", task.ruleFname, plainUrl)
185+
message = "Fetch error: %s => %s: %s" % (
186+
plainUrl, transformedUrl, e)
187+
self.queue_result("error", "fetch-error %s"% e, ruleFname, plainUrl, https_url=transformedUrl)
188+
logging.debug(message)
176189

177-
except Exception, e:
178-
message = "Fetch error: %s => %s: %s" % (
179-
plainUrl, transformedUrl, e)
180-
self.queue_result("error", "fetch-error %s"% e, task.ruleFname, plainUrl, https_url=transformedUrl)
181-
logging.debug(message)
182-
return message
183190
finally:
184191
logging.info("Finished comparing %s -> %s. Rulefile: %s.",
185192
plainUrl, transformedUrl, ruleFname)
186193

194+
return message
195+
187196
def disableRuleset(ruleset, problems):
188197
logging.info("Disabling ruleset %s", ruleset.filename)
189198
contents = open(ruleset.filename).read()

0 commit comments

Comments
 (0)