Skip to content

Commit 1c83b99

Browse files
committed
Update robotparser and its test to CPython 2.7 latest.
test_robotparser occasionally times out in CI; this upgrade may help resolve this problem.
1 parent 982ea5e commit 1c83b99

3 files changed

Lines changed: 48 additions & 4 deletions

File tree

Lib/test/test_support.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,19 @@ def _is_ipv6_enabled():
433433

434434
IPV6_ENABLED = False #_is_ipv6_enabled()
435435

436+
def system_must_validate_cert(f):
437+
"""Skip the test on TLS certificate validation failures."""
438+
@functools.wraps(f)
439+
def dec(*args, **kwargs):
440+
try:
441+
f(*args, **kwargs)
442+
except IOError as e:
443+
if "CERTIFICATE_VERIFY_FAILED" in str(e):
444+
raise unittest.SkipTest("system does not contain "
445+
"necessary certificates")
446+
raise
447+
return dec
448+
436449
FUZZ = 1e-6
437450

438451
def fcmp(x, y): # fuzzy comparison function

lib-python/2.7/robotparser.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
2) PSF license for Python 2.2
88
99
The robots.txt Exclusion Protocol is implemented as specified in
10-
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
10+
http://www.robotstxt.org/norobots-rfc.txt
11+
1112
"""
1213
import urlparse
1314
import urllib
@@ -60,7 +61,7 @@ def read(self):
6061
self.errcode = opener.errcode
6162
if self.errcode in (401, 403):
6263
self.disallow_all = True
63-
elif self.errcode >= 400:
64+
elif self.errcode >= 400 and self.errcode < 500:
6465
self.allow_all = True
6566
elif self.errcode == 200 and lines:
6667
self.parse(lines)
@@ -86,6 +87,7 @@ def parse(self, lines):
8687
linenumber = 0
8788
entry = Entry()
8889

90+
self.modified()
8991
for line in lines:
9092
linenumber += 1
9193
if not line:
@@ -131,6 +133,14 @@ def can_fetch(self, useragent, url):
131133
return False
132134
if self.allow_all:
133135
return True
136+
137+
# Until the robots.txt file has been read or found not
138+
# to exist, we must assume that no url is allowable.
139+
# This prevents false positives when a user erronenously
140+
# calls can_fetch() before calling read().
141+
if not self.last_checked:
142+
return False
143+
134144
# search for given user agent matches
135145
# the first match counts
136146
parsed_url = urlparse.urlparse(urllib.unquote(url))
@@ -160,6 +170,7 @@ def __init__(self, path, allowance):
160170
if path == '' and not allowance:
161171
# an empty value means allow all
162172
allowance = True
173+
path = urlparse.urlunparse(urlparse.urlparse(path))
163174
self.path = urllib.quote(path)
164175
self.allowance = allowance
165176

lib-python/2.7/test/test_robotparser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
from test import test_support
33
from urllib2 import urlopen, HTTPError
44

5+
HAVE_HTTPS = True
6+
try:
7+
from urllib2 import HTTPSHandler
8+
except ImportError:
9+
HAVE_HTTPS = False
10+
511
class RobotTestCase(unittest.TestCase):
612
def __init__(self, index, parser, url, good, agent):
713
unittest.TestCase.__init__(self)
@@ -228,6 +234,18 @@ def RobotTest(index, robots_txt, good_urls, bad_urls,
228234

229235
RobotTest(15, doc, good, bad)
230236

237+
# 16. Empty query (issue #17403). Normalizing the url first.
238+
doc = """
239+
User-agent: *
240+
Allow: /some/path?
241+
Disallow: /another/path?
242+
"""
243+
244+
good = ['/some/path?']
245+
bad = ['/another/path?']
246+
247+
RobotTest(16, doc, good, bad)
248+
231249

232250
class NetworkTestCase(unittest.TestCase):
233251

@@ -257,14 +275,16 @@ def testPasswordProtectedSite(self):
257275
self.skipTest('%s is unavailable' % url)
258276
self.assertEqual(parser.can_fetch("*", robots_url), False)
259277

278+
@unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
279+
@test_support.system_must_validate_cert
260280
def testPythonOrg(self):
261281
test_support.requires('network')
262282
with test_support.transient_internet('www.python.org'):
263283
parser = robotparser.RobotFileParser(
264-
"http://www.python.org/robots.txt")
284+
"https://www.python.org/robots.txt")
265285
parser.read()
266286
self.assertTrue(
267-
parser.can_fetch("*", "http://www.python.org/robots.txt"))
287+
parser.can_fetch("*", "https://www.python.org/robots.txt"))
268288

269289

270290
def test_main():

0 commit comments

Comments
 (0)