Update robotparser and its test to CPython 2.7 latest.

jimbaker · jimbaker · commit 1c83b99b34f1 · 2016-02-23T22:53:40.000-07:00
test_robotparser occasionally times out in CI; this upgrade may help
resolve this problem.
diff --git a/Lib/test/test_support.py b/Lib/test/test_support.py
@@ -433,6 +433,19 @@ def _is_ipv6_enabled():
 
 IPV6_ENABLED = False  #_is_ipv6_enabled()
 
+def system_must_validate_cert(f):
+    """Skip the test on TLS certificate validation failures."""
+    @functools.wraps(f)
+    def dec(*args, **kwargs):
+        try:
+            f(*args, **kwargs)
+        except IOError as e:
+            if "CERTIFICATE_VERIFY_FAILED" in str(e):
+                raise unittest.SkipTest("system does not contain "
+                                        "necessary certificates")
+            raise
+    return dec
+
 FUZZ = 1e-6
 
 def fcmp(x, y): # fuzzy comparison function
diff --git a/lib-python/2.7/robotparser.py b/lib-python/2.7/robotparser.py
@@ -7,7 +7,8 @@
     2) PSF license for Python 2.2
 
     The robots.txt Exclusion Protocol is implemented as specified in
-    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
+    http://www.robotstxt.org/norobots-rfc.txt
+
 """
 import urlparse
 import urllib
@@ -60,7 +61,7 @@ def read(self):
         self.errcode = opener.errcode
         if self.errcode in (401, 403):
             self.disallow_all = True
-        elif self.errcode >= 400:
+        elif self.errcode >= 400 and self.errcode < 500:
             self.allow_all = True
         elif self.errcode == 200 and lines:
             self.parse(lines)
@@ -86,6 +87,7 @@ def parse(self, lines):
         linenumber = 0
         entry = Entry()
 
+        self.modified()
         for line in lines:
             linenumber += 1
             if not line:
@@ -131,6 +133,14 @@ def can_fetch(self, useragent, url):
             return False
         if self.allow_all:
             return True
+
+        # Until the robots.txt file has been read or found not
+        # to exist, we must assume that no url is allowable.
+        # This prevents false positives when a user erronenously
+        # calls can_fetch() before calling read().
+        if not self.last_checked:
+            return False
+
         # search for given user agent matches
         # the first match counts
         parsed_url = urlparse.urlparse(urllib.unquote(url))
@@ -160,6 +170,7 @@ def __init__(self, path, allowance):
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
+        path = urlparse.urlunparse(urlparse.urlparse(path))
         self.path = urllib.quote(path)
         self.allowance = allowance
 
diff --git a/lib-python/2.7/test/test_robotparser.py b/lib-python/2.7/test/test_robotparser.py
@@ -2,6 +2,12 @@
 from test import test_support
 from urllib2 import urlopen, HTTPError
 
+HAVE_HTTPS = True
+try:
+    from urllib2 import HTTPSHandler
+except ImportError:
+    HAVE_HTTPS = False
+
 class RobotTestCase(unittest.TestCase):
     def __init__(self, index, parser, url, good, agent):
         unittest.TestCase.__init__(self)
@@ -228,6 +234,18 @@ def RobotTest(index, robots_txt, good_urls, bad_urls,
 
 RobotTest(15, doc, good, bad)
 
+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+
 
 class NetworkTestCase(unittest.TestCase):
 
@@ -257,14 +275,16 @@ def testPasswordProtectedSite(self):
                 self.skipTest('%s is unavailable' % url)
             self.assertEqual(parser.can_fetch("*", robots_url), False)
 
+    @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
+    @test_support.system_must_validate_cert
     def testPythonOrg(self):
         test_support.requires('network')
         with test_support.transient_internet('www.python.org'):
             parser = robotparser.RobotFileParser(
-                "http://www.python.org/robots.txt")
+                "https://www.python.org/robots.txt")
             parser.read()
             self.assertTrue(
-                parser.can_fetch("*", "http://www.python.org/robots.txt"))
+                parser.can_fetch("*", "https://www.python.org/robots.txt"))
 
 
 def test_main():