diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index f063e463753e0b..d89ac27083709c 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -23,11 +23,14 @@ web site that published the :file:`robots.txt` file. For more details on the structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. -.. class:: RobotFileParser(url='') +.. class:: RobotFileParser(url='', sslcontext=None) This class provides methods to read, parse and answer questions about the :file:`robots.txt` file at *url*. + It also supports overriding the default *sslcontext* used to fetch the + :file:`robots.txt` file. + .. method:: set_url(url) Sets the URL referring to a :file:`robots.txt` file. diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 8d89e2a8224452..a18510d3536955 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -6,8 +6,11 @@ from test import support from test.support import socket_helper from test.support import threading_helper +from test.support import import_helper from http.server import BaseHTTPRequestHandler, HTTPServer +ssl = import_helper.import_module("ssl") + class BaseRobotTest: robots_txt = '' @@ -388,5 +391,29 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) + +class SSLContextOverrideTestCase(unittest.TestCase): + base_url = 'https://www.pythontest.net/' + robots_txt = '{}elsewhere/robots.txt'.format(base_url) + + @classmethod + def setUpClass(cls): + support.requires('network') + # The URL has a self-signed CA. Ignore validation errors. + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + with socket_helper.transient_internet(cls.base_url): + cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt, sslcontext=ctx) + cls.parser.read() + + def test_basic(self): + self.assertFalse(self.parser.disallow_all) + self.assertFalse(self.parser.allow_all) + self.assertGreater(self.parser.mtime(), 0) + self.assertFalse(self.parser.crawl_delay('*')) + self.assertFalse(self.parser.request_rate('*')) + + if __name__=='__main__': unittest.main() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index c58565e3945146..6a10b689d1eda3 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -25,7 +25,7 @@ class RobotFileParser: """ - def __init__(self, url=''): + def __init__(self, url='', sslcontext=None): self.entries = [] self.sitemaps = [] self.default_entry = None @@ -33,6 +33,7 @@ def __init__(self, url=''): self.allow_all = False self.set_url(url) self.last_checked = 0 + self.sslcontext = sslcontext def mtime(self): """Returns the time the robots.txt file was last fetched. @@ -59,7 +60,7 @@ def set_url(self, url): def read(self): """Reads the robots.txt URL and feeds it to the parser.""" try: - f = urllib.request.urlopen(self.url) + f = urllib.request.urlopen(self.url, context=self.sslcontext) except urllib.error.HTTPError as err: if err.code in (401, 403): self.disallow_all = True diff --git a/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst new file mode 100644 index 00000000000000..22944ababc60c8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst @@ -0,0 +1,3 @@ +:meth:`urllib.robotparser.RobotFileParser.read` supports passing an additional argument - +*sslcontext*. This allows a user to override the sslcontext when reading a +robots.txt from a remote url. Patch by Tarun Chinmai Sekar.