From ad1aa10d911cafe25a4782a4fecca5a634cb9f5c Mon Sep 17 00:00:00 2001 From: Tarun Chinmai Sekar Date: Mon, 22 Mar 2021 20:19:05 -0400 Subject: [PATCH 1/2] bpo-43597: add ability to specify sslcontext in RobotFileParser.read() --- Doc/library/urllib.robotparser.rst | 5 +++- Lib/test/test_robotparser.py | 27 +++++++++++++++++++ Lib/urllib/robotparser.py | 5 ++-- .../2021-04-06-19-47-43.bpo-43597.vQvHc8.rst | 3 +++ 4 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index 544f50273dd17c..e2207c77f70e6c 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -23,11 +23,14 @@ Web site that published the :file:`robots.txt` file. For more details on the structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. -.. class:: RobotFileParser(url='') +.. class:: RobotFileParser(url='', sslcontext=None) This class provides methods to read, parse and answer questions about the :file:`robots.txt` file at *url*. + It also supports overriding the default *sslcontext* used to fetch the + :file:`robots.txt` file. + .. method:: set_url(url) Sets the URL referring to a :file:`robots.txt` file. diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index b0bed431d4b059..278c0045d6a264 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -6,8 +6,11 @@ from test import support from test.support import socket_helper from test.support import threading_helper +from test.support import import_helper from http.server import BaseHTTPRequestHandler, HTTPServer +ssl = import_helper.import_module("ssl") + class BaseRobotTest: robots_txt = '' @@ -383,5 +386,29 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) + +class SSLContextOverrideTestCase(unittest.TestCase): + base_url = 'https://www.pythontest.net/' + robots_txt = '{}elsewhere/robots.txt'.format(base_url) + + @classmethod + def setUpClass(cls): + support.requires('network') + # The URL has a self-signed CA. Ignore validation errors. + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + with socket_helper.transient_internet(cls.base_url): + cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt, sslcontext=ctx) + cls.parser.read() + + def test_basic(self): + self.assertFalse(self.parser.disallow_all) + self.assertFalse(self.parser.allow_all) + self.assertGreater(self.parser.mtime(), 0) + self.assertFalse(self.parser.crawl_delay('*')) + self.assertFalse(self.parser.request_rate('*')) + + if __name__=='__main__': unittest.main() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index c58565e3945146..6a10b689d1eda3 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -25,7 +25,7 @@ class RobotFileParser: """ - def __init__(self, url=''): + def __init__(self, url='', sslcontext=None): self.entries = [] self.sitemaps = [] self.default_entry = None @@ -33,6 +33,7 @@ def __init__(self, url=''): self.allow_all = False self.set_url(url) self.last_checked = 0 + self.sslcontext = sslcontext def mtime(self): """Returns the time the robots.txt file was last fetched. @@ -59,7 +60,7 @@ def set_url(self, url): def read(self): """Reads the robots.txt URL and feeds it to the parser.""" try: - f = urllib.request.urlopen(self.url) + f = urllib.request.urlopen(self.url, context=self.sslcontext) except urllib.error.HTTPError as err: if err.code in (401, 403): self.disallow_all = True diff --git a/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst new file mode 100644 index 00000000000000..a22750467f62ba --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst @@ -0,0 +1,3 @@ +:meth:`urllib.robotparser.RobotFileParser.read` supports passing an additional argument - +*sslcontext*. This allows a user to override the sslcontext when reading a +robots.txt from a remote url. Patch by Tarun Chinmai Sekar. \ No newline at end of file From 3e5faf614b9429a79efc6eb81d8f58dc2291e5dd Mon Sep 17 00:00:00 2001 From: Oleg Iarygin Date: Sat, 1 Apr 2023 09:47:05 +0400 Subject: [PATCH 2/2] Fix the failed `Docs / Docs (pull_request)` CI check --- .../next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst index a22750467f62ba..22944ababc60c8 100644 --- a/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst +++ b/Misc/NEWS.d/next/Library/2021-04-06-19-47-43.bpo-43597.vQvHc8.rst @@ -1,3 +1,3 @@ :meth:`urllib.robotparser.RobotFileParser.read` supports passing an additional argument - *sslcontext*. This allows a user to override the sslcontext when reading a -robots.txt from a remote url. Patch by Tarun Chinmai Sekar. \ No newline at end of file +robots.txt from a remote url. Patch by Tarun Chinmai Sekar.