Skip to content

Commit 45d6547

Browse files
miss-islingtonRémi Lapeyre
andauthored
bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)
Co-Authored-By: Tal Einat <taleinat+github@gmail.com> (cherry picked from commit 8047e0e) Co-authored-by: Rémi Lapeyre <remi.lapeyre@henki.fr>
1 parent 159ae24 commit 45d6547

3 files changed

Lines changed: 26 additions & 14 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -76,30 +76,38 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
7676

7777

7878
class BaseRequestRateTest(BaseRobotTest):
79+
request_rate = None
80+
crawl_delay = None
7981

8082
def test_request_rate(self):
83+
parser = self.parser
8184
for url in self.good + self.bad:
8285
agent, url = self.get_agent_and_url(url)
8386
with self.subTest(url=url, agent=agent):
84-
if self.crawl_delay:
85-
self.assertEqual(
86-
self.parser.crawl_delay(agent), self.crawl_delay
87-
)
88-
if self.request_rate:
87+
self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
88+
89+
parsed_request_rate = parser.request_rate(agent)
90+
self.assertEqual(parsed_request_rate, self.request_rate)
91+
if self.request_rate is not None:
8992
self.assertIsInstance(
90-
self.parser.request_rate(agent),
93+
parsed_request_rate,
9194
urllib.robotparser.RequestRate
9295
)
9396
self.assertEqual(
94-
self.parser.request_rate(agent).requests,
97+
parsed_request_rate.requests,
9598
self.request_rate.requests
9699
)
97100
self.assertEqual(
98-
self.parser.request_rate(agent).seconds,
101+
parsed_request_rate.seconds,
99102
self.request_rate.seconds
100103
)
101104

102105

106+
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
107+
robots_txt = ''
108+
good = ['/foo']
109+
110+
103111
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
104112
robots_txt = """\
105113
User-agent: figtree
@@ -120,10 +128,6 @@ class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
120128

121129
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
122130
agent = 'FigTree Robot libwww-perl/5.04'
123-
# these are not actually tested, but we still need to parse it
124-
# in order to accommodate the input parameters
125-
request_rate = None
126-
crawl_delay = None
127131

128132

129133
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):

Lib/urllib/robotparser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,15 +179,19 @@ def crawl_delay(self, useragent):
179179
for entry in self.entries:
180180
if entry.applies_to(useragent):
181181
return entry.delay
182-
return self.default_entry.delay
182+
if self.default_entry:
183+
return self.default_entry.delay
184+
return None
183185

184186
def request_rate(self, useragent):
185187
if not self.mtime():
186188
return None
187189
for entry in self.entries:
188190
if entry.applies_to(useragent):
189191
return entry.req_rate
190-
return self.default_entry.req_rate
192+
if self.default_entry:
193+
return self.default_entry.req_rate
194+
return None
191195

192196
def __str__(self):
193197
entries = self.entries
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix :meth:`RobotFileParser.crawl_delay` and
2+
:meth:`RobotFileParser.request_rate` to return ``None`` rather than
3+
raise :exc:`AttributeError` when no relevant rule is defined in the
4+
robots.txt file. Patch by Rémi Lapeyre.

0 commit comments

Comments
 (0)