Skip to content

Commit 448658e

Browse files
committed
Update urllib from v3.14.2
1 parent 77add04 commit 448658e

10 files changed

Lines changed: 1248 additions & 1286 deletions

Lib/test/test_robotparser.py

Lines changed: 146 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ class BaseRobotTest:
1616
bad = []
1717
site_maps = None
1818

19+
def __init_subclass__(cls):
20+
super().__init_subclass__()
21+
# Remove tests that do nothing.
22+
if not cls.good:
23+
cls.test_good_urls = None
24+
if not cls.bad:
25+
cls.test_bad_urls = None
26+
1927
def setUp(self):
2028
lines = io.StringIO(self.robots_txt).readlines()
2129
self.parser = urllib.robotparser.RobotFileParser()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
231239
robots_txt = """\
232240
User-agent: *
233241
Disallow: /some/path?name=value
242+
Disallow: /another/path?
243+
Disallow: /yet/one/path?name=value&more
234244
"""
235-
good = ['/some/path']
236-
bad = ['/some/path?name=value']
245+
good = ['/some/path', '/some/path?',
246+
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
247+
'/another/path', '/another/path%3F',
248+
'/yet/one/path?name=value%26more']
249+
bad = ['/some/path?name=value'
250+
'/another/path?', '/another/path?name=value',
251+
'/yet/one/path?name=value&more']
237252

238253

239254
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@@ -249,19 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249264
bad = ['/some/path']
250265

251266

252-
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
253-
# normalize the URL first (#17403)
267+
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
254268
robots_txt = """\
255269
User-agent: *
256-
Allow: /some/path?
257-
Disallow: /another/path?
258-
"""
259-
good = ['/some/path?']
260-
bad = ['/another/path?']
261-
262-
@unittest.expectedFailure # TODO: RUSTPYTHON; self.assertFalse(self.parser.can_fetch(agent, url))\nAssertionError: True is not false
263-
def test_bad_urls(self):
264-
super().test_bad_urls()
270+
Disallow: /a1/Z-._~ # unreserved characters
271+
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
272+
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
273+
Disallow: /u2/%f0%9f%90%8d
274+
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
275+
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
276+
Disallow: /v2/%f0
277+
Disallow: /v3/\udcf0 # raw non-ASCII octet
278+
Disallow: /p1%xy # raw percent
279+
Disallow: /p2%
280+
Disallow: /p3%25xy # percent-encoded percent
281+
Disallow: /p4%2525xy # double percent-encoded percent
282+
Disallow: /john%20smith # space
283+
Disallow: /john doe
284+
Disallow: /trailingspace%20
285+
Disallow: /question%3Fq=v # not query
286+
Disallow: /hash%23f # not fragment
287+
Disallow: /dollar%24
288+
Disallow: /asterisk%2A
289+
Disallow: /sub/dir
290+
Disallow: /slash%2F
291+
Disallow: /query/question?q=%3F
292+
Disallow: /query/raw/question?q=?
293+
Disallow: /query/eq?q%3Dv
294+
Disallow: /query/amp?q=v%26a
295+
"""
296+
good = [
297+
'/u1/%F0', '/u1/%f0',
298+
'/u2/%F0', '/u2/%f0',
299+
'/u3/%F0', '/u3/%f0',
300+
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
301+
'/question?q=v',
302+
'/dollar', '/asterisk',
303+
'/query/eq?q=v',
304+
'/query/amp?q=v&a',
305+
]
306+
bad = [
307+
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
308+
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
309+
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
310+
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
311+
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
312+
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
313+
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
314+
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
315+
'/p1%xy', '/p1%25xy',
316+
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
317+
'/p3%xy', '/p3%25xy',
318+
'/p4%2525xy',
319+
'/john%20smith', '/john smith',
320+
'/john%20doe', '/john doe',
321+
'/trailingspace%20', '/trailingspace ',
322+
'/question%3Fq=v',
323+
'/hash#f', '/hash%23f',
324+
'/dollar$', '/dollar%24',
325+
'/asterisk*', '/asterisk%2A',
326+
'/sub/dir', '/sub%2Fdir',
327+
'/slash%2F', '/slash/',
328+
'/query/question?q=?', '/query/question?q=%3F',
329+
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
330+
'/query/eq?q%3Dv',
331+
'/query/amp?q=v%26a',
332+
]
333+
# other reserved characters
334+
for c in ":/#[]@!$&'()*+,;=":
335+
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
336+
bad.append(f'/raw{c}')
337+
bad.append(f'/raw%{ord(c):02X}')
338+
bad.append(f'/pc{c}')
339+
bad.append(f'/pc%{ord(c):02X}')
265340

266341

267342
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -303,22 +378,17 @@ def test_string_formatting(self):
303378
self.assertEqual(str(self.parser), self.expected_output)
304379

305380

306-
class RobotHandler(BaseHTTPRequestHandler):
307-
308-
def do_GET(self):
309-
self.send_error(403, "Forbidden access")
310-
311-
def log_message(self, format, *args):
312-
pass
313-
314-
315-
class PasswordProtectedSiteTestCase(unittest.TestCase):
381+
@unittest.skipUnless(
382+
support.has_socket_support,
383+
"Socket server requires working socket."
384+
)
385+
class BaseLocalNetworkTestCase:
316386

317387
def setUp(self):
318388
# clear _opener global variable
319389
self.addCleanup(urllib.request.urlcleanup)
320390

321-
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
391+
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
322392

323393
self.t = threading.Thread(
324394
name='HTTPServer serving',
@@ -335,6 +405,57 @@ def tearDown(self):
335405
self.t.join()
336406
self.server.server_close()
337407

408+
409+
SAMPLE_ROBOTS_TXT = b'''\
410+
User-agent: test_robotparser
411+
Disallow: /utf8/\xf0\x9f\x90\x8d
412+
Disallow: /non-utf8/\xf0
413+
Disallow: //[spam]/path
414+
'''
415+
416+
417+
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
418+
class RobotHandler(BaseHTTPRequestHandler):
419+
420+
def do_GET(self):
421+
self.send_response(200)
422+
self.end_headers()
423+
self.wfile.write(SAMPLE_ROBOTS_TXT)
424+
425+
def log_message(self, format, *args):
426+
pass
427+
428+
@threading_helper.reap_threads
429+
def testRead(self):
430+
# Test that reading a weird robots.txt doesn't fail.
431+
addr = self.server.server_address
432+
url = f'http://{socket_helper.HOST}:{addr[1]}'
433+
robots_url = url + '/robots.txt'
434+
parser = urllib.robotparser.RobotFileParser()
435+
parser.set_url(robots_url)
436+
parser.read()
437+
# And it can even interpret the weird paths in some reasonable way.
438+
agent = 'test_robotparser'
439+
self.assertTrue(parser.can_fetch(agent, robots_url))
440+
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
441+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
442+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
443+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
444+
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
445+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
446+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
447+
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
448+
449+
450+
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
451+
class RobotHandler(BaseHTTPRequestHandler):
452+
453+
def do_GET(self):
454+
self.send_error(403, "Forbidden access")
455+
456+
def log_message(self, format, *args):
457+
pass
458+
338459
@threading_helper.reap_threads
339460
def testPasswordProtectedSite(self):
340461
addr = self.server.server_address
@@ -346,6 +467,7 @@ def testPasswordProtectedSite(self):
346467
self.assertFalse(parser.can_fetch("*", robots_url))
347468

348469

470+
@support.requires_working_socket()
349471
class NetworkTestCase(unittest.TestCase):
350472

351473
base_url = 'http://www.pythontest.net/'

0 commit comments

Comments
 (0)