Skip to content
Merged
Next Next commit
issue43882 - urllib.parse should sanitize urls containing ASCII newli…
…ne and tabs.
  • Loading branch information
orsenthil committed Apr 25, 2021
commit 60c9b553f29a9854e244ea21e412d7928bd57bcb
6 changes: 6 additions & 0 deletions Doc/library/urllib.parse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ or on combining URL components into a URL string.
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
decomposed before parsing, no error will be raised.

Following the specification in WHATWG which updates RFC 3986, ASCII newline
Comment thread
gpshead marked this conversation as resolved.
Outdated
``\n``, ``\r`` or ``\r\n`` and tab ``\t`` characters are stripped from
Comment thread
gpshead marked this conversation as resolved.
Outdated
the url.

.. versionchanged:: 3.6
Out-of-range port numbers now raise :exc:`ValueError`, instead of
returning :const:`None`.
Expand All @@ -320,6 +324,8 @@ or on combining URL components into a URL string.
Characters that affect netloc parsing under NFKC normalization will
now raise :exc:`ValueError`.

.. versionchanged:: 3.10
ASCII newline and tab characters are stripped from the url.
Comment thread
orsenthil marked this conversation as resolved.
Outdated

.. function:: urlunsplit(parts)

Expand Down
28 changes: 28 additions & 0 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,21 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), url)

# Remove ASCII tabs and newlines from input
Comment thread
gpshead marked this conversation as resolved.
Outdated

url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "www.python.org")
self.assertEqual(p.path, "/javascript:alert('msg')/")
self.assertEqual(p.query, "")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag")

# And check them all again, only with bytes this time
url = b"HTTP://WWW.PYTHON.ORG/doc/#frag"
p = urllib.parse.urlsplit(url)
Expand Down Expand Up @@ -606,6 +621,19 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), url)

url = b"http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"www.python.org")
self.assertEqual(p.path, b"/javascript:alert('msg')/")
self.assertEqual(p.query, b"")
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
self.assertEqual(p.hostname, b"www.python.org")
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/#frag")

# Verify an illegal port raises ValueError
url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag"
p = urllib.parse.urlsplit(url)
Expand Down
3 changes: 3 additions & 0 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ def urlsplit(url, scheme='', allow_fragments=True):
else:
scheme, url = url[:i].lower(), url[i+1:]

_unsafe_chars_to_remove = ['\t', '\r', '\n']
Comment thread
gpshead marked this conversation as resolved.
Outdated
Comment thread
gpshead marked this conversation as resolved.
Outdated
url = url.translate({ord(c): None for c in _unsafe_chars_to_remove})

if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Presence newline or tab characters in URL allowed attackers to write scripts
in URL, hijack the the the web-server.

Following the controlling specification for URLs defined by WHATWG
urllib.parse strips ASCII newline and tabs from the url, preventing such
attacks.