Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
gh-88500: Reduce memory use of urllib.unquote
`urllib.unquote_to_bytes` and `urllib.unquote` could both potentially
generate `O(len(string))` intermediate `bytes` or `str` objects while
computing the unquoted final result depending on the input provided. As
Python objects are relatively large, this could consume a lot of ram.

This switches the implementation to using an expanding `bytearray` and a
generator internally instead of precomputed `split()` style operations.
  • Loading branch information
gpshead committed Sep 12, 2022
commit c02c35744f99bf4d5dab9fdff5740da225d89279
2 changes: 2 additions & 0 deletions Lib/test/test_urllib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,8 @@ def test_unquoting(self):
self.assertEqual(result.count('%'), 1,
"using unquote(): not all characters escaped: "
"%s" % result)

def test_unquote_rejects_none_and_tuple(self):
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())

Expand Down
39 changes: 23 additions & 16 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,9 @@ def urldefrag(url):

def unquote_to_bytes(string):
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
return bytes(_unquote_to_bytearray(string))

def _unquote_to_bytearray(string):
# Note: strings are encoded as UTF-8. This is only an issue if it contains
# unescaped non-ASCII characters, which URIs should not.
if not string:
Expand All @@ -611,8 +614,8 @@ def unquote_to_bytes(string):
bits = string.split(b'%')
if len(bits) == 1:
return string
res = [bits[0]]
append = res.append
res = bytearray(bits[0])
add_data = res.extend
# Delay the initialization of the table to not waste memory
# if the function is never called
global _hextobyte
Expand All @@ -621,15 +624,25 @@ def unquote_to_bytes(string):
for a in _hexdig for b in _hexdig}
for item in bits[1:]:
try:
append(_hextobyte[item[:2]])
append(item[2:])
add_data(_hextobyte[item[:2]])
add_data(item[2:])
except KeyError:
append(b'%')
append(item)
return b''.join(res)
add_data(b'%')
add_data(item)
return res

_asciire = re.compile('([\x00-\x7f]+)')

def _generate_unquoted_parts(string, encoding, errors):
previous_match_end = 0
for ascii_match in _asciire.finditer(string):
start, end = ascii_match.span()
yield string[previous_match_end:start] # Non-ASCII
# The ascii_match[1] group == string[start:end].
yield _unquote_to_bytearray(ascii_match[1]).decode(encoding, errors)
previous_match_end = end
yield string[previous_match_end:] # Non-ASCII tail

def unquote(string, encoding='utf-8', errors='replace'):
"""Replace %xx escapes by their single-character equivalent. The optional
encoding and errors parameters specify how to decode percent-encoded
Expand All @@ -641,22 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'):
unquote('abc%20def') -> 'abc def'.
"""
if isinstance(string, bytes):
return unquote_to_bytes(string).decode(encoding, errors)
return _unquote_to_bytearray(string).decode(encoding, errors)
if '%' not in string:
# Is it a string-like object?
string.split
return string
if encoding is None:
encoding = 'utf-8'
if errors is None:
errors = 'replace'
bits = _asciire.split(string)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):
append(unquote_to_bytes(bits[i]).decode(encoding, errors))
append(bits[i + 1])
return ''.join(res)

return ''.join(_generate_unquoted_parts(string, encoding, errors))

def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
Expand Down