Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3185,6 +3185,24 @@ def test_mbcs_alias(self):
codec = codecs.lookup('cp123')
self.assertEqual(codec.name, 'mbcs')

@support.bigmemtest(size=2**31, memuse=7, dry_run=False)
def test_large_input(self):
# Test input longer than INT_MAX.
# Input should contain undecodable bytes before and after
# the INT_MAX limit.
encoded = (b'01234567' * (2**28-1) +
b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
self.assertEqual(len(encoded), 2**31+2)
decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
self.assertEqual(decoded[1], len(encoded))
del encoded
self.assertEqual(len(decoded[0]), decoded[1])
self.assertEqual(decoded[0][:10], '0123456701')
self.assertEqual(decoded[0][-20:],
'6701234567'
'\udc85\udc86\udcea\udceb\udcec'
'\udcef\udcfc\udcfd\udcfe\udcff')


class ASCIITest(unittest.TestCase):
def test_encode(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fixed the code page decoder for input longer than 2 GiB containing
undecodable bytes.
9 changes: 4 additions & 5 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -7178,7 +7178,7 @@ decode_code_page_errors(UINT code_page,
"in the target code page.";
/* each step cannot decode more than 1 character, but a character can be
represented as a surrogate pair */
wchar_t buffer[2], *startout, *out;
wchar_t buffer[2], *out;
int insize;
Py_ssize_t outsize;
PyObject *errorHandler = NULL;
Expand Down Expand Up @@ -7215,7 +7215,7 @@ decode_code_page_errors(UINT code_page,
*v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
if (*v == NULL)
goto error;
startout = PyUnicode_AS_UNICODE(*v);
out = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
Expand All @@ -7226,11 +7226,10 @@ decode_code_page_errors(UINT code_page,
}
if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
goto error;
startout = PyUnicode_AS_UNICODE(*v) + n;
out = PyUnicode_AS_UNICODE(*v) + n;
}

/* Decode the byte string character per character */
out = startout;
while (in < endin)
{
/* Decode a character */
Expand Down Expand Up @@ -7285,7 +7284,7 @@ decode_code_page_errors(UINT code_page,
*out = 0;

/* Extend unicode object */
outsize = out - startout;
outsize = out - PyUnicode_AS_UNICODE(*v);
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0)
goto error;
Expand Down