Skip to content

Commit bdeb56c

Browse files
bpo-35372: Fix the code page decoder for input > 2 GiB. (GH-10848)
(cherry picked from commit 4013c17) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 1ef06c6 commit bdeb56c

3 files changed

Lines changed: 24 additions & 5 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3185,6 +3185,24 @@ def test_mbcs_alias(self):
31853185
codec = codecs.lookup('cp123')
31863186
self.assertEqual(codec.name, 'mbcs')
31873187

3188+
@support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3189+
def test_large_input(self):
3190+
# Test input longer than INT_MAX.
3191+
# Input should contain undecodable bytes before and after
3192+
# the INT_MAX limit.
3193+
encoded = (b'01234567' * (2**28-1) +
3194+
b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3195+
self.assertEqual(len(encoded), 2**31+2)
3196+
decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3197+
self.assertEqual(decoded[1], len(encoded))
3198+
del encoded
3199+
self.assertEqual(len(decoded[0]), decoded[1])
3200+
self.assertEqual(decoded[0][:10], '0123456701')
3201+
self.assertEqual(decoded[0][-20:],
3202+
'6701234567'
3203+
'\udc85\udc86\udcea\udceb\udcec'
3204+
'\udcef\udcfc\udcfd\udcfe\udcff')
3205+
31883206

31893207
class ASCIITest(unittest.TestCase):
31903208
def test_encode(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed the code page decoder for input longer than 2 GiB containing
2+
undecodable bytes.

Objects/unicodeobject.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7178,7 +7178,7 @@ decode_code_page_errors(UINT code_page,
71787178
"in the target code page.";
71797179
/* each step cannot decode more than 1 character, but a character can be
71807180
represented as a surrogate pair */
7181-
wchar_t buffer[2], *startout, *out;
7181+
wchar_t buffer[2], *out;
71827182
int insize;
71837183
Py_ssize_t outsize;
71847184
PyObject *errorHandler = NULL;
@@ -7215,7 +7215,7 @@ decode_code_page_errors(UINT code_page,
72157215
*v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
72167216
if (*v == NULL)
72177217
goto error;
7218-
startout = PyUnicode_AS_UNICODE(*v);
7218+
out = PyUnicode_AS_UNICODE(*v);
72197219
}
72207220
else {
72217221
/* Extend unicode object */
@@ -7226,11 +7226,10 @@ decode_code_page_errors(UINT code_page,
72267226
}
72277227
if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
72287228
goto error;
7229-
startout = PyUnicode_AS_UNICODE(*v) + n;
7229+
out = PyUnicode_AS_UNICODE(*v) + n;
72307230
}
72317231

72327232
/* Decode the byte string character per character */
7233-
out = startout;
72347233
while (in < endin)
72357234
{
72367235
/* Decode a character */
@@ -7285,7 +7284,7 @@ decode_code_page_errors(UINT code_page,
72857284
*out = 0;
72867285

72877286
/* Extend unicode object */
7288-
outsize = out - startout;
7287+
outsize = out - PyUnicode_AS_UNICODE(*v);
72897288
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
72907289
if (unicode_resize(v, outsize) < 0)
72917290
goto error;

0 commit comments

Comments
 (0)