Skip to content

Commit 292f72d

Browse files
author
alexandre.vassalotti
committed
Fix issue #4730: cPickle corrupts high-unicode strings.
Update outdated copy of PyUnicode_EncodeRawUnicodeEscape. Add a test case. git-svn-id: http://svn.python.org/projects/python/trunk@67934 6015fed2-1504-0410-9fe1-9d1591cc4771
1 parent e26be87 commit 292f72d

2 files changed

Lines changed: 88 additions & 32 deletions

File tree

Lib/test/pickletester.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,14 +480,21 @@ def test_insecure_strings(self):
480480

481481
if have_unicode:
482482
def test_unicode(self):
483-
endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
484-
unicode('<\n>'), unicode('<\\>')]
483+
endcases = [u'', u'<\\u>', u'<\\\\u1234>', u'<\n>',
484+
u'<\\>', u'<\\\\U00012345>']
485485
for proto in protocols:
486486
for u in endcases:
487487
p = self.dumps(u, proto)
488488
u2 = self.loads(p)
489489
self.assertEqual(u2, u)
490490

491+
def test_unicode_high_plane(self):
492+
t = u'\U00012345'
493+
for proto in protocols:
494+
p = self.dumps(t, proto)
495+
t2 = self.loads(p)
496+
self.assertEqual(t2, t)
497+
491498
def test_ints(self):
492499
import sys
493500
for proto in protocols:

Modules/cPickle.c

Lines changed: 79 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,41 +1255,90 @@ save_string(Picklerobject *self, PyObject *args, int doput)
12551255
/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
12561256
backslash and newline characters to \uXXXX escapes. */
12571257
static PyObject *
1258-
modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
1258+
modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size)
12591259
{
1260-
PyObject *repr;
1261-
char *p;
1262-
char *q;
1260+
PyObject *repr;
1261+
char *p;
1262+
char *q;
12631263

1264-
static const char *hexdigit = "0123456789ABCDEF";
1264+
static const char *hexdigit = "0123456789abcdef";
1265+
#ifdef Py_UNICODE_WIDE
1266+
const Py_ssize_t expandsize = 10;
1267+
#else
1268+
const Py_ssize_t expandsize = 6;
1269+
#endif
12651270

1266-
repr = PyString_FromStringAndSize(NULL, 6 * size);
1267-
if (repr == NULL)
1268-
return NULL;
1269-
if (size == 0)
1270-
return repr;
1271-
1272-
p = q = PyString_AS_STRING(repr);
1273-
while (size-- > 0) {
1274-
Py_UNICODE ch = *s++;
1275-
/* Map 16-bit characters to '\uxxxx' */
1276-
if (ch >= 256 || ch == '\\' || ch == '\n') {
1277-
*p++ = '\\';
1278-
*p++ = 'u';
1279-
*p++ = hexdigit[(ch >> 12) & 0xf];
1280-
*p++ = hexdigit[(ch >> 8) & 0xf];
1281-
*p++ = hexdigit[(ch >> 4) & 0xf];
1282-
*p++ = hexdigit[ch & 15];
1283-
}
1284-
/* Copy everything else as-is */
1285-
else
1286-
*p++ = (char) ch;
1287-
}
1288-
*p = '\0';
1289-
_PyString_Resize(&repr, p - q);
1271+
if (size > PY_SSIZE_T_MAX / expandsize)
1272+
return PyErr_NoMemory();
1273+
1274+
repr = PyString_FromStringAndSize(NULL, expandsize * size);
1275+
if (repr == NULL)
1276+
return NULL;
1277+
if (size == 0)
12901278
return repr;
1291-
}
12921279

1280+
p = q = PyString_AS_STRING(repr);
1281+
while (size-- > 0) {
1282+
Py_UNICODE ch = *s++;
1283+
#ifdef Py_UNICODE_WIDE
1284+
/* Map 32-bit characters to '\Uxxxxxxxx' */
1285+
if (ch >= 0x10000) {
1286+
*p++ = '\\';
1287+
*p++ = 'U';
1288+
*p++ = hexdigit[(ch >> 28) & 0xf];
1289+
*p++ = hexdigit[(ch >> 24) & 0xf];
1290+
*p++ = hexdigit[(ch >> 20) & 0xf];
1291+
*p++ = hexdigit[(ch >> 16) & 0xf];
1292+
*p++ = hexdigit[(ch >> 12) & 0xf];
1293+
*p++ = hexdigit[(ch >> 8) & 0xf];
1294+
*p++ = hexdigit[(ch >> 4) & 0xf];
1295+
*p++ = hexdigit[ch & 15];
1296+
}
1297+
else
1298+
#else
1299+
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
1300+
if (ch >= 0xD800 && ch < 0xDC00) {
1301+
Py_UNICODE ch2;
1302+
Py_UCS4 ucs;
1303+
1304+
ch2 = *s++;
1305+
size--;
1306+
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1307+
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1308+
*p++ = '\\';
1309+
*p++ = 'U';
1310+
*p++ = hexdigit[(ucs >> 28) & 0xf];
1311+
*p++ = hexdigit[(ucs >> 24) & 0xf];
1312+
*p++ = hexdigit[(ucs >> 20) & 0xf];
1313+
*p++ = hexdigit[(ucs >> 16) & 0xf];
1314+
*p++ = hexdigit[(ucs >> 12) & 0xf];
1315+
*p++ = hexdigit[(ucs >> 8) & 0xf];
1316+
*p++ = hexdigit[(ucs >> 4) & 0xf];
1317+
*p++ = hexdigit[ucs & 0xf];
1318+
continue;
1319+
}
1320+
/* Fall through: isolated surrogates are copied as-is */
1321+
s--;
1322+
size++;
1323+
}
1324+
#endif
1325+
/* Map 16-bit characters to '\uxxxx' */
1326+
if (ch >= 256 || ch == '\\' || ch == '\n') {
1327+
*p++ = '\\';
1328+
*p++ = 'u';
1329+
*p++ = hexdigit[(ch >> 12) & 0xf];
1330+
*p++ = hexdigit[(ch >> 8) & 0xf];
1331+
*p++ = hexdigit[(ch >> 4) & 0xf];
1332+
*p++ = hexdigit[ch & 15];
1333+
}
1334+
/* Copy everything else as-is */
1335+
else
1336+
*p++ = (char) ch;
1337+
}
1338+
*p = '\0';
1339+
_PyString_Resize(&repr, p - q);
1340+
return repr;
1341+
}
12931342

12941343
static int
12951344
save_unicode(Picklerobject *self, PyObject *args, int doput)

0 commit comments

Comments
 (0)