Skip to content

Commit b2e796a

Browse files
committed
in wide builds, avoid storing high unicode characters from source code with surrogates
This is accomplished by decoding with utf-32 instead of utf-16 on all builds. The patch is by Adam Olsen.
1 parent 7b1b094 commit b2e796a

3 files changed

Lines changed: 25 additions & 9 deletions

File tree

Lib/test/test_pep263.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ def test_issue4626(self):
3636
exec(c, d)
3737
self.assertEquals(d['\xc6'], '\xc6')
3838

39+
def test_issue3297(self):
40+
c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
41+
d = {}
42+
exec(c, d)
43+
self.assertEqual(d['a'], d['b'])
44+
self.assertEqual(len(d['a']), len(d['b']))
45+
self.assertEqual(ascii(d['a']), ascii(d['b']))
46+
3947
def test_main():
4048
support.run_unittest(PEP263Test)
4149

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #3297: On wide unicode builds, do not split unicode characters into
16+
surrogates.
17+
1518
- Remove length limitation when constructing a complex number from a string.
1619

1720
- Issue #1087418: Boost performance of bitwise operations for longs.

Python/ast.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
32463246
u = NULL;
32473247
} else {
32483248
/* check for integer overflow */
3249-
if (len > PY_SIZE_MAX / 4)
3249+
if (len > PY_SIZE_MAX / 6)
32503250
return NULL;
3251-
/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
3252-
u = PyBytes_FromStringAndSize((char *)NULL, len * 4);
3251+
/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
3252+
"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
3253+
u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
32533254
if (u == NULL)
32543255
return NULL;
32553256
p = buf = PyBytes_AsString(u);
@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
32663267
PyObject *w;
32673268
char *r;
32683269
Py_ssize_t rn, i;
3269-
w = decode_utf8(c, &s, end, "utf-16-be");
3270+
w = decode_utf8(c, &s, end, "utf-32-be");
32703271
if (w == NULL) {
32713272
Py_DECREF(u);
32723273
return NULL;
32733274
}
32743275
r = PyBytes_AS_STRING(w);
32753276
rn = Py_SIZE(w);
3276-
assert(rn % 2 == 0);
3277-
for (i = 0; i < rn; i += 2) {
3278-
sprintf(p, "\\u%02x%02x",
3277+
assert(rn % 4 == 0);
3278+
for (i = 0; i < rn; i += 4) {
3279+
sprintf(p, "\\U%02x%02x%02x%02x",
32793280
r[i + 0] & 0xFF,
3280-
r[i + 1] & 0xFF);
3281-
p += 6;
3281+
r[i + 1] & 0xFF,
3282+
r[i + 2] & 0xFF,
3283+
r[i + 3] & 0xFF);
3284+
p += 10;
32823285
}
3286+
/* Should be impossible to overflow */
3287+
assert(p - buf <= Py_SIZE(u));
32833288
Py_DECREF(w);
32843289
} else {
32853290
*p++ = *s++;

0 commit comments

Comments
 (0)