Skip to content

Commit 8faf821

Browse files
author
Victor Stinner
committed
PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a
character in not in range [U+0000; U+10ffff].
1 parent bc9f0c6 commit 8faf821

1 file changed

Lines changed: 34 additions & 33 deletions

File tree

Objects/unicodeobject.c

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
6666
extern "C" {
6767
#endif
6868

69+
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70+
#define MAX_UNICODE 0x10ffff
71+
6972
#ifdef Py_DEBUG
7073
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
7174
#else
@@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
393396
}
394397
else {
395398
assert(maxchar >= 0x10000);
396-
/* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397-
return characters outside the range U+0000-U+10FFFF. */
398-
/* assert(maxchar <= 0x10FFFF); */
399+
assert(maxchar <= MAX_UNICODE);
399400
}
400401
}
401402
return 1;
@@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
12951296
Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
12961297
{
12971298
const wchar_t *iter;
1299+
Py_UCS4 ch;
12981300

12991301
assert(num_surrogates != NULL && maxchar != NULL);
13001302
*num_surrogates = 0;
13011303
*maxchar = 0;
13021304

13031305
for (iter = begin; iter < end; ) {
1304-
if (*iter > *maxchar) {
1305-
*maxchar = *iter;
1306-
#if SIZEOF_WCHAR_T != 2
1307-
if (*maxchar >= 0x10000)
1308-
return 0;
1309-
#endif
1310-
}
13111306
#if SIZEOF_WCHAR_T == 2
13121307
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
13131308
&& (iter+1) < end
13141309
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
13151310
{
1316-
Py_UCS4 surrogate_val;
1317-
surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1311+
ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
13181312
++(*num_surrogates);
1319-
if (surrogate_val > *maxchar)
1320-
*maxchar = surrogate_val;
13211313
iter += 2;
13221314
}
13231315
else
1324-
iter++;
1325-
#else
1326-
iter++;
13271316
#endif
1317+
{
1318+
ch = *iter;
1319+
iter++;
1320+
}
1321+
if (ch > *maxchar) {
1322+
*maxchar = ch;
1323+
if (*maxchar > MAX_UNICODE) {
1324+
PyErr_Format(PyExc_ValueError,
1325+
"character U+%x is not in range [U+0000; U+10ffff]",
1326+
ch);
1327+
return -1;
1328+
}
1329+
}
13281330
}
13291331
return 0;
13301332
}
@@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
16691671
&maxchar, &num_surrogates) == -1)
16701672
return NULL;
16711673

1672-
unicode = PyUnicode_New(size - num_surrogates,
1673-
maxchar);
1674+
unicode = PyUnicode_New(size - num_surrogates, maxchar);
16741675
if (!unicode)
16751676
return NULL;
16761677

@@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind)
18081809
return 0x10000;
18091810
default:
18101811
assert(0 && "invalid kind");
1811-
return 0x10ffff;
1812+
return MAX_UNICODE;
18121813
}
18131814
}
18141815

@@ -2796,7 +2797,7 @@ PyObject *
27962797
PyUnicode_FromOrdinal(int ordinal)
27972798
{
27982799
PyObject *v;
2799-
if (ordinal < 0 || ordinal > 0x10ffff) {
2800+
if (ordinal < 0 || ordinal > MAX_UNICODE) {
28002801
PyErr_SetString(PyExc_ValueError,
28012802
"chr() arg not in range(0x110000)");
28022803
return NULL;
@@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
34723473
four_bytes = PyUnicode_4BYTE_DATA(unicode);
34733474
for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
34743475
if (*four_bytes > 0xFFFF) {
3475-
assert(*four_bytes <= 0x10FFFF);
3476+
assert(*four_bytes <= MAX_UNICODE);
34763477
/* encode surrogate pair in this case */
34773478
*w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
34783479
*w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
@@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
41184119
continue;
41194120
encode_char:
41204121
if (ch >= 0x10000) {
4121-
assert(ch <= 0x10FFFF);
4122+
assert(ch <= MAX_UNICODE);
41224123

41234124
/* code first surrogate */
41244125
base64bits += 16;
@@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
45774578
}
45784579
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
45794580
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4580-
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4581+
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
45814582

45824583
WRITE_MAYBE_FAIL(i++, ch);
45834584
break;
@@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
47144715
}
47154716
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
47164717
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4717-
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4718+
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
47184719

47194720
#if SIZEOF_WCHAR_T == 4
47204721
*p++ = (wchar_t)ch;
@@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
48844885
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
48854886
*p++ = (char)(0x80 | (ch & 0x3f));
48864887
} else /* ch >= 0x10000 */ {
4887-
assert(ch <= 0x10FFFF);
4888+
assert(ch <= MAX_UNICODE);
48884889
/* Encode UCS4 Unicode ordinals */
48894890
*p++ = (char)(0xf0 | (ch >> 18));
48904891
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
@@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
57925793
break;
57935794
store:
57945795
/* when we get here, chr is a 32-bit unicode character */
5795-
if (chr <= 0x10ffff) {
5796+
if (chr <= MAX_UNICODE) {
57965797
WRITECHAR(chr);
57975798
} else {
57985799
endinpos = s-starts;
@@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
59575958

59585959
/* Map 21-bit characters to '\U00xxxxxx' */
59595960
else if (ch >= 0x10000) {
5960-
assert(ch <= 0x10FFFF);
5961+
assert(ch <= MAX_UNICODE);
59615962
*p++ = '\\';
59625963
*p++ = 'U';
59635964
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
61086109
else
61096110
x += 10 + c - 'A';
61106111
}
6111-
if (x <= 0x10ffff) {
6112+
if (x <= MAX_UNICODE) {
61126113
if (unicode_putchar(&v, &outpos, x) < 0)
61136114
goto onError;
61146115
} else {
@@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
61756176
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
61766177
/* Map 32-bit characters to '\Uxxxxxxxx' */
61776178
if (ch >= 0x10000) {
6178-
assert(ch <= 0x10FFFF);
6179+
assert(ch <= MAX_UNICODE);
61796180
*p++ = '\\';
61806181
*p++ = 'U';
61816182
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode,
65366537
else if (ch < 1000000)
65376538
repsize += 2+6+1;
65386539
else {
6539-
assert(ch <= 0x10FFFF);
6540+
assert(ch <= MAX_UNICODE);
65406541
repsize += 2+7+1;
65416542
}
65426543
}
@@ -9275,7 +9276,7 @@ fixup(PyObject *self,
92759276
else if (maxchar_new <= 65535)
92769277
maxchar_new = 65535;
92779278
else
9278-
maxchar_new = 1114111; /* 0x10ffff */
9279+
maxchar_new = MAX_UNICODE;
92799280

92809281
if (!maxchar_new && PyUnicode_CheckExact(self)) {
92819282
/* fixfct should return TRUE if it modified the buffer. If
@@ -13059,7 +13060,7 @@ formatchar(PyObject *v)
1305913060
if (x == -1 && PyErr_Occurred())
1306013061
goto onError;
1306113062

13062-
if (x < 0 || x > 0x10ffff) {
13063+
if (x < 0 || x > MAX_UNICODE) {
1306313064
PyErr_SetString(PyExc_OverflowError,
1306413065
"%c arg not in range(0x110000)");
1306513066
return (Py_UCS4) -1;

0 commit comments

Comments
 (0)