@@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
6666extern "C" {
6767#endif
6868
69+ /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70+ #define MAX_UNICODE 0x10ffff
71+
6972#ifdef Py_DEBUG
7073# define _PyUnicode_CHECK (op ) _PyUnicode_CheckConsistency(op, 0)
7174#else
@@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
393396 }
394397 else {
395398 assert (maxchar >= 0x10000 );
396- /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397- return characters outside the range U+0000-U+10FFFF. */
398- /* assert(maxchar <= 0x10FFFF); */
399+ assert (maxchar <= MAX_UNICODE );
399400 }
400401 }
401402 return 1 ;
@@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
12951296 Py_UCS4 * maxchar , Py_ssize_t * num_surrogates )
12961297{
12971298 const wchar_t * iter ;
1299+ Py_UCS4 ch ;
12981300
12991301 assert (num_surrogates != NULL && maxchar != NULL );
13001302 * num_surrogates = 0 ;
13011303 * maxchar = 0 ;
13021304
13031305 for (iter = begin ; iter < end ; ) {
1304- if (* iter > * maxchar ) {
1305- * maxchar = * iter ;
1306- #if SIZEOF_WCHAR_T != 2
1307- if (* maxchar >= 0x10000 )
1308- return 0 ;
1309- #endif
1310- }
13111306#if SIZEOF_WCHAR_T == 2
13121307 if (Py_UNICODE_IS_HIGH_SURROGATE (iter [0 ])
13131308 && (iter + 1 ) < end
13141309 && Py_UNICODE_IS_LOW_SURROGATE (iter [1 ]))
13151310 {
1316- Py_UCS4 surrogate_val ;
1317- surrogate_val = Py_UNICODE_JOIN_SURROGATES (iter [0 ], iter [1 ]);
1311+ ch = Py_UNICODE_JOIN_SURROGATES (iter [0 ], iter [1 ]);
13181312 ++ (* num_surrogates );
1319- if (surrogate_val > * maxchar )
1320- * maxchar = surrogate_val ;
13211313 iter += 2 ;
13221314 }
13231315 else
1324- iter ++ ;
1325- #else
1326- iter ++ ;
13271316#endif
1317+ {
1318+ ch = * iter ;
1319+ iter ++ ;
1320+ }
1321+ if (ch > * maxchar ) {
1322+ * maxchar = ch ;
1323+ if (* maxchar > MAX_UNICODE ) {
1324+ PyErr_Format (PyExc_ValueError ,
1325+ "character U+%x is not in range [U+0000; U+10ffff]" ,
1326+ ch );
1327+ return -1 ;
1328+ }
1329+ }
13281330 }
13291331 return 0 ;
13301332}
@@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
16691671 & maxchar , & num_surrogates ) == -1 )
16701672 return NULL ;
16711673
1672- unicode = PyUnicode_New (size - num_surrogates ,
1673- maxchar );
1674+ unicode = PyUnicode_New (size - num_surrogates , maxchar );
16741675 if (!unicode )
16751676 return NULL ;
16761677
@@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind)
18081809 return 0x10000 ;
18091810 default :
18101811 assert (0 && "invalid kind" );
1811- return 0x10ffff ;
1812+ return MAX_UNICODE ;
18121813 }
18131814}
18141815
@@ -2796,7 +2797,7 @@ PyObject *
27962797PyUnicode_FromOrdinal (int ordinal )
27972798{
27982799 PyObject * v ;
2799- if (ordinal < 0 || ordinal > 0x10ffff ) {
2800+ if (ordinal < 0 || ordinal > MAX_UNICODE ) {
28002801 PyErr_SetString (PyExc_ValueError ,
28012802 "chr() arg not in range(0x110000)" );
28022803 return NULL ;
@@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
34723473 four_bytes = PyUnicode_4BYTE_DATA (unicode );
34733474 for (; four_bytes < ucs4_end ; ++ four_bytes , ++ w ) {
34743475 if (* four_bytes > 0xFFFF ) {
3475- assert (* four_bytes <= 0x10FFFF );
3476+ assert (* four_bytes <= MAX_UNICODE );
34763477 /* encode surrogate pair in this case */
34773478 * w ++ = Py_UNICODE_HIGH_SURROGATE (* four_bytes );
34783479 * w = Py_UNICODE_LOW_SURROGATE (* four_bytes );
@@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
41184119 continue ;
41194120encode_char :
41204121 if (ch >= 0x10000 ) {
4121- assert (ch <= 0x10FFFF );
4122+ assert (ch <= MAX_UNICODE );
41224123
41234124 /* code first surrogate */
41244125 base64bits += 16 ;
@@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
45774578 }
45784579 ch = ((s [0 ] & 0x7 ) << 18 ) + ((s [1 ] & 0x3f ) << 12 ) +
45794580 ((s [2 ] & 0x3f ) << 6 ) + (s [3 ] & 0x3f );
4580- assert ((ch > 0xFFFF ) && (ch <= 0x10ffff ));
4581+ assert ((ch > 0xFFFF ) && (ch <= MAX_UNICODE ));
45814582
45824583 WRITE_MAYBE_FAIL (i ++ , ch );
45834584 break ;
@@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
47144715 }
47154716 ch = ((s [0 ] & 0x7 ) << 18 ) + ((s [1 ] & 0x3f ) << 12 ) +
47164717 ((s [2 ] & 0x3f ) << 6 ) + (s [3 ] & 0x3f );
4717- assert ((ch > 0xFFFF ) && (ch <= 0x10ffff ));
4718+ assert ((ch > 0xFFFF ) && (ch <= MAX_UNICODE ));
47184719
47194720#if SIZEOF_WCHAR_T == 4
47204721 * p ++ = (wchar_t )ch ;
@@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
48844885 * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
48854886 * p ++ = (char )(0x80 | (ch & 0x3f ));
48864887 } else /* ch >= 0x10000 */ {
4887- assert (ch <= 0x10FFFF );
4888+ assert (ch <= MAX_UNICODE );
48884889 /* Encode UCS4 Unicode ordinals */
48894890 * p ++ = (char )(0xf0 | (ch >> 18 ));
48904891 * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
@@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
57925793 break ;
57935794 store :
57945795 /* when we get here, chr is a 32-bit unicode character */
5795- if (chr <= 0x10ffff ) {
5796+ if (chr <= MAX_UNICODE ) {
57965797 WRITECHAR (chr );
57975798 } else {
57985799 endinpos = s - starts ;
@@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
59575958
59585959 /* Map 21-bit characters to '\U00xxxxxx' */
59595960 else if (ch >= 0x10000 ) {
5960- assert (ch <= 0x10FFFF );
5961+ assert (ch <= MAX_UNICODE );
59615962 * p ++ = '\\' ;
59625963 * p ++ = 'U' ;
59635964 * p ++ = Py_hexdigits [(ch >> 28 ) & 0x0000000F ];
@@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
61086109 else
61096110 x += 10 + c - 'A' ;
61106111 }
6111- if (x <= 0x10ffff ) {
6112+ if (x <= MAX_UNICODE ) {
61126113 if (unicode_putchar (& v , & outpos , x ) < 0 )
61136114 goto onError ;
61146115 } else {
@@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
61756176 Py_UCS4 ch = PyUnicode_READ (kind , data , pos );
61766177 /* Map 32-bit characters to '\Uxxxxxxxx' */
61776178 if (ch >= 0x10000 ) {
6178- assert (ch <= 0x10FFFF );
6179+ assert (ch <= MAX_UNICODE );
61796180 * p ++ = '\\' ;
61806181 * p ++ = 'U' ;
61816182 * p ++ = Py_hexdigits [(ch >> 28 ) & 0xf ];
@@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode,
65366537 else if (ch < 1000000 )
65376538 repsize += 2 + 6 + 1 ;
65386539 else {
6539- assert (ch <= 0x10FFFF );
6540+ assert (ch <= MAX_UNICODE );
65406541 repsize += 2 + 7 + 1 ;
65416542 }
65426543 }
@@ -9275,7 +9276,7 @@ fixup(PyObject *self,
92759276 else if (maxchar_new <= 65535 )
92769277 maxchar_new = 65535 ;
92779278 else
9278- maxchar_new = 1114111 ; /* 0x10ffff */
9279+ maxchar_new = MAX_UNICODE ;
92799280
92809281 if (!maxchar_new && PyUnicode_CheckExact (self )) {
92819282 /* fixfct should return TRUE if it modified the buffer. If
@@ -13059,7 +13060,7 @@ formatchar(PyObject *v)
1305913060 if (x == -1 && PyErr_Occurred ())
1306013061 goto onError ;
1306113062
13062- if (x < 0 || x > 0x10ffff ) {
13063+ if (x < 0 || x > MAX_UNICODE ) {
1306313064 PyErr_SetString (PyExc_OverflowError ,
1306413065 "%c arg not in range(0x110000)" );
1306513066 return (Py_UCS4 ) - 1 ;
0 commit comments