Skip to content

Commit d1e202f

Browse files
author
doerwalter
committed
SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data) git-svn-id: http://svn.python.org/projects/python/trunk@39448 6015fed2-1504-0410-9fe1-9d1591cc4771
1 parent b674472 commit d1e202f

6 files changed

Lines changed: 173 additions & 5 deletions

File tree

Include/unicodeobject.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
797797
int length /* Number of Py_UNICODE chars to encode */
798798
);
799799

800+
/* --- Unicode Internal Codec ---------------------------------------------
801+
802+
Only for internal use in _codecsmodule.c */
803+
804+
PyObject *_PyUnicode_DecodeUnicodeInternal(
805+
const char *string,
806+
int length,
807+
const char *errors
808+
);
809+
800810
/* --- Latin-1 Codecs -----------------------------------------------------
801811
802812
Note: Latin-1 corresponds to the first 256 Unicode ordinals.

Lib/test/test_codeccallbacks.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def test_backslashescape(self):
111111
sout += "\\U%08x" % sys.maxunicode
112112
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
113113

114-
def test_relaxedutf8(self):
114+
def test_decoderelaxedutf8(self):
115115
# This is the test for a decoding callback handler,
116116
# that relaxes the UTF-8 minimal encoding restriction.
117117
# A null byte that is encoded as "\xc0\x80" will be
@@ -158,6 +158,35 @@ def test_charmapencode(self):
158158
charmap[ord("?")] = u"XYZ"
159159
self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
160160

161+
def test_decodeunicodeinternal(self):
162+
self.assertRaises(
163+
UnicodeDecodeError,
164+
"\x00\x00\x00\x00\x00".decode,
165+
"unicode-internal",
166+
)
167+
if sys.maxunicode > 0xffff:
168+
def handler_unicodeinternal(exc):
169+
if not isinstance(exc, UnicodeDecodeError):
170+
raise TypeError("don't know how to handle %r" % exc)
171+
return (u"\x01", 1)
172+
173+
self.assertEqual(
174+
"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
175+
u"\u0000"
176+
)
177+
178+
self.assertEqual(
179+
"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
180+
u"\u0000\ufffd"
181+
)
182+
183+
codecs.register_error("test.hui", handler_unicodeinternal)
184+
185+
self.assertEqual(
186+
"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
187+
u"\u0000\u0001\u0000"
188+
)
189+
161190
def test_callbacks(self):
162191
def handler1(exc):
163192
if not isinstance(exc, UnicodeEncodeError) \
@@ -503,7 +532,8 @@ def test_badhandlerresults(self):
503532
for (enc, bytes) in (
504533
("ascii", "\xff"),
505534
("utf-8", "\xff"),
506-
("utf-7", "+x-")
535+
("utf-7", "+x-"),
536+
("unicode-internal", "\x00"),
507537
):
508538
self.assertRaises(
509539
TypeError,

Lib/test/test_codecs.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from test import test_support
22
import unittest
33
import codecs
4-
import StringIO
4+
import sys, StringIO
55

66
class Queue(object):
77
"""
@@ -453,6 +453,54 @@ def test_decode(self):
453453
for uni, puny in punycode_testcases:
454454
self.assertEquals(uni, puny.decode("punycode"))
455455

456+
class UnicodeInternalTest(unittest.TestCase):
457+
def test_bug1251300(self):
458+
# Decoding with unicode_internal used to not correctly handle "code
459+
# points" above 0x10ffff on UCS-4 builds.
460+
if sys.maxunicode > 0xffff:
461+
ok = [
462+
("\x00\x10\xff\xff", u"\U0010ffff"),
463+
("\x00\x00\x01\x01", u"\U00000101"),
464+
("", u""),
465+
]
466+
not_ok = [
467+
"\x7f\xff\xff\xff",
468+
"\x80\x00\x00\x00",
469+
"\x81\x00\x00\x00",
470+
"\x00",
471+
"\x00\x00\x00\x00\x00",
472+
]
473+
for internal, uni in ok:
474+
if sys.byteorder == "little":
475+
internal = "".join(reversed(internal))
476+
self.assertEquals(uni, internal.decode("unicode_internal"))
477+
for internal in not_ok:
478+
if sys.byteorder == "little":
479+
internal = "".join(reversed(internal))
480+
self.assertRaises(UnicodeDecodeError, internal.decode,
481+
"unicode_internal")
482+
483+
def test_decode_error_attributes(self):
484+
if sys.maxunicode > 0xffff:
485+
try:
486+
"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
487+
except UnicodeDecodeError, ex:
488+
self.assertEquals("unicode_internal", ex.encoding)
489+
self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
490+
self.assertEquals(4, ex.start)
491+
self.assertEquals(8, ex.end)
492+
else:
493+
self.fail()
494+
495+
def test_decode_callback(self):
496+
if sys.maxunicode > 0xffff:
497+
codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
498+
decoder = codecs.getdecoder("unicode_internal")
499+
ab = u"ab".encode("unicode_internal")
500+
ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
501+
"UnicodeInternalTest")
502+
self.assertEquals((u"ab", 12), ignored)
503+
456504
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
457505
nameprep_tests = [
458506
# 3.1 Map to nothing.
@@ -885,6 +933,7 @@ def test_main():
885933
EscapeDecodeTest,
886934
RecodingTest,
887935
PunycodeTest,
936+
UnicodeInternalTest,
888937
NameprepTest,
889938
CodecTest,
890939
CodecsModuleTest,

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,10 @@ Library
435435
line ending. Remove the special handling of a "\r\n" that has been split
436436
between two lines.
437437

438+
- Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
439+
about illegal code points. The codec now supports PEP 293 style error
440+
handlers.
441+
438442

439443
Build
440444
-----

Modules/_codecsmodule.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self,
254254
else {
255255
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
256256
return NULL;
257-
return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
258-
size / sizeof(Py_UNICODE)),
257+
258+
return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
259259
size);
260260
}
261261
}

Objects/unicodeobject.c

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
22732273
PyUnicode_GET_SIZE(unicode));
22742274
}
22752275

2276+
/* --- Unicode Internal Codec ------------------------------------------- */
2277+
2278+
PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2279+
int size,
2280+
const char *errors)
2281+
{
2282+
const char *starts = s;
2283+
int startinpos;
2284+
int endinpos;
2285+
int outpos;
2286+
Py_UNICODE unimax;
2287+
PyUnicodeObject *v;
2288+
Py_UNICODE *p;
2289+
const char *end;
2290+
const char *reason;
2291+
PyObject *errorHandler = NULL;
2292+
PyObject *exc = NULL;
2293+
2294+
unimax = PyUnicode_GetMax();
2295+
v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2296+
if (v == NULL)
2297+
goto onError;
2298+
if (PyUnicode_GetSize((PyObject *)v) == 0)
2299+
return (PyObject *)v;
2300+
p = PyUnicode_AS_UNICODE(v);
2301+
end = s + size;
2302+
2303+
while (s < end) {
2304+
*p = *(Py_UNICODE *)s;
2305+
/* We have to sanity check the raw data, otherwise doom looms for
2306+
some malformed UCS-4 data. */
2307+
if (
2308+
#ifdef Py_UNICODE_WIDE
2309+
*p > unimax || *p < 0 ||
2310+
#endif
2311+
end-s < Py_UNICODE_SIZE
2312+
)
2313+
{
2314+
startinpos = s - starts;
2315+
if (end-s < Py_UNICODE_SIZE) {
2316+
endinpos = end-starts;
2317+
reason = "truncated input";
2318+
}
2319+
else {
2320+
endinpos = s - starts + Py_UNICODE_SIZE;
2321+
reason = "illegal code point (> 0x10FFFF)";
2322+
}
2323+
outpos = p - PyUnicode_AS_UNICODE(v);
2324+
if (unicode_decode_call_errorhandler(
2325+
errors, &errorHandler,
2326+
"unicode_internal", reason,
2327+
starts, size, &startinpos, &endinpos, &exc, &s,
2328+
(PyObject **)&v, &outpos, &p)) {
2329+
goto onError;
2330+
}
2331+
}
2332+
else {
2333+
p++;
2334+
s += Py_UNICODE_SIZE;
2335+
}
2336+
}
2337+
2338+
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2339+
goto onError;
2340+
Py_XDECREF(errorHandler);
2341+
Py_XDECREF(exc);
2342+
return (PyObject *)v;
2343+
2344+
onError:
2345+
Py_XDECREF(v);
2346+
Py_XDECREF(errorHandler);
2347+
Py_XDECREF(exc);
2348+
return NULL;
2349+
}
2350+
22762351
/* --- Latin-1 Codec ------------------------------------------------------ */
22772352

22782353
PyObject *PyUnicode_DecodeLatin1(const char *s,

0 commit comments

Comments
 (0)