Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
string. */
);

/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */

/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);

/* --- Latin-1 Codecs ----------------------------------------------------- */

PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
Expand Down
9 changes: 5 additions & 4 deletions Lib/encodings/raw_unicode_escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.raw_unicode_escape_encode(input, self.errors)[0]

class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.raw_unicode_escape_decode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
return codecs.raw_unicode_escape_decode(input, errors, final)

class StreamWriter(Codec,codecs.StreamWriter):
pass

class StreamReader(Codec,codecs.StreamReader):
pass
def decode(self, input, errors='strict'):
return codecs.raw_unicode_escape_decode(input, errors, False)

### encodings module API

Expand Down
35 changes: 34 additions & 1 deletion Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2483,7 +2483,11 @@ def test_partial(self):
]
)

class RawUnicodeEscapeTest(unittest.TestCase):
class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
encoding = "raw-unicode-escape"

test_lone_surrogates = None

def test_empty(self):
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
Expand Down Expand Up @@ -2532,6 +2536,35 @@ def test_decode_errors(self):
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))

def test_partial(self):
self.check_partial(
"\x00\t\n\r\\\xff\uffff\U00010000",
[
'\x00',
'\x00\t',
'\x00\t\n',
'\x00\t\n\r',
'\x00\t\n\r',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff\U00010000',
]
)


class EscapeEncodeTest(unittest.TestCase):

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
Previously they failed if the escape sequence was split.
13 changes: 8 additions & 5 deletions Modules/_codecsmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
_codecs.raw_unicode_escape_decode
data: Py_buffer(accept={str, buffer})
errors: str(accept={str, NoneType}) = None
final: bool(accept={int}) = True
/
[clinic start generated code]*/

static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
const char *errors)
/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
const char *errors, int final)
/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
{
PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
errors);
return codec_tuple(decoded, data->len);
Py_ssize_t consumed = data->len;
PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
errors,
final ? NULL : &consumed);
return codec_tuple(decoded, consumed);
}

/*[clinic input]
Expand Down
18 changes: 13 additions & 5 deletions Modules/clinic/_codecsmodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 44 additions & 20 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;

#define WRITE_ASCII_CHAR(ch) \
Expand All @@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue;
}

startinpos = s - starts - 1;
Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
Expand Down Expand Up @@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos;
break;
}
error:
endinpos = s-starts;
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
Expand Down Expand Up @@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* --- Raw Unicode Escape Codec ------------------------------------------- */

PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
_PyUnicodeWriter writer;
Expand All @@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL;

if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}

Expand All @@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;

#define WRITE_CHAR(ch) \
Expand All @@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0)

/* Non-escape characters are interpreted as Unicode ordinals */
if (c != '\\' || s >= end) {
if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c);
continue;
}

Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
assert(consumed);
// Set message to silent compiler warning.
// Actually it is never used.
message = "\\ at end of string";
goto incomplete;
}

c = (unsigned char) *s++;
if (c == 'u') {
count = 4;
Expand All @@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c);
continue;
}
startinpos = s - starts - 2;

/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
for (ch = 0; count && s < end; ++s, --count) {
for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
Expand All @@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
break;
goto error;
}
}
if (!count) {
if (ch <= MAX_UNICODE) {
WRITE_CHAR(ch);
continue;
}
if (ch > MAX_UNICODE) {
message = "\\Uxxxxxxxx out of range";
goto error;
}
WRITE_CHAR(ch);
continue;

endinpos = s-starts;
incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
Expand All @@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}

PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
}


Expand Down