Skip to content

Commit 5646648

Browse files
committed
Issue 28128: Print out better error/warning messages for invalid string escapes. Backport to 3.6.
1 parent 7f0514a commit 5646648

File tree

8 files changed

+173
-22
lines changed

8 files changed

+173
-22
lines changed

Include/bytesobject.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
7474
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
7575
const char *, Py_ssize_t,
7676
const char *);
77+
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
78+
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
79+
const char *, Py_ssize_t,
80+
const char *,
81+
const char **);
7782

7883
/* Macro, trading safety for speed */
7984
#ifndef Py_LIMITED_API

Include/unicodeobject.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
14861486
const char *errors /* error handling */
14871487
);
14881488

1489+
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
1490+
chars. */
1491+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
1492+
const char *string, /* Unicode-Escape encoded string */
1493+
Py_ssize_t length, /* size of string */
1494+
const char *errors, /* error handling */
1495+
const char **first_invalid_escape /* on return, points to first
1496+
invalid escaped char in
1497+
string. */
1498+
);
1499+
14891500
PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
14901501
PyObject *unicode /* Unicode object */
14911502
);

Lib/test/test_string_literals.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import sys
3232
import shutil
3333
import tempfile
34+
import warnings
3435
import unittest
3536

3637

@@ -104,6 +105,19 @@ def test_eval_str_incomplete(self):
104105
self.assertRaises(SyntaxError, eval, r""" '\U000000' """)
105106
self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)
106107

108+
def test_eval_str_invalid_escape(self):
109+
for b in range(1, 128):
110+
if b in b"""\n\r"'01234567NU\\abfnrtuvx""":
111+
continue
112+
with self.assertWarns(DeprecationWarning):
113+
self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))
114+
with warnings.catch_warnings(record=True) as w:
115+
warnings.simplefilter('always', category=DeprecationWarning)
116+
eval("'''\n\\z'''")
117+
self.assertEqual(len(w), 1)
118+
self.assertEqual(w[0].filename, '<string>')
119+
self.assertEqual(w[0].lineno, 2)
120+
107121
def test_eval_str_raw(self):
108122
self.assertEqual(eval(""" r'x' """), 'x')
109123
self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@@ -130,6 +144,19 @@ def test_eval_bytes_incomplete(self):
130144
self.assertRaises(SyntaxError, eval, r""" b'\x' """)
131145
self.assertRaises(SyntaxError, eval, r""" b'\x0' """)
132146

147+
def test_eval_bytes_invalid_escape(self):
148+
for b in range(1, 128):
149+
if b in b"""\n\r"'01234567\\abfnrtvx""":
150+
continue
151+
with self.assertWarns(DeprecationWarning):
152+
self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))
153+
with warnings.catch_warnings(record=True) as w:
154+
warnings.simplefilter('always', category=DeprecationWarning)
155+
eval("b'''\n\\z'''")
156+
self.assertEqual(len(w), 1)
157+
self.assertEqual(w[0].filename, '<string>')
158+
self.assertEqual(w[0].lineno, 2)
159+
133160
def test_eval_bytes_raw(self):
134161
self.assertEqual(eval(""" br'x' """), b'x')
135162
self.assertEqual(eval(""" rb'x' """), b'x')

Lib/test/test_unicode.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2413,13 +2413,6 @@ def test_free_after_iterating(self):
24132413
support.check_free_after_iterating(self, iter, str)
24142414
support.check_free_after_iterating(self, reversed, str)
24152415

2416-
def test_invalid_sequences(self):
2417-
for letter in string.ascii_letters + "89": # 0-7 are octal escapes
2418-
if letter in "abfnrtuvxNU":
2419-
continue
2420-
with self.assertWarns(DeprecationWarning):
2421-
eval(r"'\%s'" % letter)
2422-
24232416

24242417
class CAPITest(unittest.TestCase):
24252418

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ What's New in Python 3.6.0 beta 3
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #28128: Deprecation warning for invalid str and byte escape
14+
sequences now prints better information about where the error
15+
occurs. Patch by Serhiy Storchaka and Eric Smith.
16+
1317
- Issue #28509: dict.update() no longer allocate unnecessary large memory.
1418

1519
- Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug

Objects/bytesobject.c

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end,
11051105
return p;
11061106
}
11071107

1108-
PyObject *PyBytes_DecodeEscape(const char *s,
1108+
PyObject *_PyBytes_DecodeEscape(const char *s,
11091109
Py_ssize_t len,
11101110
const char *errors,
11111111
Py_ssize_t unicode,
1112-
const char *recode_encoding)
1112+
const char *recode_encoding,
1113+
const char **first_invalid_escape)
11131114
{
11141115
int c;
11151116
char *p;
@@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11231124
return NULL;
11241125
writer.overallocate = 1;
11251126

1127+
*first_invalid_escape = NULL;
1128+
11261129
end = s + len;
11271130
while (s < end) {
11281131
if (*s != '\\') {
@@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s,
12071210
break;
12081211

12091212
default:
1210-
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)
1211-
goto failed;
1213+
if (*first_invalid_escape == NULL) {
1214+
*first_invalid_escape = s-1; /* Back up one char, since we've
1215+
already incremented s. */
1216+
}
12121217
*p++ = '\\';
1218+
s--;
12131219
goto non_esc; /* an arbitrary number of unescaped
12141220
UTF-8 bytes may follow. */
12151221
}
@@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s,
12221228
return NULL;
12231229
}
12241230

1231+
PyObject *PyBytes_DecodeEscape(const char *s,
1232+
Py_ssize_t len,
1233+
const char *errors,
1234+
Py_ssize_t unicode,
1235+
const char *recode_encoding)
1236+
{
1237+
const char* first_invalid_escape;
1238+
PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,
1239+
recode_encoding,
1240+
&first_invalid_escape);
1241+
if (result == NULL)
1242+
return NULL;
1243+
if (first_invalid_escape != NULL) {
1244+
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1245+
"invalid escape sequence '\\%c'",
1246+
*first_invalid_escape) < 0) {
1247+
Py_DECREF(result);
1248+
return NULL;
1249+
}
1250+
}
1251+
return result;
1252+
1253+
}
12251254
/* -------------------------------------------------------------------- */
12261255
/* object api */
12271256

Objects/unicodeobject.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5896,16 +5896,20 @@ PyUnicode_AsUTF16String(PyObject *unicode)
58965896
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
58975897

58985898
PyObject *
5899-
PyUnicode_DecodeUnicodeEscape(const char *s,
5900-
Py_ssize_t size,
5901-
const char *errors)
5899+
_PyUnicode_DecodeUnicodeEscape(const char *s,
5900+
Py_ssize_t size,
5901+
const char *errors,
5902+
const char **first_invalid_escape)
59025903
{
59035904
const char *starts = s;
59045905
_PyUnicodeWriter writer;
59055906
const char *end;
59065907
PyObject *errorHandler = NULL;
59075908
PyObject *exc = NULL;
59085909

5910+
// so we can remember if we've seen an invalid escape char or not
5911+
*first_invalid_escape = NULL;
5912+
59095913
if (size == 0) {
59105914
_Py_RETURN_UNICODE_EMPTY();
59115915
}
@@ -6080,9 +6084,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
60806084
goto error;
60816085

60826086
default:
6083-
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6084-
"invalid escape sequence '\\%c'", c) < 0)
6085-
goto onError;
6087+
if (*first_invalid_escape == NULL) {
6088+
*first_invalid_escape = s-1; /* Back up one char, since we've
6089+
already incremented s. */
6090+
}
60866091
WRITE_ASCII_CHAR('\\');
60876092
WRITE_CHAR(c);
60886093
continue;
@@ -6117,6 +6122,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
61176122
return NULL;
61186123
}
61196124

6125+
PyObject *
6126+
PyUnicode_DecodeUnicodeEscape(const char *s,
6127+
Py_ssize_t size,
6128+
const char *errors)
6129+
{
6130+
const char *first_invalid_escape;
6131+
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6132+
&first_invalid_escape);
6133+
if (result == NULL)
6134+
return NULL;
6135+
if (first_invalid_escape != NULL) {
6136+
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6137+
"invalid escape sequence '\\%c'",
6138+
*first_invalid_escape) < 0) {
6139+
Py_DECREF(result);
6140+
return NULL;
6141+
}
6142+
}
6143+
return result;
6144+
}
6145+
61206146
/* Return a Unicode-Escape string version of the Unicode object.
61216147
61226148
If quotes is true, the string is enclosed in u"" or u'' quotes as

Python/ast.c

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end)
41134113
return PyUnicode_DecodeUTF8(t, s - t, NULL);
41144114
}
41154115

4116+
static int
4117+
warn_invalid_escape_sequence(struct compiling *c, const node *n,
4118+
char first_invalid_escape_char)
4119+
{
4120+
PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c",
4121+
first_invalid_escape_char);
4122+
if (msg == NULL) {
4123+
return -1;
4124+
}
4125+
if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,
4126+
c->c_filename, LINENO(n),
4127+
NULL, NULL) < 0 &&
4128+
PyErr_ExceptionMatches(PyExc_DeprecationWarning))
4129+
{
4130+
const char *s = PyUnicode_AsUTF8(msg);
4131+
if (s != NULL) {
4132+
ast_error(c, n, s);
4133+
}
4134+
Py_DECREF(msg);
4135+
return -1;
4136+
}
4137+
Py_DECREF(msg);
4138+
return 0;
4139+
}
4140+
41164141
static PyObject *
4117-
decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
4142+
decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,
4143+
size_t len)
41184144
{
41194145
PyObject *v, *u;
41204146
char *buf;
@@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
41674193
len = p - buf;
41684194
s = buf;
41694195

4170-
v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);
4196+
const char *first_invalid_escape;
4197+
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
4198+
4199+
if (v != NULL && first_invalid_escape != NULL) {
4200+
if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
4201+
/* We have not decref u before because first_invalid_escape points
4202+
inside u. */
4203+
Py_XDECREF(u);
4204+
Py_DECREF(v);
4205+
return NULL;
4206+
}
4207+
}
41714208
Py_XDECREF(u);
41724209
return v;
41734210
}
41744211

4212+
static PyObject *
4213+
decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,
4214+
size_t len)
4215+
{
4216+
const char *first_invalid_escape;
4217+
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,
4218+
&first_invalid_escape);
4219+
if (result == NULL)
4220+
return NULL;
4221+
4222+
if (first_invalid_escape != NULL) {
4223+
if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
4224+
Py_DECREF(result);
4225+
return NULL;
4226+
}
4227+
}
4228+
return result;
4229+
}
4230+
41754231
/* Compile this expression in to an expr_ty. Add parens around the
41764232
expression, in order to allow leading spaces in the expression. */
41774233
static expr_ty
@@ -4310,7 +4366,7 @@ fstring_find_literal(const char **str, const char *end, int raw,
43104366
literal_end-literal_start,
43114367
NULL, NULL);
43124368
else
4313-
*literal = decode_unicode_with_escapes(c, literal_start,
4369+
*literal = decode_unicode_with_escapes(c, n, literal_start,
43144370
literal_end-literal_start);
43154371
if (!*literal)
43164372
return -1;
@@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode,
50485104
if (*rawmode)
50495105
*result = PyBytes_FromStringAndSize(s, len);
50505106
else
5051-
*result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL);
5107+
*result = decode_bytes_with_escapes(c, n, s, len);
50525108
} else {
50535109
if (*rawmode)
50545110
*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
50555111
else
5056-
*result = decode_unicode_with_escapes(c, s, len);
5112+
*result = decode_unicode_with_escapes(c, n, s, len);
50575113
}
50585114
return *result == NULL ? -1 : 0;
50595115
}

0 commit comments

Comments
 (0)