Skip to content

Commit 2f61d22

Browse files
author
Victor Stinner
committed
Issue #7820: The parser tokenizer restores all bytes in the right if the BOM
check fails. Fix an assertion in pydebug mode.
1 parent 6842351 commit 2f61d22

3 files changed

Lines changed: 46 additions & 22 deletions

File tree

Lib/test/test_pep263.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@ def test_issue3297(self):
3030
self.assertEqual(d['a'], d['b'])
3131
self.assertEqual(len(d['a']), len(d['b']))
3232

33+
def test_issue7820(self):
34+
# Ensure that check_bom() restores all bytes in the right order if
35+
# check_bom() fails in pydebug mode: a buffer starts with the first
36+
# byte of a valid BOM, but next bytes are different
37+
38+
# one byte in common with the UTF-16-LE BOM
39+
self.assertRaises(SyntaxError, eval, '\xff\x20')
40+
41+
# two bytes in common with the UTF-8 BOM
42+
self.assertRaises(SyntaxError, eval, '\xef\xbb\x20')
43+
3344
def test_main():
3445
test_support.run_unittest(PEP263Test)
3546

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 2.7 alpha 4?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #7820: The parser tokenizer restores all bytes in the right if
16+
the BOM check fails.
17+
1518
- Issue #7309: Fix unchecked attribute access when converting
1619
UnicodeEncodeError, UnicodeDecodeError, and UnicodeTranslateError to
1720
strings.

Parser/tokenizer.c

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -312,47 +312,57 @@ check_bom(int get_char(struct tok_state *),
312312
int set_readline(struct tok_state *, const char *),
313313
struct tok_state *tok)
314314
{
315-
int ch = get_char(tok);
315+
int ch1, ch2, ch3;
316+
ch1 = get_char(tok);
316317
tok->decoding_state = 1;
317-
if (ch == EOF) {
318+
if (ch1 == EOF) {
318319
return 1;
319-
} else if (ch == 0xEF) {
320-
ch = get_char(tok);
321-
if (ch != 0xBB)
322-
goto NON_BOM;
323-
ch = get_char(tok);
324-
if (ch != 0xBF)
325-
goto NON_BOM;
320+
} else if (ch1 == 0xEF) {
321+
ch2 = get_char(tok);
322+
if (ch2 != 0xBB) {
323+
unget_char(ch2, tok);
324+
unget_char(ch1, tok);
325+
return 1;
326+
}
327+
ch3 = get_char(tok);
328+
if (ch3 != 0xBF) {
329+
unget_char(ch3, tok);
330+
unget_char(ch2, tok);
331+
unget_char(ch1, tok);
332+
return 1;
333+
}
326334
#if 0
327335
/* Disable support for UTF-16 BOMs until a decision
328336
is made whether this needs to be supported. */
329-
} else if (ch == 0xFE) {
330-
ch = get_char(tok);
331-
if (ch != 0xFF)
332-
goto NON_BOM;
337+
} else if (ch1 == 0xFE) {
338+
ch2 = get_char(tok);
339+
if (ch2 != 0xFF) {
340+
unget_char(ch2, tok);
341+
unget_char(ch1, tok);
342+
return 1;
343+
}
333344
if (!set_readline(tok, "utf-16-be"))
334345
return 0;
335346
tok->decoding_state = -1;
336-
} else if (ch == 0xFF) {
337-
ch = get_char(tok);
338-
if (ch != 0xFE)
339-
goto NON_BOM;
347+
} else if (ch1 == 0xFF) {
348+
ch2 = get_char(tok);
349+
if (ch2 != 0xFE) {
350+
unget_char(ch2, tok);
351+
unget_char(ch1, tok);
352+
return 1;
353+
}
340354
if (!set_readline(tok, "utf-16-le"))
341355
return 0;
342356
tok->decoding_state = -1;
343357
#endif
344358
} else {
345-
unget_char(ch, tok);
359+
unget_char(ch1, tok);
346360
return 1;
347361
}
348362
if (tok->encoding != NULL)
349363
PyMem_FREE(tok->encoding);
350364
tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
351365
return 1;
352-
NON_BOM:
353-
/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
354-
unget_char(0xFF, tok); /* XXX this will cause a syntax error */
355-
return 1;
356366
}
357367

358368
/* Read a line of text from TOK into S, using the stream in TOK.

0 commit comments

Comments
 (0)