Skip to content

Commit c4ef489

Browse files
ammaraskartaleinat
authored andcommitted
bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891)
Most of the change involves fixing up the test suite, which previously made the assumption that there wouldn't be a new line if the input didn't end in one. Contributed by Ammar Askar.
1 parent 3c8aae9 commit c4ef489

3 files changed

Lines changed: 60 additions & 24 deletions

File tree

Lib/test/test_tokenize.py

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from test import support
22
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
33
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4-
open as tokenize_open, Untokenizer, generate_tokens)
4+
open as tokenize_open, Untokenizer, generate_tokens,
5+
NEWLINE)
56
from io import BytesIO, StringIO
67
import unittest
78
from unittest import TestCase, mock
@@ -11,27 +12,51 @@
1112
import token
1213

1314

15+
# Converts a source string into a list of textual representation
16+
# of the tokens such as:
17+
# ` NAME 'if' (1, 0) (1, 2)`
18+
# to make writing tests easier.
19+
def stringify_tokens_from_source(token_generator, source_string):
20+
result = []
21+
num_lines = len(source_string.splitlines())
22+
missing_trailing_nl = source_string[-1] not in '\r\n'
23+
24+
for type, token, start, end, line in token_generator:
25+
if type == ENDMARKER:
26+
break
27+
# Ignore the new line on the last line if the input lacks one
28+
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
29+
continue
30+
type = tok_name[type]
31+
result.append(f" {type:10} {token!r:13} {start} {end}")
32+
33+
return result
34+
1435
class TokenizeTest(TestCase):
1536
# Tests for the tokenize module.
1637

1738
# The tests can be really simple. Given a small fragment of source
18-
# code, print out a table with tokens. The ENDMARKER is omitted for
19-
# brevity.
39+
# code, print out a table with tokens. The ENDMARKER, ENCODING and
40+
# final NEWLINE are omitted for brevity.
2041

2142
def check_tokenize(self, s, expected):
2243
# Format the tokens in s in a table format.
23-
# The ENDMARKER is omitted.
24-
result = []
44+
# The ENDMARKER and final NEWLINE are omitted.
2545
f = BytesIO(s.encode('utf-8'))
26-
for type, token, start, end, line in tokenize(f.readline):
27-
if type == ENDMARKER:
28-
break
29-
type = tok_name[type]
30-
result.append(f" {type:10} {token!r:13} {start} {end}")
46+
result = stringify_tokens_from_source(tokenize(f.readline), s)
47+
3148
self.assertEqual(result,
3249
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
3350
expected.rstrip().splitlines())
3451

52+
def test_implicit_newline(self):
53+
# Make sure that the tokenizer puts in an implicit NEWLINE
54+
# when the input lacks a trailing new line.
55+
f = BytesIO("x".encode('utf-8'))
56+
tokens = list(tokenize(f.readline))
57+
self.assertEqual(tokens[-2].type, NEWLINE)
58+
self.assertEqual(tokens[-1].type, ENDMARKER)
59+
3560
def test_basic(self):
3661
self.check_tokenize("1 + 1", """\
3762
NUMBER '1' (1, 0) (1, 1)
@@ -922,14 +947,9 @@ async def bar(): pass
922947
class GenerateTokensTest(TokenizeTest):
923948
def check_tokenize(self, s, expected):
924949
# Format the tokens in s in a table format.
925-
# The ENDMARKER is omitted.
926-
result = []
950+
# The ENDMARKER and final NEWLINE are omitted.
927951
f = StringIO(s)
928-
for type, token, start, end, line in generate_tokens(f.readline):
929-
if type == ENDMARKER:
930-
break
931-
type = tok_name[type]
932-
result.append(f" {type:10} {token!r:13} {start} {end}")
952+
result = stringify_tokens_from_source(generate_tokens(f.readline), s)
933953
self.assertEqual(result, expected.rstrip().splitlines())
934954

935955

@@ -1022,8 +1042,8 @@ def readline():
10221042
else:
10231043
return b''
10241044

1025-
# skip the initial encoding token and the end token
1026-
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
1045+
# skip the initial encoding token and the end tokens
1046+
tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
10271047
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
10281048
self.assertEqual(tokens, expected_tokens,
10291049
"bytes not decoded with encoding")
@@ -1039,8 +1059,8 @@ def readline():
10391059
else:
10401060
return b''
10411061

1042-
# skip the end token
1043-
tokens = list(_tokenize(readline, encoding=None))[:-1]
1062+
# skip the end tokens
1063+
tokens = list(_tokenize(readline, encoding=None))[:-2]
10441064
expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
10451065
self.assertEqual(tokens, expected_tokens,
10461066
"string not tokenized when encoding is None")
@@ -1351,18 +1371,21 @@ def test_oneline_defs(self):
13511371

13521372
# Test that 500 consequent, one-line defs is OK
13531373
toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1354-
self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1374+
self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1375+
# [-2] is always NEWLINE
13551376

13561377
def assertExactTypeEqual(self, opstr, *optypes):
13571378
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
13581379
num_optypes = len(optypes)
1359-
self.assertEqual(len(tokens), 2 + num_optypes)
1380+
self.assertEqual(len(tokens), 3 + num_optypes)
13601381
self.assertEqual(tok_name[tokens[0].exact_type],
13611382
tok_name[ENCODING])
13621383
for i in range(num_optypes):
13631384
self.assertEqual(tok_name[tokens[i + 1].exact_type],
13641385
tok_name[optypes[i]])
13651386
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1387+
tok_name[token.NEWLINE])
1388+
self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
13661389
tok_name[token.ENDMARKER])
13671390

13681391
def test_exact_type(self):
@@ -1515,7 +1538,7 @@ def test_roundtrip(self):
15151538
self.check_roundtrip("if x == 1:\n"
15161539
" print(x)\n")
15171540
self.check_roundtrip("# This is a comment\n"
1518-
"# This also")
1541+
"# This also\n")
15191542

15201543
# Some people use different formatting conventions, which makes
15211544
# untokenize a little trickier. Note that this test involves trailing

Lib/tokenize.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,8 +492,15 @@ def _tokenize(readline, encoding):
492492
# BOM will already have been stripped.
493493
encoding = "utf-8"
494494
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
495+
last_line = b''
496+
line = b''
495497
while True: # loop over lines in stream
496498
try:
499+
# We capture the value of the line variable here because
500+
# readline uses the empty string '' to signal end of input,
501+
# hence `line` itself will always be overwritten at the end
502+
# of this loop.
503+
last_line = line
497504
line = readline()
498505
except StopIteration:
499506
line = b''
@@ -648,6 +655,9 @@ def _tokenize(readline, encoding):
648655
(lnum, pos), (lnum, pos+1), line)
649656
pos += 1
650657

658+
# Add an implicit NEWLINE if the input doesn't end in one
659+
if last_line and last_line[-1] not in '\r\n':
660+
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
651661
for indent in indents[1:]: # pop remaining indent levels
652662
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
653663
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Tokenize module now implicitly emits a NEWLINE when provided with input that
2+
does not have a trailing new line. This behavior now matches what the C
3+
tokenizer does internally. Contributed by Ammar Askar.

0 commit comments

Comments
 (0)