11from test import support
22from tokenize import (tokenize , _tokenize , untokenize , NUMBER , NAME , OP ,
33 STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
4- open as tokenize_open , Untokenizer , generate_tokens )
4+ open as tokenize_open , Untokenizer , generate_tokens ,
5+ NEWLINE )
56from io import BytesIO , StringIO
67import unittest
78from unittest import TestCase , mock
1112import token
1213
1314
15+ # Converts a source string into a list of textual representation
16+ # of the tokens such as:
17+ # ` NAME 'if' (1, 0) (1, 2)`
18+ # to make writing tests easier.
19+ def stringify_tokens_from_source (token_generator , source_string ):
20+ result = []
21+ num_lines = len (source_string .splitlines ())
22+ missing_trailing_nl = source_string [- 1 ] not in '\r \n '
23+
24+ for type , token , start , end , line in token_generator :
25+ if type == ENDMARKER :
26+ break
27+ # Ignore the new line on the last line if the input lacks one
28+ if missing_trailing_nl and type == NEWLINE and end [0 ] == num_lines :
29+ continue
30+ type = tok_name [type ]
31+ result .append (f" { type :10} { token !r:13} { start } { end } " )
32+
33+ return result
34+
1435class TokenizeTest (TestCase ):
1536 # Tests for the tokenize module.
1637
1738 # The tests can be really simple. Given a small fragment of source
18- # code, print out a table with tokens. The ENDMARKER is omitted for
19- # brevity.
39+ # code, print out a table with tokens. The ENDMARKER, ENCODING and
40+ # final NEWLINE are omitted for brevity.
2041
2142 def check_tokenize (self , s , expected ):
2243 # Format the tokens in s in a table format.
23- # The ENDMARKER is omitted.
24- result = []
44+ # The ENDMARKER and final NEWLINE are omitted.
2545 f = BytesIO (s .encode ('utf-8' ))
26- for type , token , start , end , line in tokenize (f .readline ):
27- if type == ENDMARKER :
28- break
29- type = tok_name [type ]
30- result .append (f" { type :10} { token !r:13} { start } { end } " )
46+ result = stringify_tokens_from_source (tokenize (f .readline ), s )
47+
3148 self .assertEqual (result ,
3249 [" ENCODING 'utf-8' (0, 0) (0, 0)" ] +
3350 expected .rstrip ().splitlines ())
3451
52+ def test_implicit_newline (self ):
53+ # Make sure that the tokenizer puts in an implicit NEWLINE
54+ # when the input lacks a trailing new line.
55+ f = BytesIO ("x" .encode ('utf-8' ))
56+ tokens = list (tokenize (f .readline ))
57+ self .assertEqual (tokens [- 2 ].type , NEWLINE )
58+ self .assertEqual (tokens [- 1 ].type , ENDMARKER )
59+
3560 def test_basic (self ):
3661 self .check_tokenize ("1 + 1" , """\
3762 NUMBER '1' (1, 0) (1, 1)
@@ -922,14 +947,9 @@ async def bar(): pass
922947class GenerateTokensTest (TokenizeTest ):
923948 def check_tokenize (self , s , expected ):
924949 # Format the tokens in s in a table format.
925- # The ENDMARKER is omitted.
926- result = []
950+ # The ENDMARKER and final NEWLINE are omitted.
927951 f = StringIO (s )
928- for type , token , start , end , line in generate_tokens (f .readline ):
929- if type == ENDMARKER :
930- break
931- type = tok_name [type ]
932- result .append (f" { type :10} { token !r:13} { start } { end } " )
952+ result = stringify_tokens_from_source (generate_tokens (f .readline ), s )
933953 self .assertEqual (result , expected .rstrip ().splitlines ())
934954
935955
@@ -1022,8 +1042,8 @@ def readline():
10221042 else :
10231043 return b''
10241044
1025- # skip the initial encoding token and the end token
1026- tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 1 ]
1045+ # skip the initial encoding token and the end tokens
1046+ tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 2 ]
10271047 expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
10281048 self .assertEqual (tokens , expected_tokens ,
10291049 "bytes not decoded with encoding" )
@@ -1039,8 +1059,8 @@ def readline():
10391059 else :
10401060 return b''
10411061
1042- # skip the end token
1043- tokens = list (_tokenize (readline , encoding = None ))[:- 1 ]
1062+ # skip the end tokens
1063+ tokens = list (_tokenize (readline , encoding = None ))[:- 2 ]
10441064 expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
10451065 self .assertEqual (tokens , expected_tokens ,
10461066 "string not tokenized when encoding is None" )
@@ -1351,18 +1371,21 @@ def test_oneline_defs(self):
13511371
13521372 # Test that 500 consequent, one-line defs is OK
13531373 toks = list (tokenize (BytesIO (buf .encode ('utf-8' )).readline ))
1354- self .assertEqual (toks [- 2 ].string , 'OK' ) # [-1] is always ENDMARKER
1374+ self .assertEqual (toks [- 3 ].string , 'OK' ) # [-1] is always ENDMARKER
1375+ # [-2] is always NEWLINE
13551376
13561377 def assertExactTypeEqual (self , opstr , * optypes ):
13571378 tokens = list (tokenize (BytesIO (opstr .encode ('utf-8' )).readline ))
13581379 num_optypes = len (optypes )
1359- self .assertEqual (len (tokens ), 2 + num_optypes )
1380+ self .assertEqual (len (tokens ), 3 + num_optypes )
13601381 self .assertEqual (tok_name [tokens [0 ].exact_type ],
13611382 tok_name [ENCODING ])
13621383 for i in range (num_optypes ):
13631384 self .assertEqual (tok_name [tokens [i + 1 ].exact_type ],
13641385 tok_name [optypes [i ]])
13651386 self .assertEqual (tok_name [tokens [1 + num_optypes ].exact_type ],
1387+ tok_name [token .NEWLINE ])
1388+ self .assertEqual (tok_name [tokens [2 + num_optypes ].exact_type ],
13661389 tok_name [token .ENDMARKER ])
13671390
13681391 def test_exact_type (self ):
@@ -1515,7 +1538,7 @@ def test_roundtrip(self):
15151538 self .check_roundtrip ("if x == 1:\n "
15161539 " print(x)\n " )
15171540 self .check_roundtrip ("# This is a comment\n "
1518- "# This also" )
1541+ "# This also\n " )
15191542
15201543 # Some people use different formatting conventions, which makes
15211544 # untokenize a little trickier. Note that this test involves trailing
0 commit comments