Skip to content

Commit 2998647

Browse files
tomlogicdpgeorge
authored andcommitted
py/lexer: Simplify lexer startup by using dummy bytes and next_char().
Now consistently uses the EOL processing ("\r" and "\r\n" convert to "\n") and EOF processing (ensure "\n" before EOF) provided by next_char(). In particular the lexer can now correctly handle input that starts with CR.
1 parent e711e2d commit 2998647

1 file changed

Lines changed: 7 additions & 22 deletions

File tree

py/lexer.c

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
677677
lex->source_name = src_name;
678678
lex->reader = reader;
679679
lex->line = 1;
680-
lex->column = 1;
680+
lex->column = -2; // account for 3 dummy bytes
681681
lex->emit_dent = 0;
682682
lex->nested_bracket_level = 0;
683683
lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
@@ -688,27 +688,12 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
688688
// store sentinel for first indentation level
689689
lex->indent_level[0] = 0;
690690

691-
// preload characters
692-
lex->chr0 = reader.readbyte(reader.data);
693-
lex->chr1 = reader.readbyte(reader.data);
694-
lex->chr2 = reader.readbyte(reader.data);
695-
696-
// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
697-
if (lex->chr0 == MP_LEXER_EOF) {
698-
lex->chr0 = '\n';
699-
} else if (lex->chr1 == MP_LEXER_EOF) {
700-
if (lex->chr0 == '\r') {
701-
lex->chr0 = '\n';
702-
} else if (lex->chr0 != '\n') {
703-
lex->chr1 = '\n';
704-
}
705-
} else if (lex->chr2 == MP_LEXER_EOF) {
706-
if (lex->chr1 == '\r') {
707-
lex->chr1 = '\n';
708-
} else if (lex->chr1 != '\n') {
709-
lex->chr2 = '\n';
710-
}
711-
}
691+
// load lexer with start of file, advancing lex->column to 1
692+
// start with dummy bytes and use next_char() for proper EOL/EOF handling
693+
lex->chr0 = lex->chr1 = lex->chr2 = 0;
694+
next_char(lex);
695+
next_char(lex);
696+
next_char(lex);
712697

713698
// preload first token
714699
mp_lexer_to_next(lex);

0 commit comments

Comments
 (0)