Skip to content

Commit 32bade1

Browse files
committed
py: Convert CR to LF and CR LF to LF in lexer.
Only noticeable difference is how newlines are encoded in triple-quoted strings. The behaviour now matches CPython3.
1 parent 3da677e commit 32bade1

3 files changed

Lines changed: 33 additions & 23 deletions

File tree

py/lexer.c

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ STATIC bool is_end(mp_lexer_t *lex) {
5555
}
5656

5757
STATIC bool is_physical_newline(mp_lexer_t *lex) {
58-
return lex->chr0 == '\n' || lex->chr0 == '\r';
58+
return lex->chr0 == '\n';
5959
}
6060

6161
STATIC bool is_char(mp_lexer_t *lex, char c) {
@@ -123,20 +123,10 @@ STATIC void next_char(mp_lexer_t *lex) {
123123
return;
124124
}
125125

126-
mp_uint_t advance = 1;
127-
128126
if (lex->chr0 == '\n') {
129-
// LF is a new line
130-
++lex->line;
131-
lex->column = 1;
132-
} else if (lex->chr0 == '\r') {
133-
// CR is a new line
127+
// a new line
134128
++lex->line;
135129
lex->column = 1;
136-
if (lex->chr1 == '\n') {
137-
// CR LF is a single new line
138-
advance = 2;
139-
}
140130
} else if (lex->chr0 == '\t') {
141131
// a tab
142132
lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
@@ -145,15 +135,26 @@ STATIC void next_char(mp_lexer_t *lex) {
145135
++lex->column;
146136
}
147137

148-
for (; advance > 0; advance--) {
149-
lex->chr0 = lex->chr1;
150-
lex->chr1 = lex->chr2;
151-
lex->chr2 = lex->stream_next_byte(lex->stream_data);
152-
if (lex->chr2 == MP_LEXER_EOF) {
153-
// EOF
154-
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
155-
lex->chr2 = '\n'; // insert newline at end of file
156-
}
138+
lex->chr0 = lex->chr1;
139+
lex->chr1 = lex->chr2;
140+
lex->chr2 = lex->stream_next_byte(lex->stream_data);
141+
142+
if (lex->chr0 == '\r') {
143+
// CR is a new line, converted to LF
144+
lex->chr0 = '\n';
145+
if (lex->chr1 == '\n') {
146+
// CR LF is a single new line
147+
lex->chr1 = lex->chr2;
148+
lex->chr2 = lex->stream_next_byte(lex->stream_data);
149+
}
150+
}
151+
152+
if (lex->chr2 == MP_LEXER_EOF) {
153+
// EOF, check if we need to insert a newline at end of file
154+
if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
155+
// if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
156+
// otherwise it just inserts a LF
157+
lex->chr2 = '\n';
157158
}
158159
}
159160
}
@@ -721,11 +722,15 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
721722
if (lex->chr0 == MP_LEXER_EOF) {
722723
lex->chr0 = '\n';
723724
} else if (lex->chr1 == MP_LEXER_EOF) {
724-
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
725+
if (lex->chr0 == '\r') {
726+
lex->chr0 = '\n';
727+
} else if (lex->chr0 != '\n') {
725728
lex->chr1 = '\n';
726729
}
727730
} else if (lex->chr2 == MP_LEXER_EOF) {
728-
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
731+
if (lex->chr1 == '\r') {
732+
lex->chr1 = '\n';
733+
} else if (lex->chr1 != '\n') {
729734
lex->chr2 = '\n';
730735
}
731736
}

tests/basics/string_cr_conversion.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# this file has CR line endings to test lexer's conversion of them to LF# in triple quoted stringsprint(repr("""abcdef"""))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# this file has CRLF line endings to test lexer's conversion of them to LF
2+
# in triple quoted strings
3+
print(repr("""abc
4+
def"""))

0 commit comments

Comments
 (0)