Skip to content
Next Next commit
fix(tokenizer): Include CRLF lines in strings and column numbers
  • Loading branch information
mgmacias95 committed May 27, 2023
commit ad130da3f980daed1f1f8d8ca63cffc605749869
19 changes: 14 additions & 5 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,20 @@ def test_basic(self):
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("foo='bar'\r\n", """\
NAME 'foo' (1, 0) (1, 3)
OP '=' (1, 3) (1, 4)
STRING "'bar'" (1, 4) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
NAME 'if' (1, 0) (1, 2)
NAME 'True' (1, 3) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\r\\n' (1, 8) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\r\\n' (2, 8) (2, 10)
INDENT ' ' (3, 0) (3, 4)
NAME 'foo' (3, 4) (3, 7)
OP '=' (3, 7) (3, 8)
STRING "\'bar\'" (3, 8) (3, 13)
NEWLINE '\\r\\n' (3, 13) (3, 15)
NL '\\r\\n' (4, 0) (4, 2)
DEDENT '' (5, 0) (5, 0)
""")

indent_error_file = b"""\
Expand Down
23 changes: 8 additions & 15 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) {

static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
Expand All @@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
}
for (current = buf; *s; s++, current++) {
c = *s;
if (skip_next_lf) {
skip_next_lf = 0;
if (c == '\n') {
c = *++s;
if (!c)
break;
}
}
if (c == '\r') {
skip_next_lf = 1;
c = '\n';
}
if (!c)
break;
*current = c;
}
/* If this is exec input, add a newline to the end of the string if
Expand Down Expand Up @@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
Expand Down Expand Up @@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
const char *prefix, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}

Expand Down Expand Up @@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(NAME);
}

if (c == '\r') {
c = tok_nextc(tok);
}

/* Newline */
if (c == '\n') {
tok->atbol = 1;
Expand Down
6 changes: 5 additions & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it)
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
if (it->tok->start[0] == '\r') {
str = PyUnicode_FromString("\r\n");
Comment thread
mgmacias95 marked this conversation as resolved.
} else {
str = PyUnicode_FromString("\n");
}
end_col_offset++;
}
}
Expand Down