fix(tokenizer): Include CRLF lines in strings and column numbers

python · pablogsal · May 28, 2023 · May 27, 2023 · May 27, 2023 · May 27, 2023
commit ad130da3f980daed1f1f8d8ca63cffc605749869
@@ -85,11 +85,20 @@ def test_basic(self):
     DEDENT     ''            (5, 0) (5, 0)
     """)
 
-        self.check_tokenize("foo='bar'\r\n", """\
-    NAME       'foo'         (1, 0) (1, 3)
-    OP         '='           (1, 3) (1, 4)
-    STRING     "'bar'"       (1, 4) (1, 9)
-    NEWLINE    '\\n'          (1, 9) (1, 10)
+        self.check_tokenize("if True:\r\n    # NL\r\n    foo='bar'\r\n\r\n", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NAME       'True'        (1, 3) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\r\\n'        (1, 8) (1, 10)
+    COMMENT    '# NL'        (2, 4) (2, 8)
+    NL         '\\r\\n'        (2, 8) (2, 10)
+    INDENT     '    '        (3, 0) (3, 4)
+    NAME       'foo'         (3, 4) (3, 7)
+    OP         '='           (3, 7) (3, 8)
+    STRING     "\'bar\'"       (3, 8) (3, 13)
+    NEWLINE    '\\r\\n'        (3, 13) (3, 15)
+    NL         '\\r\\n'        (4, 0) (4, 2)
+    DEDENT     ''            (5, 0) (5, 0)
             """)
 
         indent_error_file = b"""\

@@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) {
 
 static char *
 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
-    int skip_next_lf = 0;
     size_t needed_length = strlen(s) + 2, final_length;
     char *buf, *current;
     char c = '\0';
@@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     }
     for (current = buf; *s; s++, current++) {
         c = *s;
-        if (skip_next_lf) {
-            skip_next_lf = 0;
-            if (c == '\n') {
-                c = *++s;
-                if (!c)
-                    break;
-            }
-        }
-        if (c == '\r') {
-            skip_next_lf = 1;
-            c = '\n';
-        }
+        if (!c)
+            break;
         *current = c;
     }
     /* If this is exec input, add a newline to the end of the string if
@@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             }
         }
         tok_backup(tok, c);
-        if (c == '#' || c == '\n') {
+        if (c == '#' || c == '\n' || c == '\r') {
             /* Lines with only whitespace and/or comments
                shouldn't affect the indentation and are
                not passed to the parser as NEWLINE tokens,
@@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         const char *prefix, *type_start;
         int current_starting_col_offset;
 
-        while (c != EOF && c != '\n') {
+        while (c != EOF && c != '\n' && c != '\r') {
             c = tok_nextc(tok);
         }
 
@@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         return MAKE_TOKEN(NAME);
     }
 
+    if (c == '\r') {
+        c = tok_nextc(tok);
+    }
+
     /* Newline */
     if (c == '\n') {
         tok->atbol = 1;

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it)
             type = NAME;
         }
         else if (type == NEWLINE) {
-            str = PyUnicode_FromString("\n");
+            if (it->tok->start[0] == '\r') {
+                str = PyUnicode_FromString("\r\n");
+            } else {
+                str = PyUnicode_FromString("\n");
+            }
             end_col_offset++;
         }
     }