From ad130da3f980daed1f1f8d8ca63cffc605749869 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Sat, 27 May 2023 23:45:28 +0200
Subject: [PATCH 1/9] fix(tokenizer): Include CRLF lines in strings and column
 numbers

---
 Lib/test/test_tokenize.py | 19 ++++++++++++++-----
 Parser/tokenizer.c        | 23 ++++++++---------------
 Python/Python-tokenize.c  |  6 +++++-
 3 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 293592b3fd13db..b76d1e000978d5 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -85,11 +85,20 @@ def test_basic(self):
     DEDENT     ''            (5, 0) (5, 0)
     """)
 
-        self.check_tokenize("foo='bar'\r\n", """\
-    NAME       'foo'         (1, 0) (1, 3)
-    OP         '='           (1, 3) (1, 4)
-    STRING     "'bar'"       (1, 4) (1, 9)
-    NEWLINE    '\\n'          (1, 9) (1, 10)
+        self.check_tokenize("if True:\r\n    # NL\r\n    foo='bar'\r\n\r\n", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NAME       'True'        (1, 3) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\r\\n'        (1, 8) (1, 10)
+    COMMENT    '# NL'        (2, 4) (2, 8)
+    NL         '\\r\\n'        (2, 8) (2, 10)
+    INDENT     '    '        (3, 0) (3, 4)
+    NAME       'foo'         (3, 4) (3, 7)
+    OP         '='           (3, 7) (3, 8)
+    STRING     "\'bar\'"       (3, 8) (3, 13)
+    NEWLINE    '\\r\\n'        (3, 13) (3, 15)
+    NL         '\\r\\n'        (4, 0) (4, 2)
+    DEDENT     ''            (5, 0) (5, 0)
             """)
 
         indent_error_file = b"""\
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a84c2492b6b17a..da37da0d4ec38d 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) {
 
 static char *
 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
-    int skip_next_lf = 0;
     size_t needed_length = strlen(s) + 2, final_length;
     char *buf, *current;
     char c = '\0';
@@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     }
     for (current = buf; *s; s++, current++) {
         c = *s;
-        if (skip_next_lf) {
-            skip_next_lf = 0;
-            if (c == '\n') {
-                c = *++s;
-                if (!c)
-                    break;
-            }
-        }
-        if (c == '\r') {
-            skip_next_lf = 1;
-            c = '\n';
-        }
+        if (!c)
+            break;
         *current = c;
     }
     /* If this is exec input, add a newline to the end of the string if
@@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             }
         }
         tok_backup(tok, c);
-        if (c == '#' || c == '\n') {
+        if (c == '#' || c == '\n' || c == '\r') {
             /* Lines with only whitespace and/or comments
                shouldn't affect the indentation and are
                not passed to the parser as NEWLINE tokens,
@@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         const char *prefix, *type_start;
         int current_starting_col_offset;
 
-        while (c != EOF && c != '\n') {
+        while (c != EOF && c != '\n' && c != '\r') {
             c = tok_nextc(tok);
         }
 
@@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         return MAKE_TOKEN(NAME);
     }
 
+    if (c == '\r') {
+        c = tok_nextc(tok);
+    }
+
     /* Newline */
     if (c == '\n') {
         tok->atbol = 1;
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 01c2215366a736..40d21cd6d588b1 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it)
             type = NAME;
         }
         else if (type == NEWLINE) {
-            str = PyUnicode_FromString("\n");
+            if (it->tok->start[0] == '\r') {
+                str = PyUnicode_FromString("\r\n");
+            } else {
+                str = PyUnicode_FromString("\n");
+            }
             end_col_offset++;
         }
     }

From e5293bcc26b74239bf3c1cb23d8315d2c30a287a Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sat, 27 May 2023 21:50:49 +0000
Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst               | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst
new file mode 100644
index 00000000000000..02d653c2d658eb
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst	
@@ -0,0 +1 @@
+Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.

From 37f77adc0f865a579e22a8c9db3196ae7330cdab Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 28 May 2023 00:07:55 +0100
Subject: [PATCH 3/9] Only preserve crlf in the Python-tokenize extension

---
 Parser/pegen.c           |  4 ++--
 Parser/tokenizer.c       | 32 ++++++++++++++++++++++----------
 Parser/tokenizer.h       |  4 ++--
 Python/Python-tokenize.c |  2 +-
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/Parser/pegen.c b/Parser/pegen.c
index b031a6f5d440e8..b9894dd0acc546 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
 
     struct tok_state *tok;
     if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
-        tok = _PyTokenizer_FromUTF8(str, exec_input);
+        tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
     } else {
-        tok = _PyTokenizer_FromString(str, exec_input);
+        tok = _PyTokenizer_FromString(str, exec_input, 0);
     }
     if (tok == NULL) {
         if (PyErr_Occurred()) {
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index da37da0d4ec38d..6e563851d6ed45 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -772,7 +772,9 @@ translate_into_utf8(const char* str, const char* enc) {
 
 
 static char *
-translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
+translate_newlines(const char *s, int exec_input, int preserve_crlf,
+                   struct tok_state *tok) {
+    int skip_next_lf = 0;
     size_t needed_length = strlen(s) + 2, final_length;
     char *buf, *current;
     char c = '\0';
@@ -783,8 +785,18 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     }
     for (current = buf; *s; s++, current++) {
         c = *s;
-        if (!c)
-            break;
+        if (skip_next_lf) {
+            skip_next_lf = 0;
+            if (c == '\n') {
+                c = *++s;
+                if (!c)
+                    break;
+            }
+        }
+        if (!preserve_crlf && c == '\r') {
+            skip_next_lf = 1;
+            c = '\n';
+        }
         *current = c;
     }
     /* If this is exec input, add a newline to the end of the string if
@@ -811,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
    inside TOK.  */
 
 static char *
-decode_str(const char *input, int single, struct tok_state *tok)
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
 {
     PyObject* utf8 = NULL;
     char *str;
     const char *s;
     const char *newl[2] = {NULL, NULL};
     int lineno = 0;
-    tok->input = str = translate_newlines(input, single, tok);
+    tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
     if (str == NULL)
         return NULL;
     tok->enc = NULL;
@@ -870,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
 /* Set up tokenizer for string */
 
 struct tok_state *
-_PyTokenizer_FromString(const char *str, int exec_input)
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
 {
     struct tok_state *tok = tok_new();
     char *decoded;
 
     if (tok == NULL)
         return NULL;
-    decoded = decode_str(str, exec_input, tok);
+    decoded = decode_str(str, exec_input, tok, preserve_crlf);
     if (decoded == NULL) {
         _PyTokenizer_Free(tok);
         return NULL;
@@ -891,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
 /* Set up tokenizer for UTF-8 string */
 
 struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
 {
     struct tok_state *tok = tok_new();
     char *translated;
     if (tok == NULL)
         return NULL;
-    tok->input = translated = translate_newlines(str, exec_input, tok);
+    tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
     if (translated == NULL) {
         _PyTokenizer_Free(tok);
         return NULL;
@@ -1039,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
     }
     char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
     if (newtok != NULL) {
-        char *translated = translate_newlines(newtok, 0, tok);
+        char *translated = translate_newlines(newtok, 0, 0, tok);
         PyMem_Free(newtok);
         if (translated == NULL) {
             return 0;
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 019f533ef2a260..02749e355da812 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -135,8 +135,8 @@ struct tok_state {
 #endif
 };
 
-extern struct tok_state *_PyTokenizer_FromString(const char *, int);
-extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
+extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 40d21cd6d588b1..ba37b891a09fd8 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
     if (filename == NULL) {
         return NULL;
     }
-    self->tok = _PyTokenizer_FromUTF8(source, 1);
+    self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
     if (self->tok == NULL) {
         Py_DECREF(filename);
         return NULL;

From 68e7b36f8b32d141d4ab6e0dc914b3a96167f7d8 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 28 May 2023 12:32:40 +0100
Subject: [PATCH 4/9] Handle \r\n in continuation lines

---
 Lib/test/test_tokenize.py | 11 +++++++++++
 Parser/tokenizer.c        |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index b76d1e000978d5..ecb2f2d1b86389 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -101,6 +101,17 @@ def test_basic(self):
     DEDENT     ''            (5, 0) (5, 0)
             """)
 
+        self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (2, 0) (2, 1)
+    NEWLINE    '\\r\\n'        (2, 1) (2, 3)
+            """)
+
+
+
         indent_error_file = b"""\
 def k(x):
     x += 2
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 6e563851d6ed45..93aca41a84f6a1 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1595,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
 static inline int
 tok_continuation_line(struct tok_state *tok) {
     int c = tok_nextc(tok);
+    if (c == '\r') {
+        c = tok_nextc(tok);
+    }
     if (c != '\n') {
         tok->done = E_LINECONT;
         return -1;

From 969060f693541a472e905285383bed7f76b433ec Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 28 May 2023 13:13:01 +0100
Subject: [PATCH 5/9] fixup! Handle \r\n in continuation lines

---
 Parser/tokenizer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 93aca41a84f6a1..0f2fef8464606d 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2413,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             else {
                 end_quote_size = 0;
                 if (c == '\\') {
-                    tok_nextc(tok);  /* skip escaped char */
+                    c = tok_nextc(tok);  /* skip escaped char */
+                    if (c == '\r') {
+                        c = tok_nextc(tok);
+                    }
                 }
             }
         }

From 559487c15052c3fba40d50a546c184734a5ddee6 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 28 May 2023 13:19:31 +0100
Subject: [PATCH 6/9] fixup! fixup! Handle \r\n in continuation lines

---
 Lib/test/test_tokenize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ecb2f2d1b86389..3e30dfcb37d46b 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1804,9 +1804,9 @@ def test_random_files(self):
             if support.verbose >= 2:
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
-                # with self.subTest(file=testfile):
-                self.check_roundtrip(f)
-                self.check_line_extraction(f)
+                with self.subTest(file=testfile):
+                    self.check_roundtrip(f)
+                    self.check_line_extraction(f)
 
 
     def roundtrip(self, code):

From 363de2890385979c70a784a6971cef26d54e4f0c Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Sun, 28 May 2023 13:47:51 +0100
Subject: [PATCH 7/9] fixup! fixup! fixup! Handle \r\n in continuation lines

---
 Lib/test/test_tokenize.py | 10 ++++++++++
 Parser/tokenizer.c        |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 3e30dfcb37d46b..6c8f3df091ae09 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -2104,6 +2104,10 @@ def test_string(self):
 b\
 c"""', """\
     STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
+    """)
+
+        self.check_tokenize(r'"hola\\\r\ndfgf"', """\
+    STRING     \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
     """)
 
         self.check_tokenize('f"abc"', """\
@@ -2140,6 +2144,12 @@ def test_string(self):
     FSTRING_START 'Rf"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
     FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+
+        self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
+    FSTRING_END \'"\'           (1, 16) (1, 17)
     """)
 
     def test_function(self):
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0f2fef8464606d..59c817293fbfcd 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2707,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             return MAKE_TOKEN(FSTRING_MIDDLE);
         } else if (c == '\\') {
             int peek = tok_nextc(tok);
+            if (peek == '\r') {
+                peek = tok_nextc(tok);
+            }
             // Special case when the backslash is right before a curly
             // brace. We have to restore and return the control back
             // to the loop for the next iteration.

From 67b3a9ccd8608d7c447f4c94c88fa6d5bd5fe09c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Sun, 28 May 2023 14:55:01 +0200
Subject: [PATCH 8/9] Update Lib/test/test_tokenize.py

Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
---
 Lib/test/test_tokenize.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 6c8f3df091ae09..cd11dddd0fe51a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -110,8 +110,6 @@ def test_basic(self):
     NEWLINE    '\\r\\n'        (2, 1) (2, 3)
             """)
 
-
-
         indent_error_file = b"""\
 def k(x):
     x += 2

From d3d4ff5fd74338b143c04befb4e7214f05b3ff55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Sun, 28 May 2023 15:33:22 +0200
Subject: [PATCH 9/9] Prevent leak

---
 Python/Python-tokenize.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index ba37b891a09fd8..4eced66b617708 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -240,6 +240,7 @@ tokenizeriter_next(tokenizeriterobject *it)
             type = NAME;
         }
         else if (type == NEWLINE) {
+            Py_DECREF(str);
             if (it->tok->start[0] == '\r') {
                 str = PyUnicode_FromString("\r\n");
             } else {