From ad130da3f980daed1f1f8d8ca63cffc605749869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Sat, 27 May 2023 23:45:28 +0200 Subject: [PATCH 1/9] fix(tokenizer): Include CRLF lines in strings and column numbers --- Lib/test/test_tokenize.py | 19 ++++++++++++++----- Parser/tokenizer.c | 23 ++++++++--------------- Python/Python-tokenize.c | 6 +++++- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 293592b3fd13db..b76d1e000978d5 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -85,11 +85,20 @@ def test_basic(self): DEDENT '' (5, 0) (5, 0) """) - self.check_tokenize("foo='bar'\r\n", """\ - NAME 'foo' (1, 0) (1, 3) - OP '=' (1, 3) (1, 4) - STRING "'bar'" (1, 4) (1, 9) - NEWLINE '\\n' (1, 9) (1, 10) + self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\ + NAME 'if' (1, 0) (1, 2) + NAME 'True' (1, 3) (1, 7) + OP ':' (1, 7) (1, 8) + NEWLINE '\\r\\n' (1, 8) (1, 10) + COMMENT '# NL' (2, 4) (2, 8) + NL '\\r\\n' (2, 8) (2, 10) + INDENT ' ' (3, 0) (3, 4) + NAME 'foo' (3, 4) (3, 7) + OP '=' (3, 7) (3, 8) + STRING "\'bar\'" (3, 8) (3, 13) + NEWLINE '\\r\\n' (3, 13) (3, 15) + NL '\\r\\n' (4, 0) (4, 2) + DEDENT '' (5, 0) (5, 0) """) indent_error_file = b"""\ diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index a84c2492b6b17a..da37da0d4ec38d 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) { static char * translate_newlines(const char *s, int exec_input, struct tok_state *tok) { - int skip_next_lf = 0; size_t needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; @@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { } for (current = buf; *s; s++, current++) { c = *s; - if (skip_next_lf) { - skip_next_lf = 0; - if (c == '\n') { - c = *++s; - if (!c) - break; - } - } - if (c == '\r') { - skip_next_lf = 1; - c = '\n'; - } + if (!c) + break; *current = c; } /* If this is exec input, add a newline to the end of the string if @@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } tok_backup(tok, c); - if (c == '#' || c == '\n') { + if (c == '#' || c == '\n' || c == '\r') { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, @@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t const char *prefix, *type_start; int current_starting_col_offset; - while (c != EOF && c != '\n') { + while (c != EOF && c != '\n' && c != '\r') { c = tok_nextc(tok); } @@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(NAME); } + if (c == '\r') { + c = tok_nextc(tok); + } + /* Newline */ if (c == '\n') { tok->atbol = 1; diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 01c2215366a736..40d21cd6d588b1 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it) type = NAME; } else if (type == NEWLINE) { - str = PyUnicode_FromString("\n"); + if (it->tok->start[0] == '\r') { + str = PyUnicode_FromString("\r\n"); + } else { + str = PyUnicode_FromString("\n"); + } end_col_offset++; } } From e5293bcc26b74239bf3c1cb23d8315d2c30a287a Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 27 May 2023 21:50:49 +0000 Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst new file mode 100644 index 00000000000000..02d653c2d658eb --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst @@ -0,0 +1 @@ +Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez. From 37f77adc0f865a579e22a8c9db3196ae7330cdab Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 28 May 2023 00:07:55 +0100 Subject: [PATCH 3/9] Only preserve crlf in the Python-tokenize extension --- Parser/pegen.c | 4 ++-- Parser/tokenizer.c | 32 ++++++++++++++++++++++---------- Parser/tokenizer.h | 4 ++-- Python/Python-tokenize.c | 2 +- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index b031a6f5d440e8..b9894dd0acc546 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen struct tok_state *tok; if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) { - tok = _PyTokenizer_FromUTF8(str, exec_input); + tok = _PyTokenizer_FromUTF8(str, exec_input, 0); } else { - tok = _PyTokenizer_FromString(str, exec_input); + tok = _PyTokenizer_FromString(str, exec_input, 0); } if (tok == NULL) { if (PyErr_Occurred()) { diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index da37da0d4ec38d..6e563851d6ed45 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -772,7 +772,9 @@ translate_into_utf8(const char* str, const char* enc) { static char * -translate_newlines(const char *s, int exec_input, struct tok_state *tok) { +translate_newlines(const char *s, int exec_input, int preserve_crlf, + struct tok_state *tok) { + int skip_next_lf = 0; size_t needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; @@ -783,8 +785,18 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { } for (current = buf; *s; s++, current++) { c = *s; - if (!c) - break; + if (skip_next_lf) { + skip_next_lf = 0; + if (c == '\n') { + c = *++s; + if (!c) + break; + } + } + if (!preserve_crlf && c == '\r') { + skip_next_lf = 1; + c = '\n'; + } *current = c; } /* If this is exec input, add a newline to the end of the string if @@ -811,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { inside TOK. */ static char * -decode_str(const char *input, int single, struct tok_state *tok) +decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) { PyObject* utf8 = NULL; char *str; const char *s; const char *newl[2] = {NULL, NULL}; int lineno = 0; - tok->input = str = translate_newlines(input, single, tok); + tok->input = str = translate_newlines(input, single, preserve_crlf, tok); if (str == NULL) return NULL; tok->enc = NULL; @@ -870,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok) /* Set up tokenizer for string */ struct tok_state * -_PyTokenizer_FromString(const char *str, int exec_input) +_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) { struct tok_state *tok = tok_new(); char *decoded; if (tok == NULL) return NULL; - decoded = decode_str(str, exec_input, tok); + decoded = decode_str(str, exec_input, tok, preserve_crlf); if (decoded == NULL) { _PyTokenizer_Free(tok); return NULL; @@ -891,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input) /* Set up tokenizer for UTF-8 string */ struct tok_state * -_PyTokenizer_FromUTF8(const char *str, int exec_input) +_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf) { struct tok_state *tok = tok_new(); char *translated; if (tok == NULL) return NULL; - tok->input = translated = translate_newlines(str, exec_input, tok); + tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok); if (translated == NULL) { _PyTokenizer_Free(tok); return NULL; @@ -1039,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) { } char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); if (newtok != NULL) { - char *translated = translate_newlines(newtok, 0, tok); + char *translated = translate_newlines(newtok, 0, 0, tok); PyMem_Free(newtok); if (translated == NULL) { return 0; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 019f533ef2a260..02749e355da812 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -135,8 +135,8 @@ struct tok_state { #endif }; -extern struct tok_state *_PyTokenizer_FromString(const char *, int); -extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int); +extern struct tok_state *_PyTokenizer_FromString(const char *, int, int); +extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 40d21cd6d588b1..ba37b891a09fd8 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (filename == NULL) { return NULL; } - self->tok = _PyTokenizer_FromUTF8(source, 1); + self->tok = _PyTokenizer_FromUTF8(source, 1, 1); if (self->tok == NULL) { Py_DECREF(filename); return NULL; From 68e7b36f8b32d141d4ab6e0dc914b3a96167f7d8 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 28 May 2023 12:32:40 +0100 Subject: [PATCH 4/9] Handle \r\n in continuation lines --- Lib/test/test_tokenize.py | 11 +++++++++++ Parser/tokenizer.c | 3 +++ 2 files changed, 14 insertions(+) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index b76d1e000978d5..ecb2f2d1b86389 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -101,6 +101,17 @@ def test_basic(self): DEDENT '' (5, 0) (5, 0) """) + self.check_tokenize("x = 1 + \\\r\n1\r\n", """\ + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + OP '+' (1, 6) (1, 7) + NUMBER '1' (2, 0) (2, 1) + NEWLINE '\\r\\n' (2, 1) (2, 3) + """) + + + indent_error_file = b"""\ def k(x): x += 2 diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6e563851d6ed45..93aca41a84f6a1 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1595,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok) static inline int tok_continuation_line(struct tok_state *tok) { int c = tok_nextc(tok); + if (c == '\r') { + c = tok_nextc(tok); + } if (c != '\n') { tok->done = E_LINECONT; return -1; From 969060f693541a472e905285383bed7f76b433ec Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 28 May 2023 13:13:01 +0100 Subject: [PATCH 5/9] fixup! Handle \r\n in continuation lines --- Parser/tokenizer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 93aca41a84f6a1..0f2fef8464606d 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2413,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else { end_quote_size = 0; if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + c = tok_nextc(tok); /* skip escaped char */ + if (c == '\r') { + c = tok_nextc(tok); + } } } } From 559487c15052c3fba40d50a546c184734a5ddee6 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 28 May 2023 13:19:31 +0100 Subject: [PATCH 6/9] fixup! fixup! Handle \r\n in continuation lines --- Lib/test/test_tokenize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ecb2f2d1b86389..3e30dfcb37d46b 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1804,9 +1804,9 @@ def test_random_files(self): if support.verbose >= 2: print('tokenize', testfile) with open(testfile, 'rb') as f: - # with self.subTest(file=testfile): - self.check_roundtrip(f) - self.check_line_extraction(f) + with self.subTest(file=testfile): + self.check_roundtrip(f) + self.check_line_extraction(f) def roundtrip(self, code): From 363de2890385979c70a784a6971cef26d54e4f0c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 28 May 2023 13:47:51 +0100 Subject: [PATCH 7/9] fixup! fixup! fixup! Handle \r\n in continuation lines --- Lib/test/test_tokenize.py | 10 ++++++++++ Parser/tokenizer.c | 3 +++ 2 files changed, 13 insertions(+) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 3e30dfcb37d46b..6c8f3df091ae09 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -2104,6 +2104,10 @@ def test_string(self): b\ c"""', """\ STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) + """) + + self.check_tokenize(r'"hola\\\r\ndfgf"', """\ + STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16) """) self.check_tokenize('f"abc"', """\ @@ -2140,6 +2144,12 @@ def test_string(self): FSTRING_START 'Rf"' (1, 0) (1, 3) FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) FSTRING_END '"' (2, 3) (2, 4) + """) + + self.check_tokenize(r'f"hola\\\r\ndfgf"', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16) + FSTRING_END \'"\' (1, 16) (1, 17) """) def test_function(self): diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2fef8464606d..59c817293fbfcd 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2707,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct return MAKE_TOKEN(FSTRING_MIDDLE); } else if (c == '\\') { int peek = tok_nextc(tok); + if (peek == '\r') { + peek = tok_nextc(tok); + } // Special case when the backslash is right before a curly // brace. We have to restore and return the control back // to the loop for the next iteration. From 67b3a9ccd8608d7c447f4c94c88fa6d5bd5fe09c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Sun, 28 May 2023 14:55:01 +0200 Subject: [PATCH 8/9] Update Lib/test/test_tokenize.py Co-authored-by: Pablo Galindo Salgado --- Lib/test/test_tokenize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 6c8f3df091ae09..cd11dddd0fe51a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -110,8 +110,6 @@ def test_basic(self): NEWLINE '\\r\\n' (2, 1) (2, 3) """) - - indent_error_file = b"""\ def k(x): x += 2 From d3d4ff5fd74338b143c04befb4e7214f05b3ff55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Sun, 28 May 2023 15:33:22 +0200 Subject: [PATCH 9/9] Prevent leak --- Python/Python-tokenize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index ba37b891a09fd8..4eced66b617708 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -240,6 +240,7 @@ tokenizeriter_next(tokenizeriterobject *it) type = NAME; } else if (type == NEWLINE) { + Py_DECREF(str); if (it->tok->start[0] == '\r') { str = PyUnicode_FromString("\r\n"); } else {