From a7cf92c4704f67fb5ce70533aebf6da137c26417 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Tue, 28 Jun 2022 14:20:58 +0100 Subject: [PATCH 1/2] gh-94360: Fix a tokenizer crash when reading encoded files with syntax errors from stdin Signed-off-by: Pablo Galindo --- .../2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst | 2 ++ Parser/pegen_errors.c | 8 ++++---- Parser/tokenizer.c | 10 +++++++++- 3 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst b/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst new file mode 100644 index 00000000000000..0a74ba38b0ac43 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst @@ -0,0 +1,2 @@ +Fixed a tokenizer crash when reading encoded files with syntax errors from +``stdin`` with non utf-8 encoded text. Patch by Pablo Galindo diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 489699679633e9..5703088443eded 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -259,15 +259,15 @@ get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; for (int i = 0; i < relative_lineno - 1; i++) { - char *new_line = strchr(cur_line, '\n') + 1; + char *new_line = strchr(cur_line, '\n'); // The assert is here for debug builds but the conditional that // follows is there so in release builds we do not crash at the cost // to report a potentially wrong line. - assert(new_line != NULL && new_line <= buf_end); - if (new_line == NULL || new_line > buf_end) { + assert(new_line != NULL && new_line + 1 < buf_end); + if (new_line == NULL || new_line + 1 > buf_end) { break; } - cur_line = new_line; + cur_line = new_line + 1; } char *next_newline; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 952265eb923f9d..1bd884da41998f 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -308,6 +308,10 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; Py_ssize_t line_size = strlen(line); + char last_char = line[line_size > 0 ? line_size - 1: line_size]; + if (last_char != '\n') { + line_size += 1; + } char* new_str = tok->interactive_src_start; new_str = PyMem_Realloc(new_str, current_size + line_size + 1); @@ -321,7 +325,11 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { return -1; } strcpy(new_str + current_size, line); - + if (last_char != '\n') { + /* Last line does not end in \n, fake one */ + new_str[current_size + line_size - 1] = '\n'; + new_str[current_size + line_size] = '\0'; + } tok->interactive_src_start = new_str; tok->interactive_src_end = new_str + current_size + line_size; return 0; From a22ae03afeaeafa7a37ac226e7bd666eb62708da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 5 Jul 2022 17:43:08 +0200 Subject: [PATCH 2/2] nitty nit --- Parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 1bd884da41998f..f2606f17d14630 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -308,7 +308,7 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; Py_ssize_t line_size = strlen(line); - char last_char = line[line_size > 0 ? line_size - 1: line_size]; + char last_char = line[line_size > 0 ? line_size - 1 : line_size]; if (last_char != '\n') { line_size += 1; }