gh-105042: Disable unmatched parens syntax error in python tokenize

python · pablogsal · May 30, 2023 · May 29, 2023 · May 30, 2023 · May 30, 2023
commit 1c38078e132a20c66d9d17a7d613b4790a6fabf0
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
@@ -463,6 +463,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(id)
         STRUCT_FOR_ID(ident)
         STRUCT_FOR_ID(ignore)
+        STRUCT_FOR_ID(ignore_unmatched_parens)
         STRUCT_FOR_ID(imag)
         STRUCT_FOR_ID(importlib)
         STRUCT_FOR_ID(in_fd)

diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -447,7 +447,9 @@ def tokenize(readline):
 
 def _tokenize(rl_gen, encoding):
     source = b"".join(rl_gen).decode(encoding)
-    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+    for token in _generate_tokens_from_c_tokenizer(source,
+                                                   extra_tokens=True,
+                                                   ignore_unmatched_parens=True):
         yield token
 
 def generate_tokens(readline):
@@ -531,10 +533,12 @@ def error(message, filename=None, location=None):
         perror("unexpected error: %s" % err)
         raise
 
-def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
+def _generate_tokens_from_c_tokenizer(source, extra_tokens=False, ignore_unmatched_parens=False):
     """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
     import _tokenize as c_tokenizer
-    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
+    for info in c_tokenizer.TokenizerIter(source,
+                                          extra_tokens=extra_tokens,
+                                          ignore_unmatched_parens=ignore_unmatched_parens):
         yield TokenInfo._make(info)
 
 

@@ -113,6 +113,7 @@ tok_new(void)
     tok->report_warnings = 1;
     tok->tok_extra_tokens = 0;
     tok->comment_newline = 0;
+    tok->ignore_unmatched_parens = 0;
     tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
     tok->tok_mode_stack_index = 0;
     tok->tok_report_warnings = 1;
@@ -2496,41 +2497,42 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     case ')':
     case ']':
     case '}':
-        if (!tok->level) {
+        if (!tok->ignore_unmatched_parens && !tok->level) {
             if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
                 return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
             }
             return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
         }
-        tok->level--;
-        int opening = tok->parenstack[tok->level];
-        if (!((opening == '(' && c == ')') ||
-              (opening == '[' && c == ']') ||
-              (opening == '{' && c == '}')))
-        {
-            /* If the opening bracket belongs to an f-string's expression
-               part (e.g. f"{)}") and the closing bracket is an arbitrary
-               nested expression, then instead of matching a different
-               syntactical construct with it; we'll throw an unmatched
-               parentheses error. */
-            if (INSIDE_FSTRING(tok) && opening == '{') {
-                assert(current_tok->curly_bracket_depth >= 0);
-                int previous_bracket = current_tok->curly_bracket_depth - 1;
-                if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
-                    return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
+        if (tok->level > 0) {
+            tok->level--;
+            int opening = tok->parenstack[tok->level];
+            if (!tok->ignore_unmatched_parens && !((opening == '(' && c == ')') ||
+                                                (opening == '[' && c == ']') ||
+                                                (opening == '{' && c == '}'))) {
+                /* If the opening bracket belongs to an f-string's expression
+                part (e.g. f"{)}") and the closing bracket is an arbitrary
+                nested expression, then instead of matching a different
+                syntactical construct with it; we'll throw an unmatched
+                parentheses error. */
+                if (INSIDE_FSTRING(tok) && opening == '{') {
+                    assert(current_tok->curly_bracket_depth >= 0);
+                    int previous_bracket = current_tok->curly_bracket_depth - 1;
+                    if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
+                        return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
+                    }
+                }
+                if (tok->parenlinenostack[tok->level] != tok->lineno) {
+                    return MAKE_TOKEN(syntaxerror(tok,
+                            "closing parenthesis '%c' does not match "
+                            "opening parenthesis '%c' on line %d",
+                            c, opening, tok->parenlinenostack[tok->level]));
+                }
+                else {
+                    return MAKE_TOKEN(syntaxerror(tok,
+                            "closing parenthesis '%c' does not match "
+                            "opening parenthesis '%c'",
+                            c, opening));
                 }
-            }
-            if (tok->parenlinenostack[tok->level] != tok->lineno) {
-                return MAKE_TOKEN(syntaxerror(tok,
-                        "closing parenthesis '%c' does not match "
-                        "opening parenthesis '%c' on line %d",
-                        c, opening, tok->parenlinenostack[tok->level]));
-            }
-            else {
-                return MAKE_TOKEN(syntaxerror(tok,
-                        "closing parenthesis '%c' does not match "
-                        "opening parenthesis '%c'",
-                        c, opening));
             }
         }
 

diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
@@ -130,6 +130,7 @@ struct tok_state {
     int tok_report_warnings;
     int tok_extra_tokens;
     int comment_newline;
+    int ignore_unmatched_parens;
 #ifdef Py_DEBUG
     int debug;
 #endif

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -40,12 +40,13 @@ _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
     source: str
     *
     extra_tokens: bool
+    ignore_unmatched_parens: bool
 [clinic start generated code]*/
 
 static PyObject *
 tokenizeriter_new_impl(PyTypeObject *type, const char *source,
-                       int extra_tokens)
-/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
+                       int extra_tokens, int ignore_unmatched_parens)
+/*[clinic end generated code: output=5437e7bbc30de3f4 input=7f6b22d7c235ffd7]*/
 {
     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
     if (self == NULL) {
@@ -64,6 +65,12 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
     if (extra_tokens) {
         self->tok->tok_extra_tokens = 1;
     }
+    if (ignore_unmatched_parens) {
+        self->tok->ignore_unmatched_parens = 1;
+    }
+    if (ignore_unmatched_parens) {
+        self->tok->ignore_unmatched_parens = 1;
+    }
     self->done = 0;
     return (PyObject *)self;
 }
@@ -82,7 +89,7 @@ _tokenizer_error(struct tok_state *tok)
             msg = "invalid token";
             break;
         case E_EOF:
-            if (tok->level) {
+            if (tok->level > 0) {
                     PyErr_Format(PyExc_SyntaxError,
                                  "parenthesis '%c' was never closed",
                                 tok->parenstack[tok->level-1]);

diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h