gh-104169: Refactor tokenizer into lexer and wrappers

* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes.
python · lysnikolaou · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
commit 99a2b40bd5c8cbe0fe5bf818b326f2ff22aceaab
@@ -347,20 +347,36 @@ PEGEN_OBJS=		\
 		Parser/string_parser.o \
 		Parser/peg_api.o
 
+TOKENIZER_OBJS=		\
+		Parser/lexer/buffer.o \
+		Parser/lexer/lexer.o \
+		Parser/lexer/state.o \
+		Parser/tokenizer/file_tokenizer.o \
+		Parser/tokenizer/readline_tokenizer.o \
+		Parser/tokenizer/string_tokenizer.o \
+		Parser/tokenizer/utf8_tokenizer.o \
+		Parser/tokenizer/helpers.o
 
 PEGEN_HEADERS= \
 		$(srcdir)/Include/internal/pycore_parser.h \
 		$(srcdir)/Parser/pegen.h \
 		$(srcdir)/Parser/string_parser.h
 
+TOKENIZER_HEADERS= \
+		Parser/lexer/buffer.h \
+		Parser/lexer/lexer.h \
+		Parser/lexer/state.h \
+		Parser/tokenizer/tokenizer.h \
+		Parser/tokenizer/helpers.h
+
 POBJS=		\
 		Parser/token.o \
 
-PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
+PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o
 
 PARSER_HEADERS= \
 		$(PEGEN_HEADERS) \
-		$(srcdir)/Parser/tokenizer.h
+		$(TOKENIZER_HEADERS)
    
 ##########################################################################
 # Python

@@ -1,7 +1,6 @@
 #include <Python.h>
 
 #include "pegen.h"
-#include "tokenizer.h"
 #include "string_parser.h"
 #include "pycore_runtime.h"         // _PyRuntime
 

@@ -0,0 +1,76 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "state.h"
+
+/* Traverse and remember all f-string buffers, in order to be able to restore
+   them after reallocating tok->buf */
+void
+remember_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start_offset = mode->f_string_start - tok->buf;
+        mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
+    }
+}
+
+/* Traverse and restore all f-string buffers after reallocating tok->buf */
+void
+restore_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start = tok->buf + mode->f_string_start_offset;
+        mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
+    }
+}
+
+/* Read a line of text from TOK into S, using the stream in TOK.
+   Return NULL on failure, else S.
+
+   On entry, tok->decoding_buffer will be one of:
+     1) NULL: need to call tok->decoding_readline to get a new line
+     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
+       stored the result in tok->decoding_buffer
+     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
+       (in the s buffer) to copy entire contents of the line read
+       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
+       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
+       until the buffer ends with a '\n' (or until the end of the file is
+       reached): see tok_nextc and its calls to tok_reserve_buf.
+*/
+int
+tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
+{
+    Py_ssize_t cur = tok->cur - tok->buf;
+    Py_ssize_t oldsize = tok->inp - tok->buf;
+    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
+    if (newsize > tok->end - tok->buf) {
+        char *newbuf = tok->buf;
+        Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
+        Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
+        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
+        remember_fstring_buffers(tok);
+        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
+        if (newbuf == NULL) {
+            tok->done = E_NOMEM;
+            return 0;
+        }
+        tok->buf = newbuf;
+        tok->cur = tok->buf + cur;
+        tok->inp = tok->buf + oldsize;
+        tok->end = tok->buf + newsize;
+        tok->start = start < 0 ? NULL : tok->buf + start;
+        tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
+        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
+        restore_fstring_buffers(tok);
+    }
+    return 1;
+}
@@ -0,0 +1,10 @@
+#ifndef _LEXER_BUFFER_H_
+#define _LEXER_BUFFER_H_
+
+#include "pyport.h"
+
+void remember_fstring_buffers(struct tok_state *tok);
+void restore_fstring_buffers(struct tok_state *tok);
+int tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
+
+#endif