8000 gh-104169: Refactor tokenizer into lexer and wrappers by lysnikolaou · Pull Request #110684 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-104169: Refactor tokenizer into lexer and wrappers #110684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 11, 2023
Next Next commit
gh-104169: Refactor tokenizer into lexer and wrappers
* The lexer, which include the actual lexeme producing logic, goes into
  the `lexer` directory.
* The wrappers, one wrapper per input mode (file, string, utf-8, and
  readline), go into the `tokenizer` directory and include logic for
  creating a lexer instance and managing the buffer for different modes.
  • Loading branch information
lysnikolaou committed Oct 11, 2023
commit 99a2b40bd5c8cbe0fe5bf818b326f2ff22aceaab
20 changes: 18 additions & 2 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -347,20 +347,36 @@ PEGEN_OBJS= \
Parser/string_parser.o \
Parser/peg_api.o

TOKENIZER_OBJS= \
Parser/lexer/buffer.o \
Parser/lexer/lexer.o \
Parser/lexer/state.o \
Parser/tokenizer/file_tokenizer.o \
Parser/tokenizer/readline_tokenizer.o \
Parser/tokenizer/string_tokenizer.o \
Parser/tokenizer/utf8_tokenizer.o \
Parser/tokenizer/helpers.o

PEGEN_HEADERS= \
$(srcdir)/Include/internal/pycore_parser.h \
$(srcdir)/Parser/pegen.h \
$(srcdir)/Parser/string_parser.h

TOKENIZER_HEADERS= \
Parser/lexer/buffer.h \
Parser/lexer/lexer.h \
Parser/lexer/state.h \
Parser/tokenizer/tokenizer.h \
Parser/tokenizer/helpers.h

POBJS= \
Parser/token.o \

PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o

PARSER_HEADERS= \
$(PEGEN_HEADERS) \
$(srcdir)/Parser/tokenizer.h
$(TOKENIZER_HEADERS)

##########################################################################
# Python
Expand Down
1 change: 0 additions & 1 deletion Parser/action_helpers.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <Python.h>

#include "pegen.h"
#include "tokenizer.h"
#include "string_parser.h"
#include "pycore_runtime.h" // _PyRuntime

Expand Down
76 changes: 76 additions & 0 deletions Parser/lexer/buffer.c
8000
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "Python.h"
#include "errcode.h"

#include "state.h"

/* Traverse and remember all f-string buffers, in order to be able to restore
them after reallocating tok->buf */
void
remember_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start_offset = mode->f_string_start - tok->buf;
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
}
}

/* Traverse and restore all f-string buffers after reallocating tok->buf */
void
restore_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start = tok->buf + mode->f_string_start_offset;
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
}
}

/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.

On entry, tok->decoding_buffer will be one of:
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
until the buffer ends with a '\n' (or until the end of the file is
reached): see tok_nextc and its calls to tok_reserve_buf.
*/
int
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
{
Py_ssize_t cur = tok->cur - tok->buf;
Py_ssize_t oldsize = tok->inp - tok->buf;
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
if (newsize > tok->end - tok->buf) {
char *newbuf = tok->buf;
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
return 0;
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
tok->inp = tok->buf + oldsize;
tok->end = tok->buf + newsize;
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
restore_fstring_buffers(tok);
}
return 1;
}
10 changes: 10 additions & 0 deletions Parser/lexer/buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef _LEXER_BUFFER_H_
#define _LEXER_BUFFER_H_

#include "pyport.h"

void remember_fstring_buffers(struct tok_state *tok);
void restore_fstring_buffers(struct tok_state *tok);
int tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);

#endif
Loading
0