8000 gh-104169: Refactor tokenizer into lexer and wrappers by lysnikolaou · Pull Request #110684 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-104169: Refactor tokenizer into lexer and wrappers #110684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 11, 2023
Prev Previous commit
Next Next commit
Renames to remove smelly symbols
  • Loading branch information
lysnikolaou committed Oct 11, 2023
commit a4190a9a8678f7a2b9afd60fbd8e5df53a2cd6a6
10 changes: 5 additions & 5 deletions Parser/lexer/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
/* Traverse and remember all f-string buffers, in order to be able to restore
them after reallocating tok->buf */
void
remember_fstring_buffers(struct tok_state *tok)
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
Expand All @@ -20,7 +20,7 @@ remember_fstring_buffers(struct tok_state *tok)

/* Traverse and restore all f-string buffers after reallocating tok->buf */
void
restore_fstring_buffers(struct tok_state *tok)
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
Expand All @@ -47,7 +47,7 @@ restore_fstring_buffers(struct tok_state *tok)
reached): see tok_nextc and its calls to tok_reserve_buf.
*/
int
tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
{
Py_ssize_t cur = tok->cur - tok->buf;
Py_ssize_t oldsize = tok->inp - tok->buf;
Expand All @@ -57,7 +57,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
remember_fstring_buffers(tok);
_PyLexer_remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
Expand All @@ -70,7 +70,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
restore_fstring_buffers(tok);
_PyLexer_restore_fstring_buffers(tok);
}
return 1;
}
6 changes: 3 additions & 3 deletions Parser/lexer/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

#include "pyport.h"

void remember_fstring_buffers(struct tok_state *tok);
void restore_fstring_buffers(struct tok_state *tok);
int tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);

#endif
80 changes: 40 additions & 40 deletions Parser/lexer/lexer.c

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Parser/lexer/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "state.h"

int update_fstring_expr(struct tok_state *tok, char cur);
int _PyLexer_update_fstring_expr(struct tok_state *tok, char cur);

int _PyTokenizer_Get(struct tok_state *, struct token *);

Expand Down
6 changes: 3 additions & 3 deletions Parser/lexer/state.c
8000
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

/* Create and initialize a new tok_state structure */
struct tok_state *
tok_new(void)
_PyTokenizer_tok_new(void)
{
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
sizeof(struct tok_state));
Expand Down Expand Up @@ -113,7 +113,7 @@ _PyToken_Init(struct token *token) {
}

int
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
_PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int end_col_offset, const char *start, const char *end)
{
token->level = tok->level;
Expand All @@ -126,7 +126,7 @@ type_comment_token_setup(struct tok_state *tok, struct token *token, int type, i
}

int
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
_PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
{
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
token->level = tok->level;
Expand Down
6 changes: 3 additions & 3 deletions Parser/lexer/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,11 @@ struct tok_state {
#endif
};

int type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int end_col_offset, const char *start, const char *end);
int token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end);
int _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end);

struct tok_state *tok_new(void);
struct tok_state *_PyTokenizer_tok_new(void);
void _PyTokenizer_Free(struct tok_state *);
void _PyToken_Free(struct token *);
void _PyToken_Init(struct token *);
Expand Down
42 changes: 21 additions & 21 deletions Parser/tokenizer/file_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ static int
tok_readline_raw(struct tok_state *tok)
{
do {
if (!tok_reserve_buf(tok, BUFSIZ)) {
if (!_PyLexer_tok_reserve_buf(tok, BUFSIZ)) {
return 0;
}
int n_chars = (int)(tok->end - tok->inp);
Expand Down Expand Up @@ -86,7 +86,7 @@ tok_readline_recode(struct tok_state *tok) {
if (line == NULL) {
line = PyObject_CallNoArgs(tok->decoding_readline);
if (line == NULL) {
error_ret(tok);
_PyTokenizer_error_ret(tok);
goto error;
}
}
Expand All @@ -95,14 +95,14 @@ tok_readline_recode(struct tok_state *tok) {
}
buf = PyUnicode_AsUTF8AndSize(line, &buflen);
if (buf == NULL) {
error_ret(tok);
_PyTokenizer_error_ret(tok);
goto error;
}
// Make room for the null terminator *and* potentially
// an extra newline character that we may need to artificially
// add.
size_t buffer_size = buflen + 2;
if (!tok_reserve_buf(tok, buffer_size)) {
if (!_PyLexer_tok_reserve_buf(tok, buffer_size)) {
goto error;
}
memcpy(tok->inp, buf, buflen);
Expand Down Expand Up @@ -132,7 +132,7 @@ static void fp_ungetc(int c, struct tok_state *tok) {
/* Set the readline function for TOK to a StreamReader's
readline function. The StreamReader is named ENC.

This function is called from check_bom and check_coding_spec.
This function is called from _PyTokenizer_check_bom and _PyTokenizer_check_coding_spec.

ENC is usually identical to the future value of tok->encoding,
except for the (currently unsupported) case of UTF-16.
Expand Down Expand Up @@ -195,7 +195,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, 0, tok);
char *translated = _PyTokenizer_translate_newlines(newtok, 0, 0, tok);
PyMem_Free(newtok);
if (translated == NULL) {
return 0;
Expand All @@ -206,7 +206,7 @@ tok_underflow_interactive(struct tok_state *tok) {
/* Recode to UTF-8 */
Py_ssize_t buflen;
const char* buf;
PyObject *u = translate_into_utf8(newtok, tok->encoding);
PyObject *u = _PyTokenizer_translate_into_utf8(newtok, tok->encoding);
PyMem_Free(newtok);
if (u == NULL) {
tok->done = E_DECODE;
Expand Down Expand Up @@ -240,10 +240,10 @@ tok_underflow_interactive(struct tok_state *tok) {
}
else if (tok->start != NULL) {
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
remember_fstring_buffers(tok);
_PyLexer_remember_fstring_buffers(tok);
size_t size = strlen(newtok);
ADVANCE_LINENO();
if (!tok_reserve_buf(tok, size + 1)) {
if (!_PyLexer_tok_reserve_buf(tok, size + 1)) {
PyMem_Free(tok->buf);
tok->buf = NULL;
PyMem_Free(newtok);
Expand All @@ -253,18 +253,18 @@ tok_underflow_interactive(struct tok_state *tok) {
PyMem_Free(newtok);
tok->inp += size;
tok->multi_line_start = tok->buf + cur_multi_line_start;
restore_fstring_buffers(tok);
_PyLexer_restore_fstring_buffers(tok);
}
else {
remember_fstring_buffers(tok);
_PyLexer_remember_fstring_buffers(tok);
ADVANCE_LINENO();
PyMem_Free(tok->buf);
tok->buf = newtok;
tok->cur = tok->buf;
tok->line_start = tok->buf;
tok->inp = strchr(tok->buf, '\0');
tok->end = tok->inp + 1;
restore_fstring_buffers(tok);
_PyLexer_restore_fstring_buffers(tok);
}
if (tok->done != E_OK) {
if (tok->prompt != NULL) {
Expand All @@ -273,7 +273,7 @@ tok_underflow_interactive(struct tok_state *tok) {
return 0;
}

if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
if (tok->tok_mode_stack_index && !_PyLexer_update_fstring_expr(tok, 0)) {
return 0;
}
return 1;
Expand All @@ -288,8 +288,8 @@ tok_underflow_file(struct tok_state *tok) {
/* We have not yet determined the encoding.
If an encoding is found, use the file-pointer
reader functions from now on. */
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
error_ret(tok);
if (!_PyTokenizer_check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
_PyTokenizer_error_ret(tok);
return 0;
}
assert(tok->decoding_state != STATE_INIT);
Expand Down Expand Up @@ -320,7 +320,7 @@ tok_underflow_file(struct tok_state *tok) {
tok->implicit_newline = 1;
}

if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
if (tok->tok_mode_stack_index && !_PyLexer_update_fstring_expr(tok, 0)) {
return 0;
}

Expand All @@ -329,16 +329,16 @@ tok_underflow_file(struct tok_state *tok) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
}
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
tok, fp_setreadl))
{
return 0;
}
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
error_ret(tok);
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
_PyTokenizer_error_ret(tok);
return 0;
}
assert(tok->done == E_OK);
Expand All @@ -350,7 +350,7 @@ struct tok_state *
_PyTokenizer_FromFile(FILE *fp, const char* enc,
const char *ps1, const char *ps2)
{
struct tok_state *tok = tok_new();
struct tok_state *tok = _PyTokenizer_tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
Expand All @@ -370,7 +370,7 @@ _PyTokenizer_FromFile(FILE *fp, const char* enc,
if (enc != NULL) {
/* Must copy encoding declaration since it
gets copied into the parse tree. */
tok->encoding = new_string(enc, strlen(enc), tok);
tok->encoding = _PyTokenizer_new_string(enc, strlen(enc), tok);
if (!tok->encoding) {
_PyTokenizer_Free(tok);
return NULL;
Expand Down
Loading
0