8000 gh-97997: Add col_offset field to tokenizer and use that for AST node… · python/cpython@3de08ce · GitHub
[go: up one dir, main page]

Skip to content
Sign in

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 3de08ce

Browse files
authored
gh-97997: Add col_offset field to tokenizer and use that for AST nodes (#98000)
1 parent c062764 commit 3de08ce

File tree

3 files changed

+44
-11
lines changed

3 files changed

+44
-11
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.

Parser/tokenizer.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
#define TABSIZE 8
3838

3939
#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
40+
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
41+
type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
42+
#define ADVANCE_LINENO() \
43+
tok->lineno++; \
44+
tok->col_offset = 0;
4045

4146
/* Forward */
4247
static struct tok_state *tok_new(void);
@@ -73,6 +78,8 @@ tok_new(void)
7378
tok->pendin = 0;
7479
tok->prompt = tok->nextprompt = NULL;
7580
tok->lineno = 0;
81+
tok->starting_col_offset = -1;
82+
tok->col_offset = -1;
7683
tok->level = 0;
7784
tok->altindstack[0] = 0;
7885
tok->decoding_state = STATE_INIT;
@@ -871,7 +878,7 @@ tok_underflow_string(struct tok_state *tok) {
871878
tok->buf = tok->cur;
872879
}
873880
tok->line_start = tok->cur;
874-
tok->lineno++;
881+
ADVANCE_LINENO();
875882
tok->inp = end;
876883
return 1;
877884
}
@@ -930,7 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) {
930937
else if (tok->start != NULL) {
931938
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932939
size_t size = strlen(newtok);
933-
tok->lineno++;
940+
ADVANCE_LINENO();
934941
if (!tok_reserve_buf(tok, size + 1)) {
935942
PyMem_Free(tok->buf);
936943
tok->buf = NULL;
@@ -943,7 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
943950
tok->multi_line_start = tok->buf + cur_multi_line_start;
944951
}
945952
else {
946-
tok->lineno++;
953+
ADVANCE_LINENO();
947954
PyMem_Free(tok->buf);
948955
tok->buf = newtok;
949956
tok->cur = tok->buf;
@@ -998,7 +1005,7 @@ tok_underflow_file(struct tok_state *tok) {
9981005
*tok->inp = '\0';
9991006
}
10001007

1001-
tok->lineno++;
1008+
ADVANCE_LINENO();
10021009
if (tok->decoding_state != STATE_NORMAL) {
10031010
if (tok->lineno > 2) {
10041011
tok->decoding_state = STATE_NORMAL;
@@ -1056,6 +1063,7 @@ tok_nextc(struct tok_state *tok)
10561063
int rc;
10571064
for (;;) {
10581065
if (tok->cur != tok->inp) {
1066+
tok->col_offset++;
10591067
return Py_CHARMASK(*tok->cur++); /* Fast path */
10601068
}
10611069
if (tok->done != E_OK) {
@@ -1104,6 +1112,7 @@ tok_backup(struct tok_state *tok, int c)
11041112
if ((int)(unsigned char)*tok->cur != c) {
11051113
Py_FatalError("tok_backup: wrong character");
11061114
}
1115+
tok->col_offset--;
11071116
}
11081117
}
11091118

@@ -1390,21 +1399,33 @@ tok_continuation_line(struct tok_state *tok) {
13901399
return c;
13911400
}
13921401

1402+
static int
1403+
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
1404+
int end_col_offset, const char *start, const char *end)
1405+
{
1406+
token->level = tok->level;
1407+
token->lineno = token->end_lineno = tok->lineno;
1408+
token->col_offset = col_offset;
1409+
token->end_col_offset = end_col_offset;
1410+
token->start = start;
1411+
token->end = end;
1412+
return type;
1413+
}
1414+
13931415
static int
13941416
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
13951417
{
13961418
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
13971419
token->level = tok->level;
13981420
token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
13991421
token->end_lineno = tok->lineno;
1400-
token->col_offset = -1;
1401-
token->end_col_offset = -1;
1422+
token->col_offset = token->end_col_offset = -1;
14021423
token->start = start;
14031424
token->end = end;
1425+
14041426
if (start != NULL && end != NULL) {
1405-
const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
1406-
token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
1407-
token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
1427+
token->col_offset = tok->starting_col_offset;
1428+
token->end_col_offset = tok->col_offset;
14081429
}
14091430
return type;
14101431
}
@@ -1419,6 +1440,7 @@ tok_get(struct tok_state *tok, struct token *token)
14191440
const char *p_end = NULL;
14201441
nextline:
14211442
tok->start = NULL;
1443+
tok->starting_col_offset = -1;
14221444
blankline = 0;
14231445

14241446
/* Get indentation level */
@@ -1518,6 +1540,7 @@ tok_get(struct tok_state *tok, struct token *token)
15181540
}
15191541

15201542
tok->start = tok->cur;
1543+
tok->starting_col_offset = tok->col_offset;
15211544

15221545
/* Return pending indents/dedents */
15231546
if (tok->pendin != 0) {
@@ -1565,25 +1588,30 @@ tok_get(struct tok_state *tok, struct token *token)
15651588

15661589
/* Set start of current token */
15671590
tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1591+
tok->starting_col_offset = tok->col_offset - 1;
15681592

15691593
/* Skip comment, unless it's a type comment */
15701594
if (c == '#') {
15711595
const char *prefix, *p, *type_start;
1596+
int current_starting_col_offset;
15721597

15731598
while (c != EOF && c != '\n') {
15741599
c = tok_nextc(tok);
15751600
}
15761601

15771602
if (tok->type_comments) {
15781603
p = tok->start;
1604+
current_starting_col_offset = tok->starting_col_offset;
15791605
prefix = type_comment_prefix;
15801606
while (*prefix && p < tok->cur) {
15811607
if (*prefix == ' ') {
15821608
while (*p == ' ' || *p == '\t') {
15831609
p++;
1610+
current_starting_col_offset++;
15841611
}
15851612
} else if (*prefix == *p) {
15861613
p++;
1614+
current_starting_col_offset++;
15871615
} else {
15881616
break;
15891617
}
@@ -1594,7 +1622,9 @@ tok_get(struct tok_state *tok, struct token *token)
15941622
/* This is a type comment if we matched all of type_comment_prefix. */
15951623
if (!*prefix) {
15961624
int is_type_ignore = 1;
1625+
// +6 in order to skip the word 'ignore'
15971626
const char *ignore_end = p + 6;
1627+
const int ignore_end_col_offset = current_starting_col_offset + 6;
15981628
tok_backup(tok, c); /* don't eat the newline or EOF */
15991629

16001630
type_start = p;
@@ -1615,11 +1645,11 @@ tok_get(struct tok_state *tok, struct token *token)
16151645
tok_nextc(tok);
16161646
tok->atbol = 1;
16171647
}
1618-
return MAKE_TOKEN(TYPE_IGNORE);
1648+
return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
16191649
} else {
16201650
p_start = type_start;
16211651
p_end = tok->cur;
1622-
return MAKE_TOKEN(TYPE_COMMENT);
1652+
return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
16231653
}
16241654
}
16251655
}

Parser/tokenizer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ struct tok_state {
5757
int lineno; /* Current line number */
5858
int first_lineno; /* First line of a single line or multi line string
5959
expression (cf. issue 16806) */
60+
int starting_col_offset; /* The column offset at the beginning of a token */
61+
int col_offset; /* Current col offset */
6062
int level; /* () [] {} Parentheses nesting level */
6163
/* Used to allow free continuations inside them */
6264
char parenstack[MAXLEVEL];

0 commit comments

Comments
 (0)
0