8000 gh-102856: Tokenize performance improvement (#104731) · python/cpython@8817886 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8817886

Browse files
authored
gh-102856: Tokenize performance improvement (#104731)
1 parent 4b107d8 commit 8817886

File tree

2 files changed

+17
-13
lines changed

2 files changed

+17
-13
lines changed

Lib/tokenize.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
449449
source = b"".join(rl_gen).decode(encoding)
450450
token = None
451451
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
452-
# TODO: Marta -> limpiar esto
453-
if 6 < token.type <= 54:
454-
token = token._replace(type=OP)
455-
if token.type in {ASYNC, AWAIT}:
456-
token = token._replace(type=NAME)
457-
if token.type == NEWLINE:
458-
l_start, c_start = token.start
459-
l_end, c_end = token.end
460-
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
461-
462452
yield token
463453
if token is not None:
464454
last_line, _ = token.start
@@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
550540
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
551541
import _tokenize as c_tokenizer
552542
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
553-
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
554-
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
543+
yield TokenInfo._make(info)
555544

556545

557546
if __name__ == "__main__":

Python/Python-tokenize.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
207207
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
208208
}
209209

210-
result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
210+
if (it->tok->tok_extra_tokens) {
211+
// Necessary adjustments to match the original Python tokenize
212+
// implementation
213+
if (type > DEDENT && type < OP) {
214+
type = OP;
215+
}
216+
else if (type == ASYNC || type == AWAIT) {
217+
type = NAME;
218+
}
219+
else if (type == NEWLINE) {
220+
str = PyUnicode_FromString("\n");
221+
end_col_offset++;
222+
}
223+
}
224+
225+
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
211226
exit:
212227
_PyToken_Free(&token);
213228
return result;

0 commit comments

Comments
 (0)
0