-
-
Notifications
You must be signed in to change notification settings - Fork 32.5k
gh-120317: Lock around global state in the tokenize module #120318
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
900dd6a
abf568a
9dfe174
5fd07b6
814d9af
c1093be
117d256
975f01a
ca466fa
3abb8ab
601e539
6290f19
0ddde2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import io | ||
import time | ||
import unittest | ||
import tokenize | ||
from functools import partial | ||
from threading import Thread | ||
|
||
|
||
class TestTokenize(unittest.TestCase): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tested this locally: it crashes spectacularly without the fix :) |
||
def test_tokenizer_iter(self): | ||
source = io.StringIO("for _ in a:\n pass") | ||
it = tokenize._tokenize.TokenizerIter(source.readline, extra_tokens=False) | ||
|
||
tokens = [] | ||
def next_token(it): | ||
while True: | ||
try: | ||
r = next(it) | ||
tokens.append(tokenize.TokenInfo._make(r)) | ||
time.sleep(1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make this test faster? It's going to run on every CI job. I think we should aim for 0.1 seconds or less for these sorts of tests. (It's currently ~3 seconds). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed the amount of time each thread sleeps to 0.03 secs, which means that the whole test now takes about 0.1 seconds. Also verified that the test still fails without the fix and succeeds with it. |
||
except StopIteration: | ||
return | ||
|
||
threads = [] | ||
for _ in range(5): | ||
threads.append(Thread(target=partial(next_token, it))) | ||
|
||
for thread in threads: | ||
thread.start() | ||
|
||
for thread in threads: | ||
thread.join() | ||
|
||
expected_tokens = [ | ||
tokenize.TokenInfo(type=1, string='for', start=(1, 0), end=(1, 3), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=1, string='_', start=(1, 4), end=(1, 5), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=1, string='in', start=(1, 6), end=(1, 8), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=1, string='a', start=(1, 9), end=(1, 10), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=11, string=':', start=(1, 10), end=(1, 11), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=4, string='', start=(1, 11), end=(1, 11), line='for _ in a:\n'), | ||
tokenize.TokenInfo(type=5, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
tokenize.TokenInfo(type=1, string='pass', start=(2, 2), end=(2, 6), line=' pass'), | ||
tokenize.TokenInfo(type=4, string='', start=(2, 6), end=(2, 6), line=' pass'), | ||
tokenize.TokenInfo(type=6, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
tokenize.TokenInfo(type=0, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
] | ||
|
||
for token in tokens: | ||
self.assertIn(token, expected_tokens) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
#include "Python.h" | ||
#include "errcode.h" | ||
#include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION | ||
#include "../Parser/lexer/state.h" | ||
#include "../Parser/lexer/lexer.h" | ||
#include "../Parser/tokenizer/tokenizer.h" | ||
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() | ||
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() | ||
|
||
static struct PyModuleDef _tokenizemodule; | ||
|
||
|
@@ -175,14 +176,64 @@ _tokenizer_error(struct tok_state *tok) | |
return result; | ||
} | ||
|
||
static PyObject * | ||
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size) | ||
{ | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
|
||
PyObject *line; | ||
if (it->tok->lineno != it->last_lineno) { | ||
// Line has changed since last token, so we fetch the new line and cache it | ||
// in the iter object. | ||
Py_XDECREF(it->last_line); | ||
line = PyUnicode_DecodeUTF8(line_start, size, "replace"); | ||
it->last_line = line; | ||
it->byte_col_offset_diff = 0; | ||
} else { | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Line hasn't changed so we reuse the cached one. | ||
line = it->last_line; | ||
} | ||
return line; | ||
} | ||
|
||
static void | ||
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start, | ||
PyObject *line, Py_ssize_t lineno, Py_ssize_t end_lineno, | ||
Py_ssize_t *col_offset, Py_ssize_t *end_col_offset) | ||
{ | ||
Py_ssize_t byte_offset; | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (token.start != NULL && token.start >= line_start) { | ||
byte_offset = token.start - line_start; | ||
*col_offset = byte_offset - it->byte_col_offset_diff; | ||
} | ||
|
||
if (token.end != NULL && token.end >= it->tok->line_start) { | ||
Py_ssize_t end_byte_offset = token.end - it->tok->line_start; | ||
if (lineno == end_lineno) { | ||
// If the whole token is at the same line, we can just use the token.start | ||
// buffer for figuring out the new column offset, since using line is not | ||
// performant for very long lines. | ||
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
|
||
*end_col_offset = *col_offset + token_col_offset; | ||
it->byte_col_offset_diff += token.end - token.start - token_col_offset; | ||
} else { | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide r
10000
esolved
|
||
*end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); | ||
it->byte_col_offset_diff += end_byte_offset - *end_col_offset; | ||
} | ||
} | ||
it->last_lineno = lineno; | ||
} | ||
|
||
static PyObject * | ||
tokenizeriter_next(tokenizeriterobject *it) | ||
{ | ||
PyObject* result = NULL; | ||
struct token token; | ||
_PyToken_Init(&token); | ||
|
||
int type = _PyTokenizer_Get(it->tok, &token); | ||
int type; | ||
Py_BEGIN_CRITICAL_SECTION(it); | ||
type = _PyTokenizer_Get(it->tok, &token); | ||
Py_END_CRITICAL_SECTION(); | ||
|
||
if (type == ERRORTOKEN) { | ||
if(!PyErr_Occurred()) { | ||
_tokenizer_error(it->tok); | ||
|
@@ -192,7 +243,11 @@ tokenizeriter_next(tokenizeriterobject *it) | |
} | ||
if (it->done || type == ERRORTOKEN) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we need to protect the read? Otherwise we may get weird partial reads no? |
||
PyErr_SetString(PyExc_StopIteration, "EOF"); | ||
|
||
Py_BEGIN_CRITICAL_SECTION(it); | ||
it->done = 1; | ||
Py_END_CRITICAL_SECTION(); | ||
|
||
goto exit; | ||
} | ||
PyObject *str = NULL; | ||
|
@@ -220,18 +275,9 @@ tokenizeriter_next(tokenizeriterobject *it) | |
if (size >= 1 && it->tok->implicit_newline) { | ||
size -= 1; | ||
} | ||
|
||
if (it->tok->lineno != it->last_lineno) { | ||
// Line has changed since last token, so we fetch the new line and cache it | ||
// in the iter object. | ||
Py_XDECREF(it->last_line); | ||
line = PyUnicode_DecodeUTF8(line_start, size, "replace"); | ||
it->last_line = line; | ||
it->byte_col_offset_diff = 0; | ||
} else { | ||
// Line hasn't changed so we reuse the cached one. | ||
line = it->last_line; | ||
} | ||
Py_BEGIN_CRITICAL_SECTION(it); | ||
line = _get_current_line(it, line_start, size); | ||
Py_END_CRITICAL_SECTION(); | ||
} | ||
if (line == NULL) { | ||
Py_DECREF(str); | ||
|
@@ -240,29 +286,11 @@ tokenizeriter_next(tokenizeriterobject *it) | |
|
||
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; | ||
Py_ssize_t end_lineno = it->tok->lineno; | ||
it->last_lineno = lineno; | ||
|
||
Py_ssize_t col_offset = -1; | ||
Py_ssize_t end_col_offset = -1; | ||
Py_ssize_t byte_offset = -1; | ||
if (token.start != NULL && token.start >= line_start) { | ||
byte_offset = token.start - line_start; | ||
col_offset = byte_offset - it->byte_col_offset_diff; | ||
} | ||
if (token.end != NULL && token.end >= it->tok->line_start) { | ||
Py_ssize_t end_byte_offset = token.end - it->tok->line_start; | ||
if (lineno == end_lineno) { | ||
// If the whole token is at the same line, we can just use the token.start | ||
// buffer for figuring out the new column offset, since using line is not | ||
// performant for very long lines. | ||
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); | ||
end_col_offset = col_offset + token_col_offset; | ||
it->byte_col_offset_diff += token.end - token.start - token_col_offset; | ||
} else { | ||
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); | ||
it->byte_col_offset_diff += end_byte_offset - end_col_offset; | ||
} | ||
} | ||
Py_BEGIN_CRITICAL_SECTION(it); | ||
_get_col_offsets(it, token, line_start, line, lineno, end_lineno, &col_offset, &end_col_offset); | ||
Py_END_CRITICAL_SECTION(); | ||
|
||
if (it->tok->tok_extra_tokens) { | ||
if (is_trailing_token) { | ||
|
@@ -302,7 +330,9 @@ tokenizeriter_next(tokenizeriterobject *it) | |
exit: | ||
_PyToken_Free(&token); | ||
if (type == ENDMARKER) { | ||
Py_BEGIN_CRITICAL_SECTION(it); | ||
it->done = 1; | ||
lysnikolaou marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Py_END_CRITICAL_SECTION(); | ||
} | ||
return result; | ||
} | ||
|
Uh oh!
There was an error while loading. Please reload this page.