8000 [3.12] gh-105390: Correctly raise TokenError instead of SyntaxError f… · python/cpython@c84d4d1 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit c84d4d1

Browse files
[3.12] gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors (GH-105399) (#105439)
1 parent c607551 commit c84d4d1

File tree

6 files changed

+35
-24
lines changed

6 files changed

+35
-24
lines changed

Doc/library/tokenize.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,6 @@ function it uses to do this is available:
139139
2,
140140
3
141141

142-
Note that unclosed single-quoted strings do not cause an error to be
143-
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
144-
tokenization of their contents.
145-
146-
147142
.. _tokenize-cli:
148143

149144
Command-Line Usage

Doc/whatsnew/3.12.rst

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,14 +1489,15 @@ Changes in the Python API
14891489
Additionally, there may be some minor behavioral changes as a consecuence of the
14901490
changes required to support :pep:`701`. Some of these changes include:
14911491

1492-
* Some final ``DEDENT`` tokens are now emitted within the bounds of the
1493-
input. This means that for a file containing 3 lines, the old version of the
1494-
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
1495-
the token in line 3.
1496-
14971492
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
14981493
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
14991494

1495+
* Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
1496+
multiline strings do.
1497+
1498+
* Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
1499+
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
1500+
15001501
Build Changes
15011502
=============
15021503

Lib/test/test_tokenize.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
44
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
55
open as tokenize_open, Untokenizer, generate_tokens,
6-
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
6+
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
7+
TokenError)
78
from io import BytesIO, StringIO
89
import unittest
910
from textwrap import dedent
@@ -286,7 +287,7 @@ def number_token(s):
286287
for lit in INVALID_UNDERSCORE_LITERALS:
287288
try:
288289
number_token(lit)
289-
except SyntaxError:
290+
except TokenError:
290291
continue
291292
self.assertNotEqual(number_token(lit), lit)
292293

@@ -1379,7 +1380,7 @@ def test_latin1_normalization(self):
13791380
self.assertEqual(found, "iso-8859-1")
13801381

13811382
def test_syntaxerror_latin1(self):
1382-
# Issue 14629: need to raise SyntaxError if the first
1383+
# Issue 14629: need to raise TokenError if the first
13831384
# line(s) have non-UTF-8 characters
13841385
lines = (
13851386
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
@@ -2754,7 +2755,7 @@ def get_tokens(string):
27542755
"]",
27552756
]:
27562757
with self.subTest(case=case):
2757-
self.assertRaises(SyntaxError, get_tokens, case)
2758+
self.assertRaises(TokenError, get_tokens, case)
27582759

27592760
def test_max_indent(self):
27602761
MAXINDENT = 100
@@ -2773,7 +2774,7 @@ def generate_source(indents):
27732774

27742775
invalid = generate_source(MAXINDENT)
27752776
the_input = StringIO(invalid)
2776-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
2777+
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27772778
self.assertRaises(
27782779
IndentationError, compile, invalid, "<string>", "exec"
27792780
)

Lib/tokenize.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,14 +517,30 @@ def error(message, filename=None, location=None):
517517
perror("unexpected error: %s" % err)
518518
raise
519519

520+
def _transform_msg(msg):
521+
"""Transform error messages from the C tokenizer into the Python tokenize
522+
523+
The C tokenizer is more picky than the Python one, so we need to massage
524+
the error messages a bit for backwards compatibility.
525+
"""
526+
if "unterminated triple-quoted string literal" in msg:
527+
return "EOF in multi-line string"
528+
return msg
529+
520530
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
521531
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
522532
if encoding is None:
523533
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
524534
else:
525535
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
526-
for info in it:
527-
yield TokenInfo._make(info)
536+
try:
537+
for info in it:
538+
yield TokenInfo._make(info)
539+
except SyntaxError as e:
540+
if type(e) != SyntaxError:
541+
raise e from None
542+
msg = _transform_msg(e.msg)
543+
raise TokenError(msg, (e.lineno, e.offset)) from None
528544

529545

530546
if __name__ == "__main__":
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
2+
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
3+
Pablo Galindo

Python/Python-tokenize.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
8484
msg = "invalid token";
8585
break;
8686
case E_EOF:
87-
if (tok->level > 0) {
88-
PyErr_Format(PyExc_SyntaxError,
89-
"parenthesis '%c' was never closed",
90-
tok->parenstack[tok->level-1]);
91-
} else {
92-
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
93-
}
87+
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
88+
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
9489
return -1;
9590
case E_DEDENT:
9691
msg = "unindent does not match any outer indentation level";

0 commit comments

Comments
 (0)
0