10000 bpo-40593: Improve syntax errors for invalid characters in source code. by serhiy-storchaka · Pull Request #20033 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-40593: Improve syntax errors for invalid characters in source code. #20033

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
bpo-40593: Improve syntax errors for invalid characters in source code.
  • Loading branch information
serhiy-storchaka committed May 11, 2020
commit 8c069633d4fc403475b6168a09e3f4e04a9c1bc6
2 changes: 2 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);

PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);

#ifdef __cplusplus
}
#endif
1 change: 0 additions & 1 deletion Include/errcode.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#define E_BADSINGLE 27 /* Ill-formed single statement input */

#ifdef __cplusplus
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_fstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def test_missing_expression(self):
])

# Different error message is raised for other whitespace characters.
self.assertAllRaise(SyntaxError, 'invalid character in identifier',
self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)",
["f'''{\xa0}'''",
"\xa0",
])
Expand Down
3 changes: 3 additions & 0 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def test_issue7820(self):
# one byte in common with the UTF-16-LE BOM
self.assertRaises(SyntaxError, eval, b'\xff\x20')

# one byte in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\x20')

# two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')

Expand Down
8 changes: 5 additions & 3 deletions Lib/test/test_unicode_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self):
def test_invalid(self):
try:
from test import badsyntax_3131
except SyntaxError as s:
self.assertEqual(str(s),
"invalid character in identifier (badsyntax_3131.py, line 2)")
except SyntaxError as err:
self.assertEqual(str(err),
"invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
self.assertEqual(err.lineno, 2)
self.assertEqual(err.offset, 1)
else:
self.fail("expected exception didn't occur")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improved syntax errors for invalid characters in source code.
37 changes: 37 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -12309,6 +12309,43 @@ unicode_isnumeric_impl(PyObject *self)
Py_RETURN_TRUE;
}

Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject *self)
{
Py_ssize_t i;
if (PyUnicode_READY(self) == -1)
return -1;

Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}

int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
/* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e
start with letters and underscore, continue with
letters, digits, underscore). However, given the current
definition of XID_Start and XID_Continue, it is sufficient
to check just for these, except that _ must be allowed
as starting an identifier. */
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}

for (i = 1; i < len; i++) {
ch = PyUnicode_READ(kind, data, i);
if (!_PyUnicode_IsXidContinue(ch)) {
return i;
}
}
return i;
}

int
PyUnicode_IsIdentifier(PyObject *self)
{
Expand Down
3 changes: 0 additions & 3 deletions Parser/pegen/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN:
msg = "invalid token";
break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1;
Expand Down
41 changes: 32 additions & 9 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1101,25 +1101,48 @@ static int
verify_identifier(struct tok_state *tok)
{
PyObject *s;
int result;
if (tok->decoding_erred)
return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
PyErr_Clear();
tok->done = E_IDENTIFIER;
} else {
tok->done = E_DECODE;
}
else {
tok->done = E_ERROR;
}
return 0;
}
result = PyUnicode_IsIdentifier(s);
Py_DECREF(s);
if (result == 0) {
tok->done = E_IDENTIFIER;
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
if (invalid < 0) {
Py_DECREF(s);
tok->done = E_ERROR;
return 0;
}
return result;
assert(PyUnicode_GET_LENGTH(s) > 0);
if (invalid < PyUnicode_GET_LENGTH(s)) {
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
/* Determine the offset in UTF-8 encoded input */
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
if (s != NULL) {
Py_SETREF(s, PyUnicode_AsUTF8String(s));
}
if (s == NULL) {
tok->done = E_ERROR;
return 0;
}
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
// PyUnicode_FromFormatV() does not support %X
char hex[9];
snprintf(hex, sizeof(hex), "%04X", ch);
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
return 0;
}
Py_DECREF(s);
return 1;
}

static int
Expand Down
3 changes: 0 additions & 3 deletions Python/pythonrun.c
Original file line number Diff line number Diff line change
Expand Up @@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
msg = "unexpected character after line continuation character";
break;

case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_BADSINGLE:
msg = "multiple statements found while compiling a single statement";
break;
Expand Down
0