8000 [3.10] gh-133767: Fix use-after-free in the unicode-escape decoder wi… · python/cpython@8b528ca · GitHub
[go: up one dir, main page]

Skip to content

Commit 8b528ca

Browse files
[3.10] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944)
If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) (cherry picked from commit a75953b) (cherry picked from commit 0c33e5b) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 6322edd commit 8b528ca

File tree

8 files changed

+165
-41
lines changed

8 files changed

+165
-41
lines changed

Include/cpython/bytesobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
844844

845845
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
846846
chars. */
847+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
848+
const char *string, /* Unicode-Escape encoded string */
849+
Py_ssize_t length, /* size of string */
850+
const char *errors, /* error handling */
851+
Py_ssize_t *consumed, /* bytes consumed */
852+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
853+
invalid escaped char (<= 0xff) or invalid
854+
octal escape (> 0xff) in string. */
855+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
856+
point to the first invalid escaped
857+
char in string.
858+
May be NULL if errors is not NULL. */
859+
// Export for binary compatibility.
847860
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
848861
const char *string, /* Unicode-Escape encoded string */
849862
Py_ssize_t length, /* size of string */

Lib/test/test_codeccallbacks.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import codecs
22
import html.entities
33
import itertools
4+
import re
45
import sys
56
import unicodedata
67
import unittest
@@ -1124,7 +1125,7 @@ def test_bug828737(self):
11241125
text = 'abc<def>ghi'*n
11251126
text.translate(charmap)
11261127

1127-
def test_mutatingdecodehandler(self):
1128+
def test_mutating_decode_handler(self):
11281129
baddata = [
11291130
("ascii", b"\xff"),
11301131
("utf-7", b"++"),
@@ -1159,6 +1160,40 @@ def mutating(exc):
11591160
for (encoding, data) in baddata:
11601161
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611162

1163+
def test_mutating_decode_handler_unicode_escape(self):
1164+
decode = codecs.unicode_escape_decode
1165+
def mutating(exc):
1166+
if isinstance(exc, UnicodeDecodeError):
1167+
r = data.get(exc.object[:exc.end])
1168+
if r is not None:
1169+
exc.object = r[0] + exc.object[exc.end:]
1170+
return ('\u0404', r[1])
1171+
raise AssertionError("don't know how to handle %r" % exc)
1172+
1173+
codecs.register_error('test.mutating2', mutating)
1174+
data = {
1175+
br'\x0': (b'\\', 0),
1176+
br'\x3': (b'xxx\\', 3),
1177+
br'\x5': (b'x\\', 1),
1178+
}
1179+
def check(input, expected, msg):
1180+
with self.assertWarns(DeprecationWarning) as cm:
1181+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1182+
self.assertIn(msg, str(cm.warning))
1183+
1184+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1185+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1186+
1187+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1188+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1189+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1190+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1192+
1193+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1194+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1195+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1196+
11621197
# issue32583
11631198
def test_crashing_decode_handler(self):
11641199
# better generating one more character to fill the extra space slot

Lib/test/test_codecs.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,20 +1181,32 @@ def test_escape(self):
11811181
check(br"[\501]", b"[A]")
11821182
check(br"[\x41]", b"[A]")
11831183
check(br"[\x410]", b"[A0]")
1184+
1185+
def test_warnings(self):
1186+
decode = codecs.escape_decode
1187+
check = coding_checker(self, decode)
11841188
for i in range(97, 123):
11851189
b = bytes([i])
11861190
if b not in b'abfnrtvx':
1187-
with self.assertWarns(DeprecationWarning):
1191+
with self.assertWarnsRegex(DeprecationWarning,
1192+
r"invalid escape sequence '\\%c'" % i):
11881193
check(b"\\" + b, b"\\" + b)
1189-
with self.assertWarns(DeprecationWarning):
1194+
with self.assertWarnsRegex(DeprecationWarning,
1195+
r"invalid escape sequence '\\%c'" % (i-32)):
11901196
check(b"\\" + b.upper(), b"\\" + b.upper())
1191-
with self.assertWarns(DeprecationWarning):
1197+
with self.assertWarnsRegex(DeprecationWarning,
1198+
r"invalid escape sequence '\\8'"):
11921199
check(br"\8", b"\\8")
11931200
with self.assertWarns(DeprecationWarning):
11941201
check(br"\9", b"\\9")
1195-
with self.assertWarns(DeprecationWarning):
1202+
with self.assertWarnsRegex(DeprecationWarning,
1203+
r"invalid escape sequence '\\\xfa'") as cm:
11961204
check(b"\\\xfa", b"\\\xfa")
11971205

1206+
with self.assertWarnsRegex(DeprecationWarning,
1207+
r"invalid escape sequence '\\z'"):
1208+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1209+
11981210
def test_errors(self):
11991211
decode = codecs.escape_decode
12001212
self.assertRaises(ValueError, decode, br"\x")
@@ -2408,20 +2420,31 @@ def test_escape_decode(self):
24082420
check(br"[\x410]", "[A0]")
24092421
check(br"\u20ac", "\u20ac")
24102422
check(br"\U0001d120", "\U0001d120")
2423+
2424+
def test_decode_warnings(self):
2425+
decode = codecs.unicode_escape_decode
2426+
check = coding_checker(self, decode)
24112427
for i in range(97, 123):
24122428
b = bytes([i])
24132429
if b not in b'abfnrtuvx':
2414-
with self.assertWarns(DeprecationWarning):
2430+
with self.assertWarnsRegex(DeprecationWarning,
2431+
r"invalid escape sequence '\\%c'" % i):
24152432
check(b"\\" + b, "\\" + chr(i))
24162433
if b.upper() not in b'UN':
2417-
with self.assertWarns(DeprecationWarning):
2434+
with self.assertWarnsRegex(DeprecationWarning,
2435+
r"invalid escape sequence '\\%c'" % (i-32)):
24182436
check(b"\\" + b.upper(), "\\" + chr(i-32))
2419-
with self.assertWarns(DeprecationWarning):
2437+
with self.assertWarnsRegex(DeprecationWarning,
2438+
r"invalid escape sequence '\\8'"):
24202439
check(br"\8", "\\8")
24212440
with self.assertWarns(DeprecationWarning):
24222441
check(br"\9", "\\9")
2423-
with self.assertWarns(DeprecationWarning):
2442+
with self.assertWarnsRegex(DeprecationWarning,
2443+
r"invalid escape sequence '\\\xfa'") as cm:
24242444
check(b"\\\xfa", "\\\xfa")
2445+
with self.assertWarnsRegex(DeprecationWarning,
2446+
r"invalid escape sequence '\\z'"):
2447+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
24252448

24262449
def test_decode_errors(self):
24272450
decode = codecs.unicode_escape_decode
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

Objects/bytesobject.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,10 +1089,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10891089
}
10901090

10911091
/* Unescape a backslash-escaped string. */
1092-
PyObject *_PyBytes_DecodeEscape(const ch F438 ar *s,
1092+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10931093
Py_ssize_t len,
10941094
const char *errors,
1095-
const char **first_invalid_escape)
1095+
int *first_invalid_escape_char,
1096+
const char **first_invalid_escape_ptr)
10961097
{
10971098
int c;
10981099
char *p;
@@ -1106,7 +1107,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11061107
return NULL;
11071108
writer.overallocate = 1;
11081109

1109-
*first_invalid_escape = NULL;
1110+
*first_invalid_escape_char = -1;
1111+
*first_invalid_escape_ptr = NULL;
11101112

11111113
end = s + len;
11121114
while (s < end) {
@@ -1181,9 +1183,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11811183
break;
11821184

11831185
default:
1184-
if (*first_invalid_escape == NULL) {
1185-
*first_invalid_escape = s-1; /* Back up one char, since we've
1186-
already incremented s. */
1186+
if (*first_invalid_escape_char == -1) {
1187+
*first_invalid_escape_char = (unsigned char)s[-1];
1188+
/* Back up one char, since we've already incremented s. */
1189+
*first_invalid_escape_ptr = s - 1;
11871190
}
11881191
*p++ = '\\';
11891192
s--;
@@ -1197,21 +1200,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11971200
return NULL;
11981201
}
11991202

1203+
// Export for binary compatibility.
1204+
PyObject *_PyBytes_DecodeEscape(const char *s,
1205+
Py_ssize_t len,
1206+
const char *errors,
1207+
const char **first_invalid_escape)
1208+
{
1209+
int first_invalid_escape_char;
1210+
return _PyBytes_DecodeEscape2(
1211+
s, len, errors,
1212+
&first_invalid_escape_char,
1213+
first_invalid_escape);
1214+
}
1215+
12001216
PyObject *PyBytes_DecodeEscape(const char *s,
12011217
Py_ssize_t len,
12021218
const char *errors,
12031219
Py_ssize_t Py_UNUSED(unicode),
12041220
const char *Py_UNUSED(recode_encoding))
12051221
{
1206-
const char* first_invalid_escape;
1207-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1208-
&first_invalid_escape);
1222+
int first_invalid_escape_char;
1223+
const char *first_invalid_escape_ptr;
1224+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1225+
&first_invalid_escape_char,
1226+
&first_invalid_escape_ptr);
12091227
if (result == NULL)
12101228
return NULL;
1211-
if (first_invalid_escape != NULL) {
1229+
if (first_invalid_escape_char != -1) {
12121230
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
12131231
"invalid escape sequence '\\%c'",
1214-
(unsigned char)*first_invalid_escape) < 0) {
1232+
first_invalid_escape_char) < 0) {
12151233
Py_DECREF(result);
12161234
return NULL;
12171235
}

Objects/unicodeobject.c

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6432,20 +6432,23 @@ PyUnicode_AsUTF16String(PyObject *unicode)
64326432
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
64336433

64346434
PyObject *
6435-
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6435+
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
64366436
Py_ssize_t size,
64376437
const char *errors,
64386438
Py_ssize_t *consumed,
6439-
const char **first_invalid_escape)
6439+
int *first_invalid_escape_char,
6440+
const char **first_invalid_escape_ptr)
64406441
{
64416442
const char *starts = s;
6443+
const char *initial_starts = starts;
64426444
_PyUnicodeWriter writer;
64436445
const char *end;
64446446
PyObject *errorHandler = NULL;
64456447
PyObject *exc = NULL;
64466448

64476449
// so we can remember if we've seen an invalid escape char or not
6448-
*first_invalid_escape = NULL;
6450+
*first_invalid_escape_char = -1;
6451+
*first_invalid_escape_ptr = NULL;
64496452

64506453
if (size == 0) {
64516454
if (consumed) {
@@ -6628,9 +6631,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
66286631
goto error;
66296632

66306633
default:
6631-
if (*first_invalid_escape == NULL) {
6632-
*first_invalid_escape = s-1; /* Back up one char, since we've
6633-
already incremented s. */
6634+
if (*first_invalid_escape_char == -1) {
6635+
*first_invalid_escape_char = c;
6636+
if (starts == initial_starts) {
6637+
/* Back up one char, since we've already incremented s. */
6638+
*first_invalid_escape_ptr = s - 1;
6639+
}
66346640
}
66356641
WRITE_ASCII_CHAR('\\');
66366642
WRITE_CHAR(c);
@@ -6669,22 +6675,39 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
66696675
return NULL;
66706676
}
66716677

6678+
// Export for binary compatibility.
6679+
PyObject *
6680+
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6681+
Py_ssize_t size,
6682+
const char *errors,
6683+
Py_ssize_t *consumed,
6684+
const char **first_invalid_escape)
6685+
{
6686+
int first_invalid_escape_char;
6687+
return _PyUnicode_DecodeUnicodeEscapeInternal2(
6688+
s, size, errors, consumed,
6689+
&first_invalid_escape_char,
6690+
first_invalid_escape);
6691+
}
6692+
66726693
PyObject *
66736694
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
66746695
Py_ssize_t size,
66756696
const char *errors,
66766697
Py_ssize_t *consumed)
66776698
{
6678-
const char *first_invalid_escape;
6679-
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6699+
int first_invalid_escape_char;
6700+
const char *first_invalid_escape_ptr;
6701+
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
66806702
consumed,
6681-
&first_invalid_escape);
6703+
&first_invalid_escape_char,
6704+
&first_invalid_escape_ptr);
66826705
if (result == NULL)
66836706
return NULL;
6684-
if (first_invalid_escape != NULL) {
6707+
if (first_invalid_escape_char != -1) {
66856708
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
66866709
"invalid escape sequence '\\%c'",
6687-
(unsigned char)*first_invalid_escape) < 0) {
6710+
first_invalid_escape_char) < 0) {
66886711
Py_DECREF(result);
66896712
return NULL;
66906713
}

Parser/string_parser.c

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,15 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
114114
len = p - buf;
115115
s = buf;
116116

117-
const char *first_invalid_escape;
118-
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119-
120-
if (v != NULL && first_invalid_escape != NULL) {
121-
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122-
/* We have not decref u before because first_invalid_escape points
117+
int first_invalid_escape_char;
118+
const char *first_invalid_escape_ptr;
119+
v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
120+
&first_invalid_escape_char,
121+
&first_invalid_escape_ptr);
122+
123+
if (v != NULL && first_invalid_escape_ptr != NULL) {
124+
if (warn_invalid_escape_sequence(parser, *first_invalid_escape_ptr, t) < 0) {
125+
/* We have not decref u before because first_invalid_escape_ptr points
123126
inside u. */
124127
Py_XDECREF(u);
125128
Py_DECREF(v);
@@ -133,14 +136,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
133136
static PyObject *
134137
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135138
{
136-
const char *first_invalid_escape;
137-
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
139+
int first_invalid_escape_char;
140+
const char *first_invalid_escape_ptr;
141+
PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
142+
&first_invalid_escape_char,
143+
&first_invalid_escape_ptr);
138144
if (result == NULL) {
139145
return NULL;
140146
}
141147

142-
if (first_invalid_escape != NULL) {
143-
if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
148+
if (first_invalid_escape_ptr != NULL) {
149+
if (warn_invalid_escape_sequence(p, *first_invalid_escape_ptr, t) < 0) {
144150
Py_DECREF(result);
145151
return NULL;
146152
}

0 commit comments

Comments
 (0)
0