10000 gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V · serhiy-storchaka/cpython@e969269 · GitHub
[go: up one dir, main page]

Skip to content

Commit e969269

Browse files
pythongh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V
PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences.
1 parent 86a8a1c commit e969269

File tree

3 files changed

+50
-4
lines changed

3 files changed

+50
-4
lines changed

Lib/test/test_capi/test_unicode.py

Lines changed: 40 additions & 2 deletions
< 10000 tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,27 @@ def check_format(expected, format, *args):
415415
# truncated string
416416
check_format('abc',
417417
b'%.3s', b'abcdef')
418+
check_format('abc[',
419+
b'%.6s', 'abc[\u20ac]'.encode('utf8'))
420+
check_format('abc[\u20ac',
421+
b'%.7s', 'abc[\u20ac]'.encode('utf8'))
418422
check_format('abc[\ufffd',
419-
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
423+
b'%.5s', b'abc[\xff]')
424+
check_format('abc[',
425+
b'%.6s', b'abc[\xe2\x82]')
426+
check_format('abc[\ufffd]',
427+
b'%.7s', b'abc[\xe2\x82]')
428+
check_format(' abc[',
429+
b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
430+
check_format(' abc[\u20ac',
431+
b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
432+
check_format(' abc[\ufffd',
433+
b'%10.5s', b'abc[\xff]')
434+
check_format(' abc[',
435+
b'%10.6s', b'abc[\xe2\x82]')
436+
check_format(' abc[\ufffd]',
437+
b'%10.7s', b'abc[\xe2\x82]')
438+
420439
check_format("'\\u20acABC'",
421440
b'%A', '\u20acABC')
422441
check_format("'\\u20",
@@ -429,10 +448,29 @@ def check_format(expected, format, *args):
429448
b'%.3S', '\u20acABCDEF')
430449
check_format('\u20acAB',
431450
b'%.3U', '\u20acABCDEF')
451+
432452
check_format('\u20acAB',
433453
b'%.3V', '\u20acABCDEF', None)
454+
check_format('abc[',
455+
b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
456+
check_format('abc[\u20ac',
457+
b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
434458
check_format('abc[\ufffd',
435-
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
459+
b'%.5V', None, b'abc[\xff]')
460+
check_format('abc[',
461+
b'%.6V', None, b'abc[\xe2\x82]')
462+
check_format('abc[\ufffd]',
463+
b'%.7V', None, b'abc[\xe2\x82]')
464+
check_format(' abc[',
465+
b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
466+
check_format(' abc[\u20ac',
467+
b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
468+
check_format(' abc[\ufffd',
469+
b'%10.5V', None, b'abc[\xff]')
470+
check_format(' abc[',
471+
b'%10.6V', None, b'abc[\xe2\x82]')
472+
check_format(' abc[\ufffd]',
473+
b'%10.7V', None, b'abc[\xe2\x82]')
436474

437475
# following tests comes from #7330
438476
# test width modifier and precision modifier with %S
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
2+
character for truncated C string when use precision with ``%s`` and ``%V``.
3+
It now truncates the string before the start of truncated multibyte
4+
sequences.

Objects/unicodeobject.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,23 +2390,27 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
23902390
{
23912391
/* UTF-8 */
23922392
Py_ssize_t length;
2393+
Py_ssize_t consumed;
2394+
Py_ssize_t *pconsumed;
23932395
if (precision == -1) {
23942396
length = strlen(str);
2397+
pconsumed = NULL;
23952398
}
23962399
else {
23972400
length = 0;
23982401
while (length < precision && str[length]) {
23992402
length++;
24002403
}
2404+
pconsumed = &consumed;
24012405
}
24022406

24032407
if (width < 0) {
24042408
return unicode_decode_utf8_writer(writer, str, length,
2405-
_Py_ERROR_REPLACE, "replace", NULL);
2409+
_Py_ERROR_REPLACE, "replace", pconsumed);
24062410
}
24072411

24082412
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2409-
"replace", NULL);
2413+
"replace", pconsumed);
24102414
if (unicode == NULL)
24112415
return -1;
24122416

0 commit comments

Comments
 (0)
0