8000 gh-126024: optimize UTF-8 decoder for short non-ASCII string by methane · Pull Request #126025 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-126024: optimize UTF-8 decoder for short non-ASCII string #126025

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimize decoding of short UTF-8 sequences containing non-ASCII characters
by approximately 15%.
304 changes: 259 additions & 45 deletions Objects/unicodeobject.c
10000
Original file line number Diff line number Diff line change
Expand Up @@ -4978,39 +4978,228 @@ PyUnicode_DecodeUTF8(const char *s,
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#if (SIZEOF_SIZE_T == 8)
/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_SIZE_T == 8)
# define ASCII_CHAR_MASK 0x8080808080808080ULL
// used to count codepoints in UTF-8 string.
# define VECTOR_0101 0x0101010101010101ULL
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
#elif (SIZEOF_SIZE_T == 4)
# define ASCII_CHAR_MASK 0x80808080U
# define VECTOR_0101 0x01010101U
# define VECTOR_00FF 0x00ff00ffU
#else
# error C 'size_t' size should be either 4 or 8!
#endif

#if (defined(__clang__) || defined(__GNUC__))
#define HAVE_CTZ 1
static inline unsigned int
ctz(size_t v)
{
return __builtin_ctzll((unsigned long long)v);
}
#elif defined(_MSC_VER)
#define HAVE_CTZ 1
static inline unsigned int
ctz(size_t v)
{
unsigned long pos;
#if SIZEOF_SIZE_T == 4
_BitScanForward(&pos, v);
#else
_BitScanForward64(&pos, v);
#endif /* SIZEOF_SIZE_T */
return pos;
}
#endif

#if HAVE_CTZ
// load p[0]..p[size-1] as a little-endian size_t
// without unaligned access nor read ahead.
static size_t
load_unaligned(const unsigned char *p, size_t size)
{
assert(size <= SIZEOF_SIZE_T);
union {
size_t s;
unsigned char b[SIZEOF_SIZE_T];
} u;
u.s = 0;
switch (size) {
case 8:
u.b[7] = p[7];
_Py_FALLTHROUGH;
case 7:
u.b[6] = p[6];
_Py_FALLTHROUGH;
case 6:
u.b[5] = p[5];
_Py_FALLTHROUGH;
case 5:
u.b[4] = p[4];
_Py_FALLTHROUGH;
case 4:
u.b[3] = p[3];
_Py_FALLTHROUGH;
case 3:
u.b[2] = p[2];
_Py_FALLTHROUGH;
case 2:
u.b[1] = p[1];
_Py_FALLTHROUGH;
case 1:
u.b[0] = p[0];
break;
case 0:
break;
default:
Py_UNREACHABLE();
}
return u.s;
}
#endif

/*
* Find the first non-ASCII character in a byte sequence.
*
* This function scans a range of bytes from `start` to `end` and returns the
* index of the first byte that is not an ASCII character (i.e., has the most
* significant bit set). If all characters in the range are ASCII, it returns
* `end - start`.
*/
static Py_ssize_t
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
find_first_nonascii(const unsigned char *start, const unsigned char *end)
{
const char *p = start;
const unsigned char *p = start;

if (end - start >= SIZEOF_SIZE_T) {
const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
if (p < p2) {
#if HAVE_CTZ
#if defined(_M_AMD64) || defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
// x86 and amd64 are little endian and can load unaligned memory.
size_t u = *(const size_t*)p & ASCII_CHAR_MASK;
#else
size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK;
#endif
if (u) {
return p - start + (ctz(u) - 7) / 8;
}
p = p2;
}
#else
while (p < p2) {
if (*p & 0x80) {
return p - start;
}
p++;
}
#endif
const unsigned char *e = end - SIZEOF_SIZE_T;
while (p <= e) {
size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
if (u) {
#if PY_LITTLE_ENDIAN && HAVE_CTZ
return p - start + (ctz(u) - 7) / 8;
#else
// big endian and minor compilers are difficult to test.
// fallback to per byte check.
break;
#endif
}
p += SIZEOF_SIZE_T;
}
}
#if HAVE_CTZ
// we can not use *(const size_t*)p to avoid buffer overrun.
size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
if (u) {
return p - start + (ctz(u) - 7) / 8;
}
return end - start;
#else
while (p < end) {
if (*p & 0x80) {
break;
}
p++;
}
return p - start;
#endif
}

static inline int
scalar_utf8_start_char(unsigned int ch)
{
// 0xxxxxxx or 11xxxxxx are first byte.
return (~ch >> 7 | ch >> 6) & 1;
}

static inline size_t
vector_utf8_start_chars(size_t v)
{
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
}


// Count the number of UTF-8 code points in a given byte sequence.
static Py_ssize_t
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
{
Py_ssize_t len = 0;

if (end - s >= SIZEOF_SIZE_T) {
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
len += scalar_utf8_start_char(*s++);
}

while (s + SIZEOF_SIZE_T <= end) {
const unsigned char *e = end;
if (e - s > SIZEOF_SIZE_T * 255) {
e = s + SIZEOF_SIZE_T * 255;
}
Py_ssize_t vstart = 0;
while (s + SIZEOF_SIZE_T <= e) {
size_t v = *(size_t*)s;
size_t vs = vector_utf8_start_chars(v);
vstart += vs;
s += SIZEOF_SIZE_T;
}
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
vstart += vstart >> 16;
#if SIZEOF_SIZE_T == 8
vstart += vstart >> 32;
#endif
len += vstart & 0x7ff;
}
}
while (s < end) {
len += scalar_utf8_start_char(*s++);
}
return len;
}

static Py_ssize_t
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
{
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
/* Fast path, see in STRINGLIB(utf8_decode) for
an explanation. */
/* Help allocation */
const char *_p = p;
Py_UCS1 * q = dest;
while (_p + SIZEOF_SIZE_T <= end) {
size_t value = *(const size_t *) _p;
const char *p = start;
Py_UCS1 *q = dest;
while (p + SIZEOF_SIZE_T <= end) {
size_t value = *(const size_t *) p;
if (value & ASCII_CHAR_MASK)
break;
*((size_t *)q) = value;
_p += SIZEOF_SIZE_T;
p += SIZEOF_SIZE_T;
q += SIZEOF_SIZE_T;
}
p = _p;
while (p < end) {
if ((unsigned char)*p & 0x80)
break;
Expand All @@ -5019,31 +5208,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start;
}
#endif
while (p < end) {
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
for an explanation. */
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
/* Help allocation */
const char *_p = p;
while (_p + SIZEOF_SIZE_T <= end) {
size_t value = *(const size_t *) _p;
if (value & ASCII_CHAR_MASK)
break;
_p += SIZEOF_SIZE_T;
}
p = _p;
if (_p == end)
break;
}
if ((unsigned char)*p & 0x80)
break;
++p;
}
memcpy(dest, start, p - start);
return p - start;
Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
(const unsigned char*)end);
memcpy(dest, start, pos);
return pos;
}


static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
const char *starts, const char *s, const char *end,
Expand Down Expand Up @@ -5187,27 +5357,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]);
}

// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
// I don't know this check is necessary or not. But there is a test
// case that requires size=PY_SSIZE_T_MAX cause MemoryError.
if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
PyErr_NoMemory();
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {

const char *starts = s;
const char *end = s + size;

Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
if (pos == size) { // fast path: ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
memcpy(PyUnicode_1BYTE_DATA(u), s, size);
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;

int maxchr = 127;
Py_ssize_t maxsize = size;

unsigned char ch = (unsigned char)(s[pos]);
// error handler other than strict may remove/replace the invalid byte.
// consumed != NULL allows 1~3 bytes remainings.
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
// otherwise: check the input and decide the maxchr and maxsize to reduce
// reallocation and copy.
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
// we only calculate the number of codepoints and don't determine the exact maxchr.
// This is because writing fast and portable SIMD code to find maxchr is difficult.
// If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
// means that it is no longer necessary to allocate several times the required amount
// of memory.
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
if (ch < 0xc4) { // latin1
maxchr = 0xff;
}
else if (ch < 0xf0) { // ucs2
maxchr = 0xffff;
}
else { // ucs4
maxchr = 0x10ffff;
}
}
PyObject *u = PyUnicode_New(maxsize, maxchr);
if (!u) {
return NULL;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;
if (maxchr <= 255) {
memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
s += pos;
size -= pos;
writer.pos = pos;
}

if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
Expand Down Expand Up @@ -5267,7 +5479,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
const char *errors,
Py_ssize_t *consumed)
{
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
return unicode_decode_utf8(s, size,
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
errors, consumed);
}


Expand Down
Loading
0