8000 gh-126024: optimize UTF-8 decoder for short non-ASCII string by methane · Pull Request #126025 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-126024: optimize UTF-8 decoder for short non-ASCII string #126025

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 29, 2024
Merged
Prev Previous commit
Next Next commit
utf8_count_codepoints
  • Loading branch information
methane committed Oct 27, 2024
commit 9b47c2b6824c3a90933d437eecf6dbeb8a371e54
74 changes: 66 additions & 8 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#if (SIZEOF_SIZE_T == 8)
/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_SIZE_T == 8)
# define ASCII_CHAR_MASK 0x8080808080808080ULL
// used to count codepoints in UTF-8 string.
# define VECTOR_0101 0x0101010101010101ULL
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
#elif (SIZEOF_SIZE_T == 4)
# define ASCII_CHAR_MASK 0x80808080U
# define VECTOR_0101 0x01010101U
# define VECTOR_00FF 0x00ff00ffU
#else
# error C 'size_t' size should be either 4 or 8!
#endif
Expand Down Expand Up @@ -5056,11 +5061,13 @@ find_first_nonascii(const char *start, const char *end)
while (p <= e) {
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
if (value) {
// Optimization only for major platforms we have CI.
#if PY_LITTLE_ENDIAN && (defined(__clang__) || defined(__GNUC__))
#if SIZEOF_SIZE_T == SIZEOF_LONG
#if SIZEOF_SIZE_T == 4
// __builtin_ctzl(0x8000) == 15.
// (15-7) / 8 == 1.
// p+1 is first non-ASCII char.
return p - start + (__builtin_ctzl(value)-7) / 8;
#elif SIZEOF_SIZE_T == SIZEOF_LONG_LONG
#else
return p - start + (__builtin_ctzll(value)-7) / 8;
#endif
#elif PY_LITTLE_ENDIAN && defined(_MSC_VER)
Expand All @@ -5071,8 +5078,11 @@ find_first_nonascii(const char *start, const char *end)
_BitScanForward64(&bitpos, value);
#endif
return p - start + (bitpos-7) / 8;
#endif
#else
// big endian and minor compilers are difficult to test.
// fallback to per byte check.
break;
#endif
}
p += SIZEOF_SIZE_T;
}
Expand All @@ -5086,6 +5096,52 @@ find_first_nonascii(const char *start, const char *end)
return p - start;
}

static inline int scalar_utf8_start_char(unsigned int ch)
{
// 0xxxxxxx or 11xxxxxx are first byte.
return (~ch >> 7 | ch >> 6) & 1;
}

static inline size_t vector_utf8_start_chars(size_t v)
{
return ((~v>>7) | (v>>6)) & VECTOR_0101;
}

static Py_ssize_t utf8_count_codepoints(const unsigned char *s, Py_ssize_t size)
{
Py_ssize_t len = 0;
const unsigned char *end = s+size;

if (end - s > SIZEOF_SIZE_T*2) {
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
len += scalar_utf8_start_char(*s++);
}

while (s + SIZEOF_SIZE_T <= end) {
const unsigned char *e = end;
if (e - s > SIZEOF_SIZE_T * 255) {
e = s + SIZEOF_SIZE_T * 255;
}
Py_ssize_t vstart = 0;
while (s + SIZEOF_SIZE_T <= e) {
size_t v = *(size_t*)s;
size_t vs = vector_utf8_start_chars(v);
vstart += vs;
s += SIZEOF_SIZE_T;
}
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
vstart += vstart >> 16;
#if SIZEOF_SIZE_T == 8
vstart += vstart >> 32;
#endif
len += vstart & 0x7ff;
}
}
while (s < end) {
len += scalar_utf8_start_char(*s++);
}
return len;
}

static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
Expand Down Expand Up @@ -5234,8 +5290,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
const char *end = s + size;

Py_ssize_t pos = find_first_nonascii(starts, end);
if (pos == size) {
// fast path: ASCII
if (pos == size) { // fast path: ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
Expand All @@ -5248,8 +5303,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
}

int maxchr = 127;
Py_ssize_t maxsize = size;

unsigned char ch = (unsigned char)s[pos];
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2) {
maxsize = utf8_count_codepoints((const unsigned char *)s, size);
if (ch < 0xc4) { // latin1
maxchr = 255;
}
Expand All @@ -5260,7 +5318,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
maxchr = 0x10ffff;
}
}
PyObject *u = PyUnicode_New(size, maxchr);
PyObject *u = PyUnicode_New(maxsize, maxchr);
if (!u) {
return NULL;
}
Expand Down
Loading
0