8000 optimize unaligned memory load · python/cpython@800452a · GitHub
[go: up one dir, main page]

Skip to content

Commit 800452a

Browse files
committed
optimize unaligned memory load
1 parent 8e58bf2 commit 800452a

File tree

1 file changed

+103
-102
lines changed

1 file changed

+103
-102
lines changed

Objects/unicodeobject.c

Lines changed: 103 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -5036,70 +5036,15 @@ PyUnicode_DecodeUTF8(const char *s,
50365036
# error C 'size_t' size should be either 4 or 8!
50375037
#endif
50385038

5039-
static Py_ssize_t
5040-
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5041-
{
5042-
const char *p = start;
5043-
5044-
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5045-
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
5046-
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5047-
{
5048-
/* Fast path, see in STRINGLIB(utf8_decode) for
5049-
an explanation. */
5050-
/* Help allocation */
5051-
const char *_p = p;
5052-
Py_UCS1 * q = dest;
5053-
while (_p + SIZEOF_SIZE_T <= end) {
5054-
size_t value = *(const size_t *) _p;
5055-
if (value & ASCII_CHAR_MASK)
5056-
break;
5057-
*((size_t *)q) = value;
5058-
_p += SIZEOF_SIZE_T;
5059-
q += SIZEOF_SIZE_T;
5060-
}
5061-
p = _p;
5062-
while (p < end) {
5063-
if ((unsigned char)*p & 0x80)
5064-
break;
5065-
*q++ = *p++;
5066-
}
5067-
return p - start;
5068-
}
5069-
#endif
5070-
while (p < end) {
5071-
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072-
for an explanation. */
5073-
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074-
/* Help allocation */
5075-
const char *_p = p;
5076-
while (_p + SIZEOF_SIZE_T <= end) {
5077-
size_t value = *(const size_t *) _p;
5078-
if (value & ASCII_CHAR_MASK)
5079-
break;
5080-
_p += SIZEOF_SIZE_T;
5081-
}
5082-
p = _p;
5083-
if (_p == end)
5084-
break;
5085-
}
5086-
if ((unsigned char)*p & 0x80)
5087-
break;
5088-
++p;
5089-
}
5090-
memcpy(dest, start, p - start);
5091-
return p - start;
5092-
}
5093-
50945039
#if (defined(__clang__) || defined(__GNUC__))
5095-
#define HAS_CTZ 1
5040+
#define HAVE_CTZ 1
50965041
static inline unsigned int
50975042
ctz(size_t v)
50985043
{
50995044
return __builtin_ctzll((unsigned long long)v);
51005045
}
51015046
#elif defined(_MSC_VER)
5102-
#define HAS_CTZ 1
5047+
#define HAVE_CTZ 1
51035048
static inline unsigned int
51045049
ctz(size_t v)
51055050
{
@@ -5113,24 +5058,79 @@ ctz(size_t v)
51135058
}
51145059
#endif
51155060

5061+
#if HAVE_CTZ
5062+
// load p[0]..p[size-1] as a little-endian size_t
5063+
// without unaligned access nor read ahead.
5064+
static size_t
5065+
load_unaligned(const unsigned char *p, size_t size)
5066+
{
5067+
assert(0 <= size && size <= SIZEOF_SIZE_T);
5068+
union {
5069+
size_t s;
5070+
unsigned char b[SIZEOF_SIZE_T];
5071+
} u;
5072+
u.s = 0;
5073+
switch (size) {
5074+
case 8:
5075+
u.b[7] = p[7];
5076+
// fall through
5077+
case 7:
5078+
u.b[6] = p[6];
5079+
// fall through
5080+
case 6:
5081+
u.b[5] = p[5];
5082+
// fall through
5083+
case 5:
5084+
u.b[4] = p[4];
5085+
// fall through
5086+
case 4:
5087+
u.b[3] = p[3];
5088+
// fall through
5089+
case 3:
5090+
u.b[2] = p[2];
5091+
// fall through
5092+
case 2:
5093+
u.b[1] = p[1];
5094+
// fall through
5095+
case 1:
5096+
u.b[0] = p[0];
5097+
break;
5098+
case 0:
5099+
break;
5100+
default:
5101+
Py_UNREACHABLE();
5102+
}
5103+
return u.s;
5104+
}
5105+
#endif
5106+
51165107
static Py_ssize_t
51175108
find_first_nonascii(const unsigned char *start, const unsigned char *end)
51185109
{
51195110
const unsigned char *p = start;
51205111

5121-
if (end - start > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
5122-
while (!_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5123-
if ((unsigned char)*p & 0x80) {
5112+
if (end - start >= SIZEOF_SIZE_T) {
5113+
const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5114+
#ifdef HAVE_CTZ
5115+
size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK;
5116+
if (u) {
5117+
return p - start + (ctz(u) - 7) / 8;
5118+
}
5119+
p = p2;
5120+
#else
5121+
while (p < p2) {
5122+
if (*p & 0x80) {
51245123
return p - start;
51255124
}
51265125
p++;
51275126
}
5127+
#endif
51285128
const unsigned char *e = end - SIZEOF_SIZE_T;
51295129
while (p <= e) {
5130-
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
5131-
if (value) {
5132-
#if PY_LITTLE_ENDIAN && HAS_CTZ
5133-
return p - start + (ctz(value) - 7) / 8;
5130+
size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5131+
if (u) {
5132+
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5133+
return p - start + (ctz(u) - 7) / 8;
51345134
#else
51355135
// big endian and minor compilers are difficult to test.
51365136
// fallback to per byte check.
@@ -5140,47 +5140,15 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51405140
p += SIZEOF_SIZE_T;
51415141
}
51425142
}
5143-
#if HAS_CTZ
5144-
// This part looks bit tricky, but decoding short ASCII is super important.
5145-
// Since we copy from p to size_t manually, this part works fine with big endian.
5146-
while (p < end) {
5147-
size_t u = (size_t)(p[0]);
5148-
switch (end - p) {
5149-
default:
5150-
#if SIZEOF_SIZE_T == 8
5151-
u |= (size_t)(p[7]) << 56ull;
5152-
// fall through
5153-
case 7:
5154-
u |= (size_t)(p[6]) << 48ull;
5155-
// fall through
5156-
case 6:
5157-
u |= (size_t)(p[5]) << 40ull;
5158-
// fall through
5159-
case 5:
5160-
u |= (size_t)(p[4]) << 32ull;
5161-
// fall through
5162-
case 4:
5163-
#endif
5164-
u |= (size_t)(p[3]) << 24;
5165-
// fall through
5166-
case 3:
5167-
u |= (size_t)(p[2]) << 16;
5168-
// fall through
5169-
case 2:
5170-
u |= (size_t)(p[1]) << 8;
5171-
break;
5172-
case 1:
5173-
break;
5174-
}
5175-
if (u & ASCII_CHAR_MASK) {
5176-
return p - start + (ctz(u & ASCII_CHAR_MASK) - 7) / 8;
5177-
}
5178-
p += SIZEOF_SIZE_T;
5143+
#if HAVE_CTZ
5144+
size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5145+
if (u) {
5146+
return p - start + (ctz(u) - 7) / 8;
51795147
}
51805148
return end - start;
51815149
#else
51825150
while (p < end) {
5183-
if ((unsigned char)*p & 0x80) {
5151+
if (*p & 0x80) {
51845152
break;
51855153
}
51865154
p++;
@@ -5204,7 +5172,7 @@ static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned c
52045172
{
52055173
Py_ssize_t len = 0;
52065174

5207-
if (end - s > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
5175+
if (end - s >= SIZEOF_SIZE_T) {
52085176
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
52095177
len += scalar_utf8_start_char(*s++);
52105178
}
@@ -5235,6 +5203,39 @@ static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned c
52355203
return len;
52365204
}
52375205

5206+
static Py_ssize_t
5207+
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5208+
{
5209+
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5210+
if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5211+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5212+
{
5213+
/* Fast path, see in STRINGLIB(utf8_decode) for
5214+
an explanation. */
5215+
const char *p = start;
5216+
Py_UCS1 *q = dest;
5217+
while (p + SIZEOF_SIZE_T <= end) {
5218+
size_t value = *(const size_t *) p;
5219+
if (value & ASCII_CHAR_MASK)
5220+
break;
5221+
*((size_t *)q) = value;
5222+
p += SIZEOF_SIZE_T;
5223+
q += SIZEOF_SIZE_T;
5224+
}
5225+
while (p < end) {
5226+
if ((unsigned char)*p & 0x80)
5227+
break;
5228+
*q++ = *p++;
5229+
}
5230+
return p - start;
5231+
}
5232+
#endif
5233+
Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5234+
(const unsigned char*)end);
5235+
memcpy(dest, start, pos);
5236+
return pos;
5237+
}
5238+
52385239
static int
52395240
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
52405241
const char *starts, const char *s, const char *end,

0 commit comments

Comments
 (0)
0