8000 ENH: inline UTF-8 byte counter and make it branchless · melissawm/numpy@ed1c192 · GitHub
[go: up one dir, main page]

Skip to content

Commit ed1c192

Browse files
committed
ENH: inline UTF-8 byte counter and make it branchless
1 parent dc78e30 commit ed1c192

File tree

2 files changed

+10
-15
lines changed

2 files changed

+10
-15
lines changed

numpy/_core/src/multiarray/stringdtype/utf8_utils.c

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -55,19 +55,6 @@ find_previous_utf8_character(const unsigned char *c, size_t nchar)
5555
return c;
5656
}
5757

58-
NPY_NO_EXPORT int
59-
num_bytes_for_utf8_character(const unsigned char *c) {
60-
if (c[0] <= 0x7F) {
61-
return 1;
62-
}
63-
else if (c[0] <= 0xDF) {
64-
return 2;
65-
}
66-
else if (c[0] <= 0xEF) {
67-
return 3;
68-
}
69-
return 4;
70-
}
7158

7259
NPY_NO_EXPORT int
7360
num_utf8_bytes_for_codepoint(uint32_t code)

numpy/_core/src/multiarray/stringdtype/utf8_utils.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ extern "C" {
88
NPY_NO_EXPORT size_t
99
utf8_char_to_ucs4_code(const unsigned char *c, Py_UCS4 *code);
1010

11-
NPY_NO_EXPORT int
12-
num_bytes_for_utf8_character(const unsigned char *c);
11+
static inline int num_bytes_for_utf8_character(const unsigned char *c)
12+
{
13+
// adapted from https://github.com/skeeto/branchless-utf8
14+
// the first byte of a UTF-8 character encodes the length of the character
15+
static const char LENGTHS_LUT[] = {
16+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17+
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
18+
};
19+
return LENGTHS_LUT[c[0] >> 3];
20+
}
1321

1422
NPY_NO_EXPORT const unsigned char*
1523
find_previous_utf8_character(const unsigned char *c, size_t nchar);

0 commit comments

Comments
 (0)
0