@@ -785,6 +785,22 @@ unicode_result(PyObject *unicode)
785
785
static PyObject *
786
786
unicode_result_unchanged (PyObject * unicode )
787
787
{
788
+
789
+ /* Check if a Unicode string is a palindrome */
790
+ static int
791
+ unicode_is_palindrome (PyObject * unicode )
792
+ {
793
+ Py_ssize_t length = PyUnicode_GET_LENGTH (unicode );
794
+ int kind = PyUnicode_KIND (unicode );
795
+ const void * data = PyUnicode_DATA (unicode );
796
+
797
+ for (Py_ssize_t i = 0 ; i < length / 2 ; i ++ ) {
798
+ if (PyUnicode_READ (kind , data , i ) != PyUnicode_READ (kind , data , length - i - 1 )) {
799
+ return 0 ;
800
+ }
801
+ }
802
+ return 1 ;
803
+ }
788
804
if (PyUnicode_CheckExact (unicode )) {
789
805
return Py_NewRef (unicode );
790
806
}
@@ -5061,6 +5077,14 @@ load_unaligned(const unsigned char *p, size_t size)
5061
5077
}
5062
5078
#endif
5063
5079
5080
+ /*
5081
+ * Find the first non-ASCII character in a byte sequence.
5082
+ *
5083
+ * This function scans a range of bytes from `start` to `end` and returns the
5084
+ * index of the first byte that is not an ASCII character (i.e., has the most
5085
+ * significant bit set). If all characters in the range are ASCII, it returns
5086
+ * `end - start`.
5087
+ */
5064
5088
static Py_ssize_t
5065
5089
find_first_nonascii (const unsigned char * start , const unsigned char * end )
5066
5090
{
@@ -5122,18 +5146,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
5122
5146
#endif
5123
5147
}
5124
5148
5125
- static inline int scalar_utf8_start_char (unsigned int ch )
5149
+ static inline int
5150
+ scalar_utf8_start_char (unsigned int ch )
5126
5151
{
5127
5152
// 0xxxxxxx or 11xxxxxx are first byte.
5128
5153
return (~ch >> 7 | ch >> 6 ) & 1 ;
5129
5154
}
5130
5155
5131
- static inline size_t vector_utf8_start_chars (size_t v )
5156
+ static inline size_t
5157
+ vector_utf8_start_chars (size_t v )
5132
5158
{
5133
5159
return ((~v >> 7 ) | (v >> 6 )) & VECTOR_0101 ;
5134
5160
}
5135
5161
5136
- static Py_ssize_t utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5162
+
5163
+ // Count the number of UTF-8 code points in a given byte sequence.
5164
+ static Py_ssize_t
5165
+ utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5137
5166
{
5138
5167
Py_ssize_t len = 0 ;
5139
5168
@@ -5377,6 +5406,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5377
5406
// otherwise: check the input and decide the maxchr and maxsize to reduce
5378
5407
// reallocation and copy.
5379
5408
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2 ) {
5409
+ // we only calculate the number of codepoints and don't determine the exact maxchr.
5410
+ // This is because writing fast and portable SIMD code to find maxchr is difficult.
5411
+ // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5412
+ // means that it is no longer necessary to allocate several times the required amount
5413
+ // of memory.
5380
5414
maxsize = utf8_count_codepoints ((const unsigned char * )s , (const unsigned char * )end );
5381
5415
if (ch < 0xc4 ) { // latin1
5382
5416
maxchr = 0xff ;
0 commit comments