@@ -5077,28 +5077,32 @@ load_unaligned(const unsigned char *p, size_t size)
5077
5077
static Py_ssize_t
5078
5078
find_first_nonascii (const unsigned char * start , const unsigned char * end )
5079
5079
{
5080
+ // The search is done in `size_t` chunks.
5081
+ // The start and end might not be aligned at `size_t` boundaries,
5082
+ // so they're handled specially.
5083
+
5080
5084
const unsigned char * p = start ;
5081
5085
5082
5086
if (end - start >= SIZEOF_SIZE_T ) {
5083
- const unsigned char * p2 = _Py_ALIGN_UP ( p , SIZEOF_SIZE_T );
5087
+ // Avoid unaligned read.
5084
5088
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5085
- if (p < p2 ) {
5086
- size_t u ;
5087
- memcpy (& u , p , sizeof (size_t ));
5088
- u &= ASCII_CHAR_MASK ;
5089
- if (u ) {
5090
- return (ctz (u ) - 7 ) / 8 ;
5091
- }
5092
- p = p2 ;
5089
+ size_t u ;
5090
+ memcpy (& u , p , sizeof (size_t ));
5091
+ u &= ASCII_CHAR_MASK ;
5092
+ if (u ) {
5093
+ return (ctz (u ) - 7 ) / 8 ;
5093
5094
}
5095
+ p = _Py_ALIGN_DOWN (p + SIZEOF_SIZE_T , SIZEOF_SIZE_T );
5094
5096
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5097
+ const unsigned char * p2 = _Py_ALIGN_UP (p , SIZEOF_SIZE_T );
5095
5098
while (p < p2 ) {
5096
5099
if (* p & 0x80 ) {
5097
5100
return p - start ;
5098
5101
}
5099
5102
p ++ ;
5100
5103
}
5101
5104
#endif
5105
+
5102
5106
const unsigned char * e = end - SIZEOF_SIZE_T ;
5103
5107
while (p <= e ) {
5104
5108
size_t u = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
@@ -5115,6 +5119,7 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
5115
5119
}
5116
5120
}
5117
5121
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5122
+ assert ((end - p ) < SIZEOF_SIZE_T );
5118
5123
// we can not use *(const size_t*)p to avoid buffer overrun.
5119
5124
size_t u = load_unaligned (p , end - p ) & ASCII_CHAR_MASK ;
5120
5125
if (u ) {
0 commit comments