8000 BUG,MAINT: Fix utf-8 character stripping memory access · numpy/numpy@a9da01e · GitHub
[go: up one dir, main page]

Skip to content

Commit a9da01e

Browse files
sebergcharris
authored andcommitted
BUG,MAINT: Fix utf-8 character stripping memory access
This fixes the memory access bug, the old if was there for a reason (obviously...), but unfortunately only the sanitizer checks noticed that. But to make it clear, I had to just also rename/change things a bit making it unsigned and using a stop range rather than the actual range is just much clearer here where the range can go to length 0 IMO. I.e. the old code had to check for `j >= 0` not `j > 0` (j being the last character index), because `j` could go negative. Fixes the sanitizer tests.
1 parent 6afbbf8 commit a9da01e

File tree

1 file changed

+33
-26
lines changed

1 file changed

+33
-26
lines changed

numpy/_core/src/umath/string_buffer.h

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,49 +1149,54 @@ string_lrstrip_whitespace(Buffer<enc> buf, Buffer<enc> out, STRIPTYPE striptype)
11491149
return 0;
11501150
}
11511151

1152-
size_t i = 0;
1152+
size_t new_start = 0;
11531153

11541154
size_t num_bytes = (buf.after - buf.buf);
11551155
Buffer traverse_buf = Buffer<enc>(buf.buf, num_bytes);
11561156

11571157
if (striptype != STRIPTYPE::RIGHTSTRIP) {
1158-
while (i < len) {
1158+
while (new_start < len) {
11591159
if (!traverse_buf.first_character_isspace()) {
11601160
break;
11611161
}
11621162
num_bytes -= traverse_buf.num_bytes_next_character();
1163-
traverse_buf++;
1164-
i++;
1163+
new_start++;
1164+
traverse_buf++; // may go one beyond buffer
11651165
}
11661166
}
11671167

1168-
npy_intp j = len - 1; // Could also turn negative if we're stripping the whole string
1168+
size_t new_stop = len; // New stop is a range (beyond last char)
11691169
if (enc == ENCODING::UTF8) {
11701170
traverse_buf = Buffer<enc>(buf.after, 0) - 1;
11711171
}
11721172
else {
1173-
traverse_buf = buf + j;
1173+
traverse_buf = buf + (new_stop - 1);
11741174
}
11751175

11761176
if (striptype != STRIPTYPE::LEFTSTRIP) {
1177-
while (j >= static_cast<npy_intp>(i)) {
1177+
while (new_stop > new_start) {
11781178
if (*traverse_buf != 0 && !traverse_buf.first_character_isspace()) {
11791179
break;
11801180
}
1181+
11811182
num_bytes -= traverse_buf.num_bytes_next_character();
1182-
traverse_buf--;
1183-
j--;
1183+
new_stop--;
1184+
1185+
// Do not step to character -1: can't find it's start for utf-8.
1186+
if (new_stop > 0) {
1187+
traverse_buf--;
1188+
}
11841189
}
11851190
}
11861191

1187-
Buffer offset_buf = buf + i;
1192+
Buffer offset_buf = buf + new_start;
11881193
if (enc == ENCODING::UTF8) {
11891194
offset_buf.buffer_memcpy(out, num_bytes);
11901195
return num_bytes;
11911196
}
1192-
offset_buf.buffer_memcpy(out, j - i + 1);
1193-
out.buffer_fill_with_zeros_after_index(j - i + 1);
1194-
return j - i + 1;
1197+
offset_buf.buffer_memcpy(out, new_stop - new_start);
1198+
out.buffer_fill_with_zeros_after_index(new_stop - new_start);
1199+
return new_stop - new_start;
11951200
}
11961201

11971202

@@ -1218,13 +1223,13 @@ string_lrstrip_chars(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> out, STRIPT
12181223
return len1;
12191224
}
12201225

1221-
size_t i = 0;
1226+
size_t new_start = 0;
12221227

12231228
size_t num_bytes = (buf1.after - buf1.buf);
12241229
Buffer traverse_buf = Buffer<enc>(buf1.buf, num_bytes);
12251230

12261231
if (striptype != STRIPTYPE::RIGHTSTRIP) {
1227-
while (i < len1) {
1232+
for (; new_start < len1; traverse_buf++) {
12281233
Py_ssize_t res;
12291234
switch (enc) {
12301235
case ENCODING::ASCII:
@@ -1245,21 +1250,20 @@ string_lrstrip_chars(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> out, STRIPT
12451250
break;
12461251
}
12471252
num_bytes -= traverse_buf.num_bytes_next_character();
1248-
traverse_buf++;
1249-
i++;
1253+
new_start++;
12501254
}
12511255
}
12521256

1253-
npy_intp j = len1 - 1;
1257+
size_t new_stop = len1; // New stop is a range (beyond last char)
12541258
if (enc == ENCODING::UTF8) {
12551259
traverse_buf = Buffer<enc>(buf1.after, 0) - 1;
12561260
}
12571261
else {
1258-
traverse_buf = buf1 + j;
1262+
traverse_buf = buf1 + (new_stop - 1);
12591263
}
12601264

12611265
if (striptype != STRIPTYPE::LEFTSTRIP) {
1262-
while (j >= static_cast<npy_intp>(i)) {
1266+
while (new_stop > new_start) {
12631267
Py_ssize_t res;
12641268
switch (enc) {
12651269
case ENCODING::ASCII:
@@ -1280,19 +1284,22 @@ string_lrstrip_chars(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> out, STRIPT
12801284
break;
12811285
}
12821286
num_bytes -= traverse_buf.num_bytes_next_character();
1283-
j--;
1284-
traverse_buf--;
1287+
new_stop--;
1288+
// Do not step to character -1: can't find it's start for utf-8.
1289+
if (new_stop > 0) {
1290+
traverse_buf--;
1291+
}
12851292
}
12861293
}
12871294

1288-
Buffer offset_buf = buf1 + i;
1295+
Buffer offset_buf = buf1 + new_start;
12891296
if (enc == ENCODING::UTF8) {
12901297
offset_buf.buffer_memcpy(out, num_bytes);
12911298
return num_bytes;
12921299
}
1293-
offset_buf.buffer_memcpy(out, j - i + 1);
1294-
out.buffer_fill_with_zeros_after_index(j - i + 1);
1295-
return j - i + 1;
1300+
offset_buf.buffer_memcpy(out, new_stop - new_start);
1301+
out.buffer_fill_with_zeros_after_index(new_stop - new_start);
1302+
return new_stop - new_start;
12961303
}
12971304

12981305
template <typename char_type>

0 commit comments

Comments
 (0)
0