8000 BUG: three string ufunc bugs, one leading to segfault by mhvk · Pull Request #25515 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG: three string ufunc bugs, one leading to segfault #25515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions numpy/_core/src/umath/string_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ struct Buffer {
{
Buffer<enc> old = *this;
operator++();
return old;
return old;
}

inline Buffer<enc>&
Expand All @@ -124,7 +124,7 @@ struct Buffer {
{
Buffer<enc> old = *this;
operator--();
return old;
return old;
}

inline npy_ucs4
Expand All @@ -151,14 +151,16 @@ struct Buffer {
inline Buffer<enc>
buffer_memchr(npy_ucs4 ch, int len)
{
Buffer<enc> newbuf = *this;
switch (enc) {
case ENCODING::ASCII:
buf = (char *) memchr(buf, ch, len);
return *this;
newbuf.buf = (char *) memchr(buf, ch, len);
break;
case ENCODING::UTF32:
buf = (char *) wmemchr((wchar_t *) buf, ch, len);
return *this;
newbuf.buf = (char *) wmemchr((wchar_t *) buf, ch, len);
break;
}
return newbuf;
}

inline int
Expand Down
38 changes: 17 additions & 21 deletions numpy/_core/src/umath/string_ufuncs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,29 +343,25 @@ string_replace(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> buf3, npy_int64 c
npy_int64 len2 = buf2.num_codepoints();
npy_int64 len3 = buf3.num_codepoints();

if (len1 < len2
|| (len2 == 0 && len3 == 0)
|| count == 0
|| buf2.strcmp(buf3) == 0) {
out.buffer_fill_with_zeros_after_index(0);
return;
}
Comment on lines -346 to -352
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this refactoring needed? If so, could you explain the difference?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, before you returned an empty out if, e.g., the input string is shorter than the replacement string, so cannot possibly match. But in this case, the input string should unchanged, i.e., copied to the output. So, for any of those cases where it is obvious no match will be found, I now fall through directly to copying the input to the output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course! Thanks for the explanation!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aside, while looking through strcmp just to see whether a (len2 == len3) would be useful (it is and I'll include it), I noticed that the case for a longer buf3 would return the wrong sign. Almost certainly never matters, but maybe for a next fix:

while (tmp2.buf < tmp2.after) {
if (*tmp2 < 0) {
return -1;
}
if (*tmp2 > 0) {
return 1;
}
tmp2++;
(I don't think I should do it here, but can if you'd like me to).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While looking through the commits, I refactored this a little further. Hopefully, the intent of the if statement is clearer now.


Buffer<enc> end1 = buf1 + len1;
npy_int64 time = count;
npy_intp pos = findslice_for_replace(buf1, len1, buf2, len2);

while (time > 0 && pos >= 0) {
buf1.buffer_memcpy(out, pos);
out += pos;
buf1 += pos;

buf3.buffer_memcpy(out, len3);
out += len3;
buf1 += len2;
// Only try to replace if useful.
if (len1 >= len2 // Input is big enough to make a match possible.
&& len2 > 0 // Match string is not empty (so output will be finite).
&& !(len2 == len3 && buf2.strcmp(buf3) == 0) // Match and replacement differ.
) {
for (npy_int64 time = 0; time < count; time++) {
npy_intp pos = findslice_for_replace(buf1, end1 - buf1, buf2, len2);
if (pos < 0) {
break;
}
buf1.buffer_memcpy(out, pos);
out += pos;
buf1 += pos;

time--;
pos = findslice_for_replace(buf1, len1, buf2, len2);
buf3.buffer_memcpy(out, len3);
out += len3;
buf1 += len2;
}
}

buf1.buffer_memcpy(out, end1 - buf1);
Expand Down
38 changes: 32 additions & 6 deletions numpy/_core/tests/test_defchararray.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ def setup_method(self):
['12345', 'MixedCase'],
['123 \t 345 \0 ', 'UPPER']]) \
.view(np.char.chararray)
# Array with longer strings, > MEMCHR_CUT_OFF in code.
self.C = (np.array(['ABCDEFGHIJKLMNOPQRSTUVWXYZ',
'01234567890123456789012345'])
.view(np.char.chararray))

def test_len(self):
assert_(issubclass(np.char.str_len(self.A).dtype.type, np.integer))
Expand Down Expand Up @@ -240,12 +244,24 @@ def fail():

assert_raises(TypeError, fail)

def test_find(self):
assert_(issubclass(self.A.find('a').dtype.type, np.integer))
assert_array_equal(self.A.find('a'), [[1, -1], [-1, 6], [-1, -1]])
assert_array_equal(self.A.find('3'), [[-1, -1], [2, -1], [2, -1]])
assert_array_equal(self.A.find('a', 0, 2), [[1, -1], [-1, -1], [-1, -1]])
assert_array_equal(self.A.find(['1', 'P']), [[-1, -1], [0, -1], [0, 1]])
@pytest.mark.parametrize(
"dtype, encode",
[("U", str),
("S", lambda x: x.encode('ascii')),
])
def test_find(self, dtype, encode):
A = self.A.astype(dtype)
assert_(issubclass(A.find(encode('a')).dtype.type, np.integer))
assert_array_equal(A.find(encode('a')),
[[1, -1], [-1, 6], [-1, -1]])
assert_array_equal(A.find(encode('3')),
[[-1, -1], [2, -1], [2, -1]])
assert_array_equal(A.find(encode('a'), 0, 2),
[[1, -1], [-1, -1], [-1, -1]])
assert_array_equal(A.find([encode('1'), encode('P')]),
[[-1, -1], [0, -1], [0, 1]])
C = self.C.astype(dtype)
assert_array_equal(C.find(encode('M')), [12, -1])

def test_index(self):

Expand Down Expand Up @@ -438,6 +454,16 @@ def test_replace(self):
[b'12########## \t ##########45 \x00 ', b'UPPER']]
assert_(issubclass(R.dtype.type, np.bytes_))
assert_array_equal(R, tgt)
# Test special cases that should just return the input array,
# since replacements are not possible or do nothing.
S1 = self.A.replace(b'A very long byte string, longer than A', b'')
assert_array_equal(S1, self.A)
S2 = self.A.replace(b'', b'')
assert_array_equal(S2, self.A)
S3 = self.A.replace(b'3', b'3')
assert_array_equal(S3, self.A)
S4 = self.A.replace(b'3', b'', count=0)
assert_array_equal(S4, self.A)

def test_rjust(self):
assert_(issubclass(self.A.rjust(10).dtype.type, np.bytes_))
Expand Down
11 changes: 11 additions & 0 deletions numpy/_core/tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2578,3 +2578,14 @@ def test_isin_refcnt_bug(self):
# gh-25295
for _ in range(1000):
np.isclose(np.int64(2), np.int64(2), atol=1e-15, rtol=1e-300)

def test_replace_regression(self):
# gh-25513 segfault
carr = np.char.chararray((2,), itemsize=25)
test_strings = [b' 4.52173913043478315E+00',
b' 4.95652173913043548E+00']
carr[:] = test_strings
out = carr.replace(b"E", b"D")
expected = np.char.chararray((2,), itemsize=25)
expected[:] = [s.replace(b"E", b"D") for s in test_strings]
assert_array_equal(out, expected)
0