numpy · ngoldbaum · Jan 2, 2024 · Dec 31, 2023 · Jan 2, 2024 · Dec 31, 2023
diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h
@@ -109,7 +109,7 @@ struct Buffer {
     {
         Buffer<enc> old = *this;
         operator++();
-        return old; 
+        return old;
     }
 
     inline Buffer<enc>&
@@ -124,7 +124,7 @@ struct Buffer {
     {
         Buffer<enc> old = *this;
         operator--();
        return old; 
+        return old;
     }
 
     inline npy_ucs4
@@ -151,14 +151,16 @@ struct Buffer {
     inline Buffer<enc>
     buffer_memchr(npy_ucs4 ch, int len)
     {
+        Buffer<enc> newbuf = *this;
         switch (enc) {
         case ENCODING::ASCII:
-            buf = (char *) memchr(buf, ch, len);
-            return *this;
+            newbuf.buf = (char *) memchr(buf, ch, len);
+            break;
         case ENCODING::UTF32:
-            buf = (char *) wmemchr((wchar_t *) buf, ch, len);
-            return *this;
+            newbuf.buf = (char *) wmemchr((wchar_t *) buf, ch, len);
+            break;
         }
+        return newbuf;
     }
 
     inline int

diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp
@@ -343,29 +343,25 @@ string_replace(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> buf3, npy_int64 c
     npy_int64 len2 = buf2.num_codepoints();
     npy_int64 len3 = buf3.num_codepoints();
 
-    if (len1 < len2
-        || (len2 == 0 && len3 == 0)
-        || count == 0
-        || buf2.strcmp(buf3) == 0) {
-        out.buffer_fill_with_zeros_after_index(0);
-        return;
-    }
 while (tmp2.buf < tmp2.after) { 
     if (*tmp2 < 0) { 
         return -1; 
     } 
     if (*tmp2 > 0) { 
         return 1; 
     } 
     tmp2++; 
 while (tmp2.buf < tmp2.after) { 
     if (*tmp2 < 0) { 
         return -1; 
     } 
     if (*tmp2 > 0) { 
         return 1; 
     } 
     tmp2++; 
-
     Buffer<enc> end1 = buf1 + len1;
-    npy_int64 time = count;
-    npy_intp pos = findslice_for_replace(buf1, len1, buf2, len2);
-
-    while (time > 0 && pos >= 0) {
-        buf1.buffer_memcpy(out, pos);
-        out += pos;
-        buf1 += pos;
-
-        buf3.buffer_memcpy(out, len3);
-        out += len3;
-        buf1 += len2;
+    // Only try to replace if useful.
+    if (len1 >= len2  // Input is big enough to make a match possible.
+        && len2 > 0  // Match string is not empty (so output will be finite).
+        && !(len2 == len3 && buf2.strcmp(buf3) == 0)  // Match and replacement differ.
+        ) {
+        for (npy_int64 time = 0; time < count; time++) {
+            npy_intp pos = findslice_for_replace(buf1, end1 - buf1, buf2, len2);
+            if (pos < 0) {
+                break;
+            }
+            buf1.buffer_memcpy(out, pos);
+            out += pos;
+            buf1 += pos;
 
-        time--;
-        pos = findslice_for_replace(buf1, len1, buf2, len2);
+            buf3.buffer_memcpy(out, len3);
+            out += len3;
+            buf1 += len2;
+        }
     }
 
     buf1.buffer_memcpy(out, end1 - buf1);

diff --git a/numpy/_core/tests/test_defchararray.py b/numpy/_core/tests/test_defchararray.py
@@ -213,6 +213,10 @@ def setup_method(self):
                            ['12345', 'MixedCase'],
                            ['123 \t 345 \0 ', 'UPPER']]) \
                             .view(np.char.chararray)
+        # Array with longer strings, > MEMCHR_CUT_OFF in code.
+        self.C = (np.array(['ABCDEFGHIJKLMNOPQRSTUVWXYZ',
+                            '01234567890123456789012345'])
+                  .view(np.char.chararray))
 
     def test_len(self):
         assert_(issubclass(np.char.str_len(self.A).dtype.type, np.integer))
@@ -240,12 +244,24 @@ def fail():
 
         assert_raises(TypeError, fail)
 
-    def test_find(self):
-        assert_(issubclass(self.A.find('a').dtype.type, np.integer))
-        assert_array_equal(self.A.find('a'), [[1, -1], [-1, 6], [-1, -1]])
-        assert_array_equal(self.A.find('3'), [[-1, -1], [2, -1], [2, -1]])
-        assert_array_equal(self.A.find('a', 0, 2), [[1, -1], [-1, -1], [-1, -1]])
-        assert_array_equal(self.A.find(['1', 'P']), [[-1, -1], [0, -1], [0, 1]])
+    @pytest.mark.parametrize(
+        "dtype, encode",
+        [("U", str),
+         ("S", lambda x: x.encode('ascii')),
+         ])
+    def test_find(self, dtype, encode):
+        A = self.A.astype(dtype)
+        assert_(issubclass(A.find(encode('a')).dtype.type, np.integer))
+        assert_array_equal(A.find(encode('a')),
+                           [[1, -1], [-1, 6], [-1, -1]])
+        assert_array_equal(A.find(encode('3')),
+                           [[-1, -1], [2, -1], [2, -1]])
+        assert_array_equal(A.find(encode('a'), 0, 2),
+                           [[1, -1], [-1, -1], [-1, -1]])
+        assert_array_equal(A.find([encode('1'), encode('P')]),
+                           [[-1, -1], [0, -1], [0, 1]])
+        C = self.C.astype(dtype)
+        assert_array_equal(C.find(encode('M')), [12, -1])
 
     def test_index(self):
 
@@ -438,6 +454,16 @@ def test_replace(self):
                [b'12########## \t ##########45 \x00 ', b'UPPER']]
         assert_(issubclass(R.dtype.type, np.bytes_))
         assert_array_equal(R, tgt)
+        # Test special cases that should just return the input array,
+        # since replacements are not possible or do nothing.
+        S1 = self.A.replace(b'A very long byte string, longer than A', b'')
+        assert_array_equal(S1, self.A)
+        S2 = self.A.replace(b'', b'')
+        assert_array_equal(S2, self.A)
+        S3 = self.A.replace(b'3', b'3')
+        assert_array_equal(S3, self.A)
+        S4 = self.A.replace(b'3', b'', count=0)
+        assert_array_equal(S4, self.A)
 
     def test_rjust(self):
         assert_(issubclass(self.A.rjust(10).dtype.type, np.bytes_))

diff --git a/numpy/_core/tests/test_regression.py b/numpy/_core/tests/test_regression.py
@@ -2578,3 +2578,14 @@ def test_isin_refcnt_bug(self):
         # gh-25295
         for _ in range(1000):
             np.isclose(np.int64(2), np.int64(2), atol=1e-15, rtol=1e-300)
+
+    def test_replace_regression(self):
+        # gh-25513 segfault
+        carr = np.char.chararray((2,), itemsize=25)
+        test_strings = [b'  4.52173913043478315E+00',
+                        b'  4.95652173913043548E+00']
+        carr[:] = test_strings
+        out = carr.replace(b"E", b"D")
+        expected = np.char.chararray((2,), itemsize=25)
+        expected[:] = [s.replace(b"E", b"D") for s in test_strings]
+        assert_array_equal(out, expected)