BUG: fix find of single-character with memchr

numpy · ngoldbaum · Jan 2, 2024 · Dec 31, 2023 · Jan 2, 2024 · Dec 31, 2023
commit 925471d47f8f1ff7a972fd2335db8235c8e79334
diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h
@@ -109,7 +109,7 @@ struct Buffer {
     {
         Buffer<enc> old = *this;
         operator++();
-        return old; 
+        return old;
     }
 
     inline Buffer<enc>&
@@ -124,7 +124,7 @@ struct Buffer {
     {
         Buffer<enc> old = *this;
         operator--();
-        return old; 
+        return old;
     }
 
     inline npy_ucs4
@@ -151,14 +151,16 @@ struct Buffer {
     inline Buffer<enc>
     buffer_memchr(npy_ucs4 ch, int len)
     {
+        Buffer<enc> newbuf = *this;
         switch (enc) {
         case ENCODING::ASCII:
-            buf = (char *) memchr(buf, ch, len);
-            return *this;
+            newbuf.buf = (char *) memchr(buf, ch, len);
+            break;
         case ENCODING::UTF32:
-            buf = (char *) wmemchr((wchar_t *) buf, ch, len);
-            return *this;
+            newbuf.buf = (char *) wmemchr((wchar_t *) buf, ch, len);
+            break;
         }
+        return newbuf;
     }
 
     inline int

diff --git a/numpy/_core/tests/test_defchararray.py b/numpy/_core/tests/test_defchararray.py
@@ -213,6 +213,10 @@ def setup_method(self):
                            ['12345', 'MixedCase'],
                            ['123 \t 345 \0 ', 'UPPER']]) \
                             .view(np.char.chararray)
+        # Array with longer strings, > MEMCHR_CUT_OFF in code.
+        self.C = (np.array(['ABCDEFGHIJKLMNOPQRSTUVWXYZ',
+                            '01234567890123456789012345'])
+                  .view(np.char.chararray))
 
     def test_len(self):
         assert_(issubclass(np.char.str_len(self.A).dtype.type, np.integer))
@@ -240,12 +244,24 @@ def fail():
 
         assert_raises(TypeError, fail)
 
-    def test_find(self):
-        assert_(issubclass(self.A.find('a').dtype.type, np.integer))
-        assert_array_equal(self.A.find('a'), [[1, -1], [-1, 6], [-1, -1]])
-        assert_array_equal(self.A.find('3'), [[-1, -1], [2, -1], [2, -1]])
-        assert_array_equal(self.A.find('a', 0, 2), [[1, -1], [-1, -1], [-1, -1]])
-        assert_array_equal(self.A.find(['1', 'P']), [[-1, -1], [0, -1], [0, 1]])
+    @pytest.mark.parametrize(
+        "dtype, encode",
+        [("U", str),
+         ("S", lambda x: x.encode('ascii')),
+         ])
+    def test_find(self, dtype, encode):
+        A = self.A.astype(dtype)
+        assert_(issubclass(A.find(encode('a')).dtype.type, np.integer))
+        assert_array_equal(A.find(encode('a')),
+                           [[1, -1], [-1, 6], [-1, -1]])
+        assert_array_equal(A.find(encode('3')),
+                           [[-1, -1], [2, -1], [2, -1]])
+        assert_array_equal(A.find(encode('a'), 0, 2),
+                           [[1, -1], [-1, -1], [-1, -1]])
+        assert_array_equal(A.find([encode('1'), encode('P')]),
+                           [[-1, -1], [0, -1], [0, 1]])
+        C = self.C.astype(dtype)
+        assert_array_equal(C.find(encode('M')), [12, -1])
 
     def test_index(self):