8000 bpo-41972: Use the two-way algorithm for string searching (GH-22904) · python/cpython@73a85c4 · GitHub
[go: up one dir, main page]

Skip to content

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 73a85c4

Browse files
sweeneydetim-one
andauthored
bpo-41972: Use the two-way algorithm for string searching (GH-22904)
Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters <tim.peters@gmail.com>
1 parent 2183d06 commit 73a85c4

File tree

4 files changed

+941
-20
lines changed

4 files changed

+941
-20
lines changed

Lib/test/string_tests.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import unittest, string, sys, struct
66
from test import support
77
from collections import UserList
8+
import random
89

910
class Sequence:
1011
def __init__(self, seq='wxyz'): self.seq = seq
@@ -317,6 +318,44 @@ def test_rindex(self):
317318
else:
318319
self.checkraises(TypeError, 'hello', 'rindex', 42)
319320

321+
def test_find_periodic_pattern(self):
322+
"""Cover the special path for periodic patterns."""
323+
def reference_find(p, s):
324+
for i in range(len(s)):
325+
if s.startswith(p, i):
326+
return i
327+
return -1
328+
329+
rr = random.randrange
330+
choices = random.choices
331+
for _ in range(1000):
332+
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
333+
p = p0[:len(p0) - rr(10)] # pop off some characters
334+
left = ''.join(choices('abcdef', k=rr(2000)))
335+
right = ''.join(choices('abcdef', k=rr(2000)))
336+
text = left + p + right
337+
with self.subTest(p=p, text=text):
338+
self.checkequal(reference_find(p, text),
339+
text, 'find', p)
340+
341+
def test_find_shift_table_overflow(self):
342+
"""When the table of 8-bit shifts overflows."""
343+
N = 2**8 + 100
344+
345+
# first check the periodic case
346+
# here, the shift for 'b' is N + 1.
347+
pattern1 = 'a' * N + 'b' + 'a' * N
348+
text1 = 'babbaa' * N + pattern1
349+
self.checkequal(len(text1)-len(pattern1),
350+
text1, 'find', pattern1)
351+
352+
# now check the non-periodic case
353+
# here, the shift for 'd' is 3*(N+1)+1
354+
pattern2 = 'ddd' + 'abc' * N + "eee"
355+
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
356+
self.checkequal(len(text2) - N*len("de") - len(pattern2),
357+
text2, 'find', pattern2)
358+
320359
def test_lower(self):
321360
self.checkequal('hello', 'HeLLo', 'lower')
322361
self.checkequal('hello', 'hello', 'lower')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings.

0 commit comments

Comments
 (0)
0