8000 bpo-41972: Use the "Two-Way" algorithm when searching for long substrings by sweeneyde · Pull Request #22679 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-41972: Use the "Two-Way" algorithm when searching for long substrings #22679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 21 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
initial implementation
  • Loading branch information
sweeneyde committed Oct 11, 2020
commit 899f2894c22575f12c80081aae8959af5e8ada2e
238 changes: 238 additions & 0 deletions Objects/stringlib/fastsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@
#define FAST_SEARCH 1
#define FAST_RSEARCH 2


#if 0 && STRINGLIB_SIZEOF_CHAR == 1
#define LOG(...) printf(__VA_ARGS__)
#define LOG_STRING(s, n) printf("%.*s", n, s)
#else
#define LOG(...)
#define LOG_STRING(s, n)
#endif

#if LONG_BIT >= 128
#define STRINGLIB_BLOOM_WIDTH 128
#elif LONG_BIT >= 64
Expand Down Expand Up @@ -160,6 +169,229 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)

#undef MEMCHR_CUT_OFF


Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len,
int inverted, Py_ssize_t *return_period)
{
// We'll eventually partition needle into
// needle[:max_suffix + 1] + needle[max_suffix + 1:]
Py_ssize_t max_suffix = -1;

Py_ssize_t suffix = 0; // candidate for max_suffix
Py_ssize_t period = 1; // candidate for return_period
Py_ssize_t k = 1; // working index

while (suffix + k < needle_len) {
STRINGLIB_CHAR a = needle[suffix + k];
STRINGLIB_CHAR b = needle[max_suffix + k];
if (inverted ? (a < b) : (b < a)) {
// Suffix is smaller, period is entire prefix so far.
suffix += k;
k = 1;
period = suffix - max_suffix;
}
else if (a == b) {
// Advance through the repitition of the current period.
if (k != period) {
k++;
}
else {
suffix += period;
k = 1;
}
}
else {
// Found a bigger suffix.
max_suffix = suffix;
suffix += 1;
k = 1;
period = 1;
}
}
*return_period = period;
return max_suffix + 1;
}


Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len,
Py_ssize_t *return_period)
{
Py_ssize_t period1, period2, max_suf1, max_suf2;

// Search using both forward and inverted character-orderings
max_suf1 = STRINGLIB(_lex_search)(needle, needle_len, 0, &period1);
max_suf2 = STRINGLIB(_lex_search)(needle, needle_len, 1, &period2);

// Choose the later suffix
if (max_suf2 < max_suf1) {
*return_period = period1;
return max_suf1;
}
else {
*return_period = period2;
return max_suf2;
}
}


Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len,
const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len,
Py_ssize_t suffix, Py_ssize_t period)
{
LOG("========================\n");
LOG("Two-way with needle="); LOG_STRING(needle, needle_len);
LOG(" and haystack="); LOG_STRING(haystack, haystack_len);
LOG("\nSplit "); LOG_STRING(needle, needle_len);
LOG(" into "); LOG_STRING(needle, suffix);
LOG(" and "); LOG_STRING(needle + suffix, needle_len - suffix);
LOG(".\n");

if (memcmp(needle, needle+period, suffix * STRINGLIB_SIZEOF_CHAR) == 0) {
LOG("needle is completely periodic.\n");
// a mismatch can only advance by the period.
// use memory to avoid re-scanning known occurrences of the period.
Py_ssize_t memory = 0;
Py_ssize_t j = 0; // index into haystack
while (j <= haystack_len - needle_len) {
// Visualize the line-up:
LOG("> "); LOG_STRING(haystack, haystack_len);
LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len);
LOG("\n");

LOG("Scanning right half.\n");
Py_ssize_t i = Py_MAX(suffix, memory);
while (i < needle_len && needle[i] == haystack[j+i]) {
i++;
}
if (i >= needle_len) {
LOG("Right half matched. Scanning left half.\n");
i = suffix - 1;
while (memory < i + 1 && needle[i] == haystack[j+i]) {
i--;
}
if (i + 1 < memory + 1) {
LOG("Left half matches. Returning %d.\n", j);
return j;
}
LOG("No match.\n");
// Remember how many periods were scanned on the right
j += period;
memory = needle_len - period;
}
else {
LOG("Skip without checking left half.\n");
j += i - suffix + 1;
memory = 0;
}
}
}
else {
LOG("needle is NOT completely periodic.\n");
// The two halves are distinct;
// no extra memory is required,
// and a mismatch results in a maximal shift.
period = 1 + Py_MAX(suffix, needle_len - suffix);
STRINGLIB_CHAR suffix_start = needle[suffix];
LOG("Using period %d.\n", period);
LOG("Right half starts with %c\n", suffix_start);

Py_ssize_t j = 0;
while (j <= haystack_len - needle_len) {
// use faster code looking for first char.
Py_ssize_t find;
find = STRINGLIB(find_char)(haystack + suffix + j,
haystack_len - needle_len - j + 1,
suffix_start);
if (find == -1) {
LOG("Not found. Return -1.\n");
return -1;
}
j += find;

LOG("Found at %d.\n", j);

LOG("> "); LOG_STRING(haystack, haystack_len);
LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len);
LOG("\n");

LOG("Checking the right half.\n");
Py_ssize_t i = suffix+1;
for (; i < needle_len; i++) {
if (needle[i] != haystack[j + i]){
LOG("No match.\n");
break;
}
}

if (needle_len <= i) {
LOG("Matches. Checking the left half.\n");
i = suffix - 1;
for (i = suffix - 1; i >= 0; i--) {
if (needle[i] != haystack[j + i]) {
break;
}
}
if (i == -1) {
LOG("Match! (at %d)\n", j);
return j;
}
j += period;
}
else {
LOG("Jump forward without checking left half.\n");
j += i - suffix + 1;
}
}

}
LOG("Reached end. Returning -1.\n");
return -1;
}


Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(memmem)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len,
const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len)
{
LOG("memmem: needle="); LOG_STRING(needle, needle_len);
LOG(" and haystack="); LOG_STRING(haystack, haystack_len);
LOG("\n");
if (needle_len == 0) {
LOG("Easy out: empty.\n");
return 0;
}
STRINGLIB_CHAR first = needle[0];

Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first);
if (index == -1) {
LOG("Easy out: empty.\n");
return -1;
}

if (haystack_len - index < needle_len) {
LOG("Easy out: no room left after first found.\n");
return -1;
}

// Do a fast compare to avoid the initialization overhead
if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) {
LOG("Easy out: Naive guess was correct.\n");
return index;
}

// Start later.
index++;
Py_ssize_t period, suffix;
suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period);
return STRINGLIB(_two_way)(needle, needle_len,
haystack, haystack_len,
suffix, period);
}


Py_LOCAL_INLINE(Py_ssize_t)
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m,
Expand Down Expand Up @@ -198,6 +430,10 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
skip = mlast - 1;
mask = 0;

if (mode == FAST_SEARCH) {
return STRINGLIB(memmem)(p, m, s, n);
}

if (mode != FAST_RSEARCH) {
const STRINGLIB_CHAR *ss = s + m - 1;
const STRINGLIB_CHAR *pp = p + m - 1;
Expand Down Expand Up @@ -281,3 +517,5 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
return count;
}

#undef LOG
#undef LOG_STRING
0