8000 Feature/search 213 optimize levenshtein startswith by Dronplane · Pull Request #14744 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/search 213 optimize levenshtein startswith #14744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
64944e5
group conjunction nodes
Dronplane Sep 3, 2021
c6d0ab3
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 6, 2021
4664cc4
wip
Dronplane Sep 6, 2021
b545d56
tests
Dronplane Sep 6, 2021
b026fa4
tests
Dronplane Sep 7, 2021
9dfe7b9
tests
Dronplane Sep 7, 2021
0abc986
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 7, 2021
5b8eda2
moar tests
Dronplane Sep 7, 2021
6f11b5d
add option to explicitly block merging
Dronplane Sep 7, 2021
337cd90
moar corner cases
Dronplane Sep 8, 2021
17ea36a
empty case optimization
Dronplane Sep 8, 2021
55c038c
optimization
Dronplane Sep 8, 2021
229c972
cleanup
Dronplane Sep 8, 2021
f5d8684
fix startsWith boost
Dronplane Sep 8, 2021
af37ee0
cleanup
Dronplane Sep 8, 2021
eae7ebb
wip
Dronplane Sep 8, 2021
26386d4 8000
wip
Dronplane Sep 8, 2021
37be60d
comments
Dronplane Sep 8, 2021
bcd2093
add node serialization tests
Dronplane Sep 8, 2021
d649660
Update arangod/IResearch/IResearchFilterFactory.cpp
Dronplane Sep 9, 2021
4cd8331
adress review comments
Dronplane Sep 9, 2021
a969139
address review comments
Dronplane Sep 9, 2021
0e6b6cf
add js test
Dronplane Sep 9, 2021
0c6d9ca
adress review comments
Dronplane Sep 9, 2021
09c678c
Update CHANGELOG
Dronplane Sep 9, 2021
1708a0a
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 9, 2021
d234738
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 10, 2021
f89293e
None -> NONE renaming
Dronplane Sep 10, 2021
340087d
Merge branch 'feature/SEARCH-213-Optimize-Levenshtein-startswith' of …
Dronplane Sep 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
address review comments
  • Loading branch information
Dronplane committed Sep 9, 2021
commit a969139e227ff079c84706827d82ca6ffdd24966
48 changes: 5 additions & 43 deletions arangod/IResearch/IResearchFilterFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
#include "IResearch/IResearchAnalyzerFeature.h"
#include "IResearch/IResearchCommon.h"
#include "IResearch/IResearchFeature.h"
#include "IResearch/IResearchFilterOptimization.h"
#include "IResearch/IResearchKludge.h"
#include "IResearch/IResearchPDP.h"
#include "Logger/LogMacros.h"
Expand Down Expand Up @@ -3572,49 +3573,10 @@ Result fromFuncStartsWith(

// Try to optimize us away
if (!isMultiPrefix && !prefixes.empty() &&
ctx.filterOptimization != FilterOptimization::None && filter->type() == irs::type<irs::And>::id()) {
for (auto& f : *filter) {
if (f.type() == irs::type<irs::by_edit_distance>::id()) {
auto& levenshtein = static_cast<irs::by_edit_distance&>(f);
if (levenshtein.field() == name) {
auto options = levenshtein.mutable_options();
auto startsWith = prefixes.back().second;
if (startsWith.size() <= options->prefix.size()) {
if (std::memcmp(options->prefix.data(), startsWith.c_str(),
startsWith.size()) == 0) {
// Nothing to do. We are already covered by this levenshtein prefix
return {};
}
} else {
// maybe we could enlarge prefix to cover us?
if (std::memcmp(options->prefix.data(), startsWith.c_str(),
options->prefix.size()) == 0) {
// looks promising - beginning of the levenshtein prefix is ok
auto prefixTailSize = startsWith.size() - options->prefix.size();
if (options->term.size() >= prefixTailSize &&
std::memcmp(options->term.data(),
startsWith.c_str() + options->prefix.size(),
prefixTailSize) == 0) {
// we could enlarge prefix
8000 options->prefix = irs::ref_cast<irs::byte_type>(startsWith);
options->term.erase(options->term.begin(),
options->term.begin() + prefixTailSize);
return {};
}
}
}
if ((options->term.size() +
options->prefix.size() +
options->max_distance) < startsWith.size()) {
// last optimization effort - we can't fulfill this conjunction.
// make it empty
filter->clear();
filter->add<irs::empty>();
return {};
}
}
}
}
ctx.filterOptimization != FilterOptimization::None &&
arangodb::iresearch::includeStartsWithInLevenshtein(filter, name,
prefixes.back().second)) {
return {};
}

if (isMultiPrefix) {
Expand Down
56 changes: 56 additions & 0 deletions arangod/IResearch/IResearchFilterOptimization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,59 @@
////////////////////////////////////////////////////////////////////////////////

#include "IResearch/IResearchFilterOptimization.h"
#include "search/levenshtein_filter.hpp"
#include "search/prefix_filter.hpp"

namespace arangodb {
namespace iresearch {


bool includeStartsWithInLevenshtein(irs::boolean_filter* filter,
irs::string_ref name,
irs::string_ref startsWith) {
if (filter->type() == irs::type<irs::And>::id()) {
for (auto& f : *filter) {
if (f.type() == irs::type<irs::by_edit_distance>::id()) {
auto& levenshtein = static_cast<irs::by_edit_distance&>(f);
if (levenshtein.field() == name) {
auto options = levenshtein.mutable_options();
if (startsWith.size() <= options->prefix.size()) {
if (std::memcmp(options->prefix.data(), startsWith.c_str(),
startsWith.size()) == 0) {
// Nothing to do. We are already covered by this levenshtein prefix
return true;
}
} else {
// maybe we could enlarge prefix to cover us?
if (std::memcmp(options->prefix.data(), startsWith.c_str(),
options->prefix.size()) == 0) {
// looks promising - beginning of the levenshtein prefix is ok
auto prefixTailSize = startsWith.size() - options->prefix.size();
if (options->term.size() >= prefixTailSize &&
std::memcmp(options->term.data(),
startsWith.c_str() + options->prefix.size(),
prefixTailSize) == 0) {
// we could enlarge prefix
options->prefix = irs::ref_cast<irs::byte_type>(startsWith);
options->term.erase(options->term.begin(), options->term.begin() + prefixTailSize);
return true;
}
}
}
if ((options->term.size() + options->prefix.size() + options->max_distance) <
startsWith.size()) {
// last optimization effort - we can't fulfill this conjunction.
// make it empty
filter->clear();
filter->add<irs::empty>();
return true;
}
}
}
}
}
return false;
}

} // namespace iresearch
} // namespace arangodb
10 changes: 8 additions & 2 deletions arangod/IResearch/IResearchFilterOptimization.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
////////////////////////////////////////////////////////////////////////////////
#pragma once

#include "search/boolean_filter.hpp"

namespace arangodb {
namespace iresearch {

Expand All @@ -29,5 +31,9 @@ enum class FilterOptimization : int {
None = 0
};

}
} //namespace arangodb
bool includeStartsWithInLevenshtein(irs::boolean_filter* filter,
irs::string_ref name,
irs::string_ref startsWith);

} // namespace iresearch
} // namespace arangodb
290F
0