8000 Search-161-3.8 by alexbakharew · Pull Request #14445 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Search-161-3.8 #14445

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 43 additions & 21 deletions 3rdParty/iresearch/core/search/levenshtein_filter.cpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ template<typename Invalid, typename Term, typename Levenshtein>
inline auto executeLevenshtein(byte_type max_distance,
by_edit_distance_options::pdp_f provider,
bool with_transpositions,
Invalid inv, Term t, Levenshtein lev) {
const bytes_ref prefix, const bytes_ref target,
Invalid&& inv, Term&& t, Levenshtein&& lev) {

if (!provider) {
provider = &default_pdp;
}
Expand All @@ -74,7 +76,7 @@ inline auto executeLevenshtein(byte_type max_distance,
return inv();
}

return lev(d);
return lev(d, prefix, target);
}

template<typename StatesType>
Expand Down Expand Up @@ -186,17 +188,19 @@ template<typename Collector>
bool collect_terms(
const index_reader& index,
const string_ref& field,
const bytes_ref& prefix,
const bytes_ref& term,
const parametric_description& d,
Collector& collector) {
const auto acceptor = make_levenshtein_automaton(d, term);
const auto acceptor = make_levenshtein_automaton(d, prefix, term);

if (!validate(acceptor)) {
return false;
}

auto matcher = make_automaton_matcher(acceptor);
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(term)));
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix)) +
uint32_t(utf8_utils::utf8_length(term)));
const byte_type max_distance = d.max_distance() + 1;

for (auto& segment : index) {
Expand All @@ -217,6 +221,7 @@ filter::prepared::ptr prepare_levenshtein_filter(
const order::prepared& order,
boost_t boost,
const string_ref& field,
const bytes_ref& prefix,
const bytes_ref& term,
size_t terms_limit,
const parametric_description& d) {
Expand All @@ -228,13 +233,13 @@ filter::prepared::ptr prepare_levenshtein_filter(
all_terms_collector<decltype(states)> term_collector(states, field_stats, term_stats);
term_collector.stat_index(0); // aggregate stats from different terms

if (!collect_terms(index, field, term, d, term_collector)) {
if (!collect_terms(index, field, prefix, term, d, term_collector)) {
return filter::prepared::empty();
}
} else {
top_terms_collector term_collector(terms_limit, field_stats);

if (!collect_terms(index, field, term, d, term_collector)) {
if (!collect_terms(index, field, prefix, term, d, term_collector)) {
return filter::prepared::empty();
}

Expand Down Expand Up @@ -267,23 +272,27 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)

/*static*/ field_visitor by_edit_distance::visitor(const options_type::filter_options& opts) {
return executeLevenshtein(
opts.max_distance, opts.provider, opts.with_transpositions,
opts.max_distance, opts.provider, opts.with_transpositions, opts.prefix, opts.term,
[]() -> field_visitor {
return [](const sub_reader&, const term_reader&, filter_visitor&){};
},
[&opts]() -> field_visitor {
// must copy term as it may point to temporary string
return [term = opts.term](
return [target = opts.prefix + opts.term](
const sub_reader& segment,
const term_reader& field,
filter_visitor& visitor){
return by_term::visit(segment, field, term, visitor);
return by_term::visit(segment, field, target, visitor);
};
},
[&opts](const parametric_description& d) -> field_visitor {
[](const parametric_description& d,
const bytes_ref prefix,
const bytes_ref term) -> field_visitor {
struct automaton_context : util::noncopyable {
automaton_context(const parametric_description& d, const bytes_ref& term)
: acceptor(make_levenshtein_automaton(d, term)),
automaton_context(const parametric_description& d,
const bytes_ref& prefix,
const bytes_ref& term)
: acceptor(make_levenshtein_automaton(d, prefix, term)),
matcher(make_automaton_matcher(acceptor)) {
}

Expand All @@ -292,13 +301,14 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
};

// FIXME
auto ctx = memory::make_shared<automaton_context>(d, opts.term);
auto ctx = memory::make_shared<automaton_context>(d, prefix, term);

if (!validate(ctx->acceptor)) {
return [](const sub_reader&, const term_reader&, filter_visitor&){};
}

const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(opts.term)));
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix) +
utf8_utils::utf8_length(term)));
const byte_type max_distance = d.max_distance() + 1;

return [ctx, utf8_term_size, max_distance](
Expand All @@ -321,18 +331,30 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
size_t scored_terms_limit,
byte_type max_distance,
options_type::pdp_f provider,
bool with_transpositions) {
bool with_transpositions,
const bytes_ref& prefix) {

return executeLevenshtein(
max_distance, provider, with_transpositions,
max_distance, provider, with_transpositions, prefix, term,
[]() -> filter::prepared::ptr {
return prepared::empty();
},
[&index, &order, boost, &field, &term]() -> filter::prepared::ptr {
return by_term::prepare(index, order, boost, field, term);
[&index, &order, boost, &field, &prefix, &term]() -> filter::prepared::ptr {
if (!prefix.empty() && !term.empty()) {
bstring target;
target.reserve(prefix.size() + term.size());
target += prefix;
target += term;
return by_term::prepare(index, order, boost, field, target);
}

return by_term::prepare(index, order, boost, field, prefix.empty() ? term : prefix);
},
[&field, &term, scored_terms_limit, & 8000 amp;index, &order, boost](
const parametric_description& d) -> filter::prepared::ptr {
return prepare_levenshtein_filter(index, order, boost, field, term, scored_terms_limit, d);
[&field, scored_terms_limit, &index, &order, boost](
const parametric_description& d,
const bytes_ref prefix,
const bytes_ref term) -> filter::prepared::ptr {
return prepare_levenshtein_filter(index, order, boost, field, prefix, term, scored_terms_limit, d);
}
);
}
Expand Down
17 changes: 12 additions & 5 deletions 3rdParty/iresearch/core/search/levenshtein_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ struct IRESEARCH_API by_edit_distance_filter_options {
//////////////////////////////////////////////////////////////////////////////
bstring term;

//////////////////////////////////////////////////////////////////////////////
/// @brief match this number of characters from the beginning of the
/// target regardless of edit distance
//////////////////////////////////////////////////////////////////////////////
bstring prefix;

//////////////////////////////////////////////////////////////////////////////
/// @returns current parametric description provider, nullptr - use default
/// @note since creation of parametric description is expensive operation,
Expand All @@ -68,9 +74,9 @@ struct IRESEARCH_API by_edit_distance_filter_options {
}

size_t hash() const noexcept {
return hash_combine(std::hash<bool>()(with_transpositions),
hash_combine(std::hash<bstring>()(term),
std::hash<byte_type>()(max_distance)));
const auto hash0 = hash_combine(std::hash<bool>()(with_transpositions), std::hash<bstring>()(term));
const auto hash1 = hash_combine(std::hash<byte_type>()(max_distance), std::hash<bstring>()(prefix));
return hash_combine(hash0, hash1);
}
};

Expand Down Expand Up @@ -115,7 +121,8 @@ class IRESEARCH_API by_edit_distance final
size_t terms_limit,
byte_type max_distance,
options_type::pdp_f provider,
bool with_transpositions);
bool with_transpositions,
const bytes_ref& prefix);

static field_visitor visitor(
const options_type::filter_options& options);
Expand All @@ -130,7 +137,7 @@ class IRESEARCH_API by_edit_distance final
return prepare(index, order, this->boost()*boost,
field(), options().term, options().max_terms,
options().max_distance, options().provider,
options().with_transpositions);
options().with_transpositions, options().prefix);
}
}; // by_edit_distance

Expand Down
2 changes: 1 addition & 1 deletion 3rdParty/iresearch/core/search/phrase_filter.cpp
D966
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ struct prepare : util::noncopyable {
index, order, boost, field, part.term,
0, // collect all terms
part.max_distance, part.provider,
part.with_transpositions);
part.with_transpositions, part.prefix);
}

result_type operator()(const by_terms_options& /*part*/) const {
Expand Down
24 changes: 19 additions & 5 deletions 3rdParty/iresearch/core/utils/levenshtein_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ parametric_description read(data_input& in) {

automaton make_levenshtein_automaton(
const parametric_description& description,
const bytes_ref& prefix,
const bytes_ref& target) {
assert(description);

Expand Down Expand Up @@ -613,18 +614,31 @@ automaton make_levenshtein_automaton(
UNUSED(invalid_state);

// initial state
a.SetStart(a.AddState());
auto start_state = a.AddState();
a.SetStart(start_state);

auto begin = prefix.begin();
auto end = prefix.end();
decltype(start_state) to;
for (; begin != end; ) {
const byte_type* next = utf8_utils::next(begin, end);
to = a.AddState();
auto dist = std::distance(begin, next);
irs::utf8_emplace_arc(a, start_state, bytes_ref(begin, dist), to);
start_state = to;
begin = next;
}

// check if start state is final
const auto distance = description.distance(1, utf8_size);

if (distance <= description.max_distance()) {
a.SetFinal(a.Start(), {true, distance});
a.SetFinal(start_state, {true, distance});
}

// state stack
std::vector<state> stack;
stack.emplace_back(0, 1, a.Start()); // 0 offset, 1st parametric state, initial automaton state
stack.emplace_back(0, 1, start_state); // 0 offset, 1st parametric state, initial automaton state

std::vector<std::pair<bytes_ref, automaton::StateId>> arcs;
arcs.resize(utf8_size); // allocate space for max possible number of arcs
Expand Down Expand Up @@ -714,7 +728,7 @@ size_t edit_distance(const parametric_description& description,
const auto c = utf8_utils::next(rhs);

const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
const auto end = lhs_chars.begin() + std::min(offset + descripti F438 on.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
const auto chi = ::chi(begin, end, c);
const auto& transition = description.transition(state, chi);

Expand Down Expand Up @@ -753,7 +767,7 @@ bool edit_distance(
}

const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
const auto end = lhs_chars.begin() + std::min(offset + description.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
const auto chi = ::chi(begin, end, c);
const auto& transition = description.transition(state, chi);

Expand Down
1 change: 1 addition & 0 deletions 3rdParty/iresearch/core/utils/levenshtein_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ IRESEARCH_API parametric_description read(data_input& in);
////////////////////////////////////////////////////////////////////////////////
IRESEARCH_API automaton make_levenshtein_automaton(
const parametric_description& description,
const bytes_ref& prefix,
const bytes_ref& target);

////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
v3.8.1 (XXXX-XX-XX)
-------------------

* Add prefix parameter to LEVENSHTEIN_MATCH function in ArangoSearch
(DEVSUPP-753).

* Bug-fix: Pregel WCC algorithm could yield incorrect results if a part of the
connected component was only attached via OUTBOUND edges.
The underlying algorithm is now modified to properly retain INBOUND edges for
Expand Down
2 changes: 1 addition & 1 deletion arangod/Aql/AqlFunctionFeature.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ void AqlFunctionFeature::addStringFunctions() {
add({"ENCODE_URI_COMPONENT", ".", flags, &Functions::EncodeURIComponent});
add({"SOUNDEX", ".", flags, &Functions::Soundex});
add({"LEVENSHTEIN_DISTANCE", ".,.", flags, &Functions::LevenshteinDistance});
add({"LEVENSHTEIN_MATCH", ".,.,.|.,.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions, max terms])
add({"LEVENSHTEIN_MATCH", ".,.,.|.,.,.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions, max terms, prefix])
add({"NGRAM_MATCH", ".,.|.,.", flags, &Functions::NgramMatch}); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
add({"NGRAM_SIMILARITY", ".,.,.", flags, &Functions::NgramSimilarity}); // (attribute, target, ngram size)
add({"NGRAM_POSITIONAL_SIMILARITY", ".,.,.", flags, &Functions::NgramPositionalSimilarity}); // (attribute, target, ngram size)
Expand Down
17 changes: 15 additions & 2 deletions arangod/IResearch/IResearchFilterFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2706,7 +2706,7 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
}
auto const argc = ElementTraits::numMembers(args);
constexpr size_t min = 3 - First;
constexpr size_t max = 5 - First;
constexpr size_t max = 6 - First;
if (argc < min || argc > max) {
return error::invalidArgsCount<error::Range<min, max>>(funcName).withError(
[&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
Expand Down Expand Up @@ -2795,16 +2795,29 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
}
}

// optional (5 - First) argument defines prefix for target
irs::string_ref prefix = irs::string_ref::EMPTY;
if (5 - First < argc) {
res = ElementTraits::evaluateArg(prefix, tmpValue, funcName,
args, 5 - First, isFilter, ctx);

if (res.fail()) {
return res.withError(
[&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
}
}

irs::assign(opts.term, irs::ref_cast<irs::byte_type>(target));
opts.with_transpositions = withTranspositions;
opts.max_distance = static_cast<irs::byte_type>(maxDistance);
opts.max_terms = static_cast<size_t>(maxTerms);
opts.provider = &arangodb::iresearch::getParametricDescription;
irs::assign(opts.prefix, irs::ref_cast<irs::byte_type>(prefix));

return {};
}

// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions> ] ']'}
// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions>, <prefix> ] ']'}
Result fromFuncPhraseLevenshteinMatch(char const* funcName,
size_t const funcArgumentPosition,
char const* subFuncName,
Expand Down
2 changes: 1 addition & 1 deletion tests/IResearch/IResearchFeature-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,7 +1577,7 @@ TEST_F(IResearchFeatureTest, test_start) {
{ "MIN_MATCH", { ".,.|.+", FunctionType::FILTER } },
{ "LIKE", { ".,.|.", FunctionType::FILTER } },
{ "NGRAM_MATCH", { ".,.|.,.", FunctionType::FILTER } },
{ "LEVENSHTEIN_MATCH", { ".,.,.|.,.", FunctionType::FILTER } },
{ "LEVENSHTEIN_MATCH", { ".,.,.|.,.,.", FunctionType::FILTER } },
{ "IN_RANGE", { ".,.,.,.,.", FunctionType::FILTER } },
{ "GEO_IN_RANGE", { ".,.,.,.|.,.,.", FunctionType::FILTER } },
{ "GEO_CONTAINS", { ".,.", FunctionType::FILTER } },
Expand Down
Loading
0