8000 Search-161 cherry-picked · arangodb/arangodb@6a2e430 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6a2e430

Browse files
committed
Search-161 cherry-picked
1 parent d370ced commit 6a2e430

15 files changed

+621
-42
lines changed

3rdParty/iresearch/core/search/levenshtein_filter.cpp

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ template<typename Invalid, typename Term, typename Levenshtein>
5858
inline auto executeLevenshtein(byte_type max_distance,
5959
by_edit_distance_options::pdp_f provider,
6060
bool with_transpositions,
61-
Invalid inv, Term t, Levenshtein lev) {
61+
const bytes_ref prefix, const bytes_ref target,
62+
Invalid&& inv, Term&& t, Levenshtein&& lev) {
63+
6264
if (!provider) {
6365
provider = &default_pdp;
6466
}
@@ -74,7 +76,7 @@ inline auto executeLevenshtein(byte_type max_distance,
7476
return inv();
7577
}
7678

77-
return lev(d);
79+
return lev(d, prefix, target);
7880
}
7981

8082
template<typename StatesType>
@@ -186,17 +188,19 @@ template<typename Collector>
186188
bool collect_terms(
187189
const index_reader& index,
188190
const string_ref& field,
191+
const bytes_ref& prefix,
189192
const bytes_ref& term,
190193
const parametric_description& d,
191194
Collector& collector) {
192-
const auto acceptor = make_levenshtein_automaton(d, term);
195+
const auto acceptor = make_levenshtein_automaton(d, prefix, term);
193196

194197
if (!validate(acceptor)) {
195198
return false;
196199
}
197200

198201
auto matcher = make_automaton_matcher(acceptor);
199-
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(term)));
202+
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix)) +
203+
uint32_t(utf8_utils::utf8_length(term)));
200204
const byte_type max_distance = d.max_distance() + 1;
201205

202206
for (auto& segment : index) {
@@ -217,6 +221,7 @@ filter::prepared::ptr prepare_levenshtein_filter(
217221
const order::prepared& order,
218222
boost_t boost,
219223
const string_ref& field,
224+
const bytes_ref& prefix,
220225
const bytes_ref& term,
221226
size_t terms_limit,
222227
const parametric_description& d) {
@@ -228,13 +233,13 @@ filter::prepared::ptr prepare_levenshtein_filter(
228233
all_terms_collector<decltype(states)> term_collector(states, field_stats, term_stats);
229234
term_collector.stat_index(0); // aggregate stats from different terms
230235

231-
if (!collect_terms(index, field, term, d, term_collector)) {
236+
if (!collect_terms(index, field, prefix, term, d, term_collector)) {
232237
return filter::prepared::empty();
233238
}
234239
} else {
235240
top_terms_collector term_collector(terms_limit, field_stats);
236241

237-
if (!collect_terms(index, field, term, d, term_collector)) {
242+
if (!collect_terms(index, field, prefix, term, d, term_collector)) {
238243
return filter::prepared::empty();
239244
}
240245

@@ -267,23 +272,27 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
267272

268273
/*static*/ field_visitor by_edit_distance::visitor(const options_type::filter_options& opts) {
269274
return executeLevenshtein(
270-
opts.max_distance, opts.provider, opts.with_transpositions,
275+
opts.max_distance, opts.provider, opts.with_transpositions, opts.prefix, opts.term,
271276
[]() -> field_visitor {
272277
return [](const sub_reader&, const term_reader&, filter_visitor&){};
273278
},
274279
[&opts]() -> field_visitor {
275280
// must copy term as it may point to temporary string
276-
return [term = opts.term](
281+
return [target = opts.prefix + opts.term](
277282
const sub_reader& segment,
278283
const term_reader& field,
279284
filter_visitor& visitor){
280-
return by_term::visit(segment, field, term, visitor);
285+
return by_term::visit(segment, field, target, visitor);
281286
};
282287
},
283-
[&opts](const parametric_description& d) -> field_visitor {
288+
[](const parametric_description& d,
289+
const bytes_ref prefix,
290+
const bytes_ref term) -> field_visitor {
284291
struct automaton_context : util::noncopyable {
285-
automaton_context(const parametric_description& d, const bytes_ref& term)
286-
: acceptor(make_levenshtein_automaton(d, term)),
292+
automaton_context(const parametric_description& d,
293+
const bytes_ref& prefix,
294+
const bytes_ref& term)
295+
: acceptor(make_levenshtein_automaton(d, prefix, term)),
287296
matcher(make_automaton_matcher(acceptor)) {
288297
}
289298

@@ -292,13 +301,14 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
292301
};
293302

294303
// FIXME
295-
auto ctx = memory::make_shared<automaton_context>(d, opts.term);
304+
auto ctx = memory::make_shared<automaton_context>(d, prefix, term);
296305

297306
if (!validate(ctx->acceptor)) {
298307
return [](const sub_reader&, const term_reader&, filter_visitor&){};
299308
}
300309

301-
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(opts.term)));
310+
const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix) +
311+
utf8_utils::utf8_length(term)));
302312
const byte_type max_distance = d.max_distance() + 1;
303313

304314
return [ctx, utf8_term_size, max_distance](
@@ -321,18 +331,30 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
321331
size_t scored_terms_limit,
322332
byte_type max_distance,
323333
options_type::pdp_f provider,
324-
bool with_transpositions) {
334+
bool with_transpositions,
335+
const bytes_ref& prefix) {
336+
325337
return executeLevenshtein(
326-
max_distance, provider, with_transpositions,
338+
max_distance, provider, with_transpositions, prefix, term,
327339
[]() -> filter::prepared::ptr {
328340
return prepared::empty();
329341
},
330-
[&index, &order, boost, &field, &term]() -> filter::prepared::ptr {
331-
return by_term::prepare(index, order, boost, field, term);
342+
[&index, &order, boost, &field, &prefix, &term]() -> filter::prepared::ptr {
343+
if (!prefix.empty() && !term.empty()) {
344+
bstring target;
345+
target.reserve(prefix.size() + term.size());
346+
target += prefix;
347+
target += term;
348+
return by_term::prepare(index, order, boost, field, target);
349+
}
350+
351+
return by_term::prepare(index, order, boost, field, prefix.empty() ? term : prefix);
332352
},
333-
[&field, &term, scored_terms_limit, &index, &order, boost](
334-
const parametric_description& d) -> filter::prepared::ptr {
335-
return prepare_levenshtein_filter(index, order, boost, field, term, scored_terms_limit, d);
353+
[&field, scored_terms_limit, &index, &order, boost](
354+
const parametric_description& d,
355+
const bytes_ref prefix,
356+
const bytes_ref term) -> filter::prepared::ptr {
357+
return prepare_levenshtein_filter(index, order, boost, field, prefix, term, scored_terms_limit, d);
336358
}
337359
);
338360
}

3rdParty/iresearch/core/search/levenshtein_filter.hpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ struct IRESEARCH_API by_edit_distance_filter_options {
4343
//////////////////////////////////////////////////////////////////////////////
4444
bstring term;
4545

46+
//////////////////////////////////////////////////////////////////////////////
47+
/// @brief match this number of characters from the beginning of the
48+
/// target regardless of edit distance
49+
//////////////////////////////////////////////////////////////////////////////
50+
bstring prefix;
51+
4652
//////////////////////////////////////////////////////////////////////////////
4753
/// @returns current parametric description provider, nullptr - use default
4854
/// @note since creation of parametric description is expensive operation,
@@ -68,9 +74,9 @@ struct IRESEARCH_API by_edit_distance_filter_options {
6874
}
6975

7076
size_t hash() const noexcept {
71-
return hash_combine(std::hash<bool>()(with_transpositions),
72-
hash_combine(std::hash<bstring>()(term),
73-
std::hash<byte_type>()(max_distance)));
77+
const auto hash0 = hash_combine(std::hash<bool>()(with_transpositions), std::hash<bstring>()(term));
78+
const auto hash1 = hash_combine(std::hash<byte_type>()(max_distance), std::hash<bstring>()(prefix));
79+
return hash_combine(hash0, hash1);
7480
}
7581
};
7682

@@ -115,7 +121,8 @@ class IRESEARCH_API by_edit_distance final
115121
size_t terms_limit,
116122
byte_type max_distance,
117123
options_type::pdp_f provider,
118-
bool with_transpositions);
124+
bool with_transpositions,
125+
const bytes_ref& prefix);
119126

120127
static field_visitor visitor(
121128
const options_type::filter_options& options);
@@ -130,7 +137,7 @@ class IRESEARCH_API by_edit_distance final
130137
return prepare(index, order, this->boost()*boost,
131138
field(), options().term, options().max_terms,
132139
options().max_distance, options().provider,
133-
options().with_transpositions);
140+
options().with_transpositions, options().prefix);
134141
}
135142
}; // by_edit_distance
136143

3rdParty/iresearch/core/search/phrase_filter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ struct prepare : util::noncopyable {
151151
index, order, boost, field, part.term,
152152
0, // collect all terms
153153
part.max_distance, part.provider,
154-
part.with_transpositions);
154+
part.with_transpositions, part.prefix);
155155
}
156156

157157
result_type operator()(const by_terms_options& /*part*/) const {

3rdParty/iresearch/core/utils/levenshtein_utils.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,7 @@ parametric_description read(data_input& in) {
582582

583583
automaton make_levenshtein_automaton(
584584
const parametric_description& description,
585+
const bytes_ref& prefix,
585586
const bytes_ref& target) {
586587
assert(description);
587588

@@ -613,18 +614,31 @@ automaton make_levenshtein_automaton(
613614
UNUSED(invalid_state);
614615

615616
// initial state
616-
a.SetStart(a.AddState());
617+
auto start_state = a.AddState();
618+
a.SetStart(start_state);
619+
620+
auto begin = prefix.begin();
621+
auto end = prefix.end();
622+
decltype(start_state) to;
623+
for (; begin != end; ) {
624+
const byte_type* next = utf8_utils::next(begin, end);
625+
to = a.AddState();
626+
auto dist = std::distance(begin, next);
627+
irs::utf8_emplace_arc(a, start_state, bytes_ref(begin, dist), to);
628+
start_state = to;
629+
begin = next;
630+
}
617631

618632
// check if start state is final
619633
const auto distance = description.distance(1, utf8_size);
620634

621635
if (distance <= description.max_distance()) {
622-
a.SetFinal(a.Start(), {true, distance});
636+
a.SetFinal(start_state, {true, distance});
623637
}
624638

625639
// state stack
626640
std::vector<state> stack;
627-
stack.emplace_back(0, 1, a.Start()); // 0 offset, 1st parametric state, initial automaton state
641+
stack.emplace_back(0, 1, start_state); // 0 offset, 1st parametric state, initial automaton state
628642

629643
std::vector<std::pair<bytes_ref, automaton::StateId>> arcs;
630644
arcs.resize(utf8_size); // allocate space for max possible number of arcs
@@ -714,7 +728,7 @@ size_t edit_distance(const parametric_description& description,
714728
const auto c = utf8_utils::next(rhs);
715729

716730
const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
717-
const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
731+
const auto end = lhs_chars.begin() + std::min(offset + description.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
718732
const auto chi = ::chi(begin, end, c);
719733
const auto& transition = description.transition(state, chi);
720734

@@ -753,7 +767,7 @@ bool edit_distance(
753767
}
754768

755769
const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
756-
const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
770+
const auto end = lhs_chars.begin() + std::min(offset + description.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
757771
const auto chi = ::chi(begin, end, c);
758772
const auto& transition = description.transition(state, chi);
759773

3rdParty/iresearch/core/utils/levenshtein_utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ IRESEARCH_API parametric_description read(data_input& in);
244244
////////////////////////////////////////////////////////////////////////////////
245245
IRESEARCH_API automaton make_levenshtein_automaton(
246246
const parametric_description& description,
247+
const bytes_ref& prefix,
247248
const bytes_ref& target);
248249

249250
////////////////////////////////////////////////////////////////////////////////

CHANGELOG

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,35 @@
11
v3.8.0 (XXXX-XX-XX)
22
-------------------
33

4+
* Add prefix parameter to LEVENSHTEIN_MATCH function in ArangoSearch
5+
6+
* Removed redirects from /_admin/cluster* to /_admin/cluster/*. Adjusted
7+
internal requests to use the new url.
8+
9+
* Fix display of running and slow queries in web UI when there are multiple
10+
coordinators. Previously, the display order of queries was undefined, which
11+
could lead to queries from one coordinator being display on top once and
12+
then the queries from another. That made using this UI harder than necessary.
13+
14+
Now queries are sorted for display, according to their query IDs.
15+
16+
* Updated ArangoDB Starter to 0.15.1-preview-2.
17+
18+
* Fix potential stack overflow when executing large queries. This is
19+
achieved by splitting the callstack and moving part of the execution
20+
to a separate thread. The number of execution nodes after which such
21+
a callstack split should be performed can be configured via the query
22+
option `maxNodesPerCallstack` and the command line option
23+
`--query.max-nodes-per-callstack`; the default is 250.
24+
25+
* Bug-Fix: Pregel WCC algorithm could yield incorrect results if a
26+
part of the connected component was only attached via OUTBOUND edges.
27+
The underlying algorithm is now modified to properly retain INBOUND
28+
edges for the runtime of the execution. This uses more RAM for the
29+
algorithm but guarantees correctness.
30+
31+
* Updated ArangoDB Starter to 0.15.1-preview-1.
32+
433
* Updated arangosync to 2.4.0.
534

635
* For cluster AQL queries, let the coordinator determine the query id to be used

arangod/Aql/AqlFunctionFeature.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ void AqlFunctionFeature::addStringFunctions() {
195195
add({"ENCODE_URI_COMPONENT", ".", flags, &Functions::EncodeURIComponent});
196196
add({"SOUNDEX", ".", flags, &Functions::Soundex});
197197
add({"LEVENSHTEIN_DISTANCE", ".,.", flags, &Functions::LevenshteinDistance});
198-
add({"LEVENSHTEIN_MATCH", ".,.,.|.,.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions, max terms])
198+
add({"LEVENSHTEIN_MATCH", ".,.,.|.,.,.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions, max terms, prefix])
199199
add({"NGRAM_MATCH", ".,.|.,.", flags, &Functions::NgramMatch}); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
200200
add({"NGRAM_SIMILARITY", ".,.,.", flags, &Functions::NgramSimilarity}); // (attribute, target, ngram size)
201201
add({"NGRAM_POSITIONAL_SIMILARITY", ".,.,.", flags, &Functions::NgramPositionalSimilarity}); // (attribute, target, ngram size)

arangod/IResearch/IResearchFilterFactory.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2706,7 +2706,7 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
27062706
}
27072707
auto const argc = ElementTraits::numMembers(args);
27082708
constexpr size_t min = 3 - First;
2709-
constexpr size_t max = 5 - First;
2709+
constexpr size_t max = 6 - First;
27102710
if (argc < min || argc > max) {
27112711
return error::invalidArgsCount<error::Range<min, max>>(funcName).withError(
27122712
[&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
@@ -2795,16 +2795,29 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
27952795
}
27962796
}
27972797

2798+
// optional (5 - First) argument defines prefix for target
2799+
irs::string_ref prefix = irs::string_ref::EMPTY;
2800+
if (5 - First < argc) {
2801+
res = ElementTraits::evaluateArg(prefix, tmpValue, funcName,
2802+
args, 5 - First, isFilter, ctx);
2803+
2804+
if (res.fail()) {
2805+
return res.withError(
2806+
[&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
2807+
}
2808+
}
2809+
27982810
irs::assign(opts.term, irs::ref_cast<irs::byte_type>(target));
27992811
opts.with_transpositions = withTranspositions;
28002812
opts.max_distance = static_cast<irs::byte_type>(maxDistance);
28012813
opts.max_terms = static_cast<size_t>(maxTerms);
28022814
opts.provider = &arangodb::iresearch::getParametricDescription;
2815+
irs::assign(opts.prefix, irs::ref_cast<irs::byte_type>(prefix));
28032816

28042817
return {};
28052818
}
28062819

2807-
// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions> ] ']'}
2820+
// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions>, <prefix> ] ']'}
28082821
Result fromFuncPhraseLevenshteinMatch(char const* funcName,
28092822
size_t const funcArgumentPosition,
28102823
char const* subFuncName,

0 commit comments

Comments
 (0)
0