arangodb · KVS85 · Jul 28, 2021 · Jul 1, 2021 · Jul 1, 2021 · Jul 2, 2021
diff --git a/3rdParty/iresearch/core/search/levenshtein_filter.cpp b/3rdParty/iresearch/core/search/levenshtein_filter.cpp
@@ -58,7 +58,9 @@ template<typename Invalid, typename Term, typename Levenshtein>
 inline auto executeLevenshtein(byte_type max_distance,
                                by_edit_distance_options::pdp_f provider,
                                bool with_transpositions,
-                               Invalid inv, Term t, Levenshtein lev) {
+                               const bytes_ref prefix, const bytes_ref target,
+                               Invalid&& inv, Term&& t, Levenshtein&& lev) {
+
   if (!provider) {
     provider = &default_pdp;
   }
@@ -74,7 +76,7 @@ inline auto executeLevenshtein(byte_type max_distance,
     return inv();
   }
 
-  return lev(d);
+  return lev(d, prefix, target);
 }
 
 template<typename StatesType>
@@ -186,17 +188,19 @@ template<typename Collector>
 bool collect_terms(
     const index_reader& index,
     const string_ref& field,
+    const bytes_ref& prefix,
     const bytes_ref& term,
     const parametric_description& d,
     Collector& collector) {
-  const auto acceptor = make_levenshtein_automaton(d, term);
+  const auto acceptor = make_levenshtein_automaton(d, prefix, term);
 
   if (!validate(acceptor)) {
     return false;
   }
 
   auto matcher = make_automaton_matcher(acceptor);
-  const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(term)));
+  const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix)) +
+                                               uint32_t(utf8_utils::utf8_length(term)));
   const byte_type max_distance = d.max_distance() + 1;
 
   for (auto& segment : index) {
@@ -217,6 +221,7 @@ filter::prepared::ptr prepare_levenshtein_filter(
     const order::prepared& order,
     boost_t boost,
     const string_ref& field,
+    const bytes_ref& prefix,
     const bytes_ref& term,
     size_t terms_limit,
     const parametric_description& d) {
@@ -228,13 +233,13 @@ filter::prepared::ptr prepare_levenshtein_filter(
     all_terms_collector<decltype(states)> term_collector(states, field_stats, term_stats);
     term_collector.stat_index(0); // aggregate stats from different terms
 
-    if (!collect_terms(index, field, term, d, term_collector)) {
+    if (!collect_terms(index, field, prefix, term, d, term_collector)) {
       return filter::prepared::empty();
     }
   } else {
     top_terms_collector term_collector(terms_limit, field_stats);
 
-    if (!collect_terms(index, field, term, d, term_collector)) {
+    if (!collect_terms(index, field, prefix, term, d, term_collector)) {
       return filter::prepared::empty();
     }
 
@@ -267,23 +272,27 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
 
 /*static*/ field_visitor by_edit_distance::visitor(const options_type::filter_options& opts) {
   return executeLevenshtein(
-    opts.max_distance, opts.provider, opts.with_transpositions,
+    opts.max_distance, opts.provider, opts.with_transpositions, opts.prefix, opts.term, 
     []() -> field_visitor {
       return [](const sub_reader&, const term_reader&, filter_visitor&){};
     },
     [&opts]() -> field_visitor {
       // must copy term as it may point to temporary string
-      return [term = opts.term](
+      return [target = opts.prefix + opts.term](
           const sub_reader& segment,
           const term_reader& field,
           filter_visitor& visitor){
-        return by_term::visit(segment, field, term, visitor);
+        return by_term::visit(segment, field, target, visitor);
       };
     },
-    [&opts](const parametric_description& d) -> field_visitor {
+    [](const parametric_description& d,
+            const bytes_ref prefix,
+            const bytes_ref term) -> field_visitor {
       struct automaton_context : util::noncopyable {
-        automaton_context(const parametric_description& d, const bytes_ref& term)
-          : acceptor(make_levenshtein_automaton(d, term)),
+        automaton_context(const parametric_description& d,
+                          const bytes_ref& prefix,
+                          const bytes_ref& term)
+          : acceptor(make_levenshtein_automaton(d, prefix, term)),
             matcher(make_automaton_matcher(acceptor)) {
         }
 
@@ -292,13 +301,14 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
       };
 
       // FIXME
-      auto ctx = memory::make_shared<automaton_context>(d, opts.term);
+      auto ctx = memory::make_shared<automaton_context>(d, prefix, term);
 
       if (!validate(ctx->acceptor)) {
         return [](const sub_reader&, const term_reader&, filter_visitor&){};
       }
 
-      const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(opts.term)));
+      const uint32_t utf8_term_size = std::max(1U, uint32_t(utf8_utils::utf8_length(prefix) +
+                                                            utf8_utils::utf8_length(term)));
       const byte_type max_distance = d.max_distance() + 1;
 
       return [ctx, utf8_term_size, max_distance](
@@ -321,18 +331,30 @@ DEFINE_FACTORY_DEFAULT(by_edit_distance)
     size_t scored_terms_limit,
     byte_type max_distance,
     options_type::pdp_f provider,
-    bool with_transpositions) {
+    bool with_transpositions,
+    const bytes_ref& prefix) {
+
   return executeLevenshtein(
-    max_distance, provider, with_transpositions,
+    max_distance, provider, with_transpositions, prefix, term,
     []() -> filter::prepared::ptr {
       return prepared::empty();
     },
-    [&index, &order, boost, &field, &term]() -> filter::prepared::ptr {
-      return by_term::prepare(index, order, boost, field, term);
+    [&index, &order, boost, &field, &prefix, &term]() -> filter::prepared::ptr {
+      if (!prefix.empty() && !term.empty()) {
+        bstring target;
+        target.reserve(prefix.size() + term.size());
+        target += prefix;
+        target += term;
+        return by_term::prepare(index, order, boost, field, target);
+      }
+
+      return by_term::prepare(index, order, boost, field, prefix.empty() ? term : prefix);
     },
-    [&field, &term, scored_terms_limit, &
8000
amp;index, &order, boost](
-        const parametric_description& d) -> filter::prepared::ptr {
-      return prepare_levenshtein_filter(index, order, boost, field, term, scored_terms_limit, d);
+    [&field, scored_terms_limit, &index, &order, boost](
+        const parametric_description& d,
+        const bytes_ref prefix,
+        const bytes_ref term) -> filter::prepared::ptr {
+      return prepare_levenshtein_filter(index, order, boost, field, prefix, term, scored_terms_limit, d);
     }
   );
 }

diff --git a/3rdParty/iresearch/core/search/levenshtein_filter.hpp b/3rdParty/iresearch/core/search/levenshtein_filter.hpp
@@ -43,6 +43,12 @@ struct IRESEARCH_API by_edit_distance_filter_options {
   //////////////////////////////////////////////////////////////////////////////
   bstring term;
 
+  //////////////////////////////////////////////////////////////////////////////
+  /// @brief match this number of characters from the beginning of the
+  ///        target regardless of edit distance
+  //////////////////////////////////////////////////////////////////////////////
+  bstring prefix;
+
   //////////////////////////////////////////////////////////////////////////////
   /// @returns current parametric description provider, nullptr - use default
   /// @note since creation of parametric description is expensive operation,
@@ -68,9 +74,9 @@ struct IRESEARCH_API by_edit_distance_filter_options {
   }
 
   size_t hash() const noexcept {
-    return hash_combine(std::hash<bool>()(with_transpositions),
-                        hash_combine(std::hash<bstring>()(term),
-                                     std::hash<byte_type>()(max_distance)));
+    const auto hash0 = hash_combine(std::hash<bool>()(with_transpositions), std::hash<bstring>()(term));
+    const auto hash1 = hash_combine(std::hash<byte_type>()(max_distance), std::hash<bstring>()(prefix));
+    return hash_combine(hash0, hash1); 
   }
 };
 
@@ -115,7 +121,8 @@ class IRESEARCH_API by_edit_distance final
     size_t terms_limit,
     byte_type max_distance,
     options_type::pdp_f provider,
-    bool with_transpositions);
+    bool with_transpositions,
+    const bytes_ref& prefix);
 
   static field_visitor visitor(
     const options_type::filter_options& options);
@@ -130,7 +137,7 @@ class IRESEARCH_API by_edit_distance final
     return prepare(index, order, this->boost()*boost,
                    field(), options().term, options().max_terms,
                    options().max_distance, options().provider,
-                   options().with_transpositions);
+                   options().with_transpositions, options().prefix);
   }
 }; // by_edit_distance
 

diff --git a/3rdParty/iresearch/core/search/phrase_filter.cpp b/3rdParty/iresearch/core/search/phrase_filter.cpp
@@ -151,7 +151,7 @@ struct prepare : util::noncopyable {
       index, order, boost, field, part.term,
       0, // collect all terms
       part.max_distance, part.provider,
-      part.with_transpositions);
+      part.with_transpositions, part.prefix);
   }
 
   result_type operator()(const by_terms_options& /*part*/) const {

diff --git a/3rdParty/iresearch/core/utils/levenshtein_utils.cpp b/3rdParty/iresearch/core/utils/levenshtein_utils.cpp
@@ -582,6 +582,7 @@ parametric_description read(data_input& in) {
 
 automaton make_levenshtein_automaton(
     const parametric_description& description,
+    const bytes_ref& prefix,
     const bytes_ref& target) {
   assert(description);
 
@@ -613,18 +614,31 @@ automaton make_levenshtein_automaton(
   UNUSED(invalid_state);
 
   // initial state
-  a.SetStart(a.AddState());
+  auto start_state = a.AddState();
+  a.SetStart(start_state);
+
+  auto begin = prefix.begin();
+  auto end = prefix.end();
+  decltype(start_state) to;
+  for (; begin != end; ) {
+    const byte_type* next = utf8_utils::next(begin, end);
+    to = a.AddState();
+    auto dist = std::distance(begin, next);
+    irs::utf8_emplace_arc(a, start_state, bytes_ref(begin, dist), to);
+    start_state = to;
+    begin = next;
+  }
 
   // check if start state is final
   const auto distance = description.distance(1, utf8_size);
 
   if (distance <= description.max_distance()) {
-    a.SetFinal(a.Start(), {true, distance});
+    a.SetFinal(start_state, {true, distance});
   }
 
   // state stack
   std::vector<state> stack;
-  stack.emplace_back(0, 1, a.Start());  // 0 offset, 1st parametric state, initial automaton state
+  stack.emplace_back(0, 1, start_state);  // 0 offset, 1st parametric state, initial automaton state
 
   std::vector<std::pair<bytes_ref, automaton::StateId>> arcs;
   arcs.resize(utf8_size); // allocate space for max possible number of arcs
@@ -714,7 +728,7 @@ size_t edit_distance(const parametric_description& description,
     const auto c = utf8_utils::next(rhs);
 
     const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
-    const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
+    const auto end = lhs_chars.begin() + std::min(offset + descripti
F438
on.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
     const auto chi = ::chi(begin, end, c);
     const auto& transition = description.transition(state, chi);
 
@@ -753,7 +767,7 @@ bool edit_distance(
     }
 
     const auto begin = lhs_chars.begin() + ptrdiff_t(offset);
-    const auto end = std::min(begin + ptrdiff_t(description.chi_size()), lhs_chars.end());
+    const auto end = lhs_chars.begin() + std::min(offset + description.chi_size(), static_cast<uint64_t>(lhs_chars.size()));
     const auto chi = ::chi(begin, end, c);
     const auto& transition = description.transition(state, chi);
 

diff --git a/3rdParty/iresearch/core/utils/levenshtein_utils.hpp b/3rdParty/iresearch/core/utils/levenshtein_utils.hpp
@@ -244,6 +244,7 @@ IRESEARCH_API parametric_description read(data_input& in);
 ////////////////////////////////////////////////////////////////////////////////
 IRESEARCH_API automaton make_levenshtein_automaton(
   const parametric_description& description,
+  const bytes_ref& prefix,
   const bytes_ref& target);
 
 ////////////////////////////////////////////////////////////////////////////////

diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,9 @@
 v3.8.1 (XXXX-XX-XX)
 -------------------
 
+* Add prefix parameter to LEVENSHTEIN_MATCH function in ArangoSearch
+  (DEVSUPP-753).
+
 * Bug-fix: Pregel WCC algorithm could yield incorrect results if a part of the
   connected component was only attached via OUTBOUND edges.
   The underlying algorithm is now modified to properly retain INBOUND edges for

diff --git a/arangod/Aql/AqlFunctionFeature.cpp b/arangod/Aql/AqlFunctionFeature.cpp
@@ -195,7 +195,7 @@ void AqlFunctionFeature::addStringFunctions() {
   add({"ENCODE_URI_COMPONENT", ".", flags, &Functions::EncodeURIComponent});
   add({"SOUNDEX", ".", flags, &Functions::Soundex});
   add({"LEVENSHTEIN_DISTANCE", ".,.", flags, &Functions::LevenshteinDistance});
-  add({"LEVENSHTEIN_MATCH", ".,.,.|.,.", flags, &Functions::LevenshteinMatch});  // (attribute, target, max distance, [include transpositions, max terms])
+  add({"LEVENSHTEIN_MATCH", ".,.,.|.,.,.", flags, &Functions::LevenshteinMatch});  // (attribute, target, max distance, [include transpositions, max terms, prefix])
   add({"NGRAM_MATCH", ".,.|.,.", flags, &Functions::NgramMatch}); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
   add({"NGRAM_SIMILARITY", ".,.,.", flags, &Functions::NgramSimilarity}); // (attribute, target, ngram size)
   add({"NGRAM_POSITIONAL_SIMILARITY", ".,.,.", flags, &Functions::NgramPositionalSimilarity}); // (attribute, target, ngram size)

diff --git a/arangod/IResearch/IResearchFilterFactory.cpp b/arangod/IResearch/IResearchFilterFactory.cpp
@@ -2706,7 +2706,7 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
   }
   auto const argc = ElementTraits::numMembers(args);
   constexpr size_t min = 3 - First;
-  constexpr size_t max = 5 - First;
+  constexpr size_t max = 6 - First;
   if (argc < min || argc > max) {
     return error::invalidArgsCount<error::Range<min, max>>(funcName).withError(
         [&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
@@ -2795,16 +2795,29 @@ Result getLevenshteinArguments(char const* funcName, bool isFilter, QueryContext
     }
   }
 
+  // optional (5 - First) argument defines prefix for target
+  irs::string_ref prefix = irs::string_ref::EMPTY;
+  if (5 - First < argc) {
+    res = ElementTraits::evaluateArg(prefix, tmpValue, funcName,
+                                     args, 5 - First, isFilter, ctx);
+
+    if (res.fail()) {
+      return res.withError(
+          [&](result::Error& err) { err.appendErrorMessage(errorSuffix); });
+    }
+  }
+
   irs::assign(opts.term, irs::ref_cast<irs::byte_type>(target));
   opts.with_transpositions = withTranspositions;
   opts.max_distance = static_cast<irs::byte_type>(maxDistance);
   opts.max_terms = static_cast<size_t>(maxTerms);
   opts.provider = &arangodb::iresearch::getParametricDescription;
+  irs::assign(opts.prefix, irs::ref_cast<irs::byte_type>(prefix));
 
   return {};
 }
 
-// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions> ] ']'}
+// {<LEVENSHTEIN_MATCH>: '[' <term>, <max_distance> [, <with_transpositions>, <prefix> ] ']'}
 Result fromFuncPhraseLevenshteinMatch(char const* funcName,
                                       size_t const funcArgumentPosition,
                                       char const* subFuncName,

diff --git a/tests/IResearch/IResearchFeature-test.cpp b/tests/IResearch/IResearchFeature-test.cpp
@@ -1577,7 +1577,7 @@ TEST_F(IResearchFeatureTest, test_start) {
     { "MIN_MATCH", { ".,.|.+", FunctionType::FILTER } },
     { "LIKE", { ".,.|.", FunctionType::FILTER } },
     { "NGRAM_MATCH", { ".,.|.,.", FunctionType::FILTER } },
-    { "LEVENSHTEIN_MATCH", { ".,.,.|.,.", FunctionType::FILTER } },
+    { "LEVENSHTEIN_MATCH", { ".,.,.|.,.,.", FunctionType::FILTER } },
     { "IN_RANGE", { ".,.,.,.,.", FunctionType::FILTER } },
     { "GEO_IN_RANGE", { ".,.,.,.|.,.,.", FunctionType::FILTER } },
     { "GEO_CONTAINS", { ".,.", FunctionType::FILTER } },