8000 Feature/ngram similarity function by Dronplane · Pull Request #11276 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/ngram similarity function #11276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 16, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension 8000

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Implemented similarity functions
  • Loading branch information
Dronplane committed Mar 10, 2020
commit 5f104d8505ac6f34232dbc0d4386103c40d8c8d4
138 changes: 138 additions & 0 deletions 3rdParty/iresearch/core/utils/ngram_match_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2020 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Andrei Lobov
////////////////////////////////////////////////////////////////////////////////

#ifndef IRESEARCH_NGRAM_MATCH_UTILS_H
#define IRESEARCH_NGRAM_MATCH_UTILS_H

#include "shared.hpp"
#include "utf8_utils.hpp"
#include <vector>

NS_ROOT

////////////////////////////////////////////////////////////////////////////////
/// Evaluates ngram similarity between the specified strings based on
/// "N-gram similarity and distance" by Grzegorz Kondrak
/// http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
/// Could operate in two forms depending on search_semantics.
/// If search_semantics is false then positional ngram similarity is used,
/// binary ngram similarity is used otherwise. Also setting search_semantics to
/// true disables normalizing resulting similarity by length of longest string and
/// result is evaluated based strictly on target length (search-like semantics).
/// If search_semantics is false there is no difference which string pass
/// as target. First characters affixing currently is not implemented
/// @param target string to seek similarity for
/// @param target_size length of target string
/// @param src string to seek similarity in
/// @param src_size length of source string
/// @param ngram_size ngram size
/// @returns similarity value
////////////////////////////////////////////////////////////////////////////////
template<typename T, bool search_semantics>
float_t ngram_similarity(const T* target, size_t target_size,
const T* src, size_t src_size,
size_t ngram_size) {
if (ngram_size == 0) {
return 0.f;
}

if /*consexpr*/ (!search_semantics) {
if (target_size > src_size) {
std::swap(target_size, src_size);
std::swap(target, src);
}
}

if (target_size < ngram_size || src_size < ngram_size) {
if /*constexpr*/ (!search_semantics) {
if (target_size == 0 && src_size == 0) {
return 1; // consider two empty strings as matched
}
const T* r = src;
size_t matched = 0;
for (const T* it = target; it != target + target_size; ) {
matched += size_t(*it == *r);
++r;
++it;
}
return float_t(matched) / float_t(src_size);
} else {
if (target_size == src_size) {
return memcmp(target, src, target_size * sizeof(T)) == 0 ? 1 : 0;
}
return 0;
}
}

const size_t t_ngram_count = target_size - ngram_size + 1;
const size_t s_ngram_count = src_size - ngram_size + 1;
const T* t_ngram_start = target;
const T* t_ngram_start_end = target + target_size - ngram_size + 1; // end() analog for target ngram start

float_t d = 0; // will store upper-left cell value for current cache row
std::vector<float_t> cache(s_ngram_count + 1, 0);

// here could be constructed source string with start characters affixing

size_t t_ngram_idx = 1;
for (; t_ngram_start != t_ngram_start_end; ++t_ngram_start, ++t_ngram_idx) {
const T* t_ngram_end = t_ngram_start + ngram_size;
const T* s_ngram_start = src;
size_t s_ngram_idx = 1;
const T* s_ngram_start_end = src + src_size - ngram_size + 1; // end() analog for src ngram start

for (; s_ngram_start != s_ngram_start_end; ++s_ngram_start, ++s_ngram_idx) {
const T* rhs_ngram_end = s_ngram_start + ngram_size;
float_t similarity = !search_semantics ? 0 : 1;
for (const T* l = t_ngram_start, *r = s_ngram_start; l != t_ngram_end && r != rhs_ngram_end; ++l, ++r) {
if /*constexpr*/ (search_semantics) {
if (*l != *r) {
similarity = 0;
break;
}
} else {
if (*l == *r) {
++similarity;
}
}
}
if /*constexpr*/ (!search_semantics) {
similarity = similarity / float_t(ngram_size);
}

auto tmp = cache[s_ngram_idx];
cache[s_ngram_idx] =
std::max(
std::max(cache[s_ngram_idx - 1],
cache[s_ngram_idx]),
d + similarity);
d = tmp;
}
}
return cache[s_ngram_count] /
float_t((!search_semantics) ? s_ngram_count : t_ngram_count);
}

NS_END


#endif // IRESEARCH_NGRAM_MATCH_UTILS_H
4 changes: 3 additions & 1 deletion arangod/Aql/AqlFunctionFeature.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,9 @@ void AqlFunctionFeature::addStringFunctions() {
add({"SOUNDEX", ".", flags, &Functions::Soundex});
add({"LEVENSHTEIN_DISTANCE", ".,.", flags, &Functions::LevenshteinDistance});
add({"LEVENSHTEIN_MATCH", ".,.,.|.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions])

// add({"NGRAM_MATCH", ".,.,.|.", flags, &Functions::NgramMatch}); // (attribute, target, [threshold])
add({"NGRAM_SIMILARITY", ".,.,.", flags, &Functions::NgramSimilarity}); // (attribute, target, ngram size)
add({"NGRAM_POSITIONAL_SIMILARITY", ".,.,.", flags, &Functions::NgramPositionalSimilarity}); // (attribute, target, ngram size)
// special flags:
add({"RANDOM_TOKEN", ".", Function::makeFlags(FF::CanRunOnDBServer),
&Functions::RandomToken}); // not deterministic and not cacheable
Expand Down
83 changes: 83 additions & 0 deletions arangod/Aql/Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include "VocBase/LogicalCollection.h"

#include "utils/levenshtein_utils.hpp"
#include "utils/ngram_match_utils.hpp"

#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
Expand Down Expand Up @@ -1578,6 +1579,88 @@ AqlValue Functions::LevenshteinDistance(ExpressionContext*, transaction::Methods
return AqlValue(AqlValueHintInt(encoded));
}


namespace {
AqlValue NgramSimilarityHelper(char const* AFN, bool search_semantics, ExpressionContext* ctx, transaction::Methods* trx,
VPackFunctionParameters const& args) {

auto const& attribute = extractFunctionParameterValue(args, 0);
if (ADB_UNLIKELY(!attribute.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const attributeValue = attribute.isString() ?
arangodb::iresearch::getStringRef(attribute.slice()) : irs::string_ref::EMPTY;

auto const& target = extractFunctionParameterValue(args, 1);
if (ADB_UNLIKELY(!target.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const targetValue = target.isString() ?
arangodb::iresearch::getStringRef(target.slice()) : irs::string_ref::EMPTY;

auto const& ngramSize = extractFunctionParameterValue(args, 2);
if (ADB_UNLIKELY(!ngramSize.isNumber())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const ngramSizeValue = ngramSize.toInt64();

if (ADB_UNLIKELY(ngramSizeValue < 1)) {
arangodb::aql::registerWarning(ctx, AFN,
arangodb::Result{TRI_ERROR_BAD_PARAMETER,
"Invalid ngram size. Should be 1 or greater"});
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}

auto utf32Attribute = basics::StringUtils::characterCodes(attributeValue.c_str(), attributeValue.size());
auto utf32Target = basics::StringUtils::characterCodes(targetValue.c_str(), targetValue.size());

auto const similarity = search_semantics ?
irs::ngram_similarity<uint32_t, true>(
utf32Target.data(), utf32Target.size(),
utf32Attribute.data(), utf32Attribute.size(),
ngramSizeValue) :
irs::ngram_similarity<uint32_t, false>(
utf32Target.data(), utf32Target.size(),
utf32Attribute.data(), utf32Attribute.size(),
ngramSizeValue);
return AqlValue(AqlValueHintDouble(similarity));
}
}

/// Executes NGRAM_SIMILARITY based on binary ngram similarity
AqlValue Functions::NgramSimilarity(ExpressionContext* ctx, transaction::Methods* trx,
VPackFunctionParameters const& args) {
static char const* AFN = "NGRAM_SIMILARITY";
return NgramSimilarityHelper(AFN, true, ctx, trx, args);
//double threshold{ 0.7 };
//if (args.size() > 2) {
// auto const& userThreshold = extractFunctionParameterValue(args, 2);
// if (ADB_UNLIKELY(!userThreshold.isNumber())) {
// arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
// return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
// }
// threshold = userThreshold.toDouble();
// if (ADB_UNLIKELY(threshold <= 0 || threshold > 1)) {
// LOG_TOPIC("2d81c", WARN, Logger::AQL)
// << AFN << " AQL function: invalid threshold value. Value should be between 0 and 1";
// registerWarning(ctx, AFN, TRI_ERROR_BAD_PARAMETER);
// return AqlValue(AqlValueHintNull());
// }
//}

}

/// Executes NGRAM_POSITIONAL_SIMILARITY based on positional ngram similarity
AqlValue Functions::NgramPositionalSimilarity(ExpressionContext* ctx, transaction::Methods* trx,
VPackFunctionParameters const& args) {
static char const* AFN = "NGRAM_POSITIONAL_SIMILARITY";
return NgramSimilarityHelper(AFN, false, ctx, trx, args);
}


/// Executes LEVENSHTEIN_MATCH
AqlValue Functions::LevenshteinMatch(ExpressionContext* ctx, transaction::Methods* trx,
VPackFunctionParameters const& args) {
Expand Down
5 changes: 5 additions & 0 deletions arangod/Aql/Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ struct Functions {
static AqlValue LevenshteinMatch(arangodb::aql::ExpressionContext*,
transaction::Methods*,
VPackFunctionParameters const&);
static AqlValue NgramSimilarity(ExpressionContext*, transaction::Methods*,
VPackFunctionParameters const&);
static AqlValue NgramPositionalSimilarity(ExpressionContext* ctx,
transaction::Methods*,
VPackFunctionParameters const&);
// Date
static AqlValue DateNow(arangodb::aql::ExpressionContext*,
transaction::Methods*, VPackFunctionParameters const&);
Expand Down
0