8000 Feature/ngram similarity function by Dronplane · Pull Request #11276 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/ngram similarity function #11276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 16, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added NGRAM_MATCH implementation
  • Loading branch information
Dronplane committed Mar 11, 2020
commit d4e15c15d9adc41071ca767cb7ae89f151d38fcf
134 changes: 128 additions & 6 deletions arangod/Aql/Functions.cpp
< 8000 td id="diff-b2e365679b783182c37e63f316b2d6ba012698d7ce3bb0caa8af98b71c431c3bR1656" data-line-number="1656" class="blob-num blob-num-addition js-linkable-line-number js-blob-rnum">
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,10 @@
#include "Pregel/Worker.h"
#include "IResearch/VelocyPackHelper.h"
#include "IResearch/IResearchPDP.h"
#include "IResearch/IResearchAnalyzerFeature.h"
#include "Random/UniformCharacter.h"
#include "Rest/Version.h"
#include "RestServer/SystemDatabaseFeature.h"
#include "Ssl/SslInterface.h"
#include "Transaction/Context.h"
#include "Transaction/Helpers.h"
Expand All @@ -73,9 +75,11 @@
#include "VocBase/KeyGenerator.h"
#include "VocBase/LogicalCollection.h"

#include "analysis/token_attributes.hpp"
#include "utils/levenshtein_utils.hpp"
#include "utils/ngram_match_utils.hpp"


#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>
Expand Down Expand Up @@ -1590,16 +1594,14 @@ namespace {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const attributeValue = attribute.isString() ?
arangodb::iresearch::getStringRef(attribute.slice()) : irs::string_ref::EMPTY;
auto const attributeValue = arangodb::iresearch::getStringRef(attribute.slice());

auto const& target = extractFunctionParameterValue(args, 1);
if (ADB_UNLIKELY(!target.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const targetValue = target.isString() ?
arangodb::iresearch::getStringRef(target.slice()) : irs::string_ref::EMPTY;
auto const targetValue = arangodb::iresearch::getStringRef(target.slice());

auto const& ngramSize = extractFunctionParameterValue(args, 2);
if (ADB_UNLIKELY(!ngramSize.isNumber())) {
Expand Down Expand Up @@ -1641,9 +1643,129 @@ AqlValue Functions::NgramPositionalSimilarity(ExpressionContext* ctx, transactio
return NgramSimilarityHelper<false>(AFN, ctx, trx, args);
}

/// Executes NGRAM_MATCH based on binary ngram similarity
AqlValue Functions::NgramMatch(ExpressionContext* ctx, transaction::Methods* trx,
VPackFunctionParameters const& args) {
return AqlValue(AqlValueHintNull{});
VPackFunctionParameters const& args) {
static char const* AFN = "NGRAM_MATCH";

auto const argc = args.size();

if (argc < 3) { // for const evaluation we need analyzer to be set explicitly (we can`t access filter context)
// but we can`t set analyzer as mandatory in function AQL signature - this will break SEARCH
registerWarning(
ctx, AFN,
arangodb::Result{ TRI_ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH,
"Minimum 3 arguments are expected."});
return AqlValue(AqlValueHintNull());
}

auto const& attribute = extractFunctionParameterValue(args, 0);
if (ADB_UNLIKELY(!attribute.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const attributeValue = iresearch::getStringRef(attribute.slice());

auto const& target = extractFunctionParameterValue(args, 1);
if (ADB_UNLIKELY(!target.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const targetValue = iresearch::getStringRef(target.slice());

double_t threshold = 0.7;//!!!! to constants!
size_t analyzerPosition = 2;
if (argc > 3) {// 4 args given. 3rd is threshold
auto const& thresholdArg = extractFunctionParameterValue(args, 2);
analyzerPosition = 3;
if (ADB_UNLIKELY(!thresholdArg.isNumber())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
threshold = thresholdArg.toDouble();
if (threshold <= 0 || threshold > 1) {
arangodb::aql::registerWarning(
ctx, AFN,
arangodb::Result{TRI_ERROR_BAD_PARAMETER, "Threshold must be between 0 and 1" });
}
}

auto const& analyzerArg = extractFunctionParameterValue(args, analyzerPosition);
if (ADB_UNLIKELY(!analyzerArg.isString())) {
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
if (ADB_UNLIKELY(nullptr == trx)) {
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto const analyzerId = arangodb::iresearch::getStringRef(analyzerArg.slice());
auto& server = trx->vocbase().server();
if (!server.hasFeature<iresearch::IResearchAnalyzerFeature>()) {
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto& analyzerFeature = server.getFeature<iresearch::IResearchAnalyzerFeature>();

auto sysVocbase = server.hasFeature<arangodb::SystemDatabaseFeature>()
? server.getFeature<arangodb::SystemDatabaseFeature>().use()
: nullptr;

< 8000 /span>
if (ADB_UNLIKELY(nullptr == sysVocbase)) {
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}
auto analyzer = analyzerFeature.get(analyzerId, trx->vocbase(), *sysVocbase);
if (!analyzer) {
arangodb::aql::registerWarning(
ctx, AFN,
arangodb::Result{ TRI_ERROR_BAD_PARAMETER, "Unable to load requested analyzer" });
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
}

auto analyzerImpl = analyzer->get();
TRI_ASSERT(analyzerImpl);
irs::term_attribute const& token = *analyzerImpl->attributes().get<irs::term_attribute>();

std::vector<irs::bstring> attrNgrams;
analyzerImpl->reset(attributeValue);
while (analyzerImpl->next()) {
attrNgrams.push_back(token.value());
}

std::vector<irs::bstring> targetNgrams;
analyzerImpl->reset(targetValue);
while (analyzerImpl->next()) {
targetNgrams.push_back(token.value());
}

if (targetNgrams.empty() && attrNgrams.empty()) {
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{true} };
}

size_t thresholdMatches = (size_t)std::ceil((float_t)targetNgrams.size() * threshold);

if (!targetNgrams.empty() && !attrNgrams.empty()) {
size_t d = 0; // will store upper-left cell value for current cache row
std::vector<size_t> cache(attrNgrams.size() + 1, 0);
for (auto const& targetNgram : targetNgrams) {
size_t s_ngram_idx = 1;
for (; s_ngram_idx <= attrNgrams.size(); ++s_ngram_idx) {
size_t curMatches = d + (size_t)(attrNgrams[s_ngram_idx + 1] == targetNgram);
if (curMatches >= thresholdMatches ) {
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{true} };
}
auto tmp = cache[s_ngram_idx];
cache[s_ngram_idx] =
std::max(
std::max(cache[s_ngram_idx - 1],
cache[s_ngram_idx]),
curMatches);
d = tmp;
}
}
}
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{false} };
}


Expand Down
4 changes: 2 additions & 2 deletions arangod/IResearch/IResearchFeature.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ arangodb::aql::AqlValue dummyFilterFunc(arangodb::aql::ExpressionContext*,
arangodb::containers::SmallVector<arangodb::aql::AqlValue> const&) {
THROW_ARANGO_EXCEPTION_MESSAGE(
TRI_ERROR_NOT_IMPLEMENTED,
"ArangoSearch filter functions EXISTS, IN_RANGE, PHRASE, NGRAM_MATCH "
"ArangoSearch filter functions EXISTS, IN_RANGE, PHRASE "
" are designed to be used only within a corresponding SEARCH statement "
"of ArangoSearch view."
" Please ensure function signature is correct.");
Expand Down Expand Up @@ -407,7 +407,7 @@ void registerFilters(arangodb::aql::AqlFunctionFeature& functions) {
addFunction(functions, { "MIN_MATCH", ".,.|.+", flags, &minMatchFunc }); // (filter expression [, filter expression, ... ], min match count)
addFunction(functions, { "BOOST", ".,.", flags, &contextFunc }); // (filter expression, boost)
addFunction(functions, { "ANALYZER", ".,.", flags, &contextFunc }); // (filter expression, analyzer)
addFunction(functions, { "NGRAM_MATCH", ".,.|.,.", flags, &dummyFilterFunc }); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
//addFunction(functions, { "NGRAM_MATCH", ".,.|.,.", flags, &dummyFilterFunc }); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
}

namespace {
Expand Down
0