10000 Feature/ngram similarity function (#11276) · arangodb/arangodb@2484e81 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2484e81

Browse files
authored
Feature/ngram similarity function (#11276)
* Implemented similarity functions * WIP * added NGRAM_MATCH implementation * WIP * Added tests and code cleanup * Added constants container * cleanup * Adressed review comments * more cleanup
1 parent 0a95b1e commit 2484e81

13 files changed

+1066
-13
lines changed

arangod/Aql/AqlFunctionFeature.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,9 @@ void AqlFunctionFeature::addStringFunctions() {
220220
add({"SOUNDEX", ".", flags, &Functions::Soundex});
221221
add({"LEVENSHTEIN_DISTANCE", ".,.", flags, &Functions::LevenshteinDistance});
222222
add({"LEVENSHTEIN_MATCH", ".,.,.|.", flags, &Functions::LevenshteinMatch}); // (attribute, target, max distance, [include transpositions])
223-
223+
add({"NGRAM_MATCH", ".,.|.,.", flags, &Functions::NgramMatch}); // (attribute, target, [threshold, analyzer]) OR (attribute, target, [analyzer])
224+
add({"NGRAM_SIMILARITY", ".,.,.", flags, &Functions::NgramSimilarity}); // (attribute, target, ngram size)
225+
add({"NGRAM_POSITIONAL_SIMILARITY", ".,.,.", flags, &Functions::NgramPositionalSimilarity}); // (attribute, target, ngram size)
224226
// special flags:
225227
add({"RANDOM_TOKEN", ".", Function::makeFlags(FF::CanRunOnDBServer),
226228
&Functions::RandomToken}); // not deterministic and not cacheable

arangod/Aql/Functions.cpp

Lines changed: 197 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@
6060
#include "Pregel/Worker.h"
6161
#include "IResearch/VelocyPackHelper.h"
6262
#include "IResearch/IResearchPDP.h"
63+
#include "IResearch/IResearchAnalyzerFeature.h"
64+
#include "IResearch/IResearchFilterFactory.h"
6365
#include "Random/UniformCharacter.h"
6466
#include "Rest/Version.h"
67+
#include "RestServer/SystemDatabaseFeature.h"
6568
#include "Ssl/SslInterface.h"
6669
#include "Transaction/Context.h"
6770
#include "Transaction/Helpers.h"
@@ -73,7 +76,10 @@
7376
#include "VocBase/KeyGenerator.h"
7477
#include "VocBase/LogicalCollection.h"
7578

79+
#include "analysis/token_attributes.hpp"
7680
#include "utils/levenshtein_utils.hpp"
81+
#include "utils/ngram_match_utils.hpp"
82+
7783

7884
#include <boost/uuid/uuid.hpp>
7985
#include <boost/uuid/uuid_generators.hpp>
@@ -1578,6 +1584,195 @@ AqlValue Functions::LevenshteinDistance(ExpressionContext*, transaction::Methods
15781584
return AqlValue(AqlValueHintInt(encoded));
15791585
}
15801586

1587+
1588+
namespace {
1589+
template<bool search_semantics>
1590+
AqlValue NgramSimilarityHelper(char const* AFN, ExpressionContext* ctx, transaction::Methods* trx,
1591+
VPackFunctionParameters const& args) {
1592+
if (args.size() < 3) {
1593+
registerWarning(
1594+
ctx, AFN,
1595+
arangodb::Result{ TRI_ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH,
1596+
"Minimum 3 arguments are expected." });
1597+
return AqlValue(AqlValueHintNull());
1598+
}
1599+
1600+
auto const& attribute = extractFunctionParameterValue(args, 0);
1601+
if (ADB_UNLIKELY(!attribute.isString())) {
1602+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1603+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1604+
}
1605+
auto const attributeValue = arangodb::iresearch::getStringRef(attribute.slice());
1606+
1607+
auto const& target = extractFunctionParameterValue(args, 1);
1608+
if (ADB_UNLIKELY(!target.isString())) {
1609+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1610+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1611+
}
1612+
auto const targetValue = arangodb::iresearch::getStringRef(target.slice());
1613+
1614+
auto const& ngramSize = extractFunctionParameterValue(args, 2);
1615+
if (ADB_UNLIKELY(!ngramSize.isNumber())) {
1616+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1617+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1618+
}
1619+
auto const ngramSizeValue = ngramSize.toInt64();
1620+
1621+
if (ADB_UNLIKELY(ngramSizeValue < 1)) {
1622+
arangodb::aql::registerWarning(ctx, AFN,
1623+
arangodb::Result{TRI_ERROR_BAD_PARAMETER,
1624+
"Invalid ngram size. Should be 1 or greater"});
1625+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1626+
}
1627+
1628+
auto utf32Attribute = basics::StringUtils::characterCodes(attributeValue.c_str(), attributeValue.size());
1629+
auto utf32Target = basics::StringUtils::characterCodes(targetValue.c_str(), targetValue.size());
1630+
1631+
auto const similarity =
1632+
irs::ngram_similarity<uint32_t, search_semantics>(
1633+
utf32Target.data(), utf32Target.size(),
1634+
utf32Attribute.data(), utf32Attribute.size(),
1635+
ngramSizeValue);
1636+
return AqlValue(AqlValueHintDouble(similarity));
1637+
}
1638+
}
1639+
1640+
/// Executes NGRAM_SIMILARITY based on binary ngram similarity
1641+
AqlValue Functions::NgramSimilarity(ExpressionContext* ctx, transaction::Methods* trx,
1642+
VPackFunctionParameters const& args) {
1643+
static char const* AFN = "NGRAM_SIMILARITY";
1644+
return NgramSimilarityHelper<true>(AFN, ctx, trx, args);
1645+
}
1646+
1647+
/// Executes NGRAM_POSITIONAL_SIMILARITY based on positional ngram similarity
1648+
AqlValue Functions::NgramPositionalSimilarity(ExpressionContext* ctx, transaction::Methods* trx,
1649+
VPackFunctionParameters const& args) {
1650+
static char const* AFN = "NGRAM_POSITIONAL_SIMILARITY";
1651+
return NgramSimilarityHelper<false>(AFN, ctx, trx, args);
1652+
}
1653+
1654+
/// Executes NGRAM_MATCH based on binary ngram similarity
1655+
AqlValue Functions::NgramMatch(ExpressionContext* ctx, transaction::Methods* trx,
1656+
VPackFunctionParameters const& args) {
1657+
static char const* AFN = "NGRAM_MATCH";
1658+
1659+
auto const argc = args.size();
1660+
1661+
if (argc < 3) { // for const evaluation we need analyzer to be set explicitly (we can`t access filter context)
1662+
// but we can`t set analyzer as mandatory in function AQL signature - this will break SEARCH
1663+
registerWarning(
1664+
ctx, AFN,
1665+
arangodb::Result{ TRI_ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH,
1666+
"Minimum 3 arguments are expected."});
1667+
return AqlValue(AqlValueHintNull());
1668+
}
1669+
1670+
auto const& attribute = extractFunctionParameterValue(args, 0);
1671+
if (ADB_UNLIKELY(!attribute.isString())) {
1672+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1673+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1674+
}
1675+
auto const attributeValue = iresearch::getStringRef(attribute.slice());
1676+
1677+
auto const& target = extractFunctionParameterValue(args, 1);
1678+
if (ADB_UNLIKELY(!target.isString())) {
1679+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1680+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1681+
}
1682+
auto const targetValue = iresearch::getStringRef(target.slice());
1683+
1684+
auto threshold = arangodb::iresearch::FilterConstants::DefaultNgramMatchThreshold;
1685+
size_t analyzerPosition = 2;
1686+
if (argc > 3) {// 4 args given. 3rd is threshold
1687+
auto const& thresholdArg = extractFunctionParameterValue(args, 2);
1688+
analyzerPosition = 3;
1689+
if (ADB_UNLIKELY(!thresholdArg.isNumber())) {
1690+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1691+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1692+
}
1693+
threshold = thresholdArg.toDouble();
1694+
if (threshold <= 0 || threshold > 1) {
1695+
arangodb::aql::registerWarning(
1696+
ctx, AFN,
1697+
arangodb::Result{TRI_ERROR_BAD_PARAMETER, "Threshold must be between 0 and 1" });
1698+
}
1699+
}
1700+
1701+
auto const& analyzerArg = extractFunctionParameterValue(args, analyzerPosition);
1702+
if (ADB_UNLIKELY(!analyzerArg.isString())) {
1703+
arangodb::aql::registerInvalidArgumentWarning(ctx, AFN);
1704+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1705+
}
1706+
if (ADB_UNLIKELY(nullptr == trx)) {
1707+
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
1708+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1709+
}
1710+
auto const analyzerId = arangodb::iresearch::getStringRef(analyzerArg.slice());
1711+
auto& server = trx->vocbase().server();
1712+
if (!server.hasFeature<iresearch::IResearchAnalyzerFeature>()) {
1713+
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
1714+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1715+
}
1716+
auto& analyzerFeature = server.getFeature<iresearch::IResearchAnalyzerFeature>();
1717+
1718+
auto sysVocbase = server.hasFeature<arangodb::SystemDatabaseFeature>()
1719+
? server.getFeature<arangodb::SystemDatabaseFeature>().use()
1720+
: nullptr;
1721+
1722+
if (ADB_UNLIKELY(nullptr == sysVocbase)) {
1723+
arangodb::aql::registerWarning(ctx, AFN, TRI_ERROR_INTERNAL);
1724+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1725+
}
1726+
auto analyzer = analyzerFeature.get(analyzerId, trx->vocbase(), *sysVocbase);
1727+
if (!analyzer) {
1728+
arangodb::aql::registerWarning(
1729+
ctx, AFN,
1730+
arangodb::Result{ TRI_ERROR_BAD_PARAMETER, "Unable to load requested analyzer" });
1731+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1732+
}
1733+
1734+
auto analyzerImpl = analyzer->get();
1735+
TRI_ASSERT(analyzerImpl);
1736+
irs::term_attribute const& token = *analyzerImpl->attributes().get<irs::term_attribute>();
1737+
1738+
std::vector<irs::bstring> attrNgrams;
1739+
analyzerImpl->reset(attributeValue);
1740+
while (analyzerImpl->next()) {
1741+
attrNgrams.push_back(token.value());
1742+
}
1743+
1744+
std::vector<irs::bstring> targetNgrams;
1745+
analyzerImpl->reset(targetValue);
1746+
while (analyzerImpl->next()) {
1747+
targetNgrams.push_back(token.value());
1748+
}
1749+
1750+
// consider only non empty ngrams sets. As no ngrams emitted - means no data in index = no match
1751+
if (!targetNgrams.empty() && !attrNgrams.empty()) {
1752+
size_t thresholdMatches = (size_t)std::ceil((float_t)targetNgrams.size() * threshold);
1753+
size_t d = 0; // will store upper-left cell value for current cache row
1754+
std::vector<size_t> cache(attrNgrams.size() + 1, 0);
1755+
for (auto const& targetNgram : targetNgrams) {
1756+
size_t s_ngram_idx = 1;
1757+
for (; s_ngram_idx <= attrNgrams.size(); ++s_ngram_idx) {
1758+
size_t curMatches = d + (size_t)(attrNgrams[s_ngram_idx - 1] == targetNgram);
1759+
if (curMatches >= thresholdMatches) {
1760+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{true} };
1761+
}
1762+
auto tmp = cache[s_ngram_idx];
1763+
cache[s_ngram_idx] =
1764+
std::max(
1765+
std::max(cache[s_ngram_idx - 1],
1766+
cache[s_ngram_idx]),
1767+
curMatches);
1768+
d = tmp;
1769+
}
1770+
}
1771+
}
1772+
return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{false} };
1773+
}
1774+
1775+
15811776
/// Executes LEVENSHTEIN_MATCH
15821777
AqlValue Functions::LevenshteinMatch(ExpressionContext* ctx, transaction::Methods* trx,
15831778
VPackFunctionParameters const& args) {
@@ -6560,7 +6755,7 @@ AqlValue Functions::ReplaceNth(ExpressionContext* expressionContext, transaction
65606755
registerInvalidArgumentWarning(expressionContext, AFN);
65616756
return AqlValue(AqlValueHintNull());
65626757
}
6563-
6758+
65646759
if (offset.isNull(true)) {
65656760
THROW_ARANGO_EXCEPTION_PARAMS(TRI_ERROR_QUERY_FUNCTION_ARGUMENT_TYPE_MISMATCH, AFN);
65666761
}
@@ -6580,7 +6775,7 @@ AqlValue Functions::ReplaceNth(ExpressionContext* expressionContext, transaction
65806775
AqlValueMaterializer materializer(trx);
65816776
VPackSlice arraySlice = materializer.slice(baseArray, false);
65826777
VPackSlice replaceValue = materializer.slice(newValue, false);
6583-
6778+
65846779
transaction::BuilderLeaser builder(trx);
65856780
builder->openArray();
65866781

arangod/Aql/Functions.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,13 @@ struct Functions {
150150
static AqlValue LevenshteinMatch(arangodb::aql::ExpressionContext*,
151151
transaction::Methods*,
152152
VPackFunctionParameters const&);
153+
static AqlValue NgramSimilarity(ExpressionContext*, transaction::Methods*,
154+
VPackFunctionParameters const&);
155+
static AqlValue NgramPositionalSimilarity(ExpressionContext* ctx,
156+
transaction::Methods*,
157+
VPackFunctionParameters const&);
158+
static AqlValue NgramMatch(ExpressionContext*, transaction::Methods*,
159+
VPackFunctionParameters const&);
153160
// Date
154161
static AqlValue DateNow(arangodb::aql::ExpressionContext*,
155162
transaction::Methods*, VPackFunctionParameters const&);

arangod/IResearch/IResearchFeature.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ arangodb::aql::AqlValue dummyFilterFunc(arangodb::aql::ExpressionContext*,
9696
arangodb::containers::SmallVector<arangodb::aql::AqlValue> const&) {
9797
THROW_ARANGO_EXCEPTION_MESSAGE(
9898
TRI_ERROR_NOT_IMPLEMENTED,
99-
"ArangoSearch filter functions EXISTS, IN_RANGE, PHRASE, NGRAM_MATCH "
99+
"ArangoSearch filter functions EXISTS, IN_RANGE, PHRASE "
100100
" are designed to be used only within a corresponding SEARCH statement "
101101
"of ArangoSearch view."
102102
" Please ensure function signature is correct.");
@@ -407,7 +407,6 @@ void registerFilters(arangodb::aql::AqlFunctionFeature& functions) {
407407
addFunction(functions, { "MIN_MATCH", ".,.|.+", flags, &minMatchFunc }); // (filter expression [, filter expression, ... ], min match count)
408408
addFunction(functions, { "BOOST", ".,.", flags, &contextFunc }); // (filter expression, boost)
409409
addFunction(functions, { "ANALYZER", ".,.", flags, &contextFunc }); // (filter expression, analyzer)
410-
addFunction(functions, { "NGRAM_MATCH", ".,.|.,.", flags, &dummyFilterFunc }); // (attribute, target, threshold, [analyzer]) OR (attribute, target, [analyzer])
411410
}
412411

413412
namespace {
@@ -584,7 +583,8 @@ bool isFilter(arangodb::aql::Function const& func) noexcept {
584583
func.implementation == &minMatchFunc ||
585584
func.implementation == &startsWithFunc ||
586585
func.implementation == &aql::Functions::LevenshteinMatch ||
587-
func.implementation == &aql::Functions::Like;
586+
func.implementation == &aql::Functions::Like ||
587+
func.implementation == &aql::Functions::NgramMatch;
588588
}
589589

590590
bool isScorer(arangodb::aql::Function const& func) noexcept {

arangod/IResearch/IResearchFilterFactory.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2237,7 +2237,7 @@ arangodb::Result getLevenshteinArguments(char const* funcName, bool isFilter,
22372237
};
22382238
}
22392239

2240-
scoringLimit = 128; // FIXME make configurable
2240+
scoringLimit = FilterConstants::DefaultScoringTermsLimit;
22412241

22422242
return {};
22432243
}
@@ -2623,7 +2623,7 @@ arangodb::Result fromFuncNgramMatch(
26232623
}
26242624
}
26252625

2626-
double_t threshold = 0.7;
2626+
auto threshold = FilterConstants::DefaultNgramMatchThreshold;
26272627
TRI_ASSERT(filterCtx.analyzer);
26282628
auto analyzerPool = filterCtx.analyzer;
26292629

@@ -2769,7 +2769,7 @@ arangodb::Result fromFuncStartsWith(
27692769
return rv;
27702770
}
27712771

2772-
size_t scoringLimit = 128; // FIXME make configurable
2772+
size_t scoringLimit = FilterConstants::DefaultScoringTermsLimit;
27732773

27742774
if (argc > 2) {
27752775
// 3rd (optional) argument defines a number of scored terms
@@ -2915,7 +2915,7 @@ arangodb::Result fromFuncLike(
29152915
return res;
29162916
}
29172917

2918-
const size_t scoringLimit = 128; // FIXME make configurable
2918+
const auto scoringLimit = FilterConstants::DefaultScoringTermsLimit;
29192919

29202920
if (filter) {
29212921
std::string name;

arangod/IResearch/IResearchFilterFactory.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ struct FilterFactory {
5454
arangodb::aql::AstNode const& node);
5555
}; // FilterFactory
5656

57+
58+
struct FilterConstants {
59+
// Defaults
60+
static constexpr size_t DefaultScoringTermsLimit { 128 };
61+
static constexpr double_t DefaultNgramMatchThreshold { 0.7 };
62+
};
63+
5764
} // namespace iresearch
5865
} // namespace arangodb
5966

0 commit comments

Comments
 (0)
0