60
60
#include " Pregel/Worker.h"
61
61
#include " IResearch/VelocyPackHelper.h"
62
62
#include " IResearch/IResearchPDP.h"
63
+ #include " IResearch/IResearchAnalyzerFeature.h"
64
+ #include " IResearch/IResearchFilterFactory.h"
63
65
#include " Random/UniformCharacter.h"
64
66
#include " Rest/Version.h"
67
+ #include " RestServer/SystemDatabaseFeature.h"
65
68
#include " Ssl/SslInterface.h"
66
69
#include " Transaction/Context.h"
67
70
#include " Transaction/Helpers.h"
73
76
#include " VocBase/KeyGenerator.h"
74
77
#include " VocBase/LogicalCollection.h"
75
78
79
+ #include " analysis/token_attributes.hpp"
76
80
#include " utils/levenshtein_utils.hpp"
81
+ #include " utils/ngram_match_utils.hpp"
82
+
77
83
78
84
#include < boost/uuid/uuid.hpp>
79
85
#include < boost/uuid/uuid_generators.hpp>
@@ -1578,6 +1584,195 @@ AqlValue Functions::LevenshteinDistance(ExpressionContext*, transaction::Methods
1578
1584
return AqlValue (AqlValueHintInt (encoded));
1579
1585
}
1580
1586
1587
+
1588
+ namespace {
1589
+ template <bool search_semantics>
1590
+ AqlValue NgramSimilarityHelper (char const * AFN, ExpressionContext* ctx, transaction::Methods* trx,
1591
+ VPackFunctionParameters const & args) {
1592
+ if (args.size () < 3 ) {
1593
+ registerWarning (
1594
+ ctx, AFN,
1595
+ arangodb::Result{ TRI_ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH,
1596
+ " Minimum 3 arguments are expected." });
1597
+ return AqlValue (AqlValueHintNull ());
1598
+ }
1599
+
1600
+ auto const & attribute = extractFunctionParameterValue (args, 0 );
1601
+ if (ADB_UNLIKELY (!attribute.isString ())) {
1602
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1603
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1604
+ }
1605
+ auto const attributeValue = arangodb::iresearch::getStringRef (attribute.slice ());
1606
+
1607
+ auto const & target = extractFunctionParameterValue (args, 1 );
1608
+ if (ADB_UNLIKELY (!target.isString ())) {
1609
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1610
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1611
+ }
1612
+ auto const targetValue = arangodb::iresearch::getStringRef (target.slice ());
1613
+
1614
+ auto const & ngramSize = extractFunctionParameterValue (args, 2 );
1615
+ if (ADB_UNLIKELY (!ngramSize.isNumber ())) {
1616
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1617
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1618
+ }
1619
+ auto const ngramSizeValue = ngramSize.toInt64 ();
1620
+
1621
+ if (ADB_UNLIKELY (ngramSizeValue < 1 )) {
1622
+ arangodb::aql::registerWarning (ctx, AFN,
1623
+ arangodb::Result{TRI_ERROR_BAD_PARAMETER,
1624
+ " Invalid ngram size. Should be 1 or greater" });
1625
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1626
+ }
1627
+
1628
+ auto utf32Attribute = basics::StringUtils::characterCodes (attributeValue.c_str (), attributeValue.size ());
1629
+ auto utf32Target = basics::StringUtils::characterCodes (targetValue.c_str (), targetValue.size ());
1630
+
1631
+ auto const similarity =
1632
+ irs::ngram_similarity<uint32_t , search_semantics>(
1633
+ utf32Target.data (), utf32Target.size (),
1634
+ utf32Attribute.data (), utf32Attribute.size (),
1635
+ ngramSizeValue);
1636
+ return AqlValue (AqlValueHintDouble (similarity));
1637
+ }
1638
+ }
1639
+
1640
+ // / Executes NGRAM_SIMILARITY based on binary ngram similarity
1641
+ AqlValue Functions::NgramSimilarity (ExpressionContext* ctx, transaction::Methods* trx,
1642
+ VPackFunctionParameters const & args) {
1643
+ static char const * AFN = " NGRAM_SIMILARITY" ;
1644
+ return NgramSimilarityHelper<true >(AFN, ctx, trx, args);
1645
+ }
1646
+
1647
+ // / Executes NGRAM_POSITIONAL_SIMILARITY based on positional ngram similarity
1648
+ AqlValue Functions::NgramPositionalSimilarity (ExpressionContext* ctx, transaction::Methods* trx,
1649
+ VPackFunctionParameters const & args) {
1650
+ static char const * AFN = " NGRAM_POSITIONAL_SIMILARITY" ;
1651
+ return NgramSimilarityHelper<false >(AFN, ctx, trx, args);
1652
+ }
1653
+
1654
+ // / Executes NGRAM_MATCH based on binary ngram similarity
1655
+ AqlValue Functions::NgramMatch (ExpressionContext* ctx, transaction::Methods* trx,
1656
+ VPackFunctionParameters const & args) {
1657
+ static char const * AFN = " NGRAM_MATCH" ;
1658
+
1659
+ auto const argc = args.size ();
1660
+
1661
+ if (argc < 3 ) { // for const evaluation we need analyzer to be set explicitly (we can`t access filter context)
1662
+ // but we can`t set analyzer as mandatory in function AQL signature - this will break SEARCH
1663
+ registerWarning (
1664
+ ctx, AFN,
1665
+ arangodb::Result{ TRI_ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH,
1666
+ " Minimum 3 arguments are expected." });
1667
+ return AqlValue (AqlValueHintNull ());
1668
+ }
1669
+
1670
+ auto const & attribute = extractFunctionParameterValue (args, 0 );
1671
+ if (ADB_UNLIKELY (!attribute.isString ())) {
1672
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1673
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1674
+ }
1675
+ auto const attributeValue = iresearch::getStringRef (attribute.slice ());
1676
+
1677
+ auto const & target = extractFunctionParameterValue (args, 1 );
1678
+ if (ADB_UNLIKELY (!target.isString ())) {
1679
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1680
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1681
+ }
1682
+ auto const targetValue = iresearch::getStringRef (target.slice ());
1683
+
1684
+ auto threshold = arangodb::iresearch::FilterConstants::DefaultNgramMatchThreshold;
1685
+ size_t analyzerPosition = 2 ;
1686
+ if (argc > 3 ) {// 4 args given. 3rd is threshold
1687
+ auto const & thresholdArg = extractFunctionParameterValue (args, 2 );
1688
+ analyzerPosition = 3 ;
1689
+ if (ADB_UNLIKELY (!thresholdArg.isNumber ())) {
1690
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1691
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1692
+ }
1693
+ threshold = thresholdArg.toDouble ();
1694
+ if (threshold <= 0 || threshold > 1 ) {
1695
+ arangodb::aql::registerWarning (
1696
+ ctx, AFN,
1697
+ arangodb::Result{TRI_ERROR_BAD_PARAMETER, " Threshold must be between 0 and 1" });
1698
+ }
1699
+ }
1700
+
1701
+ auto const & analyzerArg = extractFunctionParameterValue (args, analyzerPosition);
1702
+ if (ADB_UNLIKELY (!analyzerArg.isString ())) {
1703
+ arangodb::aql::registerInvalidArgumentWarning (ctx, AFN);
1704
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1705
+ }
1706
+ if (ADB_UNLIKELY (nullptr == trx)) {
1707
+ arangodb::aql::registerWarning (ctx, AFN, TRI_ERROR_INTERNAL);
1708
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1709
+ }
1710
+ auto const analyzerId = arangodb::iresearch::getStringRef (analyzerArg.slice ());
1711
+ auto & server = trx->vocbase ().server ();
1712
+ if (!server.hasFeature <iresearch::IResearchAnalyzerFeature>()) {
1713
+ arangodb::aql::registerWarning (ctx, AFN, TRI_ERROR_INTERNAL);
1714
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1715
+ }
1716
+ auto & analyzerFeature = server.getFeature <iresearch::IResearchAnalyzerFeature>();
1717
+
1718
+ auto sysVocbase = server.hasFeature <arangodb::SystemDatabaseFeature>()
1719
+ ? server.getFeature <arangodb::SystemDatabaseFeature>().use ()
1720
+ : nullptr ;
1721
+
1722
+ if (ADB_UNLIKELY (nullptr == sysVocbase)) {
1723
+ arangodb::aql::registerWarning (ctx, AFN, TRI_ERROR_INTERNAL);
1724
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1725
+ }
1726
+ auto analyzer = analyzerFeature.get (analyzerId, trx->vocbase (), *sysVocbase);
1727
+ if (!analyzer) {
1728
+ arangodb::aql::registerWarning (
1729
+ ctx, AFN,
1730
+ arangodb::Result{ TRI_ERROR_BAD_PARAMETER, " Unable to load requested analyzer" });
1731
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintNull{} };
1732
+ }
1733
+
1734
+ auto analyzerImpl = analyzer->get ();
1735
+ TRI_ASSERT (analyzerImpl);
1736
+ irs::term_attribute const & token = *analyzerImpl->attributes ().get <irs::term_attribute>();
1737
+
1738
+ std::vector<irs::bstring> attrNgrams;
1739
+ analyzerImpl->reset (attributeValue);
1740
+ while (analyzerImpl->next ()) {
1741
+ attrNgrams.push_back (token.value ());
1742
+ }
1743
+
1744
+ std::vector<irs::bstring> targetNgrams;
1745
+ analyzerImpl->reset (targetValue);
1746
+ while (analyzerImpl->next ()) {
1747
+ targetNgrams.push_back (token.value ());
1748
+ }
1749
+
1750
+ // consider only non empty ngrams sets. As no ngrams emitted - means no data in index = no match
1751
+ if (!targetNgrams.empty () && !attrNgrams.empty ()) {
1752
+ size_t thresholdMatches = (size_t )std::ceil ((float_t )targetNgrams.size () * threshold);
1753
+ size_t d = 0 ; // will store upper-left cell value for current cache row
1754
+ std::vector<size_t > cache (attrNgrams.size () + 1 , 0 );
1755
+ for (auto const & targetNgram : targetNgrams) {
1756
+ size_t s_ngram_idx = 1 ;
1757
+ for (; s_ngram_idx <= attrNgrams.size (); ++s_ngram_idx) {
1758
+ size_t curMatches = d + (size_t )(attrNgrams[s_ngram_idx - 1 ] == targetNgram);
1759
+ if (curMatches >= thresholdMatches) {
1760
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{true } };
1761
+ }
1762
+ auto tmp = cache[s_ngram_idx];
1763
+ cache[s_ngram_idx] =
1764
+ std::max (
1765
+ std::max (cache[s_ngram_idx - 1 ],
1766
+ cache[s_ngram_idx]),
1767
+ curMatches);
1768
+ d = tmp;
1769
+ }
1770
+ }
1771
+ }
1772
+ return arangodb::aql::AqlValue{ arangodb::aql::AqlValueHintBool{false } };
1773
+ }
1774
+
1775
+
1581
1776
// / Executes LEVENSHTEIN_MATCH
1582
1777
AqlValue Functions::LevenshteinMatch (ExpressionContext* ctx, transaction::Methods* trx,
1583
1778
VPackFunctionParameters const & args) {
@@ -6560,7 +6755,7 @@ AqlValue Functions::ReplaceNth(ExpressionContext* expressionContext, transaction
6560
6755
registerInvalidArgumentWarning (expressionContext, AFN);
6561
6756
return AqlValue (AqlValueHintNull ());
6562
6757
}
6563
-
6758
+
6564
6759
if (offset.isNull (true )) {
6565
6760
THROW_ARANGO_EXCEPTION_PARAMS (TRI_ERROR_QUERY_FUNCTION_ARGUMENT_TYPE_MISMATCH, AFN);
6566
6761
}
@@ -6580,7 +6775,7 @@ AqlValue Functions::ReplaceNth(ExpressionContext* expressionContext, transaction
6580
6775
AqlValueMaterializer materializer (trx);
6581
6776
VPackSlice arraySlice = materializer.slice (baseArray, false );
6582
6777
VPackSlice replaceValue = materializer.slice (newValue, false );
6583
-
6778
+
6584
6779
transaction::BuilderLeaser builder (trx);
6585
6780
builder->openArray ();
6586
6781
0 commit comments