8000 Feature/search 213 optimize levenshtein startswith by Dronplane · Pull Request #14744 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Feature/search 213 optimize levenshtein startswith #14744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
64944e5
group conjunction nodes
Dronplane Sep 3, 2021
c6d0ab3
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 6, 2021
4664cc4
wip
Dronplane Sep 6, 2021
b545d56
tests
Dronplane Sep 6, 2021
b026fa4
tests
Dronplane Sep 7, 2021
9dfe7b9
tests
Dronplane Sep 7, 2021
0abc986
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 7, 2021
5b8eda2
moar tests
Dronplane Sep 7, 2021
6f11b5d
add option to explicitly block merging
Dronplane Sep 7, 2021
337cd90
moar corner cases
Dronplane Sep 8, 2021
17ea36a
empty case optimization
Dronplane Sep 8, 2021
55c038c
optimization
Dronplane Sep 8, 2021
229c972
cleanup
Dronplane Sep 8, 2021
f5d8684
fix startsWith boost
Dronplane Sep 8, 2021
af37ee0
cleanup
Dronplane Sep 8, 2021
eae7ebb
wip
Dronplane Sep 8, 2021
26386d4
wip
Dronplane Sep 8, 2021
37be60d
comments
Dronplane Sep 8, 2021
bcd2093
add node serialization tests
Dronplane Sep 8, 2021
d649660
Update arangod/IResearch/IResearchFilterFactory.cpp
Dronplane Sep 9, 2021
4cd8331
adress review comments
Dronplane Sep 9, 2021
a969139
address review comments
Dronplane Sep 9, 2021
0e6b6cf
add js test
Dronplane Sep 9, 2021
0c6d9ca
adress review comments
Dronplane Sep 9, 2021
09c678c
Update CHANGELOG
Dronplane Sep 9, 2021
1708a0a
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 9, 2021
d234738
Merge branch 'devel' into feature/SEARCH-213-Optimize-Levenshtein-sta…
Dronplane Sep 10, 2021
f89293e
None -> NONE renaming
Dronplane Sep 10, 2021
340087d
Merge branch 'feature/SEARCH-213-Optimize-Levenshtein-startswith' of …
Dronplane Sep 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
devel
-----

* Added ArangoSearch condition optimization: STARTS_WITH is merged
with LEVENSHTEIN_MATCH if used in the same AND node and field name and prefix
matches.

* Added multidimensional indexes which can be used to efficiently intersect
multiple range queries. They are currently limited to IEEE-754 double values.
Given documents of the form {x: 12.9, y: -284.0, z: 0.02} one can define a
Expand Down
9 changes: 6 additions & 3 deletions arangod/Aql/IResearchViewExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ IResearchViewExecutorInfos::IResearchViewExecutorInfos(
IResearchViewStoredValues const& storedValues, ExecutionPlan const& plan,
Variable const& outVariable, aql::AstNode const& filterCondition,
std::pair<bool, bool> volatility, IResearchViewExecutorInfos::VarInfoMap const& varInfoMap,
int depth, IResearchViewNode::ViewValuesRegisters&& outNonMaterializedViewRegs, iresearch::CountApproximate countApproximate)
int depth, IResearchViewNode::ViewValuesRegisters&& outNonMaterializedViewRegs,
iresearch::CountApproximate countApproximate, iresearch::FilterOptimization filterOptimization)
: _scoreRegisters(std::move(scoreRegisters)),
_reader(std::move(reader)),
_query(query),
Expand All @@ -167,7 +168,8 @@ IResearchViewExecutorInfos::IResearchViewExecutorInfos(
_depth(depth),
_outNonMaterializedViewRegs(std::move(outNonMaterializedViewRegs)),
_countApproximate(countApproximate),
_filterConditionIsEmpty(::filterConditionIsEmpty(&_filterCondition)) {
_filterConditionIsEmpty(::filterConditionIsEmpty(&_filterCondition)),
_filterOptimization(filterOptimization) {
TRI_ASSERT(_reader != nullptr);
std::tie(_documentOutReg, _collectionPointerReg) = std::visit(
overload{[&](aql::IResearchViewExecutorInfos::MaterializeRegisters regs) {
Expand Down Expand Up @@ -599,7 +601,8 @@ void IResearchViewExecutorBase<Impl, Traits>::reset() {

ExecutionPlan const* plan = &infos().plan();
iresearch::QueryContext const queryCtx = { &_trx, plan, plan->getAst(),
&_ctx, _reader.get(), &infos().outVariable()};
&_ctx, _reader.get(), &infos().outVariable(),
infos().filterOptimization()};

if (infos().volatileFilter() || !_isInitialized) { // `_volatileSort` implies `_volatileFilter`
irs::Or root;
Expand Down
4 changes: 3 additions & 1 deletion arangod/Aql/IResearchViewExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class IResearchViewExecutorInfos {
Variable const& outVariable, aql::AstNode const& filterCondition,
std::pair<bool, bool> volatility, VarInfoMap const& varInfoMap, int depth,
iresearch::IResearchViewNode::ViewValuesRegisters&& outNonMaterializedViewRegs,
iresearch::CountApproximate);
iresearch::CountApproximate, iresearch::FilterOptimization);

auto getDocumentRegister() const noexcept -> RegisterId;
auto getCollectionRegister() const noexcept -> RegisterId;
Expand All @@ -111,6 +111,7 @@ class IResearchViewExecutorInfos {
bool volatileSort() const noexcept;
bool volatileFilter() const noexcept;
iresearch::CountApproximate countApproximate() const noexcept { return _countApproximate; }
iresearch::FilterOptimization filterOptimization() const noexcept { return _filterOptimization; }

// first - sort
// second - number of sort conditions to take into account
Expand All @@ -137,6 +138,7 @@ class IResearchViewExecutorInfos {
iresearch::IResearchViewNode::ViewValuesRegisters _outNonMaterializedViewRegs;
iresearch::CountApproximate _countApproximate;
bool _filterConditionIsEmpty;
iresearch::FilterOptimization _filterOptimization;
}; // IResearchViewExecutorInfos

class IResearchViewStats {
Expand Down
32 changes: 31 additions & 1 deletion arangod/Aql/IResearchViewNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ void toVelocyPack(velocypack::Builder& builder, IResearchViewNode::Options const
}
}
}

if (options.filterOptimization != arangodb::iresearch::FilterOptimization::MAX) {
builder.add("filterOptimization", VPackValue(static_cast<int64_t>(options.filterOptimization)));
}
}

bool fromVelocyPack(velocypack::Slice optionsSlice, IResearchViewNode::Options& options) {
Expand Down Expand Up @@ -284,6 +288,19 @@ bool fromVelocyPack(velocypack::Slice optionsSlice, IResearchViewNode::Options&
}
}

// filterOptimization
{
auto const optionSlice = optionsSlice.get("filterOptimization");
if (!optionSlice.isNone()) {
// 'filterOptimization' is optional. Missing means MAX
if (!optionSlice.isNumber()) {
return false;
}
options.filterOptimization =
static_cast<arangodb::iresearch::FilterOptimization>(optionSlice.getNumber<int>());
}
}

return true;
}

Expand Down Expand Up @@ -438,6 +455,18 @@ bool parseOptions(aql::QueryContext& query, LogicalView const& view, aql::AstNod
}
options.conditionOptimization = conditionTypeIt->second;
return true;
}},
// cppcheck-suppress constStatement
{"filterOptimization", [](aql::QueryContext& /*query*/, LogicalView const& /*view*/,
aql::AstNode const& value,
IResearchViewNode::Options& options, std::string& error) {
if (!value.isValueType(aql::VALUE_TYPE_INT)) {
error = "int value expected for option 'filterOptimization'";
return false;
}
options.filterOptimization =
static_cast<arangodb::iresearch::FilterOptimization>(value.getIntValue());
return true;
}}};

if (!optionsNode) {
Expand Down Expand Up @@ -1660,7 +1689,8 @@ std::unique_ptr<aql::ExecutionBlock> IResearchViewNode::createBlock(
getRegisterPlan()->varInfo, // ??? do we need this?
getDepth(),
std::move(outNonMaterializedViewRegs),
_options.countApproximate};
_options.countApproximate,
filterOptimization()};

return std::make_tuple(materializeType, std::move(executorInfos), std::move(registerInfos));
};
Expand Down
13 changes: 13 additions & 0 deletions arangod/Aql/IResearchViewNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "Aql/ExecutionNodeId.h"
#include "Aql/LateMaterializedOptimizerRulesCommon.h"
#include "Aql/types.h"
#include "IResearch/IResearchFilterOptimization.h"
#include "IResearch/IResearchOrderFactory.h"
#include "IResearch/IResearchViewSort.h"
#include "IResearch/IResearchViewStoredValues.h"
Expand Down Expand Up @@ -90,6 +91,9 @@ class IResearchViewNode final : public arangodb::aql::ExecutionNode {

/// @brief skipAll method for view
CountApproximate countApproximate{CountApproximate::Exact};

/// @brief iresearch filters optimization level
FilterOptimization filterOptimization {FilterOptimization::MAX};
}; // Options

IResearchViewNode(aql::ExecutionPlan& plan, aql::ExecutionNodeId id, TRI_vocbase_t& vocbase,
Expand Down Expand Up @@ -151,6 +155,15 @@ class IResearchViewNode final : public arangodb::aql::ExecutionNode {
/// @brief return the scorers to pass to the view
std::vector<Scorer> const& scorers() const noexcept { return _scorers; }

// we could merge if it is allowed in general and there are no scores - as changing
// filters will affect score and we will lose backward compatibility
FilterOptimization filterOptimization() const noexcept {
if (!_scorers.empty()) {
return FilterOptimization::NONE;
}
return _options.filterOptimization;
}

/// @brief set the scorers to pass to the view
void scorers(std::vector<Scorer>&& scorers) noexcept {
_scorers = std::move(scorers);
Expand Down
60 changes: 56 additions & 4 deletions arangod/Aql/IResearchViewOptimizerRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "IResearchViewOptimizerRules.h"

#include "Aql/AqlFunctionFeature.h"
#include "Aql/CalculationNodeVarFinder.h"
#include "Aql/ClusterNodes.h"
#include "Aql/Condition.h"
Expand Down Expand Up @@ -77,6 +78,50 @@ inline IResearchViewStoredValues const& storedValues(arangodb::LogicalView const
return viewImpl.storedValues();
}

/// @brief Moves all FCALLs in every AND node to the bottom of the AND node
/// @param condition SEARCH condition node
/// @param starts_with Function to push
void pushFuncToBack(arangodb::aql::AstNode& condition, arangodb::aql::Function const* starts_with) {
auto numMembers = condition.numMembers();
for (size_t memberIdx = 0; memberIdx < numMembers; ++memberIdx) {
auto current = condition.getMemberUnchecked(memberIdx);
TRI_ASSERT(current);
auto const numAndMembers = current->numMembers();
if (current->type == arangodb::aql::AstNodeType::NODE_TYPE_OPERATOR_NARY_AND &&
numAndMembers > 1) {
size_t movePoint = numAndMembers - 1;
do {
auto candidate = current->getMemberUnchecked(movePoint);
if (candidate->type != arangodb::aql::AstNodeType::NODE_TYPE_FCALL ||
static_cast<arangodb::aql::Function const*>(candidate->getData()) != starts_with) {
break;
}
} while ((--movePoint) != 0);
for (size_t andMemberIdx = 0; andMemberIdx < movePoint; ++andMemberIdx) {
auto andMember = current->getMemberUnchecked(andMemberIdx);
if (andMember->type == arangodb::aql::AstNodeType::NODE_TYPE_FCALL &&
static_cast<arangodb::aql::Function const*>(andMember->getData()) == starts_with) {
TEMPORARILY_UNLOCK_NODE(current);
auto tmp = current->getMemberUnchecked(movePoint);
current->changeMember(movePoint--, andMember);
current->changeMember(andMemberIdx, tmp);
while (movePoint > andMemberIdx &&
current->getMemberUnchecked(movePoint)->type ==
arangodb::aql::AstNodeType::NODE_TYPE_FCALL &&
static_cast<arangodb::aql::Function const*>(
current->getMemberUnchecked(movePoint)->getData()) == starts_with) {
--movePoint;
}
} else {
pushFuncToBack(*andMember, starts_with);
}
}
} else {
pushFuncToBack(*current, starts_with);
}
}
}

bool addView(arangodb::LogicalView const& view, arangodb::aql::QueryContext& query) {
auto& collections = query.collections();

Expand Down Expand Up @@ -132,6 +177,13 @@ bool optimizeSearchCondition(IResearchViewNode& viewNode, arangodb::aql::QueryCo

// check filter condition if present
if (searchCondition.root()) {
if (viewNode.filterOptimization() != arangodb::iresearch::FilterOptimization::NONE) {
// we could benefit from merging STARTS_WITH and LEVENSHTEIN_MATCH
auto& server = plan.getAst()->query().vocbase().server();
auto starts_with = server.getFeature<AqlFunctionFeature>().byName("STARTS_WITH");
TRI_ASSERT(starts_with);
pushFuncToBack(*searchCondition.root(), starts_with);
}
auto filterCreated = FilterFactory::filter(
nullptr,
{ &query.trxForOptimization(), nullptr, nullptr, nullptr, nullptr, &viewNode.outVariable() },
Expand Down Expand Up @@ -728,14 +780,14 @@ void handleViewsRule(Optimizer* opt,
modified |= optimizeSort(viewNode, plan.get());
}

if (!optimizeSearchCondition(viewNode, query, *plan)) {
continue;
}

// find scorers that have to be evaluated by a view
scorerReplacer.extract(viewNode.outVariable(), scorers);
viewNode.scorers(std::move(scorers));

if (!optimizeSearchCondition(viewNode, query, *plan)) {
continue;
}

modified = true;
}
keepReplacementViewVariables(calcNodes, viewNodes);
Expand Down
1 change: 1 addition & 0 deletions arangod/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ add_library(arango_iresearch
IResearch/IResearchExpressionContext.cpp IResearch/IResearchExpressionContext.h
IResearch/IResearchFeature.cpp IResearch/IResearchFeature.h
IResearch/IResearchFilterFactory.cpp IResearch/IResearchFilterFactory.h
IResearch/IResearchFilterOptimization.cpp IResearch/IResearchFilterOptimization.h
IResearch/IResearchIdentityAnalyzer.cpp IResearch/IResearchIdentityAnalyzer.h
IResearch/IResearchKludge.cpp IResearch/IResearchKludge.h
IResearch/IResearchLink.cpp IResearch/IResearchLink.h
Expand Down
3 changes: 3 additions & 0 deletions arangod/IResearch/AqlHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "Aql/AstNode.h"
#include "Aql/SortCondition.h"
#include "VelocyPackHelper.h"
#include "IResearch/IResearchFilterOptimization.h"

#include "search/sort.hpp"
#include "utils/noncopyable.hpp"
Expand Down Expand Up @@ -241,6 +242,8 @@ struct QueryContext {
aql::ExpressionContext* ctx;
irs::index_reader const* index;
aql::Variable const* ref;
/// @brief allow optimize away/modify some conditions during filter building
FilterOptimization filterOptimization {FilterOptimization::MAX};
}; // QueryContext

////////////////////////////////////////////////////////////////////////////////
Expand Down
20 changes: 16 additions & 4 deletions arangod/IResearch/IResearchFilterFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
#include "IResearch/IResearchAnalyzerFeature.h"
#include "IResearch/IResearchCommon.h"
#include "IResearch/IResearchFeature.h"
#include "IResearch/IResearchFilterOptimization.h"
#include "IResearch/IResearchKludge.h"
#include "IResearch/IResearchPDP.h"
#include "Logger/LogMacros.h"
Expand Down Expand Up @@ -241,7 +242,7 @@ Result malformedNode(aql::AstNodeType type) {
return {TRI_ERROR_BAD_PARAMETER, message};
}

} // error
} // namespace error

bool setupGeoFilter(FieldMeta::Analyzer const& a,
S2RegionTermIndexer::Options& opts) {
Expand Down Expand Up @@ -403,7 +404,7 @@ Result getAnalyzerByName(
}
auto& analyzerFeature = server.getFeature<IResearchAnalyzerFeature>();

analyzer = analyzerFeature.get(analyzerId, ctx.trx->vocbase(),
analyzer = analyzerFeature.get(analyzerId, ctx.trx->vocbase(),
ctx.trx->state()->analyzersRevision());

if (!analyzer) {
Expand Down Expand Up @@ -2084,7 +2085,6 @@ Result fromFuncAnalyzer(

shortName = arangodb::iresearch::IResearchAnalyzerFeature::normalize( // normalize
analyzerId, ctx.trx->vocbase().name(), false); // args

}

FilterContext const subFilterContext(analyzerValue, filterCtx.boost); // override analyzer
Expand Down Expand Up @@ -3570,17 +3570,29 @@ Result fromFuncStartsWith(

TRI_ASSERT(filterCtx.analyzer);
kludge::mangleField(name, filterCtx.analyzer);
filter->boost(filterCtx.boost);

// Try to optimize us away
if (!isMultiPrefix && !prefixes.empty() &&
ctx.filterOptimization != FilterOptimization::NONE &&
arangodb::iresearch::includeStartsWithInLevenshtein(filter, name,
prefixes.back().second)) {
return {};
}

if (isMultiPrefix) {
auto& minMatchFilter = filter->add<irs::Or>();
minMatchFilter.min_match_count(static_cast<size_t>(minMatchCount));
minMatchFilter.boost(filterCtx.boost);
// become a new root
filter = &minMatchFilter;
}

for (size_t i = 0, size = prefixes.size(); i < size; ++i) {
auto& prefixFilter = filter->add<irs::by_prefix>();
if (!isMultiPrefix) {
TRI_ASSERT(prefixes.size() == 1);
prefixFilter.boost(filterCtx.boost);
}
if (i + 1 < size) {
*prefixFilter.mutable_field() = name;
} else {
Expand Down
2 changes: 2 additions & 0 deletions arangod/IResearch/IResearchFilterFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

#pragma once

#include "IResearchFilterOptimization.h"

#include "VocBase/voc-types.h"

#include "search/filter.hpp"
Expand Down
Loading
0