8000 covering index wins late materialization by iurii-i-popov · Pull Request #10673 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

covering index wins late materialization #10673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 13, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
covering index wins late mater, removed redundant vars, supported sub…
…queries
  • Loading branch information
Yuriy Popov committed Dec 10, 2019
commit 018013093842691a5c39d9271c925423310b3dec
31 changes: 26 additions & 5 deletions arangod/Aql/IResearchViewNode.cpp
8000
Original file line numberDiff line number Diff line change
Expand Up @@ -1408,25 +1408,46 @@ void IResearchViewNode::OptimizationState::saveCalcNodesForViewVariables(std::ve
_nodesToChange.clear();
for (auto& node : nodesToChange) {
auto& calcNodeData = _nodesToChange[node.node];
calcNodeData.reserve(node.attrs.size());
std::transform(node.attrs.cbegin(), node.attrs.cend(), std::inserter(calcNodeData, calcNodeData.end()),
[](auto const& attrAndField) {
return attrAndField.afData;
});
}
}

IResearchViewNode::ViewVarsInfo IResearchViewNode::OptimizationState::replaceViewVariables(std::vector<aql::CalculationNode*> const& calcNodes) {
IResearchViewNode::ViewVarsInfo IResearchViewNode::OptimizationState::replaceViewVariables(std::vector<aql::CalculationNode*> const& calcNodes,
arangodb::containers::HashSet<ExecutionNode*>& toUnlink) {
TRI_ASSERT(!calcNodes.empty());
ViewVarsInfo uniqueVariables;
auto ast = calcNodes.back()->expression()->ast();
// at first use variables from simple expressions
for (auto calcNode : calcNodes) {
TRI_ASSERT(_nodesToChange.find(calcNode) != _nodesToChange.cend());
auto const& calcNodeData = _nodesToChange[calcNode];
std::transform(calcNodeData.cbegin(), calcNodeData.cend(), std::inserter(uniqueVariables, uniqueVariables.end()),
[ast](auto const& afData) {
return std::make_pair(afData.field, ViewVariable{afData.number,
TRI_ASSERT(!calcNodeData.empty());
auto const& afData = calcNodeData[0];
if (afData.parentNode == nullptr) {
TRI_ASSERT(calcNodeData.size() == 1);
// we could add one redundant variable for each field only
if (uniqueVariables.try_emplace(afData.field, ViewVariable{afData.number,
calcNode->outVariable()}).second) {
toUnlink.emplace(calcNode);
}
}
}
// create variables for complex expressions
for (auto calcNode : calcNodes) {
TRI_ASSERT(_nodesToChange.find(calcNode) != _nodesToChange.cend());
auto const& calcNodeData = _nodesToChange[calcNode];
TRI_ASSERT(!calcNodeData.empty());
for (auto const& afData : calcNodeData) {
// create a variable if necessary
if (afData.parentNode != nullptr && uniqueVariables.find(afData.field) == uniqueVariables.cend()) {
uniqueVariables.emplace(afData.field, ViewVariable{afData.number,
ast->variables()->createTemporaryVariable()});
});
}
}
}
for (auto calcNode : calcNodes) {
TRI_ASSERT(_nodesToChange.find(calcNode) != _nodesToChange.cend());
Expand Down
2 changes: 1 addition & 1 deletion arangod/Aql/IResearchViewNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ class IResearchViewNode final : public arangodb::aql::ExecutionNode {

bool canVariablesBeReplaced(aql::CalculationNode* calclulationNode) const;

IResearchViewNode::ViewVarsInfo replaceViewVariables(std::vector<aql::CalculationNode*> const& calcNodes);
IResearchViewNode::ViewVarsInfo replaceViewVariables(std::vector<aql::CalculationNode*> const& calcNodes, arangodb::containers::HashSet<ExecutionNode*>& toUnlink);

void clearViewVariables() {
_nodesToChange.clear();
Expand Down
53 changes: 43 additions & 10 deletions arangod/Aql/IResearchViewOptimizerRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "Aql/Function.h"
#include "Aql/IResearchViewNode.h"
#include "Aql/LateMaterializedOptimizerRulesCommon.h"
#include "Aql/NodeFinder.h"
#include "Aql/Optimizer.h"
#include "Aql/OptimizerRule.h"
#include "Aql/Query.h"
Expand Down Expand Up @@ -330,7 +331,7 @@ void lateDocumentMaterializationArangoSearchRule(Optimizer* opt,
auto addPlan = arangodb::scopeGuard([opt, &plan, &rule, &modified]() {
opt->addPlan(std::move(plan), rule, modified);
});
// arangosearch view node supports late materialization
// arangosearch view node supports late materialization
//cppcheck-suppress accessMoved
if (!plan->contains(ExecutionNode::ENUMERATE_IRESEARCH_VIEW) ||
// we need sort node to be present (without sort it will be just skip, nothing to optimize)
Expand Down Expand Up @@ -361,7 +362,7 @@ void lateDocumentMaterializationArangoSearchRule(Optimizer* opt,
auto& viewNodeState = viewNode.state();
while (current != loop) {
auto type = current->getType();
switch (current->getType()) {
switch (type) {
case ExecutionNode::SORT:
if (sortNode == nullptr) { // we need nearest to limit sort node, so keep selected if any
sortNode = current;
Expand All @@ -384,20 +385,48 @@ void lateDocumentMaterializationArangoSearchRule(Optimizer* opt,
::arangodb::containers::HashSet<Variable const*> currentUsedVars;
current->getVariablesUsedHere(currentUsedVars);
if (currentUsedVars.find(&viewNode.outVariable()) != currentUsedVars.end()) {
// Currently only calculation and subquery nodes expected to use loop variable.
// We successfully replace all references to loop variable in calculation nodes only.
// However if some other node types will begin to use loop variable
// Currently only calculation and subquery nodes expected to use a loop variable.
// We successfully replace all references to the loop variable.
// However if some other node types will begin to use the loop variable
// assertion below will be triggered and this rule should be updated.
// Subquery node is planned to be supported later.
auto invalid = true;
if (ExecutionNode::CALCULATION == type) {
switch (type) {
case ExecutionNode::CALCULATION: {
auto calcNode = ExecutionNode::castTo<CalculationNode*>(current);
if (viewNodeState.canVariablesBeReplaced(calcNode)) {
calcNodes.emplace_back(calcNode);
invalid = false;
}
} else {
TRI_ASSERT(ExecutionNode::SUBQUERY == type);
break;
}
case ExecutionNode::SUBQUERY: {
auto subqueryNode = ExecutionNode::castTo<SubqueryNode*>(current);
auto subquery = subqueryNode->getSubquery();
::arangodb::containers::SmallVector<ExecutionNode*>::allocator_type::arena_type sa;
::arangodb::containers::SmallVector<ExecutionNode*> subqueryCalcNodes{sa};
// find calculation nodes in the plan of a subquery
NodeFinder<ExecutionNode::NodeType> finder(ExecutionNode::CALCULATION, subqueryCalcNodes, true);
subquery->walk(finder);
for (auto scn : subqueryCalcNodes) {
TRI_ASSERT(scn->getType() == ExecutionNode::CALCULATION);
currentUsedVars.clear();
scn->getVariablesUsedHere(currentUsedVars);
if (currentUsedVars.find(&viewNode.outVariable()) != currentUsedVars.end()) {
auto calcNode = ExecutionNode::castTo<CalculationNode*>(scn);
if (viewNodeState.canVariablesBeReplaced(calcNode)) {
calcNodes.emplace_back(calcNode);
invalid = false;
} else {
invalid = true;
break;
}
}
}
break;
}
default:
TRI_ASSERT(false);
break;
}
if (invalid) {
if (sortNode != nullptr) {
Expand All @@ -423,8 +452,12 @@ void lateDocumentMaterializationArangoSearchRule(Optimizer* opt,
// we could apply late materialization
// 1. Replace view variables in calculation node if need
if (!calcNodes.empty()) {
auto viewVariables = viewNodeState.replaceViewVariables(calcNodes);
::arangodb::containers::HashSet<ExecutionNode*> toUnlink;
auto viewVariables = viewNodeState.replaceViewVariables(calcNodes, toUnlink);
viewNode.setViewVariables(viewVariables);
if (!toUnlink.empty()) {
plan->unlinkNodes(toUnlink);
}
}
// 2. We need to notify view - it should not materialize documents, but produce only localDocIds
// 3. We need to add materializer after limit node to do materialization
Expand Down
4 changes: 4 additions & 0 deletions arangod/Aql/IndexNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ class IndexNode : public ExecutionNode, public DocumentProducingNode, public Col
return !_outNonMaterializedIndVars.second.empty();
}

bool canApplyLateDocumentMaterializationRule() {
return (isVarUsedLater(_outVariable) || _filter != nullptr) && coveringIndexAttributePositions().empty();
}

struct IndexVariable {
size_t indexFieldNum;
Variable const* var;
Expand Down
125 changes: 86 additions & 39 deletions arangod/Aql/IndexNodeOptimizerRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "Aql/Expression.h"
#include "Aql/IndexNode.h"
#include "Aql/LateMaterializedOptimizerRulesCommon.h"
#include "Aql/NodeFinder.h"
#include "Aql/Optimizer.h"
#include "IndexNodeOptimizerRules.h"
#include "Basics/AttributeNameParser.h"
Expand Down Expand Up @@ -69,6 +70,34 @@ namespace {
}
return true;
}

void processCalculationNode(IndexNode const* indexNode, CalculationNode* calculationNode,
std::vector<latematerialized::NodeWithAttrs>& nodesToChange,
bool NoSortNode, TRI_idx_iid_t& commonIndexId, bool& stickToSortNode, bool& stopSearch) {
auto astNode = calculationNode->expression()->nodeForModification();
latematerialized::NodeWithAttrs node;
node.node = calculationNode;
// find attributes referenced to index node out variable
if (!latematerialized::getReferencedAttributes(astNode, indexNode->outVariable(), node)) {
// is not safe for optimization
stopSearch = true;
} else if (!node.attrs.empty()) {
if (!attributesMatch(commonIndexId, indexNode, node)) {
// the node uses attributes which is not in index
if (NoSortNode) {
// we are between limit and sort nodes.
// late materialization could still be applied but we must insert MATERIALIZE node after sort not after limit
stickToSortNode = true;
} else {
// this limit node affects only closest sort, if this sort is invalid
// we need to check other limit node
stopSearch = true;
}
} else {
nodesToChange.emplace_back(std::move(node));
}
}
}
}

void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,
Expand All @@ -89,13 +118,12 @@ void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,

arangodb::containers::SmallVector<ExecutionNode*>::allocator_type::arena_type a;
arangodb::containers::SmallVector<ExecutionNode*> nodes{a};

plan->findNodesOfType(nodes, ExecutionNode::LIMIT, true);
for (auto limitNode : nodes) {
auto loop = const_cast<ExecutionNode*>(limitNode->getLoop());
if (ExecutionNode::INDEX == loop->getType()) {
auto indexNode = ExecutionNode::castTo<IndexNode*>(loop);
if (indexNode->isLateMaterialized()) {
if (!indexNode->canApplyLateDocumentMaterializationRule() || indexNode->isLateMaterialized()) {
continue; // loop is already optimized
}
auto current = limitNode->getFirstDependency();
Expand All @@ -115,33 +143,10 @@ void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,
sortNode = current;
}
break;
case ExecutionNode::CALCULATION: {
auto calculationNode = ExecutionNode::castTo<CalculationNode*>(current);
auto astNode = calculationNode->expression()->nodeForModification();
latematerialized::NodeWithAttrs node;
node.node = calculationNode;
// find attributes referenced to index node out variable
if (!latematerialized::getReferencedAttributes(astNode, indexNode->outVariable(), node)) {
// is not safe for optimization
stopSearch = true;
} else if (!node.attrs.empty()) {
if (!attributesMatch(commonIndexId, indexNode, node)) {
// the node uses attributes which is not in index
if (nullptr == sortNode) {
// we are between limit and sort nodes.
// late materialization could still be applied but we must insert MATERIALIZE node after sort not after limit
stickToSortNode = true;
} else {
// this limit node affects only closest sort, if this sort is invalid
// we need to check other limit node
stopSearch = true;
}
} else {
nodesToChange.emplace_back(std::move(node));
}
}
case ExecutionNode::CALCULATION:
processCalculationNode(indexNode, ExecutionNode::castTo<CalculationNode*>(current), nodesToChange,
nullptr == sortNode, commonIndexId, stickToSortNode, stopSearch);
break;
}
case ExecutionNode::REMOTE:
// REMOTE node is a blocker - we do not want to make materialization calls across cluster!
if (sortNode != nullptr) {
Expand All @@ -151,17 +156,38 @@ void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,
default: // make clang happy
break;
}
// Currently only calculation and subquery nodes expected to use loop variable.
// We successfully replaced all references to loop variable in calculation nodes only.
// However if some other node types will begin to use loop variable
// Currently only calculation and subquery nodes expected to use a loop variable.
// We successfully replaced all references to the loop variable.
// However if some other node types will begin to use the loop variable
// assertion below will be triggered and this rule should be updated.
// Subquery node is planned to be supported later.
if (!stopSearch && type != ExecutionNode::CALCULATION) {
arangodb::containers::HashSet<Variable const*> currentUsedVars;
current->getVariablesUsedHere(currentUsedVars);
if (currentUsedVars.find(indexNode->outVariable()) != currentUsedVars.end()) {
TRI_ASSERT(ExecutionNode::SUBQUERY == type);
stopSearch = true;
if (ExecutionNode::SUBQUERY == type) {
auto subqueryNode = ExecutionNode::castTo<SubqueryNode*>(current);
auto subquery = subqueryNode->getSubquery();
arangodb::containers::SmallVector<ExecutionNode*>::allocator_type::arena_type sa;
arangodb::containers::SmallVector<ExecutionNode*> subqueryCalcNodes{sa};
// find calculation nodes in the plan of a subquery
NodeFinder<ExecutionNode::NodeType> finder(ExecutionNode::CALCULATION, subqueryCalcNodes, true);
subquery->walk(finder);
for (auto scn : subqueryCalcNodes) {
TRI_ASSERT(scn->getType() == ExecutionNode::CALCULATION);
currentUsedVars.clear();
scn->getVariablesUsedHere(currentUsedVars);
if (currentUsedVars.find(indexNode->outVariable()) != currentUsedVars.end()) {
processCalculationNode(indexNode, ExecutionNode::castTo<CalculationNode*>(scn), nodesToChange,
nullptr == sortNode, commonIndexId, stickToSortNode, stopSearch);
if (stopSearch) {
break;
}
}
}
} else {
TRI_ASSERT(false);
stopSearch = true;
}
}
}
if (stopSearch) {
Expand All @@ -175,12 +201,30 @@ void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,
if (sortNode && !nodesToChange.empty()) {
auto ast = plan->getAst();
IndexNode::IndexVarsInfo uniqueVariables;
for (auto& node : nodesToChange) {
std::transform(node.attrs.cbegin(), node.attrs.cend(), std::inserter(uniqueVariables, uniqueVariables.end()),
[ast](auto const& attrAndField) {
return std::make_pair(attrAndField.afData.field, IndexNode::IndexVariable{attrAndField.afData.number,
arangodb::containers::HashSet<ExecutionNode*> toUnlink;
// at first use variables from simple expressions
for (auto const& node : nodesToChange) {
TRI_ASSERT(!node.attrs.empty());
auto const& afData = node.attrs[0].afData;
if (afData.parentNode == nullptr) {
TRI_ASSERT(node.attrs.size() == 1);
// we could add one redundant variable for each field only
if (uniqueVariables.try_emplace(afData.field, IndexNode::IndexVariable{afData.number,
node.node->outVariable()}).second) {
toUnlink.emplace(node.node);
}
}
}
// create variables for complex expressions
for (auto const& node : nodesToChange) {
TRI_ASSERT(!node.attrs.empty());
for (auto const& attrAndField : node.attrs) {
// create a variable if necessary
if (attrAndField.afData.parentNode != nullptr && uniqueVariables.find(attrAndField.afData.field) == uniqueVariables.cend()) {
uniqueVariables.emplace(attrAndField.afData.field, IndexNode::IndexVariable{attrAndField.afData.number,
ast->variables()->createTemporaryVariable()});
});
}
}
}
auto localDocIdTmp = ast->variables()->createTemporaryVariable();
for (auto& node : nodesToChange) {
Expand All @@ -197,6 +241,9 @@ void arangodb::aql::lateDocumentMaterializationRule(Optimizer* opt,
}
}
}
if (!toUnlink.empty()) {
plan->unlinkNodes(toUnlink);
}

// we could apply late materialization
// 1. We need to notify index node - it should not materialize documents, but produce only localDocIds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,26 @@ function lateDocumentMaterializationArangoSearch2RuleTestSuite () {
let plan = AQL_EXPLAIN(query).plan;
assertEqual(-1, plan.rules.indexOf(ruleName));
},
testNotAppliedDueToSubqueryWithDocumentAccessByAttribute() { // should be supported later
testQueryResultsWithSubqueryWithDocumentAccessByAttribute() {
let query = "FOR d IN " + svn + " SEARCH d.value IN [1, 2, 11, 12] " +
"LET a = NOOPT(d.foo) " +
"LET e = SUM(FOR c IN " + vn + " LET p = CONCAT(d.foo, c.foo) RETURN p) " +
"SORT CONCAT(a, e) LIMIT 10 RETURN d";
let plan = AQL_EXPLAIN(query).plan;
assertEqual(-1, plan.rules.indexOf(ruleName));
if (!isCluster) {
assertNotEqual(-1, plan.rules.indexOf(ruleName));
let result = AQL_EXECUTE(query);
assertEqual(4, result.json.length);
let expectedKeys = new Set(['c1', 'c2', 'c_1', 'c_2']);
result.json.forEach(function(doc) {
assertTrue(expectedKeys.has(doc._key));
expectedKeys.delete(doc._key);
});
assertEqual(0, expectedKeys.size);
} else {
// on cluster this will not be applied as remote node placed before sort node
assertEqual(-1, plan.rules.indexOf(ruleName));
}
},
testQueryResultsWithSubqueryWithoutDocumentAccess() {
let query = "FOR d IN " + svn + " SEARCH d.value IN [1, 2, 11, 12] " +
Expand Down
Loading
0