From 7d270c511d4c9f93bbb7f89b6fb1827582427b0a Mon Sep 17 00:00:00 2001 From: Michael Hackstein Date: Fri, 2 May 2025 13:02:23 +0200 Subject: [PATCH] Bug fix/print on violated block ordering (#21743) * Added printing for violation of shutdown ordering * Exclude GatherNodes from order violation checks. Added more informative printing * Applied review comment. Also added ALERT. Also print if we are still starting async tasks although we already stopped them --------- Co-authored-by: Max Neunhoeffer --- arangod/Aql/ExecutionBlock.cpp | 29 ++++++++++++++++++++++++ arangod/Aql/ExecutionBlock.h | 15 ++++++++++++- arangod/Aql/ExecutionBlockImpl.tpp | 26 +++++++++++++++++++++ arangod/Aql/ExecutionEngine.cpp | 36 +++++++++++++++++++++++++++--- 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/arangod/Aql/ExecutionBlock.cpp b/arangod/Aql/ExecutionBlock.cpp index 17dd7b70cde0..59302174c1e2 100644 --- a/arangod/Aql/ExecutionBlock.cpp +++ b/arangod/Aql/ExecutionBlock.cpp @@ -210,3 +210,32 @@ auto ExecutionBlock::printBlockInfo() const -> std::string const { } auto ExecutionBlock::stopAsyncTasks() -> void {} + +auto ExecutionBlock::isDependencyInList( + std::unordered_set const& seenBlocks) const noexcept + -> ExecutionBlock* { + for (auto const& dependency : _dependencies) { + if (seenBlocks.find(dependency) != seenBlocks.end()) { + return dependency; + } + } + return nullptr; +} + +auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept + -> std::string const { + std::stringstream ss; + ss << printBlockInfo(); + ss << " async prefetching type: " + << (int)getPlanNode()->canUseAsyncPrefetching(); + ss << " calls: ["; + for (auto const& dependency : _dependencies) { + ss << " " << dependency->printBlockInfo() << ","; + } + ss << " ]"; + return ss.str(); +} + +auto ExecutionBlock::hasStoppedAsyncTasks() const noexcept -> bool { + return _stoppedAsyncTasks; +} \ No newline at end of file diff --git a/arangod/Aql/ExecutionBlock.h b/arangod/Aql/ExecutionBlock.h index 1da9be17b8d9..697534fad8c3 100644 --- a/arangod/Aql/ExecutionBlock.h +++ b/arangod/Aql/ExecutionBlock.h @@ -31,8 +31,9 @@ #include #include -#include +#include #include +#include #include namespace arangodb { @@ -131,9 +132,17 @@ class ExecutionBlock { [[nodiscard]] auto printBlockInfo() const -> std::string const; [[nodiscard]] auto printTypeInfo() const -> std::string const; + [[nodiscard]] auto printBlockAndDependenciesInfo() const noexcept + -> std::string const; virtual auto stopAsyncTasks() -> void; + [[nodiscard]] auto isDependencyInList( + std::unordered_set const& seenBlocks) const noexcept + -> ExecutionBlock*; + + [[nodiscard]] auto hasStoppedAsyncTasks() const noexcept -> bool; + protected: // Trace the start of a execute call void traceExecuteBegin(AqlCallStack const& stack, @@ -176,6 +185,10 @@ class ExecutionBlock { /// @brief if this is set, we are done, this is reset to false by execute() bool _done; + /// @brief if this is set, we have stopped async tasks, this is set to true by + /// stopAsyncTasks() + bool _stoppedAsyncTasks{false}; + #ifdef ARANGODB_ENABLE_MAINTAINER_MODE /// @brief if this is set to true, one thread is using this block, so we can /// assert that no other thread can access this block at the same time - as diff --git a/arangod/Aql/ExecutionBlockImpl.tpp b/arangod/Aql/ExecutionBlockImpl.tpp index 1c474d473792..e336576e17a4 100644 --- a/arangod/Aql/ExecutionBlockImpl.tpp +++ b/arangod/Aql/ExecutionBlockImpl.tpp @@ -1049,6 +1049,32 @@ auto ExecutionBlockImpl::executeFetcher(ExecutionContext& ctx, return; } + // We are entering debugging land here. + // Someone hasasked as to stop async tasks. + // But here we are just starting to work on one. + // This is okay and can rarely happen. But if it happens + // We want to know if we start the task and finish it quickly + + bool hasStoppedAsyncTasks = block->hasStoppedAsyncTasks(); + if (hasStoppedAsyncTasks) { + LOG_TOPIC("14d22", WARN, Logger::AQL) + << "[query#" << block->getQuery().id() << "] ALERT" + << block->printBlockInfo() + << " was asked to stop async task. We still start one. " + "This is an allowed rare race."; + } + + auto stopGuard = + ScopeGuard([block, hasStoppedAsyncTasks]() noexcept { + if (hasStoppedAsyncTasks) { + LOG_TOPIC("14d21", WARN, Logger::AQL) + << "[query#" << block->getQuery().id() + << "] CLEAR ALERT" << block->printBlockInfo() + << " We completed the task of the aforementioned " + "race. All is fine."; + } + }); + TRI_IF_FAILURE("AsyncPrefetch::blocksDestroyedOutOfOrder") { using namespace std::chrono_literals; std::this_thread::sleep_for(100ms); diff --git a/arangod/Aql/ExecutionEngine.cpp b/arangod/Aql/ExecutionEngine.cpp index 614ad25a0d48..026de7aff43c 100644 --- a/arangod/Aql/ExecutionEngine.cpp +++ b/arangod/Aql/ExecutionEngine.cpp @@ -272,10 +272,40 @@ ExecutionEngine::~ExecutionEngine() { // tasks running on any dependent which could start another on the // current block. // The blocks are pushed in a reversed topological order. - for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) { - (*it)->stopAsyncTasks(); + { + // Note: We can use raw pointers here because we are not taking + // any responsibilty of the pointer. It is still managed by the _blocks + // vector. Also we are in the destructor here, so we have guaranteed, + // that no one else is cleaning up the blocks. + std::unordered_set seenBlocks; + bool needToPrintViolation = false; + for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) { + auto block = it->get(); + if (ExecutionBlock* seenDependency = + block->isDependencyInList(seenBlocks); + seenDependency != nullptr && + block->getPlanNode()->getType() != ExecutionNode::GATHER) { + // We have a dependency that has already been seen, we need to log this + // situation in theory this could lead to deadlocks. Some Blocks are + // fine we just want to see those here. + // Gather Nodes are known to violate this, but they are safe. + LOG_TOPIC("a6c2b", WARN, Logger::AQL) + << "ALERT Stopping async tasks for " << block->printBlockInfo() + << " but have already stopped dependency " + << seenDependency->printBlockInfo(); + needToPrintViolation = true; + } + block->stopAsyncTasks(); + seenBlocks.insert(block); + } + if (needToPrintViolation) { + for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) { + LOG_TOPIC("a6c2d", WARN, Logger::AQL) + << (*it2)->printBlockAndDependenciesInfo(); + } + TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering"; + } } - if (_sharedState) { // ensure no async task is working anymore _sharedState->invalidate(); }