8000 Bug fix/print on violated block ordering by mchacki · Pull Request #21743 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content
8000

Bug fix/print on violated block ordering #21743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions arangod/Aql/ExecutionBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,32 @@ auto ExecutionBlock::printBlockInfo() const -> std::string const {
}

auto ExecutionBlock::stopAsyncTasks() -> void {}

auto ExecutionBlock::isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock* {
for (auto const& dependency : _dependencies) {
if (seenBlocks.find(dependency) != seenBlocks.end()) {
return dependency;
}
}
return nullptr;
}

auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept
-> std::string const {
std::stringstream ss;
ss << printBlockInfo();
ss << " async prefetching type: "
<< (int)getPlanNode()->canUseAsyncPrefetching();
ss << " calls: [";
for (auto const& dependency : _dependencies) {
ss << " " << dependency->printBlockInfo() << ",";
}
ss << " ]";
return ss.str();
}

auto ExecutionBlock::hasStoppedAsyncTasks() const noexcept -> bool {
return _stoppedAsyncTasks;
}
15 changes: 14 additions & 1 deletion arangod/Aql/ExecutionBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@

#include <atomic>
#include <cstdint>
#include <string_view>
#include <unordered_set>
#include <utility>
#include <string_view>
#include <vector>

namespace arangodb {
Expand Down Expand Up @@ -131,9 +132,17 @@ class ExecutionBlock {

[[nodiscard]] auto printBlockInfo() const -> std::string const;
[[nodiscard]] auto printTypeInfo() const -> std::string const;
[[nodiscard]] auto printBlockAndDependenciesInfo() const noexcept
-> std::string const;

virtual auto stopAsyncTasks() -> void;

[[nodiscard]] auto isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock*;

[[nodiscard]] auto hasStoppedAsyncTasks() const noexcept -> bool;

protected:
// Trace the start of a execute call
void traceExecuteBegin(AqlCallStack const& stack,
Expand Down Expand Up @@ -176,6 +185,10 @@ class ExecutionBlock {
/// @brief if this is set, we are done, this is reset to false by execute()
bool _done;

/// @brief if this is set, we have stopped async tasks, this is set to true by
/// stopAsyncTasks()
bool _stoppedAsyncTasks{false};

#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
/// @brief if this is set to true, one thread is using this block, so we can
/// assert that no other thread can access this block at the same time - as
Expand Down
27 changes: 27 additions & 0 deletions arangod/Aql/ExecutionBlockImpl.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ ExecutionBlockImpl<Executor>::~ExecutionBlockImpl() {

template<class Executor>
void ExecutionBlockImpl<Executor>::stopAsyncTasks() {
_stoppedAsyncTasks = true;
if (_prefetchTask) {
// Double use diagnostics:
uint64_t userCount = _numberOfUsers.fetch_add(1);
Expand Down Expand Up @@ -1079,6 +1080,32 @@ auto ExecutionBlockImpl<Executor>::executeFetcher(ExecutionContext& ctx,
return;
}

// We are entering debugging land here.
// Someone hasasked as to stop async tasks.
// But here we are just starting to work on one.
// This is okay and can rarely happen. But if it happens
// We want to know if we start the task and finish it quickly

bool hasStoppedAsyncTasks = block->hasStoppedAsyncTasks();
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d22", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id() << "] ALERT"
<< block->printBlockInfo()
<< " was asked to stop async task. We still start one. "
"This is an allowed rare race.";
}

auto stopGuard =
ScopeGuard([block, hasStoppedAsyncTasks]() noexcept {
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d21", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id()
<< "] CLEAR ALERT" << block->printBlockInfo()
<< " We completed the task of the aforementioned "
"race. All is fine.";
}
});

TRI_IF_FAILURE("AsyncPrefetch::blocksDestroyedOutOfOrder") {
using namespace std::chrono_literals;
std::this_thread::sleep_for(100ms);
Expand Down
36 changes: 33 additions & 3 deletions arangod/Aql/ExecutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,40 @@ ExecutionEngine::~ExecutionEngine() {
// tasks running on any dependent which could start another on the
// current block.
// The blocks are pushed in a reversed topological order.
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
(*it)->stopAsyncTasks();
{
// Note: We can use raw pointers here because we are not taking
// any responsibilty of the pointer. It is still managed by the _blocks
// vector. Also we are in the destructor here, so we have guaranteed,
// that no one else is cleaning up the blocks.
std::unordered_set<ExecutionBlock*> seenBlocks;
bool needToPrintViolation = false;
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
auto block = it->get();
if (ExecutionBlock* seenDependency =
block->isDependencyInList(seenBlocks);
seenDependency != nullptr &&
block->getPlanNode()->getType() != ExecutionNode::GATHER) {
// We have a dependency that has already been seen, we need to log this
// situation in theory this could lead to deadlocks. Some Blocks are
// fine we just want to see those here.
// Gather Nodes are known to violate this, but they are safe.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we not do the warning for gather nodes then, to not spam the log with warnings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to point you to the following line:

seenDependency != nullptr && block->getPlanNode()->getType() != ExecutionNode::GATHER) {

;)

LOG_TOPIC("a6c2b", WARN, Logger::AQL)
<< "ALERT Stopping async tasks for " << block->printBlockInfo()
<< " but have already stopped dependency "
<< seenDependency->printBlockInfo();
needToPrintViolation = true;
}
block->stopAsyncTasks();
seenBlocks.insert(block);
}
if (needToPrintViolation) {
for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) {
LOG_TOPIC("a6c2d", WARN, Logger::AQL)
<< (*it2)->printBlockAndDependenciesInfo();
}
TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering";
}
}

if (_sharedState) { // ensure no async task is working anymore
_sharedState->invalidate();
}
Expand Down
0