8000 Bug fix/print on violated block ordering by mchacki · Pull Request #21743 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Bug fix/print on violated block ordering #21743

8000
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Applied review comment. Also added ALERT. Also print if we are still …
…starting async tasks although we already stopped them
  • Loading branch information
mchacki committed Apr 30, 2025
commit ea856464d788bfe6ba09bb04c551e806a0f70c4d
15 changes: 11 additions & 4 deletions arangod/Aql/ExecutionBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ auto ExecutionBlock::printBlockInfo() const -> std::string const {

auto ExecutionBlock::stopAsyncTasks() -> void {}

auto ExecutionBlock::isDependencyInList(std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept -> ExecutionBlock* {
auto ExecutionBlock::isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock* {
for (auto const& dependency : _dependencies) {
if (seenBlocks.find(dependency) != seenBlocks.end()) {
return dependency;
Expand All @@ -220,15 +222,20 @@ auto ExecutionBlock::isDependencyInList(std::unordered_set<ExecutionBlock*> cons
return nullptr;
}

auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept -> std::string const {

auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept
-> std::string const {
std::stringstream ss;
ss << printBlockInfo();
ss << " async prefetching type: " <<(int) getPlanNode()->canUseAsyncPrefetching();
ss << " async prefetching type: "
<< (int)getPlanNode()->canUseAsyncPrefetching();
ss << " calls: [";
for (auto const& dependency : _dependencies) {
ss << " " << dependency->printBlockInfo() << ",";
}
ss << " ]";
return ss.str();
}

auto ExecutionBlock::hasStoppedAsyncTasks() const noexcept -> bool {
return _stoppedAsyncTasks;
}
6 changes: 5 additions & 1 deletion arangod/Aql/ExecutionBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include <string_view>
#include <vector>


namespace arangodb {
namespace transaction {
class Methods;
Expand Down Expand Up @@ -141,6 +140,8 @@ class ExecutionBlock {
[[nodiscard]] auto isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock*;

[[nodiscard]] auto hasStoppedAsyncTasks() const noexcept -> bool;

protected:
// Trace the start of a execute call
Expand Down Expand Up @@ -184,6 +185,9 @@ class ExecutionBlock {
/// @brief if this is set, we are done, this is reset to false by execute()
bool _done;

/// @brief if this is set, we have stopped async tasks, this is set to true by stopAsyncTasks()
bool _stoppedAsyncTasks{false};

#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
/// @brief if this is set to true, one thread is using this block, so we can
/// assert that no other thread can access this block at the same time - as
Expand Down
25 changes: 25 additions & 0 deletions arangod/Aql/ExecutionBlockImpl.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ ExecutionBlockImpl<Executor>::~ExecutionBlockImpl() {

template<class Executor>
void ExecutionBlockImpl<Executor>::stopAsyncTasks() {
TRI_ASSERT(!_stoppedAsyncTasks) << "Someone already stopped async tasks for " << printBlockInfo();
_stoppedAsyncTasks = true;
if (_prefetchTask && !_prefetchTask->isConsumed() &&
!_prefetchTask->tryClaim()) {
// some thread is still working on our prefetch task
Expand Down Expand Up @@ -1047,6 +1049,29 @@ auto ExecutionBlockImpl<Executor>::executeFetcher(ExecutionContext& ctx,
return;
}

// We are entering debugging land here.
// Someone hasasked as to stop async tasks.
// But here we are just starting to work on one.
// This is okay and can rarely happen. But if it happens
// We want to know if we start the task and finish it quickly

bool hasStoppedAsyncTasks = block->hasStoppedAsyncTasks();
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d20", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id() << "] ALERT"
<< block->printBlockInfo()
<< " was asked to stop async task. We still start one. This is an allowed rare race.";
}

auto stopGuard = ScopeGuard([block, hasStoppedAsyncTasks]() noexcept {
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d21", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id() << "] CLEAR ALERT"
<< block->printBlockInfo()
<< " We completed the task of the aforementioned race. All is fine.";
}
});

TRI_IF_FAILURE("AsyncPrefetch::blocksDestroyedOutOfOrder") {
using namespace std::chrono_literals;
std::this_thread::sleep_for(100ms);
Expand Down
19 changes: 12 additions & 7 deletions arangod/Aql/ExecutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,28 +278,33 @@ ExecutionEngine::~ExecutionEngine() {
// vector. Also we are in the destructor here, so we have guaranteed,
// that no one else is cleaning up the blocks.
std::unordered_set<ExecutionBlock*> seenBlocks;
bool needToPrintViolation = false;
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
auto block = it->get();
if (ExecutionBlock* seenDependency =
block->isDependencyInList(seenBlocks);
seenDependency != nullptr && block->getPlanNode()->getType() != ExecutionNode::GATHER) {
seenDependency != nullptr &&
block->getPlanNode()->getType() != ExecutionNode::GATHER) {
// We have a dependency that has already been seen, we need to log this
// situation in theory this could lead to deadlocks. Some Blocks are
// fine we just want to see those here.
// Gather Nodes are known to violate this, but they are safe.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we not do the warning for gather nodes then, to not spam the log with warnings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to point you to the following line:

seenDependency != nullptr && block->getPlanNode()->getType() != ExecutionNode::GATHER) {

;)

LOG_TOPIC("a6c2b", WARN, Logger::AQL)
<< "Stopping async tasks for " << block->printBlockInfo()
<< "ALERT Stopping async tasks for " << block->printBlockInfo()
<< " but have already stopped dependency "
<< seenDependency->printBlockInfo();
LOG_TOPIC("a6c2c", WARN, Logger::AQL) << "Full list of blocks:";
for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) {
LOG_TOPIC("a6c2d", WARN, Logger::AQL) << (*it2)->printBlockAndDependenciesInfo();
}
TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering";
needToPrintViolation = true;
}
block->stopAsyncTasks();
seenBlocks.insert(block);
}
if (needToPrintViolation) {
for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) {
LOG_TOPIC("a6c2d", WARN, Logger::AQL)
<< (*it2)->printBlockAndDependenciesInfo();
}
TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering";
}
}
if (_sharedState) { // ensure no async task is working anymore
_sharedState->invalidate();
Expand Down
0