8000 Forward port for: print on violated block ordering (#21743) by mchacki · Pull Request #21773 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Forward port for: print on violated block ordering (#21743) #21773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions arangod/Aql/ExecutionBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,32 @@ auto ExecutionBlock::printBlockInfo() const -> std::string const {
}

auto ExecutionBlock::stopAsyncTasks() -> void {}

auto ExecutionBlock::isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock* {
for (auto const& dependency : _dependencies) {
if (seenBlocks.find(dependency) != seenBlocks.end()) {
return dependency;
}
}
return nullptr;
}

auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept
-> std::string const {
std::stringstream ss;
ss << printBlockInfo();
ss << " async prefetching type: "
<< (int)getPlanNode()->canUseAsyncPrefetching();
ss << " calls: [";
for (auto const& dependency : _dependencies) {
ss << " " << dependency->printBlockInfo() << ",";
}
ss << " ]";
return ss.str();
}

auto ExecutionBlock::hasStoppedAsyncTasks() const noexcept -> bool {
return _stoppedAsyncTasks;
}
15 changes: 14 additions & 1 deletion arangod/Aql/ExecutionBlock.h
10000
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@

#include <atomic>
#include <cstdint>
#include <string_view>
#include <unordered_set>
#include <utility>
#include <string_view>
#include <vector>

namespace arangodb {
Expand Down Expand Up @@ -131,9 +132,17 @@ class ExecutionBlock {

[[nodiscard]] auto printBlockInfo() const -> std::string const;
[[nodiscard]] auto printTypeInfo() const -> std::string const;
[[nodiscard]] auto printBlockAndDependenciesInfo() const noexcept
-> std::string const;

virtual auto stopAsyncTasks() -> void;

[[nodiscard]] auto isDependencyInList(
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
-> ExecutionBlock*;

[[nodiscard]] auto hasStoppedAsyncTasks() const noexcept -> bool;

protected:
// Trace the start of a execute call
void traceExecuteBegin(AqlCallStack const& stack,
Expand Down Expand Up @@ -176,6 +185,10 @@ class ExecutionBlock {
/// @brief if this is set, we are done, this is reset to false by execute()
bool _done;

/// @brief if this is set, we have stopped async tasks, this is set to true by
/// stopAsyncTasks()
bool _stoppedAsyncTasks{false};

#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
/// @brief if this is set to true, one thread is using this block, so we can
/// assert that no other thread can access this block at the same time - as
Expand Down
26 changes: 26 additions & 0 deletions arangod/Aql/ExecutionBlockImpl.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,32 @@ auto ExecutionBlockImpl<Executor>::executeFetcher(ExecutionContext& ctx,
return;
}

// We are entering debugging land here.
// Someone hasasked as to stop async tasks.
// But here we are just starting to work on one.
// This is okay and can rarely happen. But if it happens
// We want to know if we start the task and finish it quickly

bool hasStoppedAsyncTasks = block->hasStoppedAsyncTasks();
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d22", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id() << "] ALERT"
<< block->printBlockInfo()
<< " was asked to stop async task. We still start one. "
"This is an allowed rare race.";
}

auto stopGuard =
ScopeGuard([block, hasStoppedAsyncTasks]() noexcept {
if (hasStoppedAsyncTasks) {
LOG_TOPIC("14d21", WARN, Logger::AQL)
<< "[query#" << block->getQuery().id()
<< "] CLEAR ALERT" << block->printBlockInfo()
<< " We completed the task of the aforementioned "
"race. All is fine.";
}
});
Comment on lines +1058 to +1076
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should not be on WARN, but probably on DEBUG.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If they are DEBUG, then we do not see them in production, or?


TRI_IF_FAILURE("AsyncPrefetch::blocksDestroyedOutOfOrder") {
using namespace std::chrono_literals;
std::this_thread::sleep_for(100ms);
Expand Down
36 changes: 33 additions & 3 deletions arangod/Aql/ExecutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,40 @@ ExecutionEngine::~ExecutionEngine() {
// tasks running on any dependent which could start another on the
// current block.
// The blocks are pushed in a reversed topological order.
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
(*it)->stopAsyncTasks();
{
// Note: We can use raw pointers here because we are not taking
// any responsibilty of the pointer. It is still managed by the _blocks
// vector. Also we are in the destructor here, so we have guaranteed,
// that no one else is cleaning up the blocks.
std::unordered_set<ExecutionBlock*> seenBlocks;
bool needToPrintViolation = false;
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
auto block = it->get();
if (ExecutionBlock* seenDependency =
block->isDependencyInList(seenBlocks);
seenDependency != nullptr &&
block->getPlanNode()->getType() != ExecutionNode::GATHER) {
// We have a dependency that has already been seen, we need to log this
// situation in theory this could lead to deadlocks. Some Blocks are
// fine we just want to see those here.
// Gather Nodes are known to violate this, but they are safe.
LOG_TOPIC("a6c2b", WARN, Logger::AQL)
<< "ALERT Stopping async tasks for " << block->printBlockInfo()
<< " but have already stopped dependency "
<< seenDependency->printBlockInfo();
Comment on lines +292 to +295
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could actually be bumped to ERROR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine with me.

needToPrintViolation = true;
}
block->stopAsyncTasks();
seenBlocks.insert(block);
}
if (needToPrintViolation) {
for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) {
LOG_TOPIC("a6c2d", WARN, Logger::AQL)
<< (*it2)->printBlockAndDependenciesInfo();
}
TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering";
}
}

if (_sharedState) { // ensure no async task is working anymore
_sharedState->invalidate();
}
Expand Down
0