8000 Bug fix/print on violated block ordering (#21743) · arangodb/arangodb@7d270c5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7d270c5

Browse files
mchackineunhoef
andcommitted
Bug fix/print on violated block ordering (#21743)
* Added printing for violation of shutdown ordering * Exclude GatherNodes from order violation checks. Added more informative printing * Applied review comment. Also added ALERT. Also print if we are still starting async tasks although we already stopped them --------- Co-authored-by: Max Neunhoeffer <max@arangodb.com>
1 parent f222093 commit 7d270c5

File tree

4 files changed

+102
-4
lines changed

4 files changed

+102
-4
lines changed

arangod/Aql/ExecutionBlock.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,3 +210,32 @@ auto ExecutionBlock::printBlockInfo() const -> std::string const {
210210
}
211211
212212
auto ExecutionBlock::stopAsyncTasks() -> void {}
213+
214+
auto ExecutionBlock::isDependencyInList(
215+
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
216+
-> ExecutionBlock* {
217+
for (auto const& dependency : _dependencies) {
218+
if (seenBlocks.find(dependency) != seenBlocks.end()) {
219+
return dependency;
220+
}
221+
}
222+
return nullptr;
223+
}
224+
225+
auto ExecutionBlock::printBlockAndDependenciesInfo() const noexcept
226+
-> std::string const {
227+
std::stringstream ss;
228+
ss << printBlockInfo();
229+
ss << " async prefetching type: "
230+
<< (int)getPlanNode()->canUseAsyncPrefetching();
231+
ss << " calls: [";
232+
for (auto const& dependency : _dependencies) {
233+
ss << " " << dependency->printBlockInfo() << ",";
234+
}
235+
ss << " ]";
236+
return ss.str();
237+
}
238+
239+
auto ExecutionBlock::hasStoppedAsyncTasks() const noexcept -> bool {
240+
return _stoppedAsyncTasks;
241+
}

arangod/Aql/ExecutionBlock.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@
3131

3232
#include <atomic>
3333
#include <cstdint>
34-
#include <string_view>
34+
#include <unordered_set>
3535
#include <utility>
36+
#include <string_view>
3637
#include <vector>
3738

3839
namespace arangodb {
@@ -131,9 +132,17 @@ class ExecutionBlock {
131132

132133
[[nodiscard]] auto printBlockInfo() const -> std::string const;
133134
[[nodiscard]] auto printTypeInfo() const -> std::string const;
135+
[[nodiscard]] auto printBlockAndDependenciesInfo() const noexcept
136+
-> std::string const;
134137

135138
virtual auto stopAsyncTasks() -> void;
136139

140+
[[nodiscard]] auto isDependencyInList(
141+
std::unordered_set<ExecutionBlock*> const& seenBlocks) const noexcept
142+
-> ExecutionBlock*;
143+
144+
[[nodiscard]] auto hasStoppedAsyncTasks() const noexcept -> bool;
145+
137146
protected:
138147
// Trace the start of a execute call
139148
void traceExecuteBegin(AqlCallStack const& stack,
@@ -176,6 +185,10 @@ class ExecutionBlock {
176185
/// @brief if this is set, we are done, this is reset to false by execute()
177186
bool _done;
178187

188+
/// @brief if this is set, we have stopped async tasks, this is set to true by
189+
/// stopAsyncTasks()
190+
bool _stoppedAsyncTasks{false};
191+
179192
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
180193
/// @brief if this is set to true, one thread is using this block, so we can
181194
/// assert that no other thread can access this block at the same time - as

arangod/Aql/ExecutionBlockImpl.tpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,32 @@ auto ExecutionBlockImpl<Executor>::executeFetcher(ExecutionContext& ctx,
10491049
return;
10501050
}
10511051

1052+
// We are entering debugging land here.
1053+
// Someone hasasked as to stop async tasks.
1054+
// But here we are just starting to work on one.
1055+
// This is okay and can rarely happen. But if it happens
1056+
// We want to know if we start the task and finish it quickly
1057+
1058+
bool hasStoppedAsyncTasks = block->hasStoppedAsyncTasks();
1059+
if (hasStoppedAsyncTasks) {
1060+
LOG_TOPIC("14d22", WARN, Logger::AQL)
1061+
<< "[query#" << block->getQuery().id() << "] ALERT"
1062+
<< block->printBlockInfo()
1063+
<< " was asked to stop async task. We still start one. "
1064+
"This is an allowed rare race.";
1065+
}
1066+
1067+
au 629A to stopGuard =
1068+
ScopeGuard([block, hasStoppedAsyncTasks]() noexcept {
1069+
if (hasStoppedAsyncTasks) {
1070+
LOG_TOPIC("14d21", WARN, Logger::AQL)
1071+
<< "[query#" << block->getQuery().id()
1072+
<< "] CLEAR ALERT" << block->printBlockInfo()
1073+
<< " We completed the task of the aforementioned "
1074+
"race. All is fine.";
1075+
}
1076+
});
1077+
10521078
TRI_IF_FAILURE("AsyncPrefetch::blocksDestroyedOutOfOrder") {
10531079
using namespace std::chrono_literals;
10541080
std::this_thread::sleep_for(100ms);

arangod/Aql/ExecutionEngine.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,40 @@ ExecutionEngine::~ExecutionEngine() {
272272
// tasks running on any dependent which could start another on the
273273
// current block.
274274
// The blocks are pushed in a reversed topological order.
275-
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
276-
(*it)->stopAsyncTasks();
275+
{
276+
// Note: We can use raw pointers here because we are not taking
277+
// any responsibilty of the pointer. It is still managed by the _blocks
278+
// vector. Also we are in the destructor here, so we have guaranteed,
279+
// that no one else is cleaning up the blocks.
280+
std::unordered_set<ExecutionBlock*> seenBlocks;
281+
bool needToPrintViolation = false;
282+
for (auto it = _blocks.rbegin(); it != _blocks.rend(); ++it) {
283+
auto block = it->get();
284+
if (ExecutionBlock* seenDependency =
285+
block->isDependencyInList(seenBlocks);
286+
seenDependency != nullptr &&
287+
block->getPlanNode()->getType() != ExecutionNode::GATHER) {
288+
// We have a dependency that has already been seen, we need to log this
289+
// situation in theory this could lead to deadlocks. Some Blocks are
290+
// fine we just want to see those here.
291+
// Gather Nodes are known to violate this, but they are safe.
292+
LOG_TOPIC("a6c2b", WARN, Logger::AQL)
293+
<< "ALERT Stopping async tasks for " << block->printBlockInfo()
294+
<< " but have already stopped dependency "
295+
<< seenDependency->printBlockInfo();
296+
needToPrintViolation = true;
297+
}
298+
block->stopAsyncTasks();
299+
seenBlocks.insert(block);
300+
}
301+
if (needToPrintViolation) {
302+
for (auto it2 = _blocks.rbegin(); it2 != _blocks.rend(); ++it2) {
303+
LOG_TOPIC("a6c2d", WARN, Logger::AQL)
304+
<< (*it2)->printBlockAndDependenciesInfo();
305+
}
306+
TRI_ASSERT(false) << "Triggered violation in ExecutionBlock ordering";
307+
}
277308
}
278-
279309
if (_sharedState) { // ensure no async task is working anymore
280310
_sharedState->invalidate();
281311
}

0 commit comments

Comments
 (0)
0