8000 Bug fix 3.8/allow for long supervision job runtimes by kvahed · Pull Request #14742 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Bug fix 3.8/allow for long supervision job runtimes #14742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Sep 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
v3.8.2 (XXXX-XX-XX)
-------------------

* No runtime limits for shard move and server cleanout jobs, instead possibility
to cancel them.

* (EE only) Bug-fix: If you created a ArangoSearch view on Satellite-
Collections only and then join with a collection only having a single shard
the cluster-one-shard-rule was falsely applied and could lead to empty view
Expand Down Expand Up @@ -48,7 +51,7 @@ v3.8.2 (XXXX-XX-XX)
several clients concurrently inserts data and use custom analyzer with
non-string return type.

* Fix a rare shutdown race in RocksDBShaCalculatorThread.
* Fix a rare shutdown race in RocksDBShaCalculatorThread.

* Reduce internal priority of AQL execution. This prevents possible deadlocks
8000 with modification operations in a cluster and replicationFactor >= 2, and can
Expand Down
18 changes: 7 additions & 11 deletions arangod/Agency/CleanOutServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,8 @@ JOB_STATUS CleanOutServer::status() {
}

if (found > 0) { // some subjob still running
// timeout here:
auto tmp_time =
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated");
std::string timeCreatedString = tmp_time.first;
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
Supervision::TimePoint now(std::chrono::system_clock::now());
if (now - timeCreated > std::chrono::duration<double>(86400.0)) { // 1 day
abort("job timed out");
return FAILED;
}
return PENDING;
// consider cancellation
return considerCancellation() ? FAILED : PENDING;
}

Node::Children const& failed = _snapshot.hasAsChildren(failedPrefix).first;
Expand Down Expand Up @@ -198,6 +189,11 @@ bool CleanOutServer::create(std::shared_ptr<VPackBuilder> envelope) {
}

bool CleanOutServer::start(bool& aborts) {

if (considerCancellation()) {
return false;
}

// If anything throws here, the run() method catches it and finishes
// the job.

Expand Down
11 changes: 11 additions & 0 deletions arangod/Agency/Job.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ Job::~Job() = default;
// this will be initialized in the AgencyFeature
std::string Job::agencyPrefix = "arango";

bool Job::considerCancellation() {
// Allow for cancellation of shard moves
auto [cancel,exists] =
_snapshot.hasAsBool(std::string("/Target/") + jobStatus[_status] + "/" + _jobId + "/abort");
auto cancelled = exists && cancel;
if (cancelled) {
abort("Killed via API");
}
return cancelled;
};

bool Job::finish(std::string const& server, std::string const& shard,
bool success, std::string const& reason, query_t const payload) {
try { // protect everything, just in case
Expand Down
3 changes: 3 additions & 0 deletions arangod/Agency/Job.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ namespace consensus {
class Node;

enum JOB_STATUS { TODO, PENDING, FINISHED, FAILED, NOTFOUND };
const std::vector<std::string> jobStatus {"ToDo", "Pending", "Finished", "Failed"};
const std::vector<std::string> pos({"/Target/ToDo/", "/Target/Pending/",
"/Target/Finished/", "/Target/Failed/"});
extern std::string const mapUniqueToShortID;
Expand Down Expand Up @@ -88,6 +89,8 @@ struct Job {

virtual void run(bool& aborts) = 0;

bool considerCancellation();

void runHelper(std::string const& server, std::string const& shard, bool& aborts) {
if (_status == FAILED) { // happens when the constructor did not work
return;
Expand Down
50 changes: 22 additions & 28 deletions arangod/Agency/MoveShard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
}

bool MoveShard::start(bool&) {

if (considerCancellation()) {
return false;
}

// If anything throws here, the run() method catches it and finishes
// the job.

Expand Down Expand Up @@ -477,6 +482,13 @@ bool MoveShard::start(bool&) {
}

JOB_STATUS MoveShard::status() {

if (_status == PENDING || _status == TODO) {
if (considerCancellation()) {
return FAILED;
}
}

if (_status != PENDING) {
return _status;
}
Expand All @@ -495,19 +507,6 @@ JOB_STATUS MoveShard::status() {

JOB_STATUS MoveShard::pendingLeader() {

auto considerTimeout = [&]() -> bool {
// Not yet all in sync, consider timeout:
std::string timeCreatedString =
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
Supervision::TimePoint now(std::chrono::system_clock::now());
if (now - timeCreated > std::chrono::duration<double>(43200.0)) { // 12h
abort("MoveShard timed out in pending leader");
return true;
}
return false;
};

// Find the other shards in the same distributeShardsLike group:
std::vector<Job::shard_t> shardsLikeMe =
clones(_snapshot, _database, _collection, _shard);
Expand Down Expand Up @@ -556,9 +555,9 @@ JOB_STATUS MoveShard::pendingLeader() {
}
});

// Consider timeout:
// Consider cancellation:
if (done < shardsLikeMe.size()) {
if (considerTimeout()) {
if (considerCancellation()) {
return FAILED;
}
return PENDING; // do not act
Expand 23D3 Down Expand Up @@ -623,9 +622,9 @@ JOB_STATUS MoveShard::pendingLeader() {
}
});

// Consider timeout:
// Consider cancellation:
if (done < shardsLikeMe.size()) {
if (considerTimeout()) {
if (considerCancellation()) {
return FAILED;
}
return PENDING; // do not act!
Expand Down Expand Up @@ -752,9 +751,9 @@ JOB_STATUS MoveShard::pendingLeader() {
}
});

// Consider timeout:
// Consider cancellation:
if (done < shardsLikeMe.size()) {
if (considerTimeout()) {
if (considerCancellation()) {
return FAILED;
}
return PENDING; // do not act!
Expand Down Expand Up @@ -841,6 +840,7 @@ JOB_STATUS MoveShard::pendingLeader() {
}

JOB_STATUS MoveShard::pendingFollower() {

// Check if any of the servers in the Plan are FAILED, if so,
// we abort:
std::string planPath =
Expand All @@ -867,14 +867,8 @@ JOB_STATUS MoveShard::pendingFollower() {
}
});

if (done < shardsLikeMe.size()) {
// Not yet all in sync, consider timeout:
std::string timeCreatedString =
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
Supervision::TimePoint now(std::chrono::system_clock::now());
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
abort("MoveShard timed out in pending follower");
if (done < shardsLikeMe.size()) { // Consider cancellation
if (considerCancellation()) {
return FAILED;
}
return PENDING;
Expand Down Expand Up @@ -981,7 +975,7 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
}
}

if (finish("", "", true, "job aborted (1): " + reason, todoPrec)) {
if (finish("", "", false, "job aborted (1): " + reason, todoPrec)) {
return result;
}
_status = PENDING;
Expand Down
Loading
0