8000 Bug fix 3.8/allow for long supervision job runtimes (#14742) · arangodb/arangodb@7f716cf · GitHub
[go: up one dir, main page]

Skip to content

Commit 7f716cf

Browse files
kvahedgoedderzKVS85jsteemann
authored
Bug fix 3.8/allow for long supervision job runtimes (#14742)
* [3.8] Lower priority of AQL lanes (#14699) * Lower priority of AQL lanes * Added CHANGELOG entry * Improved comments Co-authored-by: Vadim <vadim@arangodb.com> * added a test for statistics behavior (#14703) * properly rename test file (#14705) * fix hard time limits on move shard and cleanout server * cleaner * allow for long supervision job runtimes * revert unfug in change log * fix 412 response * fix 412 response * Result constructors * integration tests * integration tests * integration tests * integration tests cleanout server * cleanoutserver integration tests * fixing test * 3.8 cleanout server abort -> moveshard failed * same error over? * Update CHANGELOG Co-authored-by: Tobias Gödderz <tobias@arangodb.com> Co-authored-by: Vadim <vadim@arangodb.com> Co-authored-by: Jan <jsteemann@users.noreply.github.com>
1 parent 59096fc commit 7f716cf

File tree

10 files changed

+442
-190
lines changed

10 files changed

+442
-190
lines changed

CHANGELOG

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
v3.8.2 (XXXX-XX-XX)
22
-------------------
33

4+
* No runtime limits for shard move and server cleanout jobs, instead possibility
5+
to cancel them.
6+
47
* (EE only) Bug-fix: If you created a ArangoSearch view on Satellite-
58
Collections only and then join with a collection only having a single shard
69
the cluster-one-shard-rule was falsely applied and could lead to empty view
@@ -48,7 +51,7 @@ v3.8.2 (XXXX-XX-XX)
4851
several clients concurrently inserts data and use custom analyzer with
4952
non-string return type.
5053

51-
* Fix a rare shutdown race in RocksDBShaCalculatorThread.
54+
* Fix a rare shutdown race in RocksDBShaCalculatorThread.
5255

5356
* Reduce internal priority of AQL execution. This prevents possible deadlocks
5457
with modification operations in a cluster and replicationFactor >= 2, and can

arangod/Agency/CleanOutServer.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,17 +82,8 @@ JOB_STATUS CleanOutServer::status() {
8282
}
8383

8484
if (found > 0) { // some subjob still running
85-
// timeout here:
86-
auto tmp_time =
87-
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated");
88-
std::string timeCreatedString = tmp_time.first;
89-
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
90-
Supervision::TimePoint now(std::chrono::system_clock::now());
91-
if (now - timeCreated > std::chrono::duration<double>(86400.0)) { // 1 day
92-
abort("job timed out");
93-
return FAILED;
94-
}
95-
return PENDING;
85+
// consider cancellation
86+
return considerCancellation() ? FAILED : PENDING;
9687
}
9788

9889
Node::Children const& failed = _snapshot.hasAsChildren(failedPrefix).first;
@@ -198,6 +189,11 @@ bool CleanOutServer::create(std::shared_ptr<VPackBuilder> envelope) {
198189
}
199190

200191
bool CleanOutServer::start(bool& aborts) {
192+
193+
if (considerCancellation()) {
194+
return false;
195+
}
196+
201197
// If anything throws here, the run() method catches it and finishes
202198
// the job.
203199

arangod/Agency/Job.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,17 @@ Job::~Job() = default;
8787
// this will be initialized in the AgencyFeature
8888
std::string Job::agencyPrefix = "arango";
8989

90+
bool Job::considerCancellation() {
91+
// Allow for cancellation of shard moves
92+
auto [cancel,exists] =
93+
_snapshot.hasAsBool(std::string("/Target/") + jobStatus[_status] + "/" + _jobId + "/abort");
94+
auto cancelled = exists && cancel;
95+
if (cancelled) {
96+
abort("Killed via API");
97+
}
98+
return cancelled;
99+
};
100+
90101
bool Job::finish(std::string const& server, std::string const& shard,
91102
bool success, std::string const& reason, query_t const payload) {
92103
try { // protect everything, just in case

arangod/Agency/Job.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ namespace consensus {
4949
class Node;
5050

5151
enum JOB_STATUS { TODO, PENDING, FINISHED, FAILED, NOTFOUND };
52+
const std::vector<std::string> jobStatus {"ToDo", "Pending", "Finished", "Failed"};
5253
const std::vector<std::string> pos({"/Target/ToDo/", "/Target/Pending/",
5354
"/Target/Finished/", "/Target/Failed/"});
5455
extern std::string const mapUniqueToShortID;
@@ -88,6 +89,8 @@ struct Job {
8889

8990
virtual void run(bool& aborts) = 0;
9091

92+
bool considerCancellation();
93+
9194
void runHelper(std::string const& server, std::string const& shard, bool& aborts) {
9295
if (_status == FAILED) { // happens when the constructor did not work
9396
return;

arangod/Agency/MoveShard.cpp

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
181181
}
182182

183183
bool MoveShard::start(bool&) {
184+
185+
if (considerCancellation()) {
186+
return false;
187+
}
188+
184189
// If anything throws here, the run() method catches it and finishes
185190
// the job.
186191

@@ -477,6 +482,13 @@ bool MoveShard::start(bool&) {
477482
}
478483

479484
JOB_STATUS MoveShard::status() {
485+
486+
if (_status == PENDING || _status == TODO) {
487+
if (considerCancellation()) {
488+
return FAILED;
489+
}
490+
}
491+
480492
if (_status != PENDING) {
481493
return _status;
482494
}
@@ -495,19 +507,6 @@ JOB_STATUS MoveShard::status() {
495507

496508
JOB_STATUS MoveShard::pendingLeader() {
497509

498-
auto considerTimeout = [&]() -> bool {
499-
// Not yet all in sync, consider timeout:
500-
std::string timeCreatedString =
501-
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
502-
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
503-
Supervision::TimePoint now(std::chrono::system_clock::now());
504-
if (now - timeCreated > std::chrono::duration<double>(43200.0)) { // 12h
505-
abort("MoveShard timed out in pending leader");
506-
return true;
507-
}
508-
return false;
509-
};
510-
511510
// Find the other shards in the same distributeShardsLike group:
512511
std::vector<Job::shard_t> shardsLikeMe =
513512
clones(_snapshot, _database, _collection, _shard);
@@ -556,9 +555,9 @@ JOB_STATUS MoveShard::pendingLeader() {
556555
}
557556
});
558557

559-
// Consider timeout:
558+
// Consider cancellation:
560559
if (done < shardsLikeMe.size()) {
561-
if (considerTimeout()) {
560+
if (considerCancellation()) {
562561
return FAILED;
563562
}
564563
return PENDING; // do not act
@@ -623,9 +622,9 @@ JOB_STATUS MoveShard::pendingLeader() {
623622
}
624623
});
625624

626-
// Consider timeout:
625+
// Consider cancellation:
627626
if (done < shardsLikeMe.size()) {
628-
if (considerTimeout()) {
627+
if (considerCancellation()) {
629628
return FAILED;
630629
}
631630
return PENDING; // do not act!
@@ -752,9 +751,9 @@ JOB_STATUS MoveShard::pendingLeader() {
752751
}
753752
});
754753

755-
// Consider timeout:
754+
// Consider cancellation:
756755
if (done < shardsLikeMe.size()) {
757-
if (considerTimeout()) {
756+
if (considerCancellation()) {
758757
return FAILED;
759758
}
760759
return PENDING; // do not act!
@@ -841,6 +840,7 @@ JOB_STATUS MoveShard::pendingLeader() {
841840
}
842841

843842
JOB_STATUS MoveShard::pendingFollower() {
843+
844844
// Check if any of the servers in the Plan are FAILED, if so,
845845
// we abort:
846846
std::string planPath =
@@ -867,14 +867,8 @@ JOB_STATUS MoveShard::pendingFollower() {
867867
}
868868
});
869869

870-
if (done < shardsLikeMe.size()) {
871-
// Not yet all in sync, consider timeout:
872-
std::string timeCreatedString =
873-
_snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
874-
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
875-
Supervision::TimePoint now(std::chrono::system_clock::now());
876-
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
877-
abort("MoveShard timed out in pending follower");
870+
if (done < shardsLikeMe.size()) { // Consider cancellation
871+
if (considerCancellation()) {
878872
return FAILED;
879873
}
880874
return PENDING;
@@ -981,7 +975,7 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
981975
}
982976
}
983977

984-
if (finish("", "", true, "job aborted (1): " + reason, todoPrec)) {
978+
if (finish("", "", false, "job aborted (1): " + reason, todoPrec)) {
985979
return result;
986980
}
987981
_status = PENDING;

0 commit comments

Comments
 (0)
0