8000 Handle a few forgotten cases of already resigned leaders. (#12443) · adamjm/arangodb@bfd68fd · GitHub
[go: up one dir, main page]

Skip to content

Commit bfd68fd

Browse files
authored
Handle a few forgotten cases of already resigned leaders. (arangodb#12443)
* Handle a few forgotten cases of already resigned leaders. In a failover or cleanout or resign leadership situation it is possible that in Current we have an already resigned leader. This needs to be taken into account when deciding which server to failover to or to move to. This fixes bad behaviour in corner cases. * CHANGELOG.
1 parent 2cf3536 commit bfd68fd

File tree

6 files changed

+24
-9
lines changed

6 files changed

+24
-9
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
devel
22
-----
33

4+
* Fixed bad behaviour in agency supervision in some corner cases involving
5+
already resigned leaders in Current.
6+
47
* Fixed a problem with potentially lost updates because a failover could
58
happen at a wrong time or a restarted leader could come back at an
69
unlucky time.

arangod/Agency/CleanOutServer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
402402
if (isLeader) {
403403

404404
std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
405-
_snapshot, database.first, collptr.first, shard.first);
405+
_snapshot, database.first, collptr.first, shard.first, _server);
406406

407407
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
408408
_jobId, database.first, collptr.first, shard.first, _server,

arangod/Agency/FailedLeader.cpp

Lines changed: 9 additions & 1 deletion
< 10000 tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ bool FailedLeader::start(bool& aborts) {
190190

191191
// Get healthy in Sync follower common to all prototype + clones
192192
auto commonHealthyInSync =
193-
findNonblockedCommonHealthyInSyncFollower(_snapshot, _database, _collection, _shard);
193+
findNonblockedCommonHealthyInSyncFollower(_snapshot, _database, _collection, _shard, _from);
194194
if (commonHealthyInSync.empty()) {
195195
return false;
196196
} else {
@@ -272,9 +272,17 @@ bool FailedLeader::start(bool& aborts) {
272272
{
273273
VPackArrayBuilder servers(&ns);
274274
ns.add(VPackValue(_to));
275+
// We prefer servers in sync and want to put them early in the new Plan
276+
// (behind the leader). This helps so that RemoveFollower prefers others
277+
// to remove.
275278
for (auto const& i : VPackArrayIterator(current)) {
276279
std::string s = i.copyString();
280+
if (s.size() > 0 && s[0] == '_') {
281+
s = s.substr(1);
282+
}
277283
if (s != _from && s != _to) {
284+
TRI_ASSERT(std::find(planv.begin(), planv.end(), s) != planv.end());
285+
// A server in Current ought to be in the Plan, if not, we want to know this.
278286
ns.add(i);
279287
planv.erase(std::remove(planv.begin(), planv.end(), s), planv.end());
280288
}

arangod/Agency/Job.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,11 @@ std::vector<Job::shard_t> Job::clones(Node const& snapshot, std::string const& d
459459
}
460460

461461
std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOOD" health
462-
Node const& snap, std::string const& db, std::string const& col, std::string const& shrd) {
462+
Node const& snap, std::string const& db, std::string const& col, std::string const& shrd,
463+
std::string const& serverToAvoid) {
464+
// serverToAvoid is the leader for which we are seeking a replacement. Note that
465+
// it is not a given that this server is the first one in Current/servers or
466+
// Current/failoverCandidates.
463467
auto cs = clones(snap, db, col, shrd); // clones
464468
auto nclones = cs.size(); // #clones
465469
std::unordered_map<std::string, bool> good;
@@ -501,13 +505,12 @@ std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOO
501505
// Guaranteed by if above
502506
TRI_ASSERT(serverList.isArray());
503507

504-
size_t i = 0;
505508
for (const auto& server : VPackArrayIterator(serverList)) {
506-
if (i++ == 0) {
507-
// Skip leader
509+
auto id = server.copyString();
510+
if (id == serverToAvoid) {
511+
// Skip current leader for which we are seeking a replacement
508512
continue;
509513
}
510-
auto id = server.copyString();
511514

512515
if (!good[id]) {
513516
// Skip unhealthy servers

arangod/Agency/Job.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ struct Job {
156156
static std::string findNonblockedCommonHealthyInSyncFollower(Node const& snap,
157157
std::string const& db,
158158
std::string const& col,
159-
std::string const& shrd);
159+
std::string const& shrd,
160+
std::string const& serverToAvoid);
160161

161162
JOB_STATUS _status;
162163
Node const& _snapshot;

arangod/Agency/ResignLeadership.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ bool ResignLeadership::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
390390
if (isLeader) {
391391

392392
std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
393-
_snapshot, database.first, collptr.first, shard.first);
393+
_snapshot, database.first, collptr.first, shard.first, _server);
394394

395395
if (toServer.empty()) {
396396
continue ; // can not resign from that shard

0 commit comments

Comments
 (0)
0