8000 Handle a few forgotten cases of already resigned leaders. by neunhoef · Pull Request #12443 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

Handle a few forgotten cases of already resigned leaders. #12443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 18, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading

Uh oh!

There was an error while loading. Please reload this page.

Diff view
Diff view
Next Next commit
Handle a few forgotten cases of already resigned leaders.
In a failover or cleanout or resign leadership situation it is possible
that in Current we have an already resigned leader. This needs to be
taken into account when deciding which server to failover to or to
move to. This fixes bad behaviour in corner cases.
  • Loading branch information
neunhoef committed Aug 17, 2020
commit d4433b2c4339c04100d3e45e3baaabd9fa7e0623
2 changes: 1 addition & 1 deletion arangod/Agency/CleanOutServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
if (isLeader) {

std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
_snapshot, database.first, collptr.first, shard.first);
_snapshot, database.first, collptr.first, shard.first, _server);

MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server,
Expand Down
10 changes: 9 additions & 1 deletion arangod/Agency/FailedLeader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ bool FailedLeader::start(bool& aborts) {

// Get healthy in Sync follower common to all prototype + clones
auto commonHealthyInSync =
findNonblockedCommonHealthyInSyncFollower(_snapshot, _database, _collection, _shard);
findNonblockedCommonHealthyInSyncFollower(_snapshot, _database, _collection, _shard, _from);
if (commonHealthyInSync.empty()) {
return false;
} else {
Expand Down Expand Up @@ -273,9 +273,17 @@ bool FailedLeader::start(bool& aborts) {
{
VPackArrayBuilder servers(&ns);
ns.add(VPackValue(_to));
// We prefer servers in sync and want to put them early in the new Plan
// (behind the leader). This helps so that RemoveFollower prefers others
// to remove.
for (auto const& i : VPackArrayIterator(current)) {
std::string s = i.copyString();
if (s.size() > 0 && s[0] == '_') {
s = s.substr(1);
}
if (s != _from && s != _to) {
TRI_ASSERT(std::find(planv.begin(), planv.end(), s) != planv.end());
// A server in Current ought to be in the Plan, if not, we want to know this.
ns.add(i);
planv.erase(std::remove(planv.begin(), planv.end(), s), planv.end());
}
Expand Down
13 changes: 8 additions & 5 deletions arangod/Agency/Job.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,11 @@ std::vector<Job::shard_t> Job::clones(Node const& snapshot, std::string const& d
}

std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOOD" health
Node const& snap, std::string const& db, std::string const& col, std::string const& shrd) {
Node const& snap, std::string const& db, std::string const& col, std::string const& shrd,
std::string const& serverToAvoid) {
// serverToAvoid is the leader for which we are seeking a replacement. Note that
// it is not a given that this server is the first one in Current/servers or
// Current/failoverCandidates.
auto cs = clones(snap, db, col, shrd); // clones
auto nclones = cs.size(); // #clones
std::unordered_map<std::string, bool> good;
Expand Down Expand Up @@ -501,13 +505,12 @@ std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOO
// Guaranteed by if above
TRI_ASSERT(serverList.isArray());

size_t i = 0;
for (const auto& server : VPackArrayIterator(serverList)) {
if (i++ == 0) {
// Skip leader
auto id = server.copyString();
if (id == serverToAvoid) {
// Skip current leader for which we are seeking a replacement
continue;
}
auto id = server.copyString();

if (!good[id]) {
// Skip unhealthy servers
Expand Down
3 changes: 2 additions & 1 deletion arangod/Agency/Job.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ struct Job {
static std::string findNonblockedCommonHealthyInSyncFollower(Node const& snap,
std::string const& db,
std::string const& col,
std::string const& shrd);
std::string const& shrd,
std::string const& serverToAvoid);

JOB_STATUS _status;
Node const& _snapshot;
Expand Down
2 changes: 1 addition & 1 deletion arangod/Agency/ResignLeadership.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ bool ResignLeadership::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
if (isLeader) {

std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
_snapshot, database.first, collptr.first, shard.first);
_snapshot, database.first, collptr.first, shard.first, _server);

if (toServer.empty()) {
continue ; // can not resign from that shard
Expand Down
0