8000 Fix an agency supervision bug. (#11356) · arangodb/arangodb@485e14a · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 485e14a

Browse files
authored
Fix an agency supervision bug. (#11356)
* Fix a bug in server state detection after leader change. * CHANGELOG. * Fix failoverCandidates bug.
1 parent a17ef48 commit 485e14a

File tree

3 files changed

+76
-27
lines changed

3 files changed

+76
-27
lines changed

CHANGELOG

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
devel
22
-----
33

4+
* Fixed a bug in the agency supervision, which ignored the `failoverCandidates`
5+
field.
6+
7+
* Fixed a bug in the agency supervision, which could declare an already FAILED
8+
dbserver temporarily as GOOD again after an agency leader change.
9+
410
* Added INTERLEAVE AQL function.
511

612
* Upgraded bundled RocksDB library to version 6.8.0.

arangod/Agency/Job.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOO
471471
auto sharedPath = db + "/" + clone.collection + "/";
472472
auto currentShardPath = curColPrefix + sharedPath + clone.shard + "/servers";
473473
auto currentFailoverCandidatesPath =
474-
curColPrefix + sharedPath + clone.shard + "/servers";
474+
curColPrefix + sharedPath + clone.shard + "/failoverCandidates";
475475
auto plannedShardPath = planColPrefix + sharedPath + "shards/" + clone.shard;
476476

477477
// start up race condition ... current might not have everything in plan

arangod/Agency/Supervision.cpp

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -588,30 +588,66 @@ std::vector<check_t> Supervision::check(std::string const& type) {
588588
HealthRecord persist(shortName, endpoint, hostId, engine, serverVersion, externalEndpoint);
589589

590590
// Get last health entries from transient and persistent key value stores
591+
bool transientHealthRecordFound = true;
591592
if (_transient.has(healthPrefix + serverID)) {
592593
transist = _transient.hasAsNode(healthPrefix + serverID).first;
594+
} else {
595+
// In this case this is the first time we look at this server during our
596+
// new leadership. So we do not touch the persisted health record and
597+
// only create a new transient health record.
598+
transientHealthRecordFound = false;
593599
}
594600
if (snapshot().has(healthPrefix + serverID)) {
595601
persist = snapshot().hasAsNode(healthPrefix + serverID).first;
596602
}
597603

604+
// Here is an important subtlety: We will derive the health status of this
605+
// server a bit further down by looking at the time when we saw the last
606+
// heartbeat. The heartbeat is stored in transient in `Sync/ServerStates`.
607+
// It has a timestamp, however, this was taken on the other server, so we
608+
// cannot trust this time stamp because of possible clock skew. Since we
609+
// do not actually know when this heartbeat came in, we have to proceed as
610+
// follows: We make a copy of the `syncTime` and store it into the health
611+
// record in transient `Supervision/Health`. If we detect a difference, it
612+
// is the first time we have seen the new heartbeat and we can then take
613+
// a reading of our local system clock and use that time. However, if we
614+
// do not yet have a previous reading in our transient health record,
615+
// we must not touch the persisted health record at all. This is what the
616+
// flag `transientHealthRecordFound` means.
617+
598618
// New health record (start with old add current information from sync)
599619
// Sync.time is copied to Health.syncTime
600620
// Sync.status is copied to Health.syncStatus
601-
std::string syncTime =
602-
_transient.has(syncPrefix + serverID)
603-
? _transient.hasAsString(syncPrefix + serverID + "/time").first
604-
: timepointToString(std::chrono::system_clock::time_point());
605-
std::string syncStatus =
606-
_transient.has(syncPrefix + serverID)
607-
? _transient.hasAsString(syncPrefix + serverID + "/status").first
608-
: "UNKNOWN";
609-
610-
// Last change registered in sync (transient != sync)
611-
// Either now or value in transient
612-
auto lastAckedTime = (syncTime != transist.syncTime)
613-
? startTimeLoop
614-
: stringToTimepoint(transist.lastAcked);
621+
std::string syncTime;
622+
std::string syncStatus;
623+
bool heartBeatVisible = _transient.has(syncPrefix + serverID);
624+
if (heartBeatVisible) {
625+
syncTime = _transient.hasAsString(syncPrefix + serverID + "/time").first;
626+
syncStatus = _transient.hasAsString(syncPrefix + serverID + "/status").first;
627+
} else {
628+
syncTime = timepointToString(std::chrono::system_clock::time_point()); // beginning of time
629+
syncStatus = "UNKNOWN";
630+
}
631+
632+
// Compute the time when we last discovered a new heartbeat from that server:
633+
std::chrono::system_clock::time_point lastAckedTime;
634+
if (heartBeatVisible) {
635+
if (transientHealthRecordFound) {
636+
lastAckedTime = (syncTime != transist.syncTime)
637+
? startTimeLoop
638+
: stringToTimepoint(transist.lastAcked);
639+
} else {
640+
// in this case we do no really know when this heartbeat came in,
641+
// however, it must have been after we became a leader, and since we
642+
// do not have a transient health record yet, we just assume that we
643+
// got it recently and set it to "now". Note that this will not make
644+
// a FAILED server "GOOD" again, since we do not touch the persisted
645+
// healt 6D4E h record if we do not have a transient health record yet.
646+
lastAckedTime = startTimeLoop;
647+
}
648+
} else {
649+
lastAckedTime = std::chrono::system_clock::time_point();
650+
}
615651
transist.lastAcked = timepointToString(lastAckedTime);
616652
transist.syncTime = syncTime;
617653
transist.syncStatus = syncStatus;
@@ -623,34 +659,41 @@ std::vector<check_t> Supervision::check(std::string const& type) {
623659
transist.hostId = hostId;
624660
transist.endpoint = endpoint;
625661

626-
// Calculate elapsed since lastAcked
627-
auto elapsed = std::chrono::duration<double>(startTimeLoop - lastAckedTime);
662+
// We have now computed a new transient health record under all circumstances.
663+
664+
bool changed;
665+
if (transientHealthRecordFound) {
666+
// Calculate elapsed since lastAcked
667+
auto elapsed = std::chrono::duration<double>(startTimeLoop - lastAckedTime);
628668

629-
if (elapsed.count() <= _okThreshold) {
630-
transist.status = Supervision::HEALTH_STATUS_GOOD;
631-
} else if (elapsed.count() <= _gracePeriod) {
632-
transist.status = Supervision::HEALTH_STATUS_BAD;
669+
if (elapsed.count() <= _okThreshold) {
670+
transist.status = Supervision::HEALTH_STATUS_GOOD;
671+
} else if (elapsed.count() <= _gracePeriod) {
672+
transist.status = Supervision::HEALTH_STATUS_BAD;
673+
} else {
674+
transist.status = Supervision::HEALTH_STATUS_FAILED;
675+
}
676+
677+
// Status changed?
678+
changed = transist.statusDiff(persist);
633679
} else {
634-
transist.status = Supervision::HEALTH_STATUS_FAILED;
680+
transist.status = persist.status;
681+
changed = false;
635682
}
636683

637-
// Status changed?
638-
bool changed = transist.statusDiff(persist);
639-
640684
// Take necessary actions if any
641685
std::shared_ptr<VPackBuilder> envelope;
642686
if (changed) {
643687
LOG_TOPIC("bbbde", DEBUG, Logger::SUPERVISION)
644688
<< "Status of server " << serverID << " has changed from "
645689
<< persist.status << " to " << transist.status;
646690
handleOnStatus(_agent, snapshot(), persist, transist, serverID, _jobId, envelope);
691+
persist = transist; // Now copy Status, SyncStatus from transient to persited
647692
} else {
648693
LOG_TOPIC("44253", TRACE, Logger::SUPERVISION)
649694
<< "Health of server " << machine.first << " remains " << transist.status;
650695
}
651696

652-
persist = transist; // Now copy Status, SyncStatus from transient to persited
653-
654697
// Transient report
655698
std::shared_ptr<Builder> tReport = std::make_shared<Builder>();
656699
{

0 commit comments

Comments
 (0)
0