8000 Fix an agency supervision bug. (#11356) · arangodb/arangodb@485e14a · GitHub
[go: up one dir, main page]

Skip to content

Commit 485e14a

Browse files
authored
Fix an agency supervision bug. (#11356)
* Fix a bug in server state detection after leader change. * CHANGELOG. * Fix failoverCandidates bug.
1 parent a17ef48 commit 485e14a

File tree

3 files changed

+76
-27
lines changed

3 files changed

+76
-27
lines changed

CHANGELOG

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
devel
22
-----
33

4+
* Fixed a bug in the agency supervision, which ignored the `failoverCandidates`
5+
field.
6+
7+
* Fixed a bug in the agency supervision, which could declare an already FAILED
8+
dbserver temporarily as GOOD again after an agency leader change.
9+
410
* Added INTERLEAVE AQL function.
511

612
* Upgraded bundled RocksDB library to version 6.8.0.

arangod/Agency/Job.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ std::string Job::findNonblockedCommonHealthyInSyncFollower( // Which is in "GOO
471471
auto sharedPath = db + "/" + clone.collection + "/";
472472
auto currentShardPath = curColPrefix + sharedPath + clone.shard + "/servers";
473473
auto currentFailoverCandidatesPath =
474-
curColPrefix + sharedPath + clone.shard + "/servers";
474+
curColPrefix + sharedPath + clone.shard + "/failoverCandidates";
475475
auto plannedShardPath = planColPrefix + sharedPath + "shards/" + clone.shard;
476476

477477
// start up race condition ... current might not have everything in plan

arangod/Agency/Supervision.cpp

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -588,30 +588,66 @@ std::vector<check_t> Supervision::check(std::string const& type) {
588588
HealthRecord persist(shortName, endpoint, hostId, engine, serverVersion, externalEndpoint);
589589

590590
// Get last health entries from transient and persistent key value stores
591+
bool transientHealthRecordFound = true;
591592
if (_transient.has(healthPrefix + serverID)) {
592593
transist = _transient.hasAsNode(healthPrefix + serverID).first;
594+
} else {
595+
// In this case this is the first time we look at this server during our
596+
// new leadership. So we do not touch the persisted health record and
597+
// only create a new transient health record.
598+
transientHealthRecordFound = false;
593599
}
594600
if (snapshot().has(healthPrefix + serverID)) {
595601
persist = snapshot().hasAsNode(healthPrefix + serverID).first;
596602
}
597603

604+
// Here is an important subtlety: We will derive the health status of this
605+
// server a bit further down by looking at the time when we saw the last
606+
// heartbeat. The heartbeat is stored in transient in `Sync/ServerStates`.
607+
// It has a timestamp, however, this was taken on the other server, so we
608+
// cannot trust this time stamp because of possible clock skew. Since we
609+
// do not actually know when this heartbeat came in, we have to proceed as
610+
// follows: We make a copy of the `syncTime` and store it into the health
611+
// record in transient `Supervision/Health`. If we detect a difference, it
612+
// is the first time we have seen the new heartbeat and we can then take
613+
// a reading of our local system clock and use that time. However, if we
614+
// do not yet have a previous reading in our transient health record,
615+
// we must not touch the persisted health record at all. This is what the
616+
// flag `transientHealthRecordFound` means.
617+
598618
// New health record (start with old add current information from sync)
599619
// Sync.time is copied to Health.syncTime
600620
// Sync.status is copied to Health.syncStatus
601-
std::string syncTime =
602-
_transient.has(syncPrefix + serverID)
603-
? _transient.hasAsString(syncPrefix + serverID + "/time").first
604-
: timepointToString(std::chrono::system_clock::time_point());
605-
std::string syncStatus =
606-
_transient.has(syncPrefix + serverID)
607-
? _transient.hasAsString(syncPrefix + serverID + "/status").first
608-
: "UNKNOWN";
609-
610-
// Last change registered in sync (transient != sync)
611-
// Either now or value in transient
612-
auto lastAckedTime = (syncTime != transist.syncTime)
613-
? startTimeLoop
614-
: stringToTimepoint(transist.lastAcked);
621+
std::string syncTime;
622+
std::string syncStatus;
623+
bool heartBeatVisible = _transient.has(syncPrefix + serverID);
624+
if (heartBeatVisible) {
625+
syncTime = _transient.hasAsString(syncPrefix + serverID + "/time").first;
626+
syncStatus = _transient.hasAsString(syncPrefix + serverID + "/status").first;
627+
} else {
628+
syncTime = timepointToString(std::chrono::system_clock::time_point()); // beginning of time
629+
syncStatus = "UNKNOWN";
630+
}
631+
632+
// Compute the time when we last discovered a new heartbeat from that server:
633+
std::chrono::system_clock::time_point lastAckedTime;
634+
if (heartBeatVisible) {
635+
if (transientHealthRecordFound) {
636+
lastAckedTime = (syncTime != transist.syncTime)
637+
? startTimeLoop
638+
: stringToTimepoint(transist.lastAcked);
639+
} else {
640+
// in this case we do no really know when this heartbeat came in,
641+
// however, it must have been after we became a leader, and since we
642+
// do not have a transient health record yet, we just assume that we
643+
// got it recently and set it to "now". Note that this will not make
644+
// a FAILED server "GOOD" again, since we do not touch the persisted
645+
// health record if we do not have a transient health record yet.
646+
lastAckedTime = startTimeLoop;
647+
}
648+
} else {
649+
lastAckedTime = std::chrono::system_clock::time_point();
650+
}
615651
transist.lastAcked = timepointToString(lastAckedTime);
616652
transist.syncTime = syncTime;
617653
transist.syncStatus = syncStatus;
@@ -623,34 +659,41 @@ std::vector<check_t> Supervision::check(std::string const& type) {
623659
transist.hostId = hostId;
624660
transist.endpoint = endpoint;
625661

626-
// Calculate elapsed since lastAcked
627-
auto elapsed = std::chrono::duration<double>(startTimeLoop - lastAckedTime);
662+
// We have now computed a new transient health record under all circumstances.
663+
664+
bool changed;
665+
if (transientHealthRecordFound) AA32 {
666+
// Calculate elapsed since lastAcked
667+
auto elapsed = std::chrono::duration<double>(startTimeLoop - lastAckedTime);
628668

629-
if (elapsed.count() <= _okThreshold) {
630-
transist.status = Supervision::HEALTH_STATUS_GOOD;
631-
} else if (elapsed.count() <= _gracePeriod) {
632-
transist.status = Supervision::HEALTH_STATUS_BAD;
669+
if (elapsed.count() <= _okThreshold) {
670+
transist.status = Supervision::HEALTH_STATUS_GOOD;
671+
} else if (elapsed.count() <= _gracePeriod) {
672+
transist.status = Supervision::HEALTH_STATUS_BAD;
673+
} else {
674+
transist.status = Supervision::HEALTH_STATUS_FAILED;
675+
}
676+
677+
// Status changed?
678+
changed = transist.statusDiff(persist);
633679
} else {
634-
transist.status = Supervision::HEALTH_STATUS_FAILED;
680+
transist.status = persist.status;
681+
changed = false;
635682
}
636683

637-
// Status changed?
638-
bool changed = transist.statusDiff(persist);
639-
640684
// Take necessary actions if any
641685
std::shared_ptr<VPackBuilder> envelope;
642686
if (changed) {
643687
LOG_TOPIC("bbbde", DEBUG, Logger::SUPERVISION)
644688
<< "Status of server " << serverID << " has changed from "
645689
<< persist.status << " to " << transist.status;
646690
handleOnStatus(_agent, snapshot(), persist, transist, serverID, _jobId, envelope);
691+
persist = transist; // Now copy Status, SyncStatus from transient to persited
647692
} else {
648693
LOG_TOPIC("44253", TRACE, Logger::SUPERVISION)
649694
<< "Health of server " << machine.first << " remains " << transist.status;
650695
}
651696

652-
persist = transist; // Now copy Status, SyncStatus from transient to persited
653-
654697
// Transient report
655698
std::shared_ptr<Builder> tReport = std::make_shared<Builder>();
656699
{

0 commit comments

Comments
 (0)
0