8000 [CINFRA-203] CommitFailReason Leadership (#15483) · strogo/arangodb@825f8fc · GitHub
[go: up one dir, main page]

Skip to content

Commit 825f8fc

Browse files
Lars Maiermarkuspf
andauthored
[CINFRA-203] CommitFailReason Leadership (arangodb#15483)
* Added new CommitFailReason + report to agency when leadership not established. * Reformat code. * Fixing bug in serialization. * Added test to js. * Update arangod/Replication2/ReplicatedLog/Algorithms.cpp Co-authored-by: Markus Pfeiffer <markuspf@users.noreply.github.com> Co-authored-by: Markus Pfeiffer <markuspf@users.noreply.github.com>
1 parent 56bdeb2 commit 825f8fc

File tree

10 files changed

+522
-266
lines changed

10 files changed

+522
-266
lines changed

arangod/Cluster/Maintenance.cpp

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,24 +1295,43 @@ static auto reportCurrentReplicatedLogLeader(
12951295
TRI_ASSERT(status.role ==
12961296
replication2::replicated_log::ParticipantRole::kLeader)
12971297
<< "expected participant with leader role";
1298-
if (status.leadershipEstablished) {
1299-
// must have a value after leadership has been established
1300-
TRI_ASSERT(status.committedParticipantsConfig != nullptr);
1301-
// check if either there is no entry in current yet, the term has changed or
1302-
// the participant config generation has changed.
1303-
bool const requiresUpdate =
1304-
currentLeader == nullptr ||
1298+
1299+
bool const requiresUpdate = std::invoke([&] {
1300+
// check if either there is no entry in current yet, the term has changed
1301+
// or the participant config generation has changed or if leadership was
1302+
// established in the meantime
1303+
if (currentLeader == nullptr ||
13051304
currentLeader->term != status.getCurrentTerm() ||
1306-
status.committedParticipantsConfig->generation !=
1307-
currentLeader->committedParticipantsConfig.generation;
1308-
1309-
if (requiresUpdate) {
1310-
replication2::agency::LogCurrent::Leader leader;
1311-
leader.term = *status.getCurrentTerm();
1312-
leader.serverId = serverId;
1313-
leader.committedParticipantsConfig = *status.committedParticipantsConfig;
1314-
return leader;
1305+
currentLeader->leadershipEstablished != status.leadershipEstablished ||
1306+
currentLeader->commitStatus != status.commitFailReason) {
1307+
return true;
1308+
}
1309+
1310+
// check if the committed participants config needs an update
1311+
if (status.committedParticipantsConfig != nullptr) {
1312+
if (currentLeader->committedParticipantsConfig.has_value()) {
1313+
return currentLeader->committedParticipantsConfig->generation !=
1314+
status.committedParticipantsConfig->generation;
1315+
}
1316+
return true;
1317+
}
1318+
1319+
return false;
1320+
});
1321+
1322+
if (requiresUpdate) {
1323+
std::optional<arangodb::replication2::ParticipantsConfig>
1324+
committedParticipantsConfig;
1325+
if (status.committedParticipantsConfig != nullptr) {
1326+
committedParticipantsConfig = *status.committedParticipantsConfig;
13151327
}
1328+
replication2::agency::LogCurrent::Leader leader;
1329+
leader.term = *status.getCurrentTerm();
1330+
leader.serverId = serverId;
1331+
leader.leadershipEstablished = status.leadershipEstablished;
1332+
leader.commitStatus = status.commitFailReason;
1333+
leader.committedParticipantsConfig = std::move(committedParticipantsConfig);
1334+
return leader;
13161335
}
13171336

13181337
return std::nullopt;

arangod/Replication2/ReplicatedLog/AgencyLogSpecification.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,15 +259,31 @@ auto LogCurrent::Leader::toVelocyPack(VPackBuilder& builder) const -> void {
259259
VPackObjectBuilder ob(&builder);
260260
builder.add(StaticStrings::Term, VPackValue(term));
261261
builder.add(StaticStrings::ServerId, VPackValue(serverId));
262-
builder.add(VPackValue(StringCommittedParticipantsConfig));
263-
committedParticipantsConfig.toVelocyPack(builder);
262+
if (committedParticipantsConfig) {
263+
builder.add(VPackValue(StringCommittedParticipantsConfig));
264+
committedParticipantsConfig->toVelocyPack(builder);
265+
}
266+
builder.add("leadershipEstablished", VPackValue(leadershipEstablished));
267+
if (commitStatus) {
268+
builder.add(VPackValue("commitStatus"));
269+
commitStatus->toVelocyPack(builder);
270+
}
264271
}
265272

266273
auto LogCurrent::Leader::fromVelocyPack(VPackSlice s) -> Leader {
267274
auto leader = LogCurrent::Leader{};
268275
leader.term = s.get(StaticStrings::Term).extract<LogTerm>();
269276
leader.serverId = s.get(StaticStrings::ServerId).copyString();
270-
leader.committedParticipantsConfig = ParticipantsConfig::fromVelocyPack(
271-
s.get(StringCommittedParticipantsConfig));
277+
leader.leadershipEstablished = s.get("leadershipEstablished").isTrue();
278+
if (auto commitStatusSlice = s.get("commitStatus");
279+
!commitStatusSlice.isNone()) {
280+
leader.commitStatus =
281+
replicated_log::CommitFailReason::fromVelocyPack(commitStatusSlice);
282+
}
283+
if (auto configSlice = s.get(StringCommittedParticipantsConfig);
284+
!configSlice.isNone()) {
285+
leader.committedParticipantsConfig =
286+
ParticipantsConfig::fromVelocyPack(configSlice);
287+
}
272288
return leader;
273289
}

arangod/Replication2/ReplicatedLog/AgencyLogSpecification.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,17 @@ struct LogCurrent {
137137
struct Leader {
138138
ParticipantId serverId;
139139
LogTerm term;
140-
ParticipantsConfig committedParticipantsConfig;
140+
// optional because the leader might not have committed anything
141+
std::optional<ParticipantsConfig> committedParticipantsConfig;
142+
bool leadershipEstablished;
143+
// will be set after 5s if leader is unable to establish leadership
144+
std::optional<replicated_log::CommitFailReason> commitStatus;
141145

142146
auto toVelocyPack(VPackBuilder&) const -> void;
143147
static auto fromVelocyPack(VPackSlice) -> Leader;
144148
};
145149

146-
// Will be nullopt until leadership has been established
150+
// Will be nullopt until a leader has been assumed leadership
147151
std::optional<Leader> leader;
148152

149153
auto toVelocyPack(VPackBuilder&) const -> void;

arangod/Replication2/ReplicatedLog/Algorithms.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -528,12 +528,25 @@ auto algorithms::calculateCommitIndex(
528528
}
529529
}
530530

531-
// This happens when all servers are either excluded or failed;
531+
// This happens when too many servers are either excluded or failed;
532532
// this certainly means we could not reach a quorum;
533533
// indexes cannot be empty because this particular case would've been handled
534534
// above by comparing actualWriteConcern to 0;
535+
CommitFailReason::NonEligibleServerRequiredForQuorum::CandidateMap candidates;
536+
for (auto const& p : indexes) {
537+
if (p.isFailed()) {
538+
candidates.emplace(
539+
p.id, CommitFailReason::NonEligibleServerRequiredForQuorum::kFailed);
540+
} else if (p.isExcluded()) {
541+
candidates.emplace(
542+
p.id,
543+
CommitFailReason::NonEligibleServerRequiredForQuorum::kExcluded);
544+
}
545+
}
546+
535547
TRI_ASSERT(!indexes.empty());
536-
auto const& who = indexes.front().id;
537-
return {
538-
currentCommitIndex, CommitFailReason::withQuorumSizeNotReached(who), {}};
548+
return {currentCommitIndex,
549+
CommitFailReason::withNonEligibleServerRequiredForQuorum(
550+
std::move(candidates)),
551+
{}};
539552
}

arangod/Replication2/ReplicatedLog/LogCommon.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,14 +381,26 @@ auto replicated_log::CommitFailReason::withForcedParticipantNotInQuorum(
381381
ForcedParticipantNotInQuorum{std::move(who)});
382382
}
383383

384+
auto replicated_log::CommitFailReason::withNonEligibleServerRequiredForQuorum(
385+
NonEligibleServerRequiredForQuorum::CandidateMap candidates) noexcept
386+
-> CommitFailReason {
387+
return CommitFailReason(
388+
std::in_place, NonEligibleServerRequiredForQuorum{std::move(candidates)});
389+
}
390+
384391
namespace {
385392
inline constexpr std::string_view ReasonFieldName = "reason";
386393
inline constexpr std::string_view NothingToCommitEnum = "NothingToCommit";
387394
inline constexpr std::string_view QuorumSizeNotReachedEnum =
388395
"QuorumSizeNotReached";
389396
inline constexpr std::string_view ForcedParticipantNotInQuorumEnum =
390397
"ForcedParticipantNotInQuorum";
398+
inline constexpr std::string_view NonEligibleServerRequiredForQuorumEnum =
399+
"NonEligibleServerRequiredForQuorum";
391400
inline constexpr std::string_view WhoFieldName = "who";
401+
inline constexpr std::string_view CandidatesFieldName = "candidates";
402+
inline constexpr std::string_view NonEligibleFailed = "failed";
403+
inline constexpr std::string_view NonEligibleExcluded = "excluded";
392404
} // namespace
393405

394406
auto replicated_log::CommitFailReason::NothingToCommit::fromVelocyPack(
@@ -449,6 +461,51 @@ void replicated_log::CommitFailReason::ForcedParticipantNotInQuorum::
449461
builder.add(std::string_view(WhoFieldName), VPackValue(who));
450462
}
451463

464+
void replicated_log::CommitFailReason::NonEligibleServerRequiredForQuorum::
465+
toVelocyPack(velocypack::Builder& builder) const {
466+
VPackObjectBuilder obj(&builder);
467+
builder.add(ReasonFieldName,
468+
VPackValue(NonEligibleServerRequiredForQuorumEnum));
469+
builder.add(VPackValue(CandidatesFieldName));
470+
VPackObjectBuilder canObject(&builder);
471+
for (auto const& [p, why] : candidates) {
472+
builder.add(p, VPackValue(to_string(why)));
473+
}
474+
}
475+
476+
auto replicated_log::CommitFailReason::NonEligibleServerRequiredForQuorum::
477+
to_string(replicated_log::CommitFailReason::
478+
NonEligibleServerRequiredForQuorum::Why why) noexcept
479+
-> std::string_view {
480+
switch (why) {
481+
case kFailed:
482+
return NonEligibleFailed;
483+
case kExcluded:
484+
return NonEligibleExcluded;
485+
default:
486+
TRI_ASSERT(false);
487+
return "(unknown)";
488+
}
489+
}
490+
491+
auto replicated_log::CommitFailReason::NonEligibleServerRequiredForQuorum::
492+
fromVelocyPack(velocypack::Slice s) -> NonEligibleServerRequiredForQuorum {
493+
TRI_ASSERT(s.get(ReasonFieldName)
494+
.isEqualString(NonEligibleServerRequiredForQuorumEnum))
495+
<< "Expected string `" << NonEligibleServerRequiredForQuorumEnum
496+
<< "`, found: " << s.stringView();
497+
CandidateMap candidates;
498+
for (auto const& [key, value] :
499+
velocypack::ObjectIterator(s.get(CandidatesFieldName))) {
500+
if (value.isEqualString(NonEligibleFailed)) {
501+
candidates[key.copyString()] = kFailed;
502+
} else if (value.isEqualString(NonEligibleExcluded)) {
503+
candidates[key.copyString()] = kExcluded;
504+
}
505+
}
506+
return NonEligibleServerRequiredForQuorum{std::move(candidates)};
507+
}
508+
452509
auto replicated_log::CommitFailReason::fromVelocyPack(velocypack::Slice s)
453510
-> CommitFailReason {
454511
auto reason = s.get(ReasonFieldName).stringView();
@@ -460,6 +517,9 @@ auto replicated_log::CommitFailReason::fromVelocyPack(velocypack::Slice s)
460517
} else if (reason == ForcedParticipantNotInQuorumEnum) {
461518
return CommitFailReason{std::in_place,
462519
ForcedParticipantNotInQuorum::fromVelocyPack(s)};
520+
} else if (reason == NonEligibleServerRequiredForQuorumEnum) {
521+
return CommitFailReason{
522+
std::in_place, NonEligibleServerRequiredForQuorum::fromVelocyPack(s)};
463523
} else {
464524
THROW_ARANGO_EXCEPTION_MESSAGE(
465525
TRI_ERROR_BAD_PARAMETER,
@@ -487,6 +547,16 @@ auto replicated_log::to_string(CommitFailReason const& r) -> std::string {
487547
-> std::string {
488548
return "Forced participant not in quorum. Participant " + reason.who;
489549
}
550+
auto operator()(
551+
CommitFailReason::NonEligibleServerRequiredForQuorum const& reason)
552+
-> std::string {
553+
auto result =
554+
std::string{"A non-eligible server is required to reach a quorum: "};
555+
for (auto const& [pid, why] : reason.candidates) {
556+
result += basics::StringUtils::concatT(" ", pid, ": ", why);
557+
}
558+
return result;
559+
}
490560
};
491561

492562
return std::visit(ToStringVisitor{}, r.value);

arangod/Replication2/ReplicatedLog/LogCommon.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,15 +380,37 @@ struct CommitFailReason {
380380
ForcedParticipantNotInQuorum const& right) noexcept
381381
-> bool = default;
382382
};
383+
struct NonEligibleServerRequiredForQuorum {
384+
enum Why {
385+
kExcluded,
386+
kFailed,
387+
};
388+
static auto to_string(Why) noexcept -> std::string_view;
389+
390+
using CandidateMap = std::unordered_map<ParticipantId, Why>;
391+
392+
CandidateMap candidates;
393+
394+
static auto fromVelocyPack(velocypack::Slice)
395+
-> NonEligibleServerRequiredForQuorum;
396+
void toVelocyPack(velocypack::Builder& builder) const;
397+
friend auto operator==(
398+
NonEligibleServerRequiredForQuorum const& left,
399+
NonEligibleServerRequiredForQuorum const& right) noexcept
400+
-> bool = default;
401+
};
383402
std::variant<NothingToCommit, QuorumSizeNotReached,
384-
ForcedParticipantNotInQuorum>
403+
ForcedParticipantNotInQuorum, NonEligibleServerRequiredForQuorum>
385404
value;
386405

387406
static auto withNothingToCommit() noexcept -> CommitFailReason;
388407
static auto withQuorumSizeNotReached(ParticipantId who) noexcept
389408
-> CommitFailReason;
390409
static auto withForcedParticipantNotInQuorum(ParticipantId who) noexcept
391410
-> CommitFailReason;
411+
static auto withNonEligibleServerRequiredForQuorum(
412+
NonEligibleServerRequiredForQuorum::CandidateMap) noexcept
413+
-> CommitFailReason;
392414

393415
static auto fromVelocyPack(velocypack::Slice) -> CommitFailReason;
394416
void toVelocyPack(velocypack::Builder& builder) const;

arangod/Replication2/ReplicatedLog/LogLeader.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,11 +509,16 @@ auto replicated_log::LogLeader::getQuickStatus() const -> QuickLogStatus {
509509
throw ParticipantResignedException(
510510
TRI_ERROR_REPLICATION_REPLICATED_LOG_LEADER_RESIGNED, ADB_HERE);
511511
}
512+
auto commitFailReason = std::optional<CommitFailReason>{};
513+
if (leaderData.calculateCommitLag() > std::chrono::seconds{20}) {
514+
commitFailReason = leaderData._lastCommitFailReason;
515+
}
512516
return QuickLogStatus{
513517
.role = ParticipantRole::kLeader,
514518
.term = term,
515519
.local = leaderData.getLocalStatistics(),
516520
.leadershipEstablished = leaderData._leadershipEstablished,
521+
.commitFailReason = commitFailReason,
517522
.activeParticipantsConfig = leaderData.activeParticipantsConfig,
518523
.committedParticipantsConfig =
519524
leaderData.committedParticipantsConfig};

arangod/Replication2/ReplicatedLog/LogStatus.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct QuickLogStatus {
4545
std::optional<LogTerm> term{};
4646
std::optional<LogStatistics> local{};
4747
bool leadershipEstablished{false};
48+
std::optional<CommitFailReason> commitFailReason{};
4849

4950
// The following make sense only for a leader.
5051
std::shared_ptr<ParticipantsConfig const> activeParticipantsConfig{};

tests/Replication2/ReplicatedLog/CalcCommitIndexTest.cpp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,15 @@ TEST_F(CalcCommitIndexTest, includes_less_quorum_size) {
181181
participants, CalculateCommitIndexOptions{3, 3, 3}, LogIndex{1},
182182
LogIndex{50});
183183
EXPECT_EQ(index, expectedLogIndex);
184-
EXPECT_TRUE(std::holds_alternative<CommitFailReason::QuorumSizeNotReached>(
185-
reason.value));
184+
EXPECT_TRUE(
185+
std::holds_alternative<
186+
CommitFailReason::NonEligibleServerRequiredForQuorum>(reason.value));
187+
auto const& details =
188+
std::get<CommitFailReason::NonEligibleServerRequiredForQuorum>(
189+
reason.value);
190+
EXPECT_EQ(details.candidates.size(), 1);
191+
EXPECT_EQ(details.candidates.at("A"),
192+
CommitFailReason::NonEligibleServerRequiredForQuorum::kExcluded);
186193

187194
EXPECT_EQ(quorum.size(), 0);
188195
verifyQuorum(participants, quorum, expectedLogIndex);
@@ -230,8 +237,13 @@ TEST_F(CalcCommitIndexTest, all_excluded) {
230237
participants, CalculateCommitIndexOptions{3, 3, 3}, LogIndex{1},
231238
LogIndex{50});
232239
EXPECT_EQ(index, expectedLogIndex);
233-
EXPECT_TRUE(std::holds_alternative<CommitFailReason::QuorumSizeNotReached>(
234-
reason.value));
240+
EXPECT_TRUE(
241+
std::holds_alternative<
242+
CommitFailReason::NonEligibleServerRequiredForQuorum>(reason.value));
243+
auto const& details =
244+
std::get<CommitFailReason::NonEligibleServerRequiredForQuorum>(
245+
reason.value);
246+
EXPECT_EQ(details.candidates.size(), 3);
235247

236248
EXPECT_EQ(quorum.size(), 0);
237249
verifyQuorum(participants, quorum, expectedLogIndex);
@@ -521,6 +533,10 @@ TEST_F(CalcCommitIndexTest, who_all_failed_excluded) {
521533
participants, CalculateCommitIndexOptions{1, 1, 2}, LogIndex{1},
522534
LogIndex{50});
523535

524-
EXPECT_TRUE(reason == CommitFailReason::withQuorumSizeNotReached("A") ||
525-
reason == CommitFailReason::withQuorumSizeNotReached("B"));
536+
EXPECT_EQ(
537+
reason,
538+
(CommitFailReason::withNonEligibleServerRequiredForQuorum(
539+
{{"A", CommitFailReason::NonEligibleServerRequiredForQuorum::kFailed},
540+
{"B", CommitFailReason::NonEligibleServerRequiredForQuorum::
541+
kExcluded}})));
526542
}

0 commit comments

Comments
 (0)
0