8000 Bug fix 3.4/cleanout server dropping wrong follower (#6572) · arangodb/arangodb@b742224 · GitHub
[go: up one dir, main page]

Skip to content

Commit b742224

Browse files
Lars Maierneunhoef
authored andcommitted
Bug fix 3.4/cleanout server dropping wrong follower (#6572)
* Fixing bug: cleanoutServer will no longer add old leader as follower. * Fixed rollback.
1 parent c5b67d2 commit b742224

File tree

7 files changed

+89
-37
lines changed

7 files changed

+89
-37
lines changed

arangod/Agency/CleanOutServer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
389389
serversCopy.end());
390390
}
391391

392+
bool isLeader = (found == 0);
393+
392394
// Among those a random destination:
393395
std::string toServer;
394396
if (serversCopy.empty()) {
@@ -403,7 +405,7 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
403405
// Schedule move into trx:
404406
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
405407
_jobId, database.first, collptr.first,
406-
shard.first, _server, toServer, found == 0)
408+
shard.first, _server, toServer, isLeader, false)
407409
.create(trx);
408410
}
409411
}

arangod/Agency/MoveShard.cpp

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,22 @@
3030
using namespace arangodb;
3131
using namespace arangodb::consensus;
3232

33+
MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
34+
std::string const& jobId, std::string const& creator,
35+
std::string const& database,
36+
std::string const& collection, std::string const& shard,
37+
std::string const& from, std::string const& to,
38+
bool isLeader, bool remainsFollower)
39+
: Job(NOTFOUND, snapshot, agent, jobId, creator),
40+
_database(database),
41+
_collection(collection),
42+
_shard(shard),
43+
_from(id(from)),
44+
_to(id(to)),
45+
_isLeader(isLeader), // will be initialized properly when information known
46+
_remainsFollower(remainsFollower)
47+
{ }
48+
3349
MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
3450
std::string const& jobId, std::string const& creator,
3551
std::string const& database,
@@ -42,7 +58,8 @@ MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
4258
_shard(shard),
4359
_from(id(from)),
4460
_to(id(to)),
45-
_isLeader(isLeader) // will be initialized properly when information known
61+
_isLeader(isLeader), // will be initialized properly when information known
62+
_remainsFollower(isLeader)
4663
{ }
4764

4865
MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
@@ -57,6 +74,7 @@ MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
5774
auto tmp_to = _snapshot.hasAsString(path + "toServer");
5875
auto tmp_shard = _snapshot.hasAsString(path + "shard");
5976
auto tmp_isLeader = _snapshot.hasAsSlice(path + "isLeader");
77+
auto tmp_remainsFollower = _snapshot.hasAsSlice(path + "remainsFollower");
6078
auto tmp_creator = _snapshot.hasAsString(path + "creator");
6179

6280
if (tmp_database.second && tmp_collection.second && tmp_from.second && tmp_to.second
@@ -67,6 +85,7 @@ MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
6785
_to = tmp_to.first;
6886
_shard = tmp_shard.first;
6987
_isLeader = tmp_isLeader.first.isTrue();
88+
_remainsFollower = tmp_remainsFollower.second ? tmp_remainsFollower.first.isTrue() : _isLeader;
7089
_creator = tmp_creator.first;
7190
} else {
7291
std::stringstream err;
@@ -130,6 +149,7 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
130149
_jb->add("fromServer", VPackValue(_from));
131150
_jb->add("toServer", VPackValue(_to));
132151
_jb->add("isLeader", VPackValue(_isLeader));
152+
_jb->add("remainsFollower", VPackValue(_remainsFollower));
133153
_jb->add("jobId", VPackValue(_jobId));
134154
_jb->add("timeCreated", VPackValue(now));
135155
}
@@ -276,6 +296,11 @@ bool MoveShard::start() {
276296
return false;
277297
}
278298

299+
if (!_isLeader && _remainsFollower) {
300+
finish("", "", false, "remainsFollower is invalid without isLeader");
301+
return false;
302+
}
303+
279304
// Compute group to move shards together:
280305
std::vector<Job::shard_t> shardsLikeMe
281306
= clones(_snapshot, _database, _collection, _shard);
@@ -354,7 +379,7 @@ bool MoveShard::start() {
354379
addPreconditionUnchanged(pending, planPath, planned);
355380
addPreconditionShardNotBlocked(pending, _shard);
356381
addPreconditionServerNotBlocked(pending, _to);
357-
addPreconditionServerHealth(pending, _to, "GOOD");
382+
addPreconditionServerHealth(pending, _to, "GOOD");
358383
addPreconditionUnchanged(pending, failedServersPrefix, failedServers);
359384
addPreconditionUnchanged(pending, cleanedPrefix, cleanedServers);
360385
} // precondition done
@@ -507,6 +532,7 @@ JOB_STATUS MoveShard::pendingLeader() {
507532
trx.add(srv);
508533
}
509534
}
535+
// add the old leader as follower in case of a rollback
510536
trx.add(VPackValue(_from));
511537
}
512538
// Precondition: Plan still as it was
@@ -545,7 +571,19 @@ JOB_STATUS MoveShard::pendingLeader() {
545571
{ VPackObjectBuilder trxObject(&trx);
546572
VPackObjectBuilder preObject(&pre);
547573
doForAllShards(_snapshot, _database, shardsLikeMe,
548-
[&pre](Slice plan, Slice current, std::string& planPath) {
574+
[&trx, &pre, this](Slice plan, Slice current, std::string& planPath) {
575+
if (!_remainsFollower) {
576+
// Remove _from from the list of follower
577+
trx.add(VPackValue(planPath));
578+
{ VPackArrayBuilder guard(&trx);
579+
for (auto const& srv : VPackArrayIterator(plan)) {
580+
if (!srv.isEqualString(_from)) {
581+
trx.add(srv);
582+
}
583+
}
584+
}
585+
}
586+
549587
// Precondition: Plan still as it was
550588
pre.add(VPackValue(planPath));
551589
{ VPackObjectBuilder guard(&pre);

arangod/Agency/MoveShard.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,17 @@ namespace arangodb {
3131
namespace consensus {
3232

3333
struct MoveShard : public Job {
34-
34+
35+
MoveShard(Node const& snapshot, AgentInterface* agent, std::string const& jobId,
36+
std::string const& creator,
37+
std::string const& database,
38+
std::string const& collection,
39+
std::string const& shard,
40+
std::string const& from,
41+
std::string const& to,
42+
bool isLeader,
43+
bool remainsFollower);
44+
3545
MoveShard(Node const& snapshot, AgentInterface* agent, std::string const& jobId,
3646
std::string const& creator,
3747
std::string const& database,
@@ -61,6 +71,7 @@ struct MoveShard : public Job {
6171
std::string _from;
6272
std::string _to;
6373
bool _isLeader;
74+
bool _remainsFollower;
6475
};
6576
}
6677
}

tests/Agency/CleanOutServerTest.cpp

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ VPackBuilder createMoveShardJob() {
6666
builder.add("fromServer", VPackValue("test"));
6767
builder.add("toServer", VPackValue("test2"));
6868
builder.add("isLeader", VPackValue(true));
69+
builder.add("remainsFollower", VPackValue(false));
6970
builder.add("collection", VPackValue("test"));
7071
builder.add("shard", VPackValue("s99"));
7172
builder.add("creator", VPackValue("unittest"));
@@ -77,7 +78,7 @@ VPackBuilder createMoveShardJob() {
7778

7879
void checkFailed(JOB_STATUS status, query_t const& q) {
7980
INFO("WRITE: " << q->toJson());
80-
81+
8182
REQUIRE(std::string(q->slice().typeName()) == "array" );
8283
REQUIRE(q->slice().length() == 1);
8384
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
@@ -100,7 +101,7 @@ Node createNodeFromBuilder(VPackBuilder const& builder) {
100101
VPackBuilder opBuilder;
101102
{ VPackObjectBuilder a(&opBuilder);
102103
opBuilder.add("new", builder.slice()); }
103-
104+
104105
Node node("");
105106
node.handle<SET>(opBuilder.slice());
106107
return node;
@@ -112,11 +113,11 @@ Builder createBuilder(char const* c) {
112113
options.checkAttributeUniqueness = true;
113114
VPackParser parser(&options);
114115
parser.parse(c);
115-
116+
116117
VPackBuilder builder;
117118
builder.add(parser.steal()->slice());
118119
return builder;
119-
120+
120121
}
121122

122123
Node createNode(char const* c) {
@@ -156,7 +157,7 @@ VPackBuilder createJob(std::string const& server) {
156157
TEST_CASE("CleanOutServer", "[agency][supervision]") {
157158
RandomGenerator::initialize(RandomGenerator::RandomType::MERSENNE);
158159
auto baseStructure = createRootNode();
159-
160+
160161
write_ret_t fakeWriteResult {true, "", std::vector<bool> {true}, std::vector<index_t> {1}};
161162
auto transBuilder = std::make_shared<Builder>();
162163
{ VPackArrayBuilder a(transBuilder.get());
@@ -338,7 +339,7 @@ SECTION("cleanout server should fail if the server is already cleaned") {
338339
}
339340
return builder;
340341
};
341-
342+
342343
Mock<AgentInterface> mockAgent;
343344
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
344345
checkFailed(JOB_STATUS::TODO, q);
@@ -385,7 +386,7 @@ SECTION("cleanout server should fail if the server is failed") {
385386
}
386387
return builder;
387388
};
388-
389+
389390
Mock<AgentInterface> mockAgent;
390391
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
391392
checkFailed(JOB_STATUS::TODO, q);
@@ -434,7 +435,7 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
434435
}
435436
return builder;
436437
};
437-
438+
438439
Mock<AgentInterface> mockAgent;
439440
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
440441
checkFailed(JOB_STATUS::TODO, q);
@@ -484,7 +485,7 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
484485
}
485486
return builder;
486487
};
487-
488+
488489
Mock<AgentInterface> mockAgent;
489490
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
490491
checkFailed(JOB_STATUS::TODO, q);
@@ -529,11 +530,11 @@ SECTION("a cleanout server job should move into pending when everything is ok")
529530
}
530531
return builder;
531532
};
532-
533+
533534
Mock<AgentInterface> mockAgent;
534535
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
535536
INFO("WRITE: " << q->toJson());
536-
537+
537538
REQUIRE(std::string(q->slice().typeName()) == "array" );
538539
REQUIRE(q->slice().length() == 1);
539540
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
@@ -609,14 +610,14 @@ SECTION("a cleanout server job should abort after a long timeout") {
609610
}
610611
return builder;
611612
};
612-
613+
613614
int qCount = 0;
614615
Mock<AgentInterface> mockAgent;
615616
When(Method(mockAgent, write)).AlwaysDo([&](query_t const& q, bool d) -> write_ret_t {
616617
if (qCount++ == 0) {
617618
// first the moveShard job should be aborted
618619
INFO("WRITE FIRST: " << q->toJson());
619-
620+
620621
REQUIRE(std::string(q->slice().typeName()) == "array" );
621622
REQUIRE(q->slice().length() == 1);
622623
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
@@ -720,7 +721,7 @@ SECTION("once all subjobs were successful then the job should be finished") {
720721
Mock<AgentInterface> mockAgent;
721722
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
722723
INFO("WRITE: " << q->toJson());
723-
724+
724725
REQUIRE(std::string(q->slice().typeName()) == "array" );
725726
REQUIRE(q->slice().length() == 1);
726727
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
@@ -827,7 +828,7 @@ SECTION("when the cleanout server job is aborted all subjobs should be aborted t
827828
if (qCount++ == 0) {
828829
// first the moveShard job should be aborted
829830
INFO("WRITE FIRST: " << q->toJson());
830-
831+
831832
REQUIRE(std::string(q->slice().typeName()) == "array" );
832833
REQUIRE(q->slice().length() == 1);
833834
REQUIRE(std::string(q->slice()[0].typeName()) == "array");

tests/Agency/FailedFollowerTest.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ Node createNodeFromBuilder(VPackBuilder const& builder) {
8787
VPackBuilder opBuilder;
8888
{ VPackObjectBuilder a(&opBuilder);
8989
opBuilder.add("new", builder.slice()); }
90-
90+
9191
Node node("");
9292
node.handle<SET>(opBuilder.slice());
9393
return node;
@@ -100,11 +100,11 @@ Builder createBuilder(char const* c) {
100100
options.checkAttributeUniqueness = true;
101101
VPackParser parser(&options);
102102
parser.parse(c);
103-
103+
104104
VPackBuilder builder;
105105
builder.add(parser.steal()->slice());
106106
return builder;
107-
107+
108108
}
109109

110110
Node createNode(char const* c) {
@@ -121,13 +121,13 @@ TEST_CASE("FailedFollower", "[agency][supervision]") {
121121
auto transBuilder = std::make_shared<Builder>();
122122
{ VPackArrayBuilder a(transBuilder.get());
123123
transBuilder->add(VPackValue((uint64_t)1)); }
124-
125-
126-
124+
125+
126+
127127
auto baseStructure = createRootNode();
128128
write_ret_t fakeWriteResult {true, "", std::vector<bool> {true}, std::vector<index_t> {1}};
129129
trans_ret_t fakeTransResult {true, "", 1, 0, transBuilder};
130-
130+
131131
SECTION("creating a job should create a job in todo") {
132132
Mock<AgentInterface> mockAgent;
133133

@@ -396,7 +396,7 @@ SECTION("if there is no healthy free server when trying to start just wait") {
396396
REQUIRE(builder);
397397
INFO("Agency: " << builder->toJson());
398398
Node agency = createNodeFromBuilder(*builder);
399-
399+
400400
// nothing should happen
401401
Mock<AgentInterface> mockAgent;
402402
AgentInterface &agent = mockAgent.get();
@@ -429,7 +429,7 @@ SECTION("abort any moveShard job blocking the shard and start") {
429429
AgentInterface &moveShardAgent = moveShardMockAgent.get();
430430
auto moveShard = MoveShard(
431431
baseStructure(PREFIX), &moveShardAgent, "2", "strunz", DATABASE,
432-
COLLECTION, SHARD, SHARD_LEADER, FREE_SERVER, true);
432+
COLLECTION, SHARD, SHARD_LEADER, FREE_SERVER, true, true);
433433
moveShard.create();
434434

435435
std::string jobId = "1";

0 commit comments

Comments
 (0)
0