8000 Bug fix/do not reclaim shard ownership (#12461) · adamjm/arangodb@0a77e0a · GitHub 8000
[go: up one dir, main page]

Skip to content

Commit 0a77e0a

Browse files
mchackineunhoef
andauthored
Bug fix/do not reclaim shard ownership (arangodb#12461)
* Added CHANGELOG entry * Let both Maintenance tests run independ of each other * Simplify MoveShard::abort. When the new leader has been installed, always "forward-abort". Adjust unit tests. * Added some helper methods in Maintenance test for test Matrix in Leadership modification jobs * Let a restarted server resign if it is asked to in the Plan. * Remove bad hack to take over leadership after a restart when resigning. * Hand on remainsFollower for MoveShard (leader case). * Make MoveShard test more robust. * Implemented leadership change test matrix, some tests are red! * Do not include `_` in shard-leader information * Adapted Tests to latest definition of behaviour * Adapted Resign and Takeover leadership detection to latest definition * Unified reboot behaviour if another server as leader is found. * Prefer drop over TakeOverShardLeadership. Also adapt now wrong unit-test Co-authored-by: Max Neunhoeffer <max@arangodb.com>
1 parent 6ba0a6c commit 0a77e0a

File tree

9 files changed

+997
-328
lines changed

9 files changed

+997
-328
lines changed

CHANGELOG

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
devel
22
-----
33

4+
* Fixed: During a move-shard job which moves the leader there is a
5+
situation in which the old owner of a shard can reclaim ownership
6+
(after having resigned already), with a small race where it allows to
7+
write documents only locally, but then continue the move-shard to a
8+
server without those documents. An additional bug in the MoveShard
9+
Supervision job would then leave the shard in a bad configuration
10+
with a resigned leader permanently in charge.
11+
412
* Fixed issue #12304: insert in transaction causing com.arangodb.ArangoDBException:
513
Response: 500, Error: 4 - Builder value not yet sealed.
614

arangod/Agency/MoveShard.cpp

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,19 @@ bool MoveShard::start(bool&) {
308308
return false;
309309
}
310310

311-
if (!_isLeader && _remainsFollower) {
312-
moveShardFinish(false, false, "remainsFollower is invalid without isLeader");
313-
return false;
311+
if (!_isLeader) {
312+
if (_remainsFollower) {
313+
moveShardFinish(false, false, "remainsFollower is invalid without isLeader");
314+
return false;
315+
}
316+
} else {
317+
if (_toServerIsFollower && !_remainsFollower) {
318+
moveShardFinish(false, false, "remainsFollower must be true if the toServer is a follower");
319+
return false;
320+
}
314321
}
315322

323+
316324
// Compute group to move shards together:
317325
std::vector<Job::shard_t> shardsLikeMe =
318326
clones(_snapshot, _database, _collection, _shard);
@@ -557,7 +565,12 @@ JOB_STATUS MoveShard::pendingLeader() {
557565

558566
// We need to switch leaders:
559567
{
560-
// Abort in to server no longer among replica for any shards like me
568+
// First make sure that the server we want to go to is still in Current
569+
// for all shards. This is important, since some transaction which the leader
570+
// has still executed before its resignation might have dropped a follower
571+
// for some shard, and this could have been our new leader. In this case we
572+
// must abort and go back to the original leader, which is still perfectly
573+
// safe.
561574
for (auto const& sh : shardsLikeMe) {
562575
auto const shardPath = curColPrefix + _database + "/" + sh.collection + "/" + sh.shard;
563576
auto const tmp = _snapshot.hasAsArray(shardPath + "/servers");
@@ -866,20 +879,20 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
866879
std::vector<Job::shard_t> shardsLikeMe =
867880
clones(_snapshot, _database, _collection, _shard);
868881

882+
// If we move the leader: Once the toServer has been put into the Plan
883+
// as leader, we always abort by moving forwards:
869884
// We can no longer abort by reverting to where we started, if any of the
870885
// shards of the distributeShardsLike group has already gone to new leader
871886
if (_isLeader) {
872-
for (auto const& i : shardsLikeMe) {
873-
auto const& cur = _snapshot.hasAsArray(curColPrefix + _database + "/" + i.collection +
874-
"/" + i.shard + "/" + "servers");
875-
if (cur.second && cur.first[0].copyString() == _to) {
876-
LOG_TOPIC("72a82", INFO, Logger::SUPERVISION)
877-
<< "MoveShard can no longer abort through reversion to where it "
878-
"started. Flight forward";
879-
moveShardFinish(true, true,
880-
"job aborted (2) - new leader already in place: " + reason);
881-
return result;
882-
}
887+
auto const& plan = _snapshot.hasAsArray(planColPrefix + _database + "/" + _collection + "/shards/" + _shard);
888+
if (plan.second && plan.first[0].co F438 pyString() == _to) {
889+
LOG_TOPIC("72a82", INFO, Logger::SUPERVISION)
890+
<< "MoveShard can no longer abort through reversion to where it "
891+
"started. Flight forward, leaving Plan as it is now.";
892+
moveShardFinish(true, false,
893+
"job aborted (2) - new leader already in place: " + reason);
894+
return result;
895+
883896
}
884897
}
885898

@@ -922,7 +935,7 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
922935
{
923936
VPackArrayBuilder guard(&trx);
924937
for (VPackSlice srv : VPackArrayIterator(plan)) {
925-
if (false == srv.isEqualString(_to)) {
938+
if (!srv.isEqualString(_to)) {
926939
trx.add(srv);
927940
}
928941
}
@@ -940,18 +953,10 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
940953
}
941954
{
942955
VPackObjectBuilder preconditionObj(&trx);
943-
if (_isLeader) { // Precondition, that current is still as in snapshot
944-
// Current preconditions for all shards
945-
doForAllShards(_snapshot, _database, shardsLikeMe,
946-
[&trx](Slice plan, Slice current, std::string& planPath,
947-
std::string& curPath) {
948-
// Current still as is
949-
trx.add(curPath, current);
950-
});
951-
addPreconditionJobStillInPending(trx, _jobId);
952-
}
953956
addMoveShardToServerCanUnLock(trx);
954957
addMoveShardFromServerCanUnLock(trx);
958+
// If the collection is gone in the meantime, we do nothing here, but the
959+
// round will move the job to Finished anyway:
955960
addPreconditionCollectionStillThere(trx, _database, _collection);
956961
}
957962
}
@@ -963,14 +968,21 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
963968
std::string("Lost leadership"));
964969
return result;
965970
} else if (res.indices[0] == 0) {
971+
// Precondition failed
966972
if (_isLeader) {
967973
// Tough luck. Things have changed. We'll move on
968974
LOG_TOPIC("513e6", INFO, Logger::SUPERVISION)
969-
<< "MoveShard can no longer abort through reversion to where it "
970-
"started. Flight forward";
971-
moveShardFinish(true, true,
972-
"job aborted (4) - new leader already in place: " + reason);
975+
<< "Precondition failed on MoveShard::abort() for shard " << _shard << " of collection " << _collection
976+
<< ", if the collection has been deleted in the meantime, the job will be finished soon, if this message repeats, tell us.";
977+
result = Result(
978+
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
979+
std::string("Precondition failed while aborting moveShard job ") + _jobId);
973980
return result;
981+
// We intentionally do not move the job object to Failed or Finished here! The failed
982+
// precondition can either be one of the read locks, which suggests a fundamental problem,
983+
// and in which case we will log this message in every round of the supervision.
984+
// Or the collection has been dropped since we took the snapshot, in this case we
985+
// will move the job to Finished in the next round.
974986
}
975987
result = Result(
976988
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,

arangod/Agency/PathComponent.h

Lines changed: 7 additions & 3 deletions
< F42D tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,14 @@ class Path {
5252
return stream;
5353
}
5454

55-
std::vector<std::string> vec() const {
55+
std::vector<std::string> vec(size_t skip = 0) const {
5656
std::vector<std::string> res;
57-
forEach([&res](const char* component) {
58-
res.emplace_back(std::string{component});
57+
forEach([&res, &skip](const char* component) {
58+
if (skip == 0) {
59+
res.emplace_back(std::string{component});
60+
} else {
61+
skip--;
62+
}
5963
});
6064
return res;
6165
}

arangod/Cluster/Maintenance.cpp

Lines changed: 76 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
#include "VocBase/LogicalCollection.h"
3939
#include "VocBase/Methods/Databases.h"
4040

41+
#include "Cluster/ResignShardLeadership.h"
42+
4143
#include <velocypack/Collection.h>
4244
#include <velocypack/Compare.h>
4345
#include <velocypack/Iterator.h>
@@ -184,6 +186,17 @@ static VPackBuilder compareIndexes(std::string const& dbname, std::string const&
184186
return builder;
185187
}
186188

189+
static std::string CreateLeaderString(std::string const& leaderId, bool shouldBeLeading) {
190+
if (shouldBeLeading) {
191+
return std::string();
192+
}
193+
TRI_ASSERT(!leaderId.empty());
194+
if (leaderId.front() == UNDERSCORE[0]) {
195+
return leaderId.substr(1);
196+
}
197+
return leaderId;
198+
}
199+
187200
void handlePlanShard(uint64_t planIndex, VPackSlice const& cprops, VPackSlice const& ldb,
188201
std::string const& dbname, std::string const& colname,
189202
std::string const& shname, std::string const& serverId,
@@ -192,15 +205,13 @@ void handlePlanShard(uint64_t planIndex, VPackSlice const& cprops, VPackSlice co
192205
MaintenanceFeature::errors_t& errors, MaintenanceFeature& feature,
193206
std::vector<std::shared_ptr<ActionDescription>>& actions) {
194207
bool shouldBeLeading = serverId == leaderId;
195-
bool shouldResign = UNDERSCORE + serverId == leaderId;
196208

197209
commonShrds.emplace(shname);
198210

199211
auto const lcol = ldb.get(shname);
200212
if (lcol.isObject()) { // Have local collection with that name
201213

202214
std::string_view const localLeader = lcol.get(THE_LEADER).stringView();
203-
bool const leaderTouched = localLeader != LEADER_NOT_YET_KNOWN;
204215
bool leading = localLeader.empty();
205216
auto const properties = compareRelevantProps(cprops, lcol);
206217

@@ -258,37 +269,20 @@ void handlePlanShard(uint64_t planIndex, VPackSlice const& cprops, VPackSlice co
258269
<< "for central " << dbname << "/" << colname << "- skipping";
259270
}
260271
}
261-
262-
// Handle leadership change, this is mostly about taking over leadership,
263-
// but it also handles the case that in a failover scenario we used to
264-
// be the leader and now somebody else is the leader. However, it does
265-
// not handle the case of a controlled leadership resignation, see below
266-
// in handleLocalShard for this.
267-
if (shouldResign && !leading) {
268-
// This case is a special one which is triggered if a server
269-
// restarts, has `NOT_YET_TOUCHED` in its local shard as theLeader
270-
// and finds a resignation sign. In that case, it should first officially
271-
// take over leadership. In the following round it will then resign.
272-
// This enables cleanOutServer jobs to continue to work in case of
273-
// a leader restart.
274-
shouldBeLeading = true;
275-
shouldResign = false;
276-
}
277-
if ((leading != shouldBeLeading && !shouldResign) || !leaderTouched) {
272+
if (!leading && shouldBeLeading) {
278273
LOG_TOPIC("52412", DEBUG, Logger::MAINTENANCE)
279274
<< "Triggering TakeoverShardLeadership job for shard " << dbname
280275
<< "/" << colname << "/" << shname
281276
<< ", local leader: " << lcol.get(THE_LEADER).copyString()
282277
<< ", leader id: " << leaderId << ", my id: " << serverId
283-
<< ", should be leader: " << (shouldBeLeading ? std::string() : leaderId)
284-
<< ", leaderTouched = " << (leaderTouched ? "yes" : "no");
278+
<< ", should be leader: ";
285279
actions.emplace_back(std::make_shared<ActionDescription>(
286280
std::map<std::string, std::string>{
287281
{NAME, TAKEOVER_SHARD_LEADERSHIP},
288282
{DATABASE, dbname},
289283
{COLLECTION, colname},
290284
{SHARD, shname},
291-
{THE_LEADER, shouldBeLeading ? std::string() : leaderId},
285+
{THE_LEADER, std::string()},
292286
{LOCAL_LEADER, std::string(localLeader)},
293287
{OLD_CURRENT_COUNTER, "0"}, // legacy, no longer used
294288
{PLAN_RAFT_INDEX, std::to_string(planIndex)}},
@@ -318,7 +312,7 @@ void handlePlanShard(uint64_t planIndex, VPackSlice const& cprops, VPackSlice co
318312
}
319313
}
320314
}
321-
} else { // Create the sucker, if not a previous error stops us
315+
} else { // Create the collection, if not a previous error stops us
322316
if (errors.shards.find(dbname + "/" + colname + "/" + shname) ==
323317
errors.shards.end()) {
324318
auto props = createProps(cprops); // Only once might need often!
@@ -328,7 +322,7 @@ void handlePlanShard(uint64_t planIndex, VPackSlice const& cprops, VPackSlice co
328322
{SHARD, shname},
329323
{DATABASE, dbname},
330324
{SERVER_ID, serverId},
331-
{THE_LEADER, shouldBeLeading ? std::string() : leaderId}},
325+
{THE_LEADER, CreateLeaderString(leaderId, shouldBeLeading)}},
332326
shouldBeLeading ? LEADER_PRIORITY : FOLLOWER_PRIORITY, std::move(props)));
333327
} else {
334328
LOG_TOPIC("c1d8e", DEBUG, Logger::MAINTENANCE)
@@ -343,58 +337,71 @@ void handleLocalShard(std::string const& dbname, std::string const& colname,
343337
std::unordered_set<std::string>& commonShrds,
344338
std::unordered_set<std::string>& indis, std::string const& serverId,
345339
std::vector<std::shared_ptr<ActionDescription>>& actions) {
346-
std::unordered_set<std::string>::const_iterator it;
340+
341+
std::unordered_set<std::string>::const_iterator it =
342+
std::find(commonShrds.begin(), commonShrds.end(), colname);
343+
344+
auto localLeader = cprops.get(THE_LEADER).stringRef();
345+
bool const isLeading = localLeader.empty();
346+
if (it == commonShrds.end()) {
347+
// This collection is not planned anymore, can drop it
348+
actions.emplace_back(std::make_shared<ActionDescription>(
349+
std::map<std::string, std::string>{{NAME, DROP_COLLECTION},
350+
{DATABASE, dbname},
351+
{COLLECTION, colname}},
352+
isLeading ? LEADER_PRIORITY : FOLLOWER_PRIORITY));
353+
return;
354+
}
355+
// We dropped out before
356+
TRI_ASSERT(it != commonShrds.end());
357+
// The shard exists in both Plan and Local
358+
commonShrds.erase(it); // it not a common shard?
347359

348360
std::string plannedLeader;
349361
if (shardMap.get(colname).isArray()) {
350362
plannedLeader = shardMap.get(colname)[0].copyString();
351363
}
352-
bool const localLeader = cprops.get(THE_LEADER).stringRef().empty();
353-
if (localLeader && plannedLeader == UNDERSCORE + serverId) {
364+
365+
bool const activeResign = isLeading && plannedLeader != serverId;
366+
bool const adjustResignState =
367+
(plannedLeader == UNDERSCORE + serverId &&
368+
localLeader != ResignShardLeadership::LeaderNotYetKnownString) ||
369+
(plannedLeader != serverId && localLeader == LEADER_NOT_YET_KNOWN);
370+
/*
371+
* We need to resign in the following cases:
372+
* 1) (activeResign) We think we are the leader locally, but the plan says we are not. (including, we are resigned)
373+
* 2) (adjustResignState) We are not leading, and not in resigned state, but the plan says we should be resigend.
374+
* - This triggers on rebooted servers, that were in resign process
375+
* - This triggers if the shard is moved from the server, before it actually took ownership.
376+
*/
377+
378+
if (activeResign || adjustResignState) {
354379
actions.emplace_back(std::make_shared<ActionDescription>(
355-
std::map<std::string, std::string>{{NAME, RESIGN_SHARD_LEADERSHIP}, {DATABASE, dbname}, {SHARD, colname}},
356-
RESIGN_PRIORITY));
357-
} else {
358-
bool drop = false;
359-
// check if shard is in plan, if not drop it
360-
if (commonShrds.empty()) {
361-
drop = true;
362-
} else {
363-
it = std::find(commonShrds.begin(), commonShrds.end(), colname);
364-
if (it == commonShrds.end()) {
365-
drop = true;
366-
}
367-
}
380+
std::map<std::string, std::string>{{NAME, RESIGN_SHARD_LEADERSHIP},
381+
{DATABASE, dbname},
382+
{SHARD, colname}},
383+
RESIGN_PRIORITY));
384+
}
368385

369-
if (drop) {
370-
actions.emplace_back(std::make_shared<ActionDescription>(
371-
std::map<std::string, std::string>{{NAME, DROP_COLLECTION}, {DATABASE, dbname}, {COLLECTION, colname}},
372-
localLeader ? LEADER_PRIORITY : FOLLOWER_PRIORITY));
373-
} else {
374-
// The shard exists in both Plan and Local
375-
commonShrds.erase(it); // it not a common shard?
376-
377-
// We only drop indexes, when collection is not being dropped already
378-
if (cprops.hasKey(INDEXES)) {
379-
if (cprops.get(INDEXES).isArray()) {
380-
for (auto const& index : VPackArrayIterator(cprops.get(INDEXES))) {
381-
VPackStringRef type = index.get(StaticStrings::IndexType).stringRef();
382-
if (type != PRIMARY && type != EDGE) {
383-
std::string const id = index.get(ID).copyString();
384-
385-
// check if index is in plan
386-
if (indis.find(colname + "/" + id) != indis.end() ||
387-
indis.find(id) != indis.end()) {
388-
indis.erase(id);
389-
} else {
390-
actions.emplace_back(std::make_shared<ActionDescription>(
391-
std::map<std::string, std::string>{{NAME, DROP_INDEX},
392-
{DATABASE, dbname},
393-
{COLLECTION, colname},
394-
{"index", id}},
395-
INDEX_PRIORITY));
396-
}
397-
}
386+
// We only drop indexes, when collection is not being dropped already
387+
if (cprops.hasKey(INDEXES)) {
388+
if (cprops.get(INDEXES).isArray()) {
389+
for (auto const& index : VPackArrayIterator(cprops.get(INDEXES))) {
390+
VPackStringRef type = index.get(StaticStrings::IndexType).stringRef();
391+
if (type != PRIMARY && type != EDGE) {
392+
std::string const id = index.get(ID).copyString();
393+
394+
// check if index is in plan
395+
if (indis.find(colname + "/" + id) != indis.end() ||
396+
indis.find(id) != indis.end()) {
397+
indis.erase(id);
398+
} else {
399+
actions.emplace_back(std::make_shared<ActionDescription>(
400+
std::map<std::string, std::string>{{NAME, DROP_INDEX},
401+
{DATABASE, dbname},
402+
{COLLECTION, colname},
403+
{"index", id}},
404+
INDEX_PRIORITY));
398405
}
399406
}
400407
}

0 commit comments

Comments
 (0)
0