arangodb
diff --git a/‎CHANGELOG
+5Lines changed: 5 additions & 0 deletions b/‎CHANGELOG
+5Lines changed: 5 additions & 0 deletions
diff --git a/‎arangod/Agency/MoveShard.cpp
Lines changed: 83 additions & 25 deletions b/‎arangod/Agency/MoveShard.cpp
Lines changed: 83 additions & 25 deletions
diff --git a/‎arangod/Agency/MoveShard.h
Lines changed: 2 additions & 0 deletions b/‎arangod/Agency/MoveShard.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/Agency/MoveShardTest.cpp
Lines changed: 97 additions & 0 deletions b/‎tests/Agency/MoveShardTest.cpp
Lines changed: 97 additions & 0 deletions
@@ -1,6 +1,11 @@
 devel
 -----
 
+* Delay a MoveShard operation for leader change, until the old leader has
+  actually assumed its leadership and until the new leader is actually in
+  sync. This fixes a bug which could block a shard under certain circumstances.
+  This fixes BTS-1110.
+
 * Fixed issue #17367: FILTER fails when using negation (!) on variable whose
   name starts with "in". Add trailing context to NOT IN token.
 
 
@@ -204,6 +204,50 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
   return false;
 }
 
+bool MoveShard::checkLeaderFollowerCurrent(
+    std::vector<Job::shard_t> const& shardsLikeMe) {
+  bool ok = true;
+  for (auto const& s : shardsLikeMe) {
+    auto sharedPath = _database + "/" + s.collection + "/";
+    auto currentServersPath = curColPrefix + sharedPath + s.shard + "/servers";
+    auto serverList = _snapshot.hasAsArray(currentServersPath);
+    if (serverList && (*serverList).length() > 0) {
+      if (_from != (*serverList)[0].stringView()) {
+        LOG_TOPIC("55261", DEBUG, Logger::SUPERVISION)
+            << "MoveShard: From server " << _from
+            << " has not yet assumed leadership for collection " << s.collection
+            << " shard " << s.shard
+            << ", delaying start of MoveShard job for shard " << _shard;
+        ok = false;
+        break;
+      }
+      bool toFound = false;
+      for (auto server : VPackArrayIterator(*serverList)) {
+        if (_to == server.stringView()) {
+          toFound = true;
+          break;
+        }
+      }
+      if (!toFound) {
+        LOG_TOPIC("55262", DEBUG, Logger::SUPERVISION)
+            << "MoveShard: To server " << _to
+            << " is not in sync for collection " << s.collection << " shard "
+            << s.shard << ", delaying start of MoveShard job for shard "
+            << _shard;
+        ok = false;
+        break;
+      }
+    } else {
+      LOG_TOPIC("55263", INFO, Logger::SUPERVISION)
+          << "MoveShard: Did not find a non-empty server list in Current "
+             "for collection "
+          << s.collection << " and shard " << s.shard;
+      ok = false;  // not even a server list found
+    }
+  }
+  return ok;
+}
+
 bool MoveShard::start(bool&) {
   if (considerCancellation()) {
     return false;
@@ -391,6 +435,19 @@ bool MoveShard::start(bool&) {
     }
   }
 
+  if (_isLeader && _toServerIsFollower) {
+    // Further checks, before we can begin, we must make sure that the
+    // _fromServer has accepted its leadership already for all shards in the
+    // shard group and that the _toServer is actually in sync. Otherwise,
+    // if this job here asks the leader to resign, we would be stuck.
+    // If the _toServer is not in sync, the job would take overly long.
+    bool ok = checkLeaderFollowerCurrent(shardsLikeMe);
+    if (!ok) {
+      return false;  // Do not start job, but leave it in Todo.
+                     // Log messages already written.
+    }
+  }
+
   // Copy todo to pending
   Builder todo, pending;
 
@@ -732,7 +789,8 @@ JOB_STATUS MoveShard::pendingLeader() {
   if (plan.isArray() &&
       Job::countGoodOrBadServersInList(_snapshot, plan) < plan.length()) {
     LOG_TOPIC("de056", DEBUG, Logger::SUPERVISION)
-        << "MoveShard (leader): found FAILED server in Plan, aborting job, db: "
+        << "MoveShard (leader): found FAILED server in Plan, aborting job, "
+           "db: "
         << _database << " coll: " << _collection << " shard: " << _shard;
     abort("failed server in Plan");
     return FAILED;
@@ -839,12 +897,12 @@ JOB_STATUS MoveShard::pendingLeader() {
 
     // We need to switch leaders:
     {
-      // First make sure that the server we want to go to is still in Current
-      // for all shards. This is important, since some transaction which the
-      // leader has still executed before its resignation might have dropped a
-      // follower for some shard, and this could have been our new leader. In
-      // this case we must abort and go back to the original leader, which is
-      // still perfectly safe.
+      // First make sure that the server we want to go to is still in
+      // Current for all shards. This is important, since some transaction
+      // which the leader has still executed before its resignation might
+      // have dropped a follower for some shard, and this could have been
+      // our new leader. In this case we must abort and go back to the
+      // original leader, which is still perfectly safe.
       for (auto const& sh : shardsLikeMe) {
         auto const shardPath =
             curColPrefix + _database + "/" + sh.collection + "/" + sh.shard;
@@ -953,8 +1011,8 @@ JOB_STATUS MoveShard::pendingLeader() {
                   }
                 } else {
                   LOG_TOPIC("3294e", WARN, Logger::SUPERVISION)
-                      << "failed to iterate through current shard servers for "
-                         "shard "
+                      << "failed to iterate through current shard servers "
+                         "for shard "
                       << _shard << " or one of its clones";
                   TRI_ASSERT(false);
                   return;  // we don't increment done and remain PENDING
@@ -1063,8 +1121,8 @@ JOB_STATUS MoveShard::pendingFollower() {
   if (plan.isArray() &&
       Job::countGoodOrBadServersInList(_snapshot, plan) < plan.length()) {
     LOG_TOPIC("f8c22", DEBUG, Logger::SUPERVISION)
-        << "MoveShard (follower): found FAILED server in Plan, aborting job, "
-           "db: "
+        << "MoveShard (follower): found FAILED server in Plan, aborting "
+           "job, db: "
         << _database << " coll: " << _collection << " shard: " << _shard;
     abort("failed server in Plan");
     return FAILED;
@@ -1242,8 +1300,8 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
                 trx.add(VPackValue(_from));
                 if (plan.isArray()) {
                   for (VPackSlice srv : VPackArrayIterator(plan)) {
-                    // from could be in plan as <from> or <_from>. Exclude to
-                    // server always.
+                    // from could be in plan as <from> or <_from>. Exclude
+                    // to server always.
                     if (srv.isEqualString(_from) ||
                         srv.isEqualString("_" + _from) ||
                         srv.isEqualString(_to)) {
@@ -1259,8 +1317,8 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
                   TRI_ASSERT(false);
                   return;
                 }
-                // Add to server last. Will be removed by removeFollower if to
-                // much
+                // Add to server last. Will be removed by removeFollower if
+                // too many.
                 trx.add(VPackValue(_to));
               }
             });
@@ -1312,8 +1370,8 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
       VPackObjectBuilder preconditionObj(&trx);
       addMoveShardToServerCanUnLock(trx);
       addMoveShardFromServerCanUnLock(trx);
-      // If the collection is gone in the meantime, we do nothing here, but the
-      // round will move the job to Finished anyway:
+      // If the collection is gone in the meantime, we do nothing here, but
+      // the round will move the job to Finished anyway:
       addPreconditionCollectionStillThere(trx, _database, _collection);
     }
   }
@@ -1331,19 +1389,19 @@ arangodb::Result MoveShard::abort(std::string const& reason) {
       LOG_TOPIC("513e6", INFO, Logger::SUPERVISION)
           << "Precondition failed on MoveShard::abort() for shard " << _shard
           << " of collection " << _collection
-          << ", if the collection has been deleted in the meantime, the job "
-             "will be finished soon, if this message repeats, tell us.";
+          << ", if the collection has been deleted in the meantime, the "
+             "job will be finished soon, if this message repeats, tell us.";
       result = Result(
           TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
           std::string("Precondition failed while aborting moveShard job ") +
               _jobId);
       return result;
-      // We intentionally do not move the job object to Failed or Finished here!
-      // The failed precondition can either be one of the read locks, which
-      // suggests a fundamental problem, and in which case we will log this
-      // message in every round of the supervision. Or the collection has been
-      // dropped since we took the snapshot, in this case we will move the job
-      // to Finished in the next round.
+      // We intentionally do not move the job object to Failed or Finished
+      // here! The failed precondition can either be one of the read locks,
+      // which suggests a fundamental problem, and in which case we will log
+      // this message in every round of the supervision. Or the collection
+      // has been dropped since we took the snapshot, in this case we will
+      // move the job to Finished in the next round.
     }
     result = Result(
         TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
 
@@ -88,5 +88,7 @@ struct MoveShard : public Job {
   void addMoveShardFromServerCanUnLock(Builder& ops) const;
 
   bool moveShardFinish(bool unlock, bool success, std::string const& msg);
+  bool checkLeaderFollowerCurrent(
+      std::vector<Job::shard_t> const& shardsLikeMe);
 };
 }  // namespace arangodb::consensus
@@ -621,6 +621,103 @@ TEST_F(MoveShardTest, the_job_should_wait_until_the_target_server_is_good) {
   moveShard.start(aborts);
 }
 
+TEST_F(MoveShardTest, the_job_should_wait_until_the_from_server_is_in_current) {
+  std::function<std::unique_ptr<VPackBuilder>(velocypack::Slice,
+                                              std::string const&)>
+      createTestStructure = [&](velocypack::Slice s, std::string const& path) {
+        auto builder = std::make_unique<velocypack::Builder>();
+        if (s.isObject()) {
+          builder->add(VPackValue(VPackValueType::Object));
+          for (auto it : VPackObjectIterator(s)) {
+            auto childBuilder =
+                createTestStructure(it.value, path + "/" + it.key.copyString());
+            if (childBuilder) {
+              builder->add(it.key.copyString(), childBuilder->slice());
+            }
+          }
+
+          if (path == "/arango/Target/ToDo") {
+            builder->add(
+                jobId,
+                createJob(COLLECTION, SHARD_LEADER, SHARD_FOLLOWER1).slice());
+          }
+          builder->close();
+        } else {
+          // Simulate a new leader which has not yet assumed its leadership:
+          if (path == "/arango/Current/Collections/" + DATABASE + "/" +
+                          COLLECTION + "/" + SHARD + "/servers") {
+            {
+              VPackArrayBuilder guard(builder.get());
+              builder->add(VPackValue("follower1"));
+              builder->add(VPackValue("leader"));
+            }
+          } else {
+            builder->add(s);
+          }
+        }
+        return builder;
+      };
+
+  Mock<AgentInterface> mockAgent;
+  When(Method(mockAgent, waitFor)).AlwaysReturn();
+  AgentInterface& agent = mockAgent.get();
+
+  auto builder = createTestStructure(baseStructure.toBuilder().slice(), "");
+  ASSERT_TRUE(builder);
+  Node agency = createAgencyFromBuilder(*builder);
+
+  auto moveShard = MoveShard(agency, &agent, TODO, jobId);
+  moveShard.start(aborts);
+}
+
+TEST_F(MoveShardTest, the_job_should_wait_until_the_to_server_is_in_sync) {
+  std::function<std::unique_ptr<VPackBuilder>(velocypack::Slice,
+                                              std::string const&)>
+      createTestStructure = [&](velocypack::Slice s, std::string const& path) {
+        auto builder = std::make_unique<velocypack::Builder>();
+        if (s.isObject()) {
+          builder->add(VPackValue(VPackValueType::Object));
+          for (auto it : VPackObjectIterator(s)) {
+            auto childBuilder =
+                createTestStructure(it.value, path + "/" + it.key.copyString());
+            if (childBuilder) {
+              builder->add(it.key.copyString(), childBuilder->slice());
+            }
+          }
+
+          if (path == "/arango/Target/ToDo") {
+            builder->add(
+                jobId,
+                createJob(COLLECTION, SHARD_LEADER, SHARD_FOLLOWER1).slice());
+          }
+          builder->close();
+        } else {
+          // Simulate a new leader which has not yet assumed its leadership:
+          if (path == "/arango/Current/Collections/" + DATABASE + "/" +
+                          COLLECTION + "/" + SHARD + "/servers") {
+            {
+              VPackArrayBuilder guard(builder.get());
+              builder->add(VPackValue("leader"));
+            }
+          } else {
+            builder->add(s);
+          }
+        }
+        return builder;
+      };
+
+  Mock<AgentInterface> mockAgent;
+  When(Method(mockAgent, waitFor)).AlwaysReturn();
+  AgentInterface& agent = mockAgent.get();
+
+  auto builder = createTestStructure(baseStructure.toBuilder().slice(), "");
+  ASSERT_TRUE(builder);
+  Node agency = createAgencyFromBuilder(*builder);
+
+  auto moveShard = MoveShard(agency, &agent, TODO, jobId);
+  moveShard.start(aborts);
+}
+
 TEST_F(
     MoveShardTest,
     the_job_should_fail_if_the_shard_distributes_its_shards_like_some_other) {