8000 next attempt · odidev/arangodb@0e8f0d6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0e8f0d6

Browse files
committed
next attempt
1 parent a4adcda commit 0e8f0d6

24 files changed

+608
-1166
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ endif()
799799
## OPENSSL
800800
################################################################################
801801

802-
# Some special stuff for Mac OSX and homebrew as a preparation for the
802+
# Some special stuff for Mac OSX and homebrew as a preparation for the
803803
# generic FindOpenSSL script:
804804
if (APPLE AND BREW AND NOT OPENSSL_ROOT_DIR)
805805
message("searching openssl with brew (${BREW})")

arangod/Agency/ActiveFailoverJob.cpp

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,15 @@ ActiveFailoverJob::ActiveFailoverJob(Node const& snapshot, AgentInterface* agent
4242
JOB_STATUS status, std::string const& jobId)
4343
: Job(status, snapshot, agent, jobId) {
4444
// Get job details from agency:
45-
std::string path = pos[status] + _jobId + "/";
46-
auto tmp_server = _snapshot.hasAsString(path + "server");
47-
auto tmp_creator = _snapshot.hasAsString(path + "creator");
48-
49-
if (tmp_server.second && tmp_creator.second) {
50-
_server = tmp_server.first;
51-
_creator = tmp_creator.first;
52-
} else {
45+
try {
46+
std::string path = pos[status] + _jobId + "/";
47+
_server = _snapshot.get(path + "server").getString();
48+
_creator = _snapshot.get(path + "creator").getString();
49+
} catch(std::exception const& e) {
5350
std::stringstream err;
5451
err << "Failed to find job " << _jobId << " in agency.";
5552
LOG_TOPIC(ERR, Logger::SUPERVISION) << err.str();
56-
finish(tmp_server.first, "", false, err.str());
53+
finish(_server, "", false, err.str());
5754
_status = FAILED;
5855
}
5956
}
@@ -139,20 +136,22 @@ bool ActiveFailoverJob::start() {
139136
return finish(_server, "", true, reason); // move to /Target/Finished
140137
}
141138

142-
auto leader = _snapshot.hasAsSlice(asyncReplLeader);
143-
if (!leader.second || leader.first.compareString(_server) != 0) {
139+
Node leader = _snapshot.get(asyncReplLeader);
140+
if (!leader.isString() || leader.slice().compareString(_server) != 0) {
144141
std::string reason = "Server " + _server + " is not the current replication leader";
145142
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason;
146143
return finish(_server, "", true, reason); // move to /Target/Finished
147144
}
148145

149146
// Abort job blocking server if abortable
150-
auto jobId = _snapshot.hasAsString(blockedServersPrefix + _server);
151-
if (jobId.second && !abortable(_snapshot, jobId.first)) {
152-
return false;
153-
} else if (jobId.second) {
154-
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
155-
}
147+
try {
148+
std::string jobId = _snapshot(blockedServersPrefix + _server).getString();
149+
if (!abortable(_snapshot, jobId)) {
150+
return false;
151+
} else {
152+
JobContext(PENDING, jobId, _snapshot, _agent).abort();
153+
}
154+
} catch (...) {}
156155

157156
// Todo entry
158157
Builder todo;
@@ -195,7 +194,7 @@ bool ActiveFailoverJob::start() {
195194
// Destination server should not be blocked by another job
196195
addPreconditionServerNotBlocked(pending, newLeader);
197196
// AsyncReplication leader must be the failed server
198-
addPreconditionUnchanged(pending, asyncReplLeader, leader.first);
197+
addPreconditionUnchanged(pending, asyncReplLeader, leader.slice());
199198
} // precondition done
200199

201200
} // array for transaction done
@@ -247,20 +246,17 @@ arangodb::Result ActiveFailoverJob::abort() {
247246
typedef std::pair<std::string, TRI_voc_tick_t> ServerTick;
248247
/// Try to select the follower most in-sync with failed leader
249248
std::string ActiveFailoverJob::findBestFollower() {
250-
std::vector<std::string> healthy = healthyServers(_snapshot);
251-
// the failed leader should never appear as healthy
252-
TRI_ASSERT(std::find(healthy.begin(), healthy.end(), _server) == healthy.end());
249+
std::vector<std::string> as = healthyServers(_snapshot);
253250

254251
// blocked; (not sure if this can even happen)
255252
try {
256253
for (auto const& srv : _snapshot(blockedServersPrefix).children()) {
257-
healthy.erase(std::remove(healthy.begin(), healthy.end(), srv.first), healthy.end());
254+
as.erase(std::remove(as.begin(), as.end(), srv.first), as.end());
258255
}
259256
} catch (...) {}
260257

261258
std::vector<ServerTick> ticks;
262-
try {
263-
// collect tick values from transient state
259+
try { // collect tick values from transient state
264260
query_t trx = std::make_unique<VPackBuilder>();
265261
{
266262
VPackArrayBuilder transactions(trx.get());
@@ -277,15 +273,13 @@ std::string ActiveFailoverJob::findBestFollower() {
277273
VPackSlice obj = resp.at(0).get({ Job::agencyPrefix, "AsyncReplication"});
278274
for (VPackObjectIterator::ObjectPair pair : VPackObjectIterator(obj)) {
279275
std::string srvUUID = pair.key.copyString();
280-
bool isAvailable = std::find(healthy.begin(), healthy.end(), srvUUID) != healthy.end();
281-
if (!isAvailable) {
276+
if (std::find(as.begin(), as.end(), srvUUID) == as.end()) {
282277
continue; // skip inaccessible servers
283278
}
284-
TRI_ASSERT(srvUUID != _server);
285279

286280
VPackSlice leader = pair.value.get("leader"); // broken leader
287281
VPackSlice lastTick = pair.value.get("lastTick");
288-
if (leader.isString() && leader.isEqualString(_server) &&
282+
if (leader.isString() && leader.compareString(_server) == 0 &&
289283
lastTick.isNumber()) {
290284
ticks.emplace_back(std::move(srvUUID), lastTick.getUInt());
291285
}
@@ -298,7 +292,6 @@ std::string ActiveFailoverJob::findBestFollower() {
298292
return a.second > b.second;
299293
});
300294
if (!ticks.empty()) {
301-
TRI_ASSERT(ticks.size() == 1 || ticks[0].second >= ticks[1].second);
302295
return ticks[0].first;
303296
}
304297
return ""; // fallback to any available server

arangod/Agency/AddFollower.cpp

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
////////////////////////////////////////////////////////////////////////////////
22
/// DISCLAIMER
33
///
4-
/// Copyright 2016-2018 ArangoDB GmbH, Cologne, Germany
4+
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
55
///
66
/// Licensed under the Apache License, Version 2.0 (the "License");
77
/// you may not use this file except in compliance with the License.
@@ -41,25 +41,18 @@ AddFollower::AddFollower(Node const& snapshot, AgentInterface* agent,
4141
AddFollower::AddFollower(Node const& snapshot, AgentInterface* agent,
4242
JOB_STATUS status, std::string const& jobId)
4343
: Job(status, snapshot, agent, jobId) {
44-
4544
// Get job details from agency:
46-
std::string path = pos[status] + _jobId + "/";
47-
auto tmp_database = _snapshot.hasAsString(path + "database");
48-
auto tmp_collection = _snapshot.hasAsString(path + "collection");
49-
auto tmp_shard = _snapshot.hasAsString(path + "shard");
50-
auto tmp_creator = _snapshot.hasAsString(path + "creator");
51-
52-
if (tmp_database.second && tmp_collection.second
53-
&& tmp_shard.second && tmp_creator.second) {
54-
_database = tmp_database.first;
55-
_collection = tmp_collection.first;
56-
_shard = tmp_shard.first;
57-
_creator = tmp_creator.first;
58-
} else {
45+
try {
46+
std::string path = pos[status] + _jobId + "/";
47+
_database = _snapshot.get(path + "database").getString();
48+
_collection = _snapshot.get(path + "collection").getString();
49+
_shard = _snapshot.get(path + "shard").getString();
50+
_creator = _snapshot.get(path + "creator").getString();
51+
} catch (std::exception const& e) {
5952
std::stringstream err;
60-
err << "Failed to find job " << _jobId << " in agency.";
53+
err << "Failed to find job " << _jobId << " in agency: " << e.what();
6154
LOG_TOPIC(ERR, Logger::SUPERVISION) << err.str();
62-
finish("", tmp_shard.first, false, err.str());
55+
finish("", _shard, false, err.str());
6356
_status = FAILED;
6457
}
6558
}
@@ -130,24 +123,24 @@ bool AddFollower::start() {
130123
finish("", "", true, "collection has been dropped in the meantime");
131124
return false;
132125
}
133-
Node collection = _snapshot.hasAsNode(planColPrefix + _database + "/" + _collection).first;
126+
Node collection = _snapshot.get(planColPrefix + _database + "/" + _collection);
134127
if (collection.has("distributeShardsLike")) {
135128
finish("", "", false,
136129
"collection must not have 'distributeShardsLike' attribute");
137130
return false;
138131
}
139-
132+
140133
// Look at Plan:
141134
std::string planPath =
142135
planColPrefix + _database + "/" + _collection + "/shards/" + _shard;
143136

144-
Slice planned = _snapshot.hasAsSlice(planPath).first;
137+
Slice planned = _snapshot.get(planPath).slice();
145138

146139
TRI_ASSERT(planned.isArray());
147140

148141
// First check that we still have too few followers for the current
149142
// `replicationFactor`:
150-
size_t desiredReplFactor = collection.hasAsUInt("replicationFactor").first;
143+
size_t desiredReplFactor = collection.get("replicationFactor").getUInt();
151144
size_t actualReplFactor = planned.length();
152145
if (actualReplFactor >= desiredReplFactor) {
153146
finish("", "", true, "job no longer necessary, have enough replicas");
@@ -181,7 +174,7 @@ bool AddFollower::start() {
181174
// Check that we have enough:
182175
if (available.size() < desiredReplFactor - actualReplFactor) {
183176
LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "shard " << _shard
184-
<< " does not have enough candidates to add followers, waiting, jobId="
177+
<< " does not have enough candidates to add followers, waiting, jobId="
185178
<< _jobId;
186179
return false;
187180
}
@@ -211,8 +204,9 @@ bool AddFollower::start() {
211204
// will not be in the snapshot under ToDo, but in this case we find it
212205
// in _jb:
213206
if (_jb == nullptr) {
214-
auto tmp_todo = _snapshot.hasAsBuilder(toDoPrefix + _jobId, todo);
215-
if (!tmp_todo.second) {
207+
try {
208+
_snapshot.get(toDoPrefix + _jobId).toBuilder(todo);
209+
} catch (std::exception const&) {
216210
// Just in case, this is never going to happen, since we will only
217211
// call the start() method if the job is already in ToDo.
218212
LOG_TOPIC(INFO, Logger::SUPERVISION)
@@ -231,7 +225,7 @@ bool AddFollower::start() {
231225
}
232226
}
233227
}
234-
228+
235229
// Enter pending, remove todo, block toserver
236230
{ VPackArrayBuilder listOfTransactions(&trx);
237231

@@ -266,7 +260,7 @@ bool AddFollower::start() {
266260
}
267261
} // precondition done
268262
} // array for transaction done
269-
263+
270264
// Transact to agency
271265
write_ret_t res = singleWriteTransaction(_agent, trx);
272266

@@ -300,7 +294,7 @@ arangodb::Result AddFollower::abort() {
300294
"Failed aborting addFollower job beyond pending stage");
301295
}
302296

303-
Result result;
297+
Result result;
304298
// Can now only be TODO or PENDING
305299
if (_status == TODO) {
306300
finish("", "", false, "job aborted");
@@ -309,4 +303,5 @@ arangodb::Result AddFollower::abort() {
309303

310304
TRI_ASSERT(false); // cannot happen, since job moves directly to FINISHED
311305
return result;
306+
312307
}

0 commit comments

Comments
 (0)
0