8000 Resilience test failure points (#6539) · CoericK/arangodb@0fa7f01 · GitHub
[go: up one dir, main page]

8000
Skip to content

Commit 0fa7f01

Browse files
graetzerjsteemann
authored andcommitted
Resilience test failure points (arangodb#6539)
1 parent 5371241 commit 0fa7f01

27 files changed

+883
-275
lines changed

arangod/Cluster/ClusterInfo.cpp

Lines changed: 0 additions & 51 deletions
_mappingsProt.isValid = true;
Original file line numberDiff line numberDiff line change
@@ -3307,33 +3307,26 @@ void ClusterInfo::loadCurrentMappings() {
33073307

33083308
if (mappings.isObject()) {
33093309
decltype(_coordinatorIdMap) newCoordinatorIdMap;
3310-
decltype(_dbserverIdMap) newDBServerIdMap;
3311-
decltype(_nameMap) newNameMap;
33123310

33133311
for (auto const& mapping : VPackObjectIterator(mappings)) {
33143312
ServerID fullId = mapping.key.copyString();
33153313
auto mapObject = mapping.value;
33163314
if (mapObject.isObject()) {
33173315
ServerShortName shortName = mapObject.get("ShortName").copyString();
3318-
newNameMap.emplace(shortName, fullId);
33193316

33203317
ServerShortID shortId = mapObject.get("TransactionID").getNumericValue<ServerShortID>();
33213318
static std::string const expectedPrefix{"Coordinator"};
33223319
if (shortName.size() > expectedPrefix.size() &&
33233320
shortName.substr(0, expectedPrefix.size()) == expectedPrefix) {
33243321
newCoordinatorIdMap.emplace(shortId, fullId);
3325-
} else {
3326-
newDBServerIdMap.emplace(shortId, fullId);
33273322
}
33283323
}
33293324
}
33303325

33313326
// Now set the new value:
33323327
{
33333328
WRITE_LOCKER(writeLocker, _mappingsProt.lock);
3334-
_nameMap.swap(newNameMap);
33353329
_coordinatorIdMap.swap(newCoordinatorIdMap);
3336-
_dbserverIdMap.swap(newDBServerIdMap);
33373330
_mappingsProt.doneVersion = storedVersion;
33383331
33393332
}
@@ -3594,50 +3587,6 @@ ServerID ClusterInfo::getCoordinatorByShortID(ServerShortID shortId) {
35943587
return result;
35953588
}
35963589

3597-
////////////////////////////////////////////////////////////////////////////////
3598-
/// @brief lookup full dbserver ID from short ID
3599-
////////////////////////////////////////////////////////////////////////////////
3600-
3601-
ServerID ClusterInfo::getDBServerByShortID(ServerShortID shortId) {
3602-
ServerID result;
3603-
3604-
if (!_mappingsProt.isValid) {
3605-
loadCurrentMappings();
3606-
}
3607-
3608-
// return a consistent state of servers
3609-
READ_LOCKER(readLocker, _mappingsProt.lock);
3610-
3611-
auto it = _dbserverIdMap.find(shortId);
3612-
if (it != _dbserverIdMap.end()) {
3613-
result = it->second;
3614-
}
3615-
3616-
return result;
3617-
}
3618-
3619-
////////////////////////////////////////////////////////////////////////////////
3620-
/// @brief lookup full server ID from short name
3621-
////////////////////////////////////////////////////////////////////////////////
3622-
3623-
ServerID ClusterInfo::getServerByShortName(ServerShortName const& shortName) {
3624-
ServerID result;
3625-
3626-
if (!_mappingsProt.isValid) {
3627-
loadCurrentMappings();
3628-
}
3629-
3630-
// return a consistent state of servers
3631-
READ_LOCKER(readLocker, _mappingsProt.lock);
3632-
3633-
auto it = _nameMap.find(shortName);
3634-
if (it != _nameMap.end()) {
3635-
result = it->second;
3636-
}
3637-
3638-
return result;
3639-
}
3640-
36413590
//////////////////////////////////////////////////////////////////////////////
36423591
/// @brief invalidate plan
36433592
//////////////////////////////////////////////////////////////////////////////

arangod/Cluster/ClusterInfo.h

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -548,18 +548,6 @@ class ClusterInfo {
548548

549549
ServerID getCoordinatorByShortID(ServerShortID);
550550

551-
//////////////////////////////////////////////////////////////////////////////
552-
/// @brief lookup a full dbserver ID by short ID
553-
//////////////////////////////////////////////////////////////////////////////
554-
555-
ServerID getDBServerByShortID(ServerShortID);
556-
557-
//////////////////////////////////////////////////////////////////////////////
558-
/// @brief lookup a full server ID by short name
559-
//////////////////////////////////////////////////////////////////////////////
560-
561-
ServerID getServerByShortName(ServerShortName const&);
562-
563551
//////////////////////////////////////////////////////////////////////////////
564552
/// @brief invalidate planned
565553
//////////////////////////////////////////////////////////////////////////////
@@ -595,7 +583,7 @@ class ClusterInfo {
595583
//////////////////////////////////////////////////////////////////////////////
596584

597585
std::shared_ptr<VPackBuilder> getCurrent();
598-
586+
599587
std::vector<std::string> getFailedServers() { MUTEX_LOCKER(guard, _failedServersMutex); return _failedServers; }
600588
void setFailedServers(std::vector<std::string> const& failedServers) { MUTEX_LOCKER(guard, _failedServersMutex); _failedServers = failedServers; }
601589

@@ -715,8 +703,6 @@ class ClusterInfo {
715703

716704
// Mappings between short names/IDs and full server IDs
717705
std::unordered_map<ServerShortID, ServerID> _coordinatorIdMap;
718-
std::unordered_map<ServerShortID, ServerID> _dbserverIdMap;
719-
std::unordered_map<ServerShortName, ServerID> _nameMap;
720706
ProtectionData _mappingsProt;
721707

722708
std::shared_ptr<VPackBuilder> _plan;

arangod/Cluster/HeartbeatThread.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -963,21 +963,20 @@ void HeartbeatThread::runCoordinator() {
963963
{AgencyCommManager::path(), "Target", "FailedServers"}));
964964

965965
if (failedServersSlice.isObject()) {
966-
std::vector<std::string> failedServers = {};
966+
std::vector<ServerID> failedServers = {};
967967
for (auto const& server : VPackObjectIterator(failedServersSlice)) {
968968
if (server.value.isArray() && server.value.length() == 0) {
969969
failedServers.push_back(server.key.copyString());
970970
}
971971
}
972-
// calling pregel code
973972
ClusterInfo::instance()->setFailedServers(failedServers);
974973

975974
pregel::PregelFeature *prgl = pregel::PregelFeature::instance();
976975
if (prgl != nullptr && failedServers.size() > 0) {
977976
pregel::RecoveryManager* mngr = prgl->recoveryManager();
978977
if (mngr != nullptr) {
979978
try {
980-
mngr->updatedFailedServers();
979+
mngr->updatedFailedServers(failedServers);
981980
} catch (std::exception const& e) {
982981
LOG_TOPIC(ERR, Logger::HEARTBEAT)
983982
<< "Got an exception in coordinator heartbeat: " << e.what();

arangod/Cluster/SynchronizeShard.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ arangodb::Result getReadLockId (
172172
}
173173

174174

175-
arangodb::Result count(
175+
arangodb::Result collectionCount(
176176
std::shared_ptr<arangodb::LogicalCollection> const& col, uint64_t& c) {
177177

178178
std::string collectionName(col->name());
@@ -202,7 +202,6 @@ arangodb::Result count(
202202
c = s.getNumber<uint64_t>();
203203

204204
return opResult.result;
205-
206205
}
207206

208207
arangodb::Result addShardFollower (
@@ -232,16 +231,22 @@ arangodb::Result addShardFollower (
232231
return arangodb::Result(TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND, errorMsg);
233232
}
234233

235-
uint64_t c;
236-
count(collection, c);
234+
uint64_t docCount;
235+
Result res = collectionCount(collection, docCount);
236+
if (res.fail()) {
237+
return res;
238+
}
237239
VPackBuilder body;
238240
{ VPackObjectBuilder b(&body);
239241
body.add(FOLLOWER_ID, VPackValue(arangodb::ServerState::instance()->getId()));
240242
body.add(SHARD, VPackValue(shard));
241-
body.add("checksum", VPackValue(std::to_string(c)));
243+
body.add("checksum", VPackValue(std::to_string(docCount)));
242244
if (lockJobId != 0) {
243-
body.add("readLockId", VPackValue(lockJobId));
244-
}}
245+
body.add("readLockId", VPackValue(std::to_string(lockJobId)));
246+
} else {
247+
TRI_ASSERT(docCount == 0);
248+
}
249+
}
245250

246251
auto comres = cc->syncRequest(
247252
TRI_NewTickServer(), endpoint, rest::RequestType::PUT,
@@ -728,16 +733,16 @@ bool SynchronizeShard::first() {
728733
}
729734

730735
auto ep = clusterInfo->getServerEndpoint(leader);
731-
uint64_t c;
732-
if (!count(collection, c).ok()) {
736+
uint64_t docCount;
737+
if (!collectionCount(collection, docCount).ok()) {
733738
std::stringstream error;
734739
error << "failed to get a count on leader " << shard;
735740
LOG_TOPIC(ERR, Logger::MAINTENANCE) << "SynchronizeShard " << error.str();
736741
_result.reset(TRI_ERROR_INTERNAL, error.str());
737742
return false;
738743
}
739744

740-
if (c == 0) {
745+
if (docCount == 0) {
741746
// We have a short cut:
742747
LOG_TOPIC(DEBUG, Logger::MAINTENANCE) <<
743748
"synchronizeOneShard: trying short cut to synchronize local shard '" <<

arangod/ClusterEngine/ClusterCollection.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,9 +479,9 @@ void ClusterCollection::invokeOnAllElements(
479479
// -- SECTION DML Operations --
480480
///////////////////////////////////
481481

482-
void ClusterCollection::truncate(transaction::Methods* trx,
482+
Result ClusterCollection::truncate(transaction::Methods* trx,
483483
OperationOptions& options) {
484-
THROW_ARANGO_EXCEPTION(TRI_ERROR_NOT_IMPLEMENTED);
484+
return Result(TRI_ERROR_NOT_IMPLEMENTED);
485485
}
486486

487487
LocalDocumentId ClusterCollection::lookupKey(transaction::Methods* trx,

arangod/ClusterEngine/ClusterCollection.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ class ClusterCollection final : public PhysicalCollection {
131131
// -- SECTION DML Operations --
132132
///////////////////////////////////
133133

134-
void truncate(transaction::Methods* trx, OperationOptions& options) override;
134+
Result truncate(transaction::Methods* trx, OperationOptions&) override;
135135

136136
void deferDropCollection(
137137
std::function<bool(LogicalCollection&)> const& callback

arangod/MMFiles/MMFilesCollection.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2781,8 +2781,8 @@ int MMFilesCollection::unlockWrite(bool useDeadlockDetector, TransactionState co
27812781
return TRI_ERROR_NO_ERROR;
27822782
}
27832783

2784-
void MMFilesCollection::truncate(transaction::Methods* trx,
2785-
OperationOptions& options) {
2784+
Result MMFilesCollection::truncate(transaction::Methods* trx,
2785+
OperationOptions& options) {
27862786
auto primaryIdx = primaryIndex();
27872787

27882788
options.ignoreRevs = true;
@@ -2811,7 +2811,15 @@ void MMFilesCollection::truncate(transaction::Methods* trx,
28112811

28122812
return true;
28132813
};
2814-
primaryIdx->invokeOnAllElementsForRemoval(callback);
2814+
try {
2815+
primaryIdx->invokeOnAllElementsForRemoval(callback);
2816+
} catch(basics::Exception const& e) {
2817+
return Result(e.code(), e.message());
2818+
} catch(std::exception const& e) {
2819+
return Result(TRI_ERROR_INTERNAL, e.what());
2820+
} catch(...) {
2821+
return Result(TRI_ERROR_INTERNAL, "unknown error during truncate");
2822+
}
28152823

28162824
READ_LOCKER(guard, _indexesLock);
28172825
auto indexes = _indexes;
@@ -2822,6 +2830,8 @@ void MMFilesCollection::truncate(transaction::Methods* trx,
28222830
TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX);
28232831
idx->afterTruncate();
28242832
}
2833+
2834+
return Result();
28252835
}
28262836

28272837
LocalDocumentId MMFilesCollection::reuseOrCreateLocalDocumentId(OperationOptions const& options) const {

arangod/MMFiles/MMFilesCollection.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ class MMFilesCollection final : public PhysicalCollection {
283283
// -- SECTION DML Operations --
284284
///////////////////////////////////
285285

286-
void truncate(transaction::Methods* trx, OperationOptions& options) override;
286+
Result truncate(transaction::Methods* trx, OperationOptions&) override;
287287

288288
/// @brief Defer a callback to be executed when the collection
289289
/// can be dropped. The callback is supposed to drop

arangod/Pregel/Recovery.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,9 @@ int RecoveryManager::filterGoodServers(std::vector<ServerID> const& servers,
137137
return TRI_ERROR_NO_ERROR;
138138
}
139139

140-
void RecoveryManager::updatedFailedServers() {
140+
void RecoveryManager::updatedFailedServers(std::vector<ServerID> const& failed) {
141141
MUTEX_LOCKER(guard, _lock); // we are accessing _primaryServers
142142

143-
std::vector<std::string> const failed =
144-
ClusterInfo::instance()->getFailedServers();
145143
for (auto const& pair : _primaryServers) {
146144
auto const& it = std::find(failed.begin(), failed.end(), pair.second);
147145
if (it != failed.end()) {

arangod/Pregel/Recovery.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class RecoveryManager {
6161
void stopMonitoring(Conductor*);
6262
int filterGoodServers(std::vector<ServerID> const& servers,
6363
std::vector<ServerID>& goodServers);
64-
void updatedFailedServers();
64+
void updatedFailedServers(std::vector<ServerID> const& failedServers);
6565
// bool allServersAvailable(std::vector<ServerID> const& dbServers);
6666
};
6767

0 commit comments

Comments
 (0)
0