8000 Fixes a blockage occurring with hotbackup when writing. (#12398) · adamjm/arangodb@3e23718 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3e23718

Browse files
authored
Fixes a blockage occurring with hotbackup when writing. (arangodb#12398)
* Initial manual port of fix from 3.6. * CHANGELOG. * Hard code isFollowerTransaction to false in the ClusterEngine.
1 parent d5dc571 commit 3e23718

15 files changed

+80
-25
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
devel
22
-----
33

4+
* Fixed a blockage on hotbackup when writes are happening concurrently, since
5+
followers could no longer replicate leader transactions.
6+
47
* Updated arangosync to 0.7.9.
58

69
* Fixed hotbackup S3 credentials validation and error reporting for upload

arangod/Cluster/ClusterMethods.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3780,8 +3780,14 @@ arangodb::Result hotBackupCoordinator(ClusterFeature& feature, VPackSlice const
37803780

37813781
std::vector<ServerID> dbServers = ci.getCurrentDBServers();
37823782
std::vector<ServerID> lockedServers;
3783-
double lockWait(1);
3784-
while (steady_clock::now() < end) {
3783+
// We try to hold all write transactions on all dbservers at the same time.
3784+
// The default timeout to get to this state is 120s. We first try for a
3785+
// certain time t, and if not everybody has stopped all transactions within
3786+
// t seconds, we release all locks and try again with t doubled, until the
3787+
// total timeout has been reached. We start with t=15, which gives us
3788+
// 15, 30 and 60 to try before the default timeout of 120s has been reached.
3789+
double lockWait(15.0);
3790+
while (steady_clock::now() < end && !feature.server().isStopping()) {
37853791
result = lockDBServerTransactions(pool, backupId, dbServers, lockWait, lockedServers);
37863792
if (!result.ok()) {
37873793
unlockDBServerTransactions(pool, backupId, lockedServers);
@@ -3796,7 +3802,7 @@ arangodb::Result hotBackupCoordinator(ClusterFeature& feature, VPackSlice const
37963802
break;
37973803
}
37983804
if (lockWait < 3600.0) {
3799-
lockWait *= 1.5;
3805+
lockWait *= 2.0;
38003806
}
38013807
std::this_thread::sleep_for(milliseconds(300));
38023808
}

arangod/Cluster/FollowerInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,8 @@ Result FollowerInfo::persistInAgency(bool isRemove) const {
371371
}
372372
} else {
373373
if (!planEntry.isArray() || planEntry.length() == 0 || !planEntry[0].isString() ||
374-
!planEntry[0].isEqualString(ServerState::instance()->getId())) {
374+
!(planEntry[0].isEqualString(ServerState::instance()->getId()) ||
375+
planEntry[0].isEqualString("_" + ServerState::instance()->getId()))) {
375376
LOG_TOPIC("42231", INFO, Logger::CLUSTER)
376377
<< reportName(isRemove)
377378
<< ", did not find myself in Plan: " << _docColl->vocbase().name()

arangod/ClusterEngine/ClusterTransactionState.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ Result ClusterTransactionState::beginTransaction(transaction::Hints hints) {
8080
updateStatus(transaction::Status::RUNNING);
8181
_vocbase.server().getFeature<MetricsFeature>().serverStatistics()._transactionsStatistics._transactionsStarted++;
8282

83-
transaction::ManagerFeature::manager()->registerTransaction(id(), isReadOnlyTransaction());
83+
transaction::ManagerFeature::manager()->registerTransaction(id(), isReadOnlyTransaction(), false /* isFollowerTransaction */);
8484
setRegistered();
8585

8686
cleanup.cancel();

arangod/RestHandler/RestDocumentHandler.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,9 @@ RestStatus RestDocumentHandler::insertDocument() {
203203
if (!isMultiple && !opOptions.isOverwriteModeUpdateReplace()) {
204204
_activeTrx->addHint(transaction::Hints::Hint::SINGLE_OPERATION);
205205
}
206+
if (!opOptions.isSynchronousReplicationFrom.empty()) {
207+
_activeTrx->addHint(transaction::Hints::Hint::IS_FOLLOWER_TRX);
208+
}
206209

207210
Result res = _activeTrx->begin();
208211
if (!res.ok()) {
@@ -486,6 +489,9 @@ RestStatus RestDocumentHandler::modifyDocument(bool isPatch) {
486489
if (!isArrayCase) {
487490
_activeTrx->addHint(transaction::Hints::Hint::SINGLE_OPERATION);
488491
}
492+
if (!opOptions.isSynchronousReplicationFrom.empty()) {
493+
_activeTrx->addHint(transaction::Hints::Hint::IS_FOLLOWER_TRX);
494+
}
489495

490496
// ...........................................................................
491497
// inside write transaction
@@ -605,6 +611,9 @@ RestStatus RestDocumentHandler::removeDocument() {
605611
if (suffixes.size() == 2 || !search.isArray()) {
606612
_activeTrx->addHint(transaction::Hints::Hint::SINGLE_OPERATION);
607613
}
614+
if (!opOptions.isSynchronousReplicationFrom.empty()) {
615+
_activeTrx->addHint(transaction::Hints::Hint::IS_FOLLOWER_TRX);
616+
}
608617

609618
Result res = _activeTrx->begin();
610619

arangod/RocksDBEngine/RocksDBTransactionState.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ Result RocksDBTransactionState::beginTransaction(transaction::Hints hints) {
9898
}
9999

100100
// register with manager
101-
transaction::ManagerFeature::manager()->registerTransaction(id(), isReadOnlyTransaction());
101+
transaction::ManagerFeature::manager()->registerTransaction(id(), isReadOnlyTransaction(), hasHint(transaction::Hints::Hint::IS_FOLLOWER_TRX));
102102
updateStatus(transaction::Status::RUNNING);
103103
_vocbase.server().getFeature<MetricsFeature>().serverStatistics()._transactionsStatistics._transactionsStarted++;
104104

arangod/StorageEngine/TransactionState.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ class TransactionState {
185185
return (_type == AccessMode::Type::READ);
186186
}
187187
188+
/// @brief whether or not a transaction is a follower transaction
189+
bool isFollowerTransaction() const {
190+
return hasHint(transaction::Hints::Hint::IS_FOLLOWER_TRX);
191+
}
192+
188193
/// @brief whether or not a transaction only has exculsive or read accesses
189194
bool isOnlyExclusiveTransaction() const;
190195

arangod/Transaction/Context.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ transaction::Context::Context(TRI_vocbase_t& vocbase)
7373
_strings{_strArena},
7474
_options(arangodb::velocypack::Options::Defaults),
7575
_dumpOptions(arangodb::velocypack::Options::Defaults),
76-
_transaction{TransactionId::none(), false},
76+
_transaction{TransactionId::none(), false, false},
7777
_ownsResolver(false) {
7878
/// dump options contain have the escapeUnicode attribute set to true
7979
/// this allows dumping of string values as plain 7-bit ASCII values.
@@ -89,7 +89,8 @@ transaction::Context::~Context() {
8989
// unregister the transaction from the logfile manager
9090
if (_transaction.id.isSet()) {
9191
transaction::ManagerFeature::manager()->unregisterTransaction(_transaction.id,
92-
_transaction.isReadOnlyTransaction);
92+
_transaction.isReadOnlyTransaction,
93+
_transaction.isFollowerTransaction);
9394
}
9495

9596
// call the actual cleanup routine which frees all
@@ -240,12 +241,14 @@ std::shared_ptr<TransactionState> transaction::Context::createState(transaction:
240241
/// @brief unregister the transaction
241242
/// this will save the transaction's id and status locally
242243
void transaction::Context::storeTransactionResult(TransactionId id, bool wasRegistered,
243-
bool isReadOnlyTransaction) noexcept {
244+
bool isReadOnlyTransaction,
245+
bool isFollowerTransaction) noexcept {
244246
TRI_ASSERT(_transaction.id.empty());
245247

246248
if (wasRegistered) {
247249
_transaction.id = id;
248250
_transaction.isReadOnlyTransaction = isReadOnlyTransaction;
251+
_transaction.isFollowerTransaction = isFollowerTransaction;
249252
}
250253
}
251254

arangod/Transaction/Context.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ class Context {
109109
/// @brief unregister the transaction
110110
/// this will save the transaction's id and status locally
111111
void storeTransactionResult(TransactionId id, bool wasRegistered,
112-
bool isReadOnlyTransaction) noexcept;
112+
bool isReadOnlyTransaction,
113+
bool isFollowerTranaction) noexcept;
113114

114115
public:
115116
/// @brief get a custom type handler
@@ -165,6 +166,7 @@ class Context {
165166
struct {
166167
TransactionId id;
167168
bool isReadOnlyTransaction;
169+
bool isFollowerTransaction;
168170
} _transaction;
169171

170172
bool _ownsResolver;

arangod/Transaction/Hints.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class Hints {
4646
FROM_TOPLEVEL_AQL = 128, // transaction is only runnning one AQL query
4747
GLOBAL_MANAGED = 256, // transaction with externally managed lifetime
4848
INDEX_CREATION = 512, // transaction is for creating index on existing collection (many inserts, no removes, index will be deleted on any failure anyway)
49+
IS_FOLLOWER_TRX = 1024, // transaction used to replicate something on a follower
4950
};
5051

5152
Hints() : _value(0) {}

arangod/Transaction/Manager.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,24 +88,39 @@ struct MGMethods final : arangodb::transaction::Methods {
8888
Manager::Manager(ManagerFeature& feature)
8989
: _feature(feature),
9090
_nrRunning(0),
91+
_nrReadLocked(0),
9192
_disallowInserts(false),
9293
_writeLockHeld(false),
9394
_streamingLockTimeout(feature.streamingLockTimeout()) {}
9495

95-
void Manager::registerTransaction(TransactionId /*transactionId*/, bool isReadOnlyTransaction) {
96-
if (!isReadOnlyTransaction) {
96+
void Manager::registerTransaction(TransactionId transactionId,
97+
bool isReadOnlyTransaction,
98+
bool isFollowerTransaction) {
99+
// If isFollowerTransaction is set then either the transactionId should be
100+
// an isFollowerTransactionId or it should be a legacy transactionId:
101+
TRI_ASSERT(!isFollowerTransaction ||
102+
transactionId.isFollowerTransactionId() ||
103+
transactionId.isLegacyTransactionId());
104+
if (!isReadOnlyTransaction && !isFollowerTransaction) {
105+
LOG_TOPIC("ccdea", TRACE, Logger::TRANSACTIONS) << "Acquiring read lock for tid " << transactionId.id();
97106
_rwLock.lockRead();
107+
_nrReadLocked.fetch_add(1, std::memory_order_relaxed);
108+
LOG_TOPIC("ccdeb", TRACE, Logger::TRANSACTIONS) << "Got read lock for tid " << transactionId.id() << " nrReadLocked: " << _nrReadLocked.load(std::memory_order_relaxed);
98109
}
99110

100111
_nrRunning.fetch_add(1, std::memory_order_relaxed);
101112
}
102113

103114
// unregisters a transaction
104-
void Manager::unregisterTransaction(TransactionId transactionId, bool isReadOnlyTransaction) {
115+
void Manager::unregisterTransaction(TransactionId transactionId,
116+
bool isReadOnlyTransaction,
117+
bool isFollowerTransaction) {
105118
// always perform an unlock when we leave this function
106-
auto guard = scopeGuard([this, &isReadOnlyTransaction]() {
107-
if (!isReadOnlyTransaction) {
119+
auto guard = scopeGuard([this, transactionId, &isReadOnlyTransaction, &isFollowerTransaction]() {
120+
if (!isReadOnlyTransaction && !isFollowerTransaction) {
108121
_rwLock.unlockRead();
122+
_nrReadLocked.fetch_sub(1, std::memory_order_relaxed);
123+
LOG_TOPIC("ccded", TRACE, Logger::TRANSACTIONS) << "Released lock for tid " << transactionId.id() << " nrReadLocked: " << _nrReadLocked.load(std::memory_order_relaxed);
109124
}
110125
});
111126

@@ -287,7 +302,7 @@ Result Manager::createManagedTrx(TRI_vocbase_t& vocbase, TransactionId tid,
287302
}
288303

289304
LOG_TOPIC("7bd2d", DEBUG, Logger::TRANSACTIONS)
290-
<< "managed trx creating: '" << tid << "'";
305+
<< "managed trx creating: '" << tid.id() << "'";
291306

292307
const size_t bucket = getBucket(tid);
293308

@@ -595,7 +610,7 @@ Result Manager::updateTransaction(TransactionId tid, transaction::Status status,
595610
status == transaction::Status::ABORTED);
596611

597612
LOG_TOPIC("7bd2f", DEBUG, Logger::TRANSACTIONS)
598-
<< "managed trx '" << tid << " updating to '" << status << "'";
613+
<< "managed trx '" << tid << " updating to '" << (status == transaction::Status::COMMITTED ? "COMMITED" : "ABORTED") << "'";
599614

600615
Result res;
601616
size_t const bucket = getBucket(tid);

arangod/Transaction/Manager.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
#ifndef ARANGOD_TRANSACTION_MANAGER_H
2525
#define ARANGOD_TRANSACTION_MANAGER_H 1
2626

27+
#include "Basics/Identifier.h"
2728
#include "Basics/ReadWriteLock.h"
2829
#include "Basics/ReadWriteSpinLock.h"
2930
#include "Basics/Result.h"
31+
#include "Logger/LogMacros.h"
3032
#include "Transaction/Status.h"
3133
#include "VocBase/AccessMode.h"
3234
#include "VocBase/Identifiers/TransactionId.h"
@@ -99,10 +101,12 @@ class Manager final {
99101
explicit Manager(ManagerFeature& feature);
100102

101103
// register a transaction
102-
void registerTransaction(TransactionId transactionId, bool isReadOnlyTransaction);
104+
void registerTransaction(TransactionId transactionId, bool isReadOnlyTransaction,
105+
bool isFollowerTransaction);
103106

104107
// unregister a transaction
105-
void unregisterTransaction(TransactionId transactionId, bool isReadOnlyTransaction);
108+
void unregisterTransaction(TransactionId transactionId, bool isReadOnlyTransaction,
109+
bool isFollowerTransaction);
106110

107111
uint64_t getActiveTransactionCount();
108112

@@ -163,9 +167,13 @@ class Manager final {
163167
bool ret = false;
164168
std::unique_lock<std::mutex> guard(_mutex);
165169
if (!_writeLockHeld) {
170+
LOG_TOPIC("eedda", TRACE, Logger::TRANSACTIONS) << "Trying to get write lock to hold transactions...";
166171
ret = _rwLock.lockWrite(timeout);
167172
if (ret) {
173+
LOG_TOPIC("eeddb", TRACE, Logger::TRANSACTIONS) << "Got write lock to hold transactions.";
168174
_writeLockHeld = true;
175+
} else {
176+
LOG_TOPIC("eeddc", TRACE, Logger::TRANSACTIONS) << "Did not get write lock to hold transactions.";
169177
}
170178
}
171179
return ret;
@@ -175,6 +183,7 @@ class Manager final {
175183
void releaseTransactions() {
176184
std::unique_lock<std::mutex> guard(_mutex);
177185
if (_writeLockHeld) {
186+
LOG_TOPIC("eeddd", TRACE, Logger::TRANSACTIONS) << "Releasing write lock to hold transactions.";
178187
_rwLock.unlockWrite();
179188
_writeLockHeld = false;
180189
}
@@ -210,6 +219,7 @@ class Manager final {
210219

211220
/// Nr of running transactions
212221
std::atomic<uint64_t> _nrRunning;
222+
std::atomic<uint64_t> _nrReadLocked;
213223

214224
std::atomic<bool> _disallowInserts;
215225

arangod/Transaction/Methods.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,8 @@ transaction::Methods::~Methods() {
353353
// store result in context
354354
_transactionContext->storeTransactionResult(_state->id(),
355355
_state->wasRegistered(),
356-
_state->isReadOnlyTransaction());
356+
_state->isReadOnlyTransaction(),
357+
_state->isFollowerTransaction());
357358

358359
_state = nullptr;
359360
}

arangod/Utils/SingleCollectionTransaction.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
#include "VocBase/LogicalDataSource.h"
3232

3333
#include "Logger/Logger.h"
34-
#include "Logger/LogMacros.h"
3534

3635
namespace arangodb {
3736

tests/RocksDBEngine/TransactionManagerTest.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ TEST(RocksDBTransactionManager, test_non_overlapping) {
4848
EXPECT_TRUE(tm.holdTransactions(500) );
4949
tm.releaseTransactions();
5050

51-
tm.registerTransaction(static_cast<TransactionId>(1), false);
51+
tm.registerTransaction(static_cast<TransactionId>(1), false, false);
5252
EXPECT_EQ(tm.getActiveTransactionCount(), 1);
53-
tm.unregisterTransaction(static_cast<TransactionId>(1), false);
53+
tm.unregisterTransaction(static_cast<TransactionId>(1), false, false);
5454
EXPECT_EQ(tm.getActiveTransactionCount(), 0);
5555

5656
EXPECT_TRUE(tm.holdTransactions(500) );
@@ -78,7 +78,7 @@ TEST(RocksDBTransactionManager, test_overlapping) {
7878
cv.notify_all();
7979
}
8080

81-
tm.registerTransaction(static_cast<TransactionId>(1), false);
81+
tm.registerTransaction(static_cast<TransactionId>(1), false, false);
8282
EXPECT_EQ(tm.getActiveTransactionCount(), 1);
8383
};
8484

@@ -92,6 +92,6 @@ TEST(RocksDBTransactionManager, test_overlapping) {
9292

9393
reader.join();
9494
EXPECT_EQ(tm.getActiveTransactionCount(), 1);
95-
tm.unregisterTransaction(static_cast<TransactionId>(1), false);
95+
tm.unregisterTransaction(static_cast<TransactionId>(1), false, false);
9696
EXPECT_EQ(tm.getActiveTransactionCount(), 0);
9797
}

0 commit comments

Comments
 (0)
0