8000 make replication timeouts configurable via startup options by jsteemann · Pull Request #10473 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content

make replication timeouts configurable via startup options #10473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
v3.5.3 (XXXX-XX-XX)
-------------------

* Make the timeouts for replication requests (for active failover and master-slave
replication configurable via startup options:

--replication.connect-timeout
--replication.request-timeout

* Fixed internal issue #4647: dead Coordinators are not removed for agency.

* Fixed UPSERT matching.
Expand Down
8 changes: 7 additions & 1 deletion arangod/Cluster/HeartbeatThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,13 @@ void HeartbeatThread::runSingleServer() {
config._idleMinWaitTime = 250 * 1000; // 250ms
config._idleMaxWaitTime = 3 * 1000 * 1000; // 3s
TRI_ASSERT(!config._skipCreateDrop);
config._includeFoxxQueues = true; // sync _queues and _jobs
config._includeFoxxQueues = true; // sync _queues and _jobs

auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
config._connectTimeout = feature->checkConnectTimeout(config._connectTimeout);
config._requestTimeout = feature->checkRequestTimeout(config._requestTimeout);
}

applier->forget(); // forget about any existing configuration
applier->reconfigure(config);
Expand Down
25 changes: 22 additions & 3 deletions arangod/Replication/ReplicationApplierConfiguration.cpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "Basics/Exceptions.h"
#include "Cluster/ClusterFeature.h"
#include "GeneralServer/AuthenticationFeature.h"
#include "Replication/ReplicationFeature.h"

#include <velocypack/Builder.h>
#include <velocypack/Iterator.h>
Expand Down Expand Up @@ -63,7 +64,13 @@ ReplicationApplierConfiguration::ReplicationApplierConfiguration()
_requireFromPresent(true),
_incremental(false),
_verbose(false),
_restrictType(RestrictType::None) {}
_restrictType(RestrictType::None) {
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
_requestTimeout = feature->requestTimeout();
_connectTimeout = feature->connectTimeout();
}
}

/// @brief reset the configuration to defaults
void ReplicationApplierConfiguration::reset() {
Expand Down Expand Up @@ -99,6 +106,12 @@ void ReplicationApplierConfiguration::reset() {
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
_force32mode = false;
#endif

auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
_requestTimeout = feature->requestTimeout();
_connectTimeout = feature->connectTimeout();
}
}

/// @brief get a VelocyPack representation
Expand Down Expand Up @@ -219,12 +232,18 @@ ReplicationApplierConfiguration ReplicationApplierConfiguration::fromVelocyPack(

value = slice.get("requestTimeout");
if (value.isNumber()) {
configuration._requestTimeout = value.getNumber<double>();
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
configuration._requestTimeout = feature->checkRequestTimeout(value.getNumber<double>());
}
}

value = slice.get("connectTimeout");
if (value.isNumber()) {
configuration._connectTimeout = value.getNumber<double>();
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
configuration._connectTimeout = feature->checkConnectTimeout(value.getNumber<double>());
}
}

value = slice.get("maxConnectRetries");
Expand Down
42 changes: 42 additions & 0 deletions arangod/Replication/ReplicationFeature.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ ReplicationFeature* ReplicationFeature::INSTANCE = nullptr;

ReplicationFeature::ReplicationFeature(ApplicationServer& server)
: ApplicationFeature(server, "Replication"),
_connectTimeout(10.0),
_requestTimeout(600.0),
_forceConnectTimeout(false),
_forceRequestTimeout(false),
_replicationApplierAutoStart(true),
_enableActiveFailover(false),
_parallelTailingInvocations(0),
Expand Down Expand Up @@ -74,11 +78,21 @@ void ReplicationFeature::collectOptions(std::shared_ptr<ProgramOptions> options)
options->addOption("--replication.active-failover",
"Enable active-failover during asynchronous replication",
new BooleanParameter(&_enableActiveFailover));

options->addOption("--replication.max-parallel-tailing-invocations",
"Maximum number of concurrently allowed WAL tailing invocations (0 = unlimited)",
new UInt64Parameter(&_maxParallelTailingInvocations),
arangodb::options::makeFlags(arangodb::options::Flags::Hidden))
.setIntroducedIn(30500);

options->addOption("--replication.connect-timeout",
"Default timeout value for replication connection attempts (in seconds)",
new DoubleParameter(&_connectTimeout))
.setIntroducedIn(30409).setIntroducedIn(30504);
options->addOption("--replication.request-timeout",
"Default timeout value for replication requests (in seconds)",
new DoubleParameter(&_requestTimeout))
.setIntroducedIn(30409).setIntroducedIn(30504);
}

void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions> options) {
Expand All @@ -89,6 +103,20 @@ void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions
"configured";
FATAL_ERROR_EXIT();
}

if (_connectTimeout < 1.0) {
_connectTimeout = 1.0;
}
if (options->processingResult().touched("--replication.connect-timeout")) {
_forceConnectTimeout = true;
}

if (_requestTimeout < 3.0) {
_requestTimeout = 3.0;
}
if (options->processingResult().touched("--replication.request-timeout")) {
_forceRequestTimeout = true;
}
}

void ReplicationFeature::prepare() {
Expand Down Expand Up @@ -165,6 +193,20 @@ void ReplicationFeature::trackTailingStart() {
void ReplicationFeature::trackTailingEnd() noexcept {
--_parallelTailingInvocations;
}

double ReplicationFeature::checkConnectTimeout(double value) const {
if (_forceConnectTimeout) {
return _connectTimeout;
}
return value;
}

double ReplicationFeature::checkRequestTimeout(double value) const {
if (_forceRequestTimeout) {
return _requestTimeout;
}
return value;
}

// start the replication applier for a single database
void ReplicationFeature::startApplier(TRI_vocbase_t* vocbase) {
Expand Down
32 changes: 32 additions & 0 deletions arangod/Replication/ReplicationFeature.h
628C
Original file line number Diff line numberDiff line change
Expand Up @@ -60,6 +60,24 @@ class ReplicationFeature final : public application_features::ApplicationFeature
/// @brief stop the replication applier for a single database
void stopApplier(TRI_vocbase_t* vocbase);

/// @brief returns the connect timeout for replication requests
double connectTimeout() const { return _connectTimeout; }

/// @brief returns the request timeout for replication requests
double requestTimeout() const { return _requestTimeout; }

/// @brief returns the connect timeout for replication requests
/// this will return the provided value if the user has not adjusted the
/// timeout via configuration. otherwise it will return the configured
/// timeout value
double checkConnectTimeout(double value) const;

/// @brief returns the request timeout for replication requests
/// this will return the provided value if the user has not adjusted the
/// timeout via configuration. otherwise it will return the configured
/// timeout value
double checkRequestTimeout(double value) const;

/// @brief automatic failover of replication using the agency
bool isActiveFailoverEnabled() const { return _enableActiveFailover; }

Expand All @@ -81,6 +99,20 @@ class ReplicationFeature final : public application_features::ApplicationFeature
static ReplicationFeature* INSTANCE;

private:
/// @brief connection timeout for replication requests
double _connectTimeout;

/// @brief request timeout for replication requests
double _requestTimeout;

/// @brief whether or not the user-defined connect timeout is forced to be used
/// this is true only if the user set the connect timeout at startup
bool _forceConnectTimeout;

/// @brief whether or not the user-defined request timeout is forced to be used
/// this is true only if the user set the request timeout at startup
bool _forceRequestTimeout;

bool _replicationApplierAutoStart;

/// Enable the active failover
Expand Down
0