8000 [BTS-491][3.7] ArangoRestore connect Retry (#14437) · arangodb/arangodb@999044c · GitHub
[go: up one dir, main page]

Skip to content

Commit 999044c

Browse files
Lars MaierjsteemannKVS85
authored
[BTS-491][3.7] ArangoRestore connect Retry (#14437)
* Added high level retry loop for the FIRST connect. * Retry on INTERNAL_ERROR as well. * Added option for retry count. Updated CHANGELOG. * Added sleep before retry. * Apply suggestions from code review Co-authored-by: Jan <jsteemann@users.noreply.github.com> Co-authored-by: Jan <jsteemann@users.noreply.github.com> Co-authored-by: Vadim <vadim@arangodb.com>
1 parent bfedd78 commit 999044c

File tree

3 files changed

+28
-2
lines changed

3 files changed

+28
-2
lines changed

CHANGELOG

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
v3.7.13 (XXXX-XX-XX)
22
--------------------
33

4+
* Added a retry loop for arangorestore during the initial connection phase. The
5+
number of retries defaults to 3 and can be configured using
6+
--initial-connect-retries.
7+
48
* Fix display of running and slow queries in web UI when there are multiple
59
coordinators. Previously, the display order of queries was undefined, which
610
could lead to queries from one coordinator being display on top once and then

arangosh/Restore/RestoreFeature.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,6 +1347,12 @@ void RestoreFeature::collectOptions(std::shared_ptr<options::ProgramOptions> opt
13471347
new UInt32Parameter(&_options.threadCount))
13481348
.setIntroducedIn(30400);
13491349

1350+
options
1351+
->addOption("--initial-connect-retries",
1352+
"number of connect retries for initial connection",
1353+
new UInt32Parameter(&_options.initialConnectRetries))
1354+
.setIntroducedIn(30713);
1355+
13501356
options->addOption("--include-system-collections",
13511357
"include system collections",
13521358
new BooleanParameter(&_options.includeSystemCollections));
@@ -1610,9 +1616,24 @@ void RestoreFeature::start() {
16101616

16111617
std::unique_ptr<SimpleHttpClient> httpClient;
16121618

1619+
auto const connectRetry = [&](size_t numRetries) -> Result {
1620+
for (size_t i = 0; i < numRetries; i++) {
1621+
if (i > 0) {
1622+
LOG_TOPIC("5855a", WARN, Logger::RESTORE) << "Failed to connect to server, retrying...";
1623+
using namespace std::chrono_literals;
1624+
std::this_thread::sleep_for(i * 1s);
1625+
}
1626+
Result result = _clientManager.getConnectedClient(httpClient, _options.force,
1627+
true, !_options.createDatabase, false);
1628+
if (!result.is(TRI_ERROR_SIMPLE_CLIENT_COULD_NOT_CONNECT) && !result.is(TRI_ERROR_INTERNAL)) {
1629+
return result;
1630+
}
1631+
}
1632+
return {TRI_ERROR_SIMPLE_CLIENT_COULD_NOT_CONNECT};
1633+
};
1634+
16131635
// final result
1614-
Result result = _clientManager.getConnectedClient(httpClient, _options.force,
1615-
true, !_options.createDatabase, false);
1636+
Result result = connectRetry(std::max<uint32_t>(1, _options.initialConnectRetries));
16161637
if (result.is(TRI_ERROR_SIMPLE_CLIENT_COULD_NOT_CONNECT)) {
16171638
LOG_TOPIC("c23bf", FATAL, Logger::RESTORE)
16181639
<< "cannot create server connection, giving up!";

arangosh/Restore/RestoreFeature.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ class RestoreFeature final : public application_features::ApplicationFeature {
7979
std::vector<std::string> numberOfShards;
8080
std::vector<std::string> replicationFactor;
8181
uint32_t threadCount{2};
82+
uint32_t initialConnectRetries{3};
8283
bool clusterMode{false};
8384
bool createDatabase{false};
8485
bool force{false};

0 commit comments

Comments
 (0)
0