From b6eb49db35299798edca4ea28ff9b0a583abba48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20G=C3=B6dderz?= Date: Thu, 26 Aug 2021 15:44:17 +0200 Subject: [PATCH 01/19] [3.8] Lower priority of AQL lanes (#14699) * Lower priority of AQL lanes * Added CHANGELOG entry * Improved comments Co-authored-by: Vadim --- CHANGELOG | 4 ++ arangod/Aql/Query.cpp | 2 +- arangod/Aql/RestAqlHandler.cpp | 8 ++++ arangod/Aql/RestAqlHandler.h | 2 +- arangod/Aql/SharedQueryState.cpp | 62 +++++++++++++++-------------- arangod/GeneralServer/RequestLane.h | 19 ++++----- 6 files changed, 57 insertions(+), 40 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index d972f7047b87..80e0a843bd05 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,10 @@ v3.8.1 (XXXX-XX-XX) ------------------- +* Reduce internal priority of AQL execution. This prevents possible deadlocks + with modification operations in a cluster and replicationFactor >= 2, and can + also improve responsiveness under high load of AQL queries. + * Updated arangosync to 2.6.0. * Added protocol specific metrics: histogram about request body size, total diff --git a/arangod/Aql/Query.cpp b/arangod/Aql/Query.cpp index 8f642247dcb4..71d28bf09c69 100644 --- a/arangod/Aql/Query.cpp +++ b/arangod/Aql/Query.cpp @@ -1261,7 +1261,7 @@ futures::Future finishDBServerParts(Query& query, ErrorCode errorCode) { network::RequestOptions options; options.database = query.vocbase().name(); options.timeout = network::Timeout(60.0); // Picked arbitrarily - options.continuationLane = RequestLane::CLUSTER_AQL_CONTINUATION; + options.continuationLane = RequestLane::CLUSTER_AQL_INTERNAL_COORDINATOR; // options.skipScheduler = true; VPackBuffer body; diff --git a/arangod/Aql/RestAqlHandler.cpp b/arangod/Aql/RestAqlHandler.cpp index 38a010dbe00c..32048f0aad7d 100644 --- a/arangod/Aql/RestAqlHandler.cpp +++ b/arangod/Aql/RestAqlHandler.cpp @@ -851,3 +851,11 @@ RestStatus RestAqlHandler::handleFinishQuery(std::string const& idString) { generateResult(rest::ResponseCode::OK, std::move(buffer)); })); } + +RequestLane RestAqlHandler::lane() const { + if (ServerState::instance()->isCoordinator()) { + return RequestLane::CLUSTER_AQL_INTERNAL_COORDINATOR; + } else { + return RequestLane::CLUSTER_AQL; + } +} diff --git a/arangod/Aql/RestAqlHandler.h b/arangod/Aql/RestAqlHandler.h index 380f7b981f79..b890696a5efa 100644 --- a/arangod/Aql/RestAqlHandler.h +++ b/arangod/Aql/RestAqlHandler.h @@ -44,7 +44,7 @@ class RestAqlHandler : public RestVocbaseBaseHandler { public: char const* name() const override final { return "RestAqlHandler"; } - RequestLane lane() const override final { return RequestLane::CLUSTER_AQL; } + RequestLane lane() const override final; RestStatus execute() override; RestStatus continueExecute() override; void shutdownExecute(bool isFinalized) noexcept override; diff --git a/arangod/Aql/SharedQueryState.cpp b/arangod/Aql/SharedQueryState.cpp index 6820efa79dec..2b5d2f61e350 100644 --- a/arangod/Aql/SharedQueryState.cpp +++ b/arangod/Aql/SharedQueryState.cpp @@ -26,6 +26,7 @@ #include "ApplicationFeatures/ApplicationServer.h" #include "Basics/Exceptions.h" #include "Basics/ScopeGuard.h" +#include "Cluster/ServerState.h" #include "RestServer/QueryRegistryFeature.h" #include "Scheduler/Scheduler.h" #include "Scheduler/SchedulerFeature.h" @@ -129,34 +130,37 @@ void SharedQueryState::queueHandler() { return; } - bool queued = - scheduler->queue(RequestLane::CLUSTER_AQL_CONTINUATION, - [self = shared_from_this(), cb = _wakeupCb, v = _cbVersion]() { - std::unique_lock lck(self->_mutex, std::defer_lock); - - do { - bool cntn = false; - try { - cntn = cb(); - } catch (...) { - } - - lck.lock(); - if (v == self->_cbVersion) { - unsigned c = self->_numWakeups--; - TRI_ASSERT(c > 0); - if (c == 1 || !cntn || !self->_valid) { - break; - } - } else { - return; - } - lck.unlock(); - } while (true); - - TRI_ASSERT(lck); - self->queueHandler(); - }); + auto const lane = ServerState::instance()->isCoordinator() + ? RequestLane::CLUSTER_AQL_INTERNAL_COORDINATOR + : RequestLane::CLUSTER_AQL; + + bool queued = scheduler->queue(lane, [self = shared_from_this(), + cb = _wakeupCb, v = _cbVersion]() { + std::unique_lock lck(self->_mutex, std::defer_lock); + + do { + bool cntn = false; + try { + cntn = cb(); + } catch (...) { + } + + lck.lock(); + if (v == self->_cbVersion) { + unsigned c = self->_numWakeups--; + TRI_ASSERT(c > 0); + if (c == 1 || !cntn || !self->_valid) { + break; + } + } else { + return; + } + lck.unlock(); + } while (true); + + TRI_ASSERT(lck); + self->queueHandler(); + }); if (!queued) { // just invalidate _wakeupCb = nullptr; @@ -168,7 +172,7 @@ void SharedQueryState::queueHandler() { bool SharedQueryState::queueAsyncTask(fu2::unique_function cb) { Scheduler* scheduler = SchedulerFeature::SCHEDULER; if (scheduler) { - return scheduler->queue(RequestLane::CLIENT_AQL, std::move(cb)); + return scheduler->queue(RequestLane::CLUSTER_AQL, std::move(cb)); } return false; } diff --git a/arangod/GeneralServer/RequestLane.h b/arangod/GeneralServer/RequestLane.h index ffe1f6223115..fdc5b673ad15 100644 --- a/arangod/GeneralServer/RequestLane.h +++ b/arangod/GeneralServer/RequestLane.h @@ -62,15 +62,16 @@ enum class RequestLane { // V8 or having high priority. CLUSTER_INTERNAL, - // For requests from the DBserver to the Coordinator or - // from the Coordinator to the DBserver. Using AQL - // these have Medium priority. + // Internal AQL requests, or continuations. Low priority. CLUSTER_AQL, - // For future continuations initiated as part of an AQL request from the - // DBserver to the Coordinator or from the Coordinator to the DBserver. - // These have High priority. - CLUSTER_AQL_CONTINUATION, + // For requests from the DBserver to the Coordinator, and continuations on the + // Coordinator. + // These have medium priority. Because client requests made against the + // RestCursorHandler (with lane CLIENT_AQL) might block and need these to + // finish. Ongoing low priority requests can also prevent low priority lanes + // from being worked on, having the same effect. + CLUSTER_AQL_INTERNAL_COORDINATOR, // For requests from the from the Coordinator to the // DBserver using V8. @@ -142,9 +143,9 @@ inline RequestPriority PriorityRequestLane(RequestLane lane) { case RequestLane::CLUSTER_INTERNAL: return RequestPriority::HIGH; case RequestLane::CLUSTER_AQL: + return RequestPriority::LOW; + case RequestLane::CLUSTER_AQL_INTERNAL_COORDINATOR: return RequestPriority::MED; - case RequestLane::CLUSTER_AQL_CONTINUATION: - return RequestPriority::HIGH; case RequestLane::CLUSTER_V8: return RequestPriority::LOW; case RequestLane::CLUSTER_ADMIN: From fc248f727a40584e15404c94ddd3d36f067cad01 Mon Sep 17 00:00:00 2001 From: Jan Date: Thu, 26 Aug 2021 15:45:09 +0200 Subject: [PATCH 02/19] added a test for statistics behavior (#14703) --- .../test-statistics-enabled-false.js | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 tests/js/client/server_parameters/test-statistics-enabled-false.js diff --git a/tests/js/client/server_parameters/test-statistics-enabled-false.js b/tests/js/client/server_parameters/test-statistics-enabled-false.js new file mode 100644 index 000000000000..c6e5b1967f0f --- /dev/null +++ b/tests/js/client/server_parameters/test-statistics-enabled-false.js @@ -0,0 +1,88 @@ +/*jshint globalstrict:false, strict:false */ +/* global getOptions, assertEqual, assertTrue, arango */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test for security-related server options +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB Inc, Cologne, Germany +/// +/// @author Jan Steemann +/// @author Copyright 2019, ArangoDB Inc, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +if (getOptions === true) { + return { + 'server.statistics': "false", + 'server.export-metrics-api': "true", + }; +} + +const jsunity = require('jsunity'); + +function testSuite() { + return { + testGetAdminStatistics : function() { + let res = arango.GET("/_admin/statistics"); + assertTrue(res.error); + assertEqual(404, res.code); + }, + + testGetAdminStatisticsDescription : function() { + let res = arango.GET("/_admin/statistics-description"); + assertTrue(res.error); + assertEqual(404, res.code); + }, + + testGetMetrics : function() { + let metrics = {}; + String(arango.GET("/_admin/metrics/v2")) + .split("\n") + .filter((line) => line.match(/^[^#]/)) + .filter((line) => line.match(/^arangodb_/)) + .forEach((line) => { + let name = line.replace(/[ \{].*$/g, ''); + let value = Number(line.replace(/^.+ (\d+)$/, '$1')); + metrics[name] = value; + }); + + const expected = [ + "arangodb_process_statistics_minor_page_faults_total", + "arangodb_process_statistics_major_page_faults_total", + "arangodb_process_statistics_user_time", + "arangodb_process_statistics_system_time", + "arangodb_process_statistics_number_of_threads", + "arangodb_process_statistics_resident_set_size", + "arangodb_process_statistics_resident_set_size_percent", + "arangodb_process_statistics_virtual_memory_size", + "arangodb_server_statistics_physical_memory", + "arangodb_server_statistics_server_uptime_total", + ]; + + expected.forEach((name) => { + assertTrue(metrics.hasOwnProperty(name), name); + assertEqual("number", typeof metrics[name], name); + }); + }, + }; +} + +jsunity.run(testSuite); +return jsunity.done(); From 24bc4c20098ab0e5a6f53a6867048f1f81b9e507 Mon Sep 17 00:00:00 2001 From: Jan Date: Thu, 26 Aug 2021 15:45:42 +0200 Subject: [PATCH 03/19] properly rename test file (#14705) --- ...istics-spec-nocluster.js => api-statistics-spec-noncluster.js} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/js/client/shell/{api-statistics-spec-nocluster.js => api-statistics-spec-noncluster.js} (100%) diff --git a/tests/js/client/shell/api-statistics-spec-nocluster.js b/tests/js/client/shell/api-statistics-spec-noncluster.js similarity index 100% rename from tests/js/client/shell/api-statistics-spec-nocluster.js rename to tests/js/client/shell/api-statistics-spec-noncluster.js From e0f237a5d9c29b34325f78b3e75789a686272363 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 08:48:58 +0200 Subject: [PATCH 04/19] fix hard time limits on move shard and cleanout server --- CHANGELOG | 5 +- arangod/Agency/CleanOutServer.cpp | 15 +- arangod/Agency/Job.cpp | 11 ++ arangod/Agency/Job.h | 3 + arangod/Agency/MoveShard.cpp | 48 +++--- .../RestHandler/RestAdminClusterHandler.cpp | 142 +++++++++++++++++- arangod/RestHandler/RestAdminClusterHandler.h | 2 + tests/Agency/CleanOutServerTest.cpp | 73 ++++++++- tests/Agency/MoveShardTest.cpp | 133 +--------------- 9 files changed, 261 insertions(+), 171 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 88610bc6f37c..86cc8f9f09db 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,9 @@ v3.8.2 (XXXX-XX-XX) ------------------- +* No runtime limits for shard move and server cleanout jobs, instead + possibility to cancel them. + * Updated ArangoDB Starter to 0.15.2. * SEARCH-238: Improved SortNodes placement optimization in cluster so late @@ -10,7 +13,7 @@ v3.8.2 (XXXX-XX-XX) several clients concurrently inserts data and use custom analyzer with non-string return type. -* Fix a rare shutdown race in RocksDBShaCalculatorThread. +* Fix a rare shutdown race in RocksDBShaCalculatorThread. * Reduce internal priority of AQL execution. This prevents possible deadlocks with modification operations in a cluster and replicationFactor >= 2, and can diff --git a/arangod/Agency/CleanOutServer.cpp b/arangod/Agency/CleanOutServer.cpp index 264ff5a79a75..70f1fd6d2bf7 100644 --- a/arangod/Agency/CleanOutServer.cpp +++ b/arangod/Agency/CleanOutServer.cpp @@ -82,14 +82,8 @@ JOB_STATUS CleanOutServer::status() { } if (found > 0) { // some subjob still running - // timeout here: - auto tmp_time = - _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated"); - std::string timeCreatedString = tmp_time.first; - Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString); - Supervision::TimePoint now(std::chrono::system_clock::now()); - if (now - timeCreated > std::chrono::duration(86400.0)) { // 1 day - abort("job timed out"); + // consider cancellation + if (considerCancellation()) { return FAILED; } return PENDING; @@ -198,6 +192,11 @@ bool CleanOutServer::create(std::shared_ptr envelope) { } bool CleanOutServer::start(bool& aborts) { + + if (considerCancellation()) { + return false; + } + // If anything throws here, the run() method catches it and finishes // the job. diff --git a/arangod/Agency/Job.cpp b/arangod/Agency/Job.cpp index c29c156615be..81241c057ddd 100644 --- a/arangod/Agency/Job.cpp +++ b/arangod/Agency/Job.cpp @@ -87,6 +87,17 @@ Job::~Job() = default; // this will be initialized in the AgencyFeature std::string Job::agencyPrefix = "arango"; +bool Job::considerCancellation() { + // Allow for cancellation of shard moves + auto [cancel,exists] = + _snapshot.hasAsBool(std::string("/Target/") + jobStatus[_status] + "/" + _jobId + "/abort"); + auto cancelled = exists && cancel; + if (cancelled) { + abort("Killed via API"); + } + return cancelled; +}; + bool Job::finish(std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) { try { // protect everything, just in case diff --git a/arangod/Agency/Job.h b/arangod/Agency/Job.h index ff3e54ca2cec..88f47eadd4da 100644 --- a/arangod/Agency/Job.h +++ b/arangod/Agency/Job.h @@ -49,6 +49,7 @@ namespace consensus { class Node; enum JOB_STATUS { TODO, PENDING, FINISHED, FAILED, NOTFOUND }; +const std::vector jobStatus {"ToDo", "Pending", "Finished", "Failed"}; const std::vector pos({"/Target/ToDo/", "/Target/Pending/", "/Target/Finished/", "/Target/Failed/"}); extern std::string const mapUniqueToShortID; @@ -88,6 +89,8 @@ struct Job { virtual void run(bool& aborts) = 0; + bool considerCancellation(); + void runHelper(std::string const& server, std::string const& shard, bool& aborts) { if (_status == FAILED) { // happens when the constructor did not work return; diff --git a/arangod/Agency/MoveShard.cpp b/arangod/Agency/MoveShard.cpp index 519ba1a73a8a..1c3dc03768f8 100644 --- a/arangod/Agency/MoveShard.cpp +++ b/arangod/Agency/MoveShard.cpp @@ -181,6 +181,11 @@ bool MoveShard::create(std::shared_ptr envelope) { } bool MoveShard::start(bool&) { + + if (considerCancellation()) { + return false; + } + // If anything throws here, the run() method catches it and finishes // the job. @@ -477,6 +482,13 @@ bool MoveShard::start(bool&) { } JOB_STATUS MoveShard::status() { + + if (_status == PENDING || _status == TODO) { + if (considerCancellation()) { + return FAILED; + } + } + if (_status != PENDING) { return _status; } @@ -495,19 +507,6 @@ JOB_STATUS MoveShard::status() { JOB_STATUS MoveShard::pendingLeader() { - auto considerTimeout = [&]() -> bool { - // Not yet all in sync, consider timeout: - std::string timeCreatedString = - _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first; - Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString); - Supervision::TimePoint now(std::chrono::system_clock::now()); - if (now - timeCreated > std::chrono::duration(43200.0)) { // 12h - abort("MoveShard timed out in pending leader"); - return true; - } - return false; - }; - // Find the other shards in the same distributeShardsLike group: std::vector shardsLikeMe = clones(_snapshot, _database, _collection, _shard); @@ -556,9 +555,9 @@ JOB_STATUS MoveShard::pendingLeader() { } }); - // Consider timeout: + // Consider cancellation: if (done < shardsLikeMe.size()) { - if (considerTimeout()) { + if (considerCancellation()) { return FAILED; } return PENDING; // do not act @@ -623,9 +622,9 @@ JOB_STATUS MoveShard::pendingLeader() { } }); - // Consider timeout: + // Consider cancellation: if (done < shardsLikeMe.size()) { - if (considerTimeout()) { + if (considerCancellation()) { return FAILED; } return PENDING; // do not act! @@ -752,9 +751,9 @@ JOB_STATUS MoveShard::pendingLeader() { } }); - // Consider timeout: + // Consider cancellation: if (done < shardsLikeMe.size()) { - if (considerTimeout()) { + if (considerCancellation()) { return FAILED; } return PENDING; // do not act! @@ -841,6 +840,7 @@ JOB_STATUS MoveShard::pendingLeader() { } JOB_STATUS MoveShard::pendingFollower() { + // Check if any of the servers in the Plan are FAILED, if so, // we abort: std::string planPath = @@ -867,14 +867,8 @@ JOB_STATUS MoveShard::pendingFollower() { } }); - if (done < shardsLikeMe.size()) { - // Not yet all in sync, consider timeout: - std::string timeCreatedString = - _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first; - Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString); - Supervision::TimePoint now(std::chrono::system_clock::now()); - if (now - timeCreated > std::chrono::duration(10000.0)) { - abort("MoveShard timed out in pending follower"); + if (done < shardsLikeMe.size()) { // Consider cancellation + if (considerCancellation()) { return FAILED; } return PENDING; diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 71194d49c985..446ba4f8f22e 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -37,6 +37,7 @@ #include "Agency/TransactionBuilder.h" #include "ApplicationFeatures/ApplicationServer.h" #include "Basics/ResultT.h" +#include "Cluster/AgencyCache.h" #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterHelpers.h" #include "Cluster/ClusterInfo.h" @@ -270,6 +271,7 @@ std::string const RestAdminClusterHandler::ResignLeadership = "resignLeadership"; std::string const RestAdminClusterHandler::MoveShard = "moveShard"; std::string const RestAdminClusterHandler::QueryJobStatus = "queryAgencyJob"; +std::string const RestAdminClusterHandler::CancelJob = "cancelAgencyJob"; std::string const RestAdminClusterHandler::RemoveServer = "removeServer"; std::string const RestAdminClusterHandler::RebalanceShards = "rebalanceShards"; std::string const RestAdminClusterHandler::ShardStatistics = "shardStatistics"; @@ -331,6 +333,8 @@ RestStatus RestAdminClusterHandler::execute() { return handleResignLeadership(); } else if (command == MoveShard) { return handleMoveShard(); + } else if (command == CancelJob) { + return handleCancelJob(); } else if (command == QueryJobStatus) { return handleQueryJobStatus(); } else if (command == RemoveServer) { @@ -886,6 +890,142 @@ RestStatus RestAdminClusterHandler::handleQueryJobStatus() { })); } +RestStatus RestAdminClusterHandler::handleCancelJob() { + if (!ExecContext::current().isAdminUser()) { + generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN); + return RestStatus::DONE; + } + + if (!ServerState::instance()->isCoordinator()) { + generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN, + "only allowed on coordinators"); + return RestStatus::DONE; + } + + if (request()->requestType() != rest::RequestType::POST) { + generateError(rest::ResponseCode::METHOD_NOT_ALLOWED, TRI_ERROR_HTTP_METHOD_NOT_ALLOWED); + return RestStatus::DONE; + } + + bool parseSuccess; + VPackSlice body = parseVPackBody(parseSuccess); + if (!parseSuccess) { + return RestStatus::DONE; + } + + std::string jobId; + if (body.isObject() && body.hasKey("id") && body.get("id").isString()) { + jobId = body.get("id").copyString(); + } else { + generateError(rest::ResponseCode::BAD, TRI_ERROR_BAD_PARAMETER, "object with key `id`"); + return RestStatus::DONE; + } + + auto targetPath = arangodb::cluster::paths::root()->arango()->target(); + std::vector paths = { + targetPath->pending()->job(jobId)->str(), + targetPath->failed()->job(jobId)->str(), + targetPath->finished()->job(jobId)->str(), + targetPath->toDo()->job(jobId)->str(), + }; + + try { + auto& agencyCache = server().getFeature().agencyCache(); + auto [acb, idx] = agencyCache.read(paths); + auto res = acb->slice(); + + auto paths = std::vector{ + targetPath->pending()->job(jobId)->vec(), + targetPath->failed()->job(jobId)->vec(), + targetPath->finished()->job(jobId)->vec(), + targetPath->toDo()->job(jobId)->vec(), + }; + + for (auto const& path : paths) { + VPackSlice job = res.at(0).get(path); + + if (job.isObject()) { + LOG_DEVEL << job.toJson(); + LOG_TOPIC("eb139", INFO, Logger::SUPERVISION) + << "Attempting to abort supervision job " << job.toJson(); + auto type = job.get("type").copyString(); + if (type != "moveShard") { // only moveshard may be aborted + generateError(Result{400, "Only MoveShard jobs can be aborted"}); + return RestStatus::DONE; + } else if (path[2] != "Pending" && path[2] != "ToDo") { + generateError(Result{400, path[2] + " jobs can no longer be cancelled."}); + return RestStatus::DONE; + } + + auto sendTransaction = [&] { + VPackBuffer trxBody; + { VPackBuilder builder(trxBody); + { VPackArrayBuilder trxs(&builder); + if (path[2] == "ToDo") { + VPackArrayBuilder trx(&builder); + { VPackObjectBuilder op(&builder); + builder.add("arango/Target/ToDo/" + jobId + "/abort", VPackValue(true)); } + { VPackObjectBuilder pre(&builder); + builder.add(VPackValue("arango/Target/ToDo/" + jobId)); + { VPackObjectBuilder val(&builder); + builder.add("oldEmpty", VPackValue(false)); }} + } + VPackArrayBuilder trx(&builder); + { VPackObjectBuilder op(&builder); + builder.add("arango/Target/Pending/" + jobId + "/abort", VPackValue(true)); } + { VPackObjectBuilder pre(&builder); + builder.add(VPackValue("arango/Target/Pending/" + jobId)); + { VPackObjectBuilder val(&builder); + builder.add("oldEmpty", VPackValue(false)); }} + } + } + return AsyncAgencyComm().sendWriteTransaction(60s, std::move(trxBody)); + }; + + waitForFuture( + sendTransaction() + .thenValue([this](AsyncAgencyCommResult&& wr) { + if (!wr.ok()) { + if (wr.statusCode() == 412) { + generateError(Result{412, "Job is no longer running"}); + } else { + generateError(wr.asResult()); + } + } + return futures::makeFuture(); + }) + .thenError([this](VPackException const& e) { + generateError(Result{e.errorCode(), e.what()}); + }) + .thenError([this](std::exception const& e) { + generateError(rest::ResponseCode::SERVER_ERROR, + TRI_ERROR_HTTP_SERVER_ERROR, e.what()); + })); + VPackBuffer payload; + { + VPackBuilder builder(payload); + VPackObjectBuilder ob(&builder); + builder.add("job", VPackValue(jobId)); + builder.add("status", VPackValue("aborted")); + builder.add("error", VPackValue(false)); + } + resetResponse(rest::ResponseCode::OK); + response()->setPayload(std::move(payload)); + return RestStatus::DONE; + } + } + + generateError(rest::ResponseCode::NOT_FOUND, TRI_ERROR_HTTP_NOT_FOUND); + + } catch (VPackException const& e) { + generateError(Result{e.errorCode(), e.what()}); + } catch (std::exception const& e) { + generateError(rest::ResponseCode::SERVER_ERROR, TRI_ERROR_HTTP_SERVER_ERROR, e.what()); + } + + return RestStatus::DONE; +} + RestStatus RestAdminClusterHandler::handleSingleServerJob(std::string const& job) { if (!ExecContext::current().isAdminUser()) { generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN); @@ -1232,7 +1372,7 @@ RestStatus RestAdminClusterHandler::setMaintenance(bool wantToActivate) { auto sendTransaction = [&] { if (wantToActivate) { constexpr int timeout = 3600; // 1 hour - return AsyncAgencyComm().setValue(60s, maintenancePath, + return AsyncAgencyComm().setValue(60s, maintenancePath, VPackValue(timepointToString( std::chrono::system_clock::now() + std::chrono::seconds(timeout))), 3600); } else { diff --git a/arangod/RestHandler/RestAdminClusterHandler.h b/arangod/RestHandler/RestAdminClusterHandler.h index 58b4e9996d18..62de08f4835f 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.h +++ b/arangod/RestHandler/RestAdminClusterHandler.h @@ -47,6 +47,7 @@ class RestAdminClusterHandler : public RestVocbaseBaseHandler { RequestLane lane() const override final { return RequestLane::CLIENT_SLOW; } private: + static std::string const CancelJob; static std::string const Health; static std::string const NumberOfServers; static std::string const Maintenance; @@ -87,6 +88,7 @@ class RestAdminClusterHandler : public RestVocbaseBaseHandler { RestStatus handleCleanoutServer(); RestStatus handleResignLeadership(); RestStatus handleMoveShard(); + RestStatus handleCancelJob(); RestStatus handleQueryJobStatus(); RestStatus handleRemoveServer(); diff --git a/tests/Agency/CleanOutServerTest.cpp b/tests/Agency/CleanOutServerTest.cpp index a20b13889cae..05e1a0da4f19 100644 --- a/tests/Agency/CleanOutServerTest.cpp +++ b/tests/Agency/CleanOutServerTest.cpp @@ -585,7 +585,7 @@ TEST_F(CleanOutServerTest, cleanout_server_job_should_move_into_pending_if_ok) { Verify(Method(mockAgent, write)); } -TEST_F(CleanOutServerTest, cleanout_server_job_should_abort_after_timeout) { +TEST_F(CleanOutServerTest, test_cancel_pending_job) { TestStructureType createTestStructure = [&](VPackSlice const& s, std::string const& path) { std::unique_ptr builder; builder.reset(new VPackBuilder()); @@ -604,13 +604,9 @@ TEST_F(CleanOutServerTest, cleanout_server_job_should_abort_after_timeout) { builder->add(VPackValue(JOBID)); builder->add(VPackValue(VPackValueType::Object)); for (auto const& jobIt : VPackObjectIterator(job.slice())) { - if (jobIt.key.copyString() == "timeCreated") { - builder->add(jobIt.key.copyString(), - VPackValue("2015-01-03T20:00:00Z")); - } else { builder->add(jobIt.key.copyString(), jobIt.value); - } } + builder->add("abort", VPackValue(true)); builder->close(); } else if (path == "/arango/Target/ToDo") { VPackBuilder moveBuilder = createMoveShardJob(); @@ -657,8 +653,69 @@ TEST_F(CleanOutServerTest, cleanout_server_job_should_abort_after_timeout) { Node agency = createAgency(createTestStructure); // should not throw auto cleanOutServer = CleanOutServer(agency, &agent, JOB_STATUS::PENDING, JOBID); - cleanOutServer.run(aborts); - Verify(Method(mockAgent, write)); + + Mock spy(cleanOutServer); + Fake(Method(spy, abort)); + + Job& spyCleanOutServer = spy.get(); + spyCleanOutServer.run(aborts); + Verify(Method(spy, abort)); + +} + +TEST_F(CleanOutServerTest, test_cancel_todo_job) { + TestStructureType createTestStructure = [&](VPackSlice const& s, std::string const& path) { + std::unique_ptr builder; + builder.reset(new VPackBuilder()); + if (s.isObject()) { + builder->add(VPackValue(VPackValueType::Object)); + for (auto it : VPackObjectIterator(s)) { + auto childBuilder = + createTestStructure(it.value, path + "/" + it.key.copyString()); + if (childBuilder) { + builder->add(it.key.copyString(), childBuilder->slice()); + } + } + + if (path == "/arango/Target/ToDo") { + auto job = createJob(SERVER); + builder->add(VPackValue(JOBID)); + builder->add(VPackValue(VPackValueType::Object)); + for (auto const& jobIt : VPackObjectIterator(job.slice())) { + builder->add(jobIt.key.copyString(), jobIt.value); + } + builder->add("abort", VPackValue(true)); + builder->close(); + VPackBuilder moveBuilder = createMoveShardJob(); + builder->add("1-0", moveBuilder.slice()); + } + builder->close(); + } else { + builder->add(s); + } + return builder; + }; + + int qCount = 0; + Mock mockAgent; + When(Method(mockAgent, write)).AlwaysDo([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t { + if (qCount++ == 0) { + checkFailed(JOB_STATUS::TODO, q); + } + return fakeWriteResult; + }); + AgentInterface& agent = mockAgent.get(); + + Node agency = createAgency(createTestStructure); + // should not throw + auto cleanOutServer = CleanOutServer(agency, &agent, JOB_STATUS::TODO, JOBID); + Mock spy(cleanOutServer); + Fake(Method(spy, abort)); + + Job& spyCleanOutServer = spy.get(); + spyCleanOutServer.run(aborts); + Verify(Method(spy, abort)); + } TEST_F(CleanOutServerTest, when_there_are_still_subjobs_it_should_wait) { diff --git a/tests/Agency/MoveShardTest.cpp b/tests/Agency/MoveShardTest.cpp index 44e50a3fc921..793fa4673775 100644 --- a/tests/Agency/MoveShardTest.cpp +++ b/tests/Agency/MoveShardTest.cpp @@ -935,65 +935,6 @@ TEST_F(MoveShardTest, when_moving_a_shard_that_is_a_distributeshardslike_leader_ Verify(Method(mockAgent, write)); } -TEST_F(MoveShardTest, if_the_job_is_too_old_it_should_be_aborted_to_prevent_a_deadloop) { - std::function(VPackSlice const&, std::string const&)> createTestStructure = - [&](VPackSlice const& s, std::string const& path) { - std::unique_ptr builder; - builder.reset(new VPackBuilder()); - if (s.isObject()) { - builder->add(VPackValue(VPackValueType::Object)); - for (auto it : VPackObjectIterator(s)) { - auto childBuilder = - createTestStructure(it.value, path + "/" + it.key.copyString()); - if (childBuilder) { - builder->add(it.key.copyString(), childBuilder->slice()); - } - } - - if (path == "/arango/Target/Pending") { - VPackBuilder pendingJob; - { - VPackObjectBuilder b(&pendingJob); - auto plainJob = createJob(COLLECTION, SHARD_FOLLOWER1, FREE_SERVER); - for (auto it : VPackObjectIterator(plainJob.slice())) { - pendingJob.add(it.key.copyString(), it.value); - } - pendingJob.add("timeCreated", VPackValue("2015-01-03T20:00:00Z")); - } - builder->add(jobId, pendingJob.slice()); - } - builder->close(); - } else { - if (path == "/arango/Plan/Collections/" + DATABASE + "/" + - COLLECTION + "/shards/" + SHARD) { - builder->add(VPackValue(VPackValueType::Array)); - builder->add(VPackValue(SHARD_LEADER)); - builder->add(VPackValue(SHARD_FOLLOWER1)); - builder->add(VPackValue(FREE_SERVER)); - builder->close(); - } else { - builder->add(s); - } - } - return builder; - }; - - auto builder = createTestStructure(baseStructure.toBuilder().slice(), ""); - Node agency = createAgencyFromBuilder(*builder); - - Mock mockAgent; - AgentInterface& agent = mockAgent.get(); - - auto moveShard = MoveShard(agency, &agent, PENDING, jobId); - Mock spy(moveShard); - Fake(Method(spy, abort)); - - Job& spyMoveShard = spy.get(); - spyMoveShard.run(aborts); - - Verify(Method(spy, abort)); -} - TEST_F(MoveShardTest, if_the_to_server_no_longer_replica_we_should_abort) { std::function(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) { @@ -1059,65 +1000,6 @@ TEST_F(MoveShardTest, if_the_to_server_no_longer_replica_we_should_abort) { Verify(Method(spy, abort)); } -TEST_F(MoveShardTest, if_the_job_is_too_old_leader_case_it_should_be_aborted_to_prevent_deadloop) { - std::function(VPackSlice const&, std::string const&)> createTestStructure = - [&](VPackSlice const& s, std::string const& path) { - std::unique_ptr builder; - builder.reset(new VPackBuilder()); - if (s.isObject()) { - builder->add(VPackValue(VPackValueType::Object)); - for (auto it : VPackObjectIterator(s)) { - auto childBuilder = - createTestStructure(it.value, path + "/" + it.key.copyString()); - if (childBuilder) { - builder->add(it.key.copyString(), childBuilder->slice()); - } - } - - if (path == "/arango/Target/Pending") { - VPackBuilder pendingJob; - { - VPackObjectBuilder b(&pendingJob); - auto plainJob = createJob(COLLECTION, SHARD_LEADER, FREE_SERVER); - for (auto it : VPackObjectIterator(plainJob.slice())) { - pendingJob.add(it.key.copyString(), it.value); - } - pendingJob.add("timeCreated", VPackValue("2015-01-03T20:00:00Z")); - } - builder->add(jobId, pendingJob.slice()); - } - builder->close(); - } else { - if (path == "/arango/Plan/Collections/" + DATABASE + "/" + - COLLECTION + "/shards/" + SHARD) { - builder->add(VPackValue(VPackValueType::Array)); - builder->add(VPackValue(SHARD_LEADER)); - builder->add(VPackValue(SHARD_FOLLOWER1)); - builder->add(VPackValue(FREE_SERVER)); - builder->close(); - } else { - builder->add(s); - } - } - return builder; - }; - - auto builder = createTestStructure(baseStructure.toBuilder().slice(), ""); - Node agency = createAgencyFromBuilder(*builder); - - Mock mockAgent; - AgentInterface& agent = mockAgent.get(); - - auto moveShard = MoveShard(agency, &agent, PENDING, jobId); - Mock spy(moveShard); - Fake(Method(spy, abort)); - - Job& spyMoveShard = spy.get(); - spyMoveShard.run(aborts); - - Verify(Method(spy, abort)); -} - TEST_F(MoveShardTest, if_the_collection_was_dropped_while_moving_finish_the_job) { std::function(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) { @@ -2834,7 +2716,7 @@ TEST_F(MoveShardTest, trying_to_abort_a_finished_should_result_in_failure) { EXPECT_EQ(result.errorNumber(), TRI_ERROR_SUPERVISION_GENERAL_FAILURE); } -TEST_F(MoveShardTest, if_the_job_fails_while_trying_to_switch_over_leadership_it_should_be_aborted) { +TEST_F(MoveShardTest, test_cancel_pending_job) { std::function(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) { std::unique_ptr builder; @@ -2857,7 +2739,7 @@ TEST_F(MoveShardTest, if_the_job_fails_while_trying_to_switch_over_leadership_it for (auto it : VPackObjectIterator(plainJob.slice())) { pendingJob.add(it.key.copyString(), it.value); } - pendingJob.add("timeCreated", VPackValue("2015-01-03T20:00:00Z")); + pendingJob.add("abort", VPackValue(true)); } builder->add(jobId, pendingJob.slice()); } @@ -2866,9 +2748,8 @@ TEST_F(MoveShardTest, if_the_job_fails_while_trying_to_switch_over_leadership_it if (path == "/arango/Plan/Collections/" + DATABASE + "/" + COLLECTION + "/shards/" + SHARD) { builder->add(VPackValue(VPackValueType::Array)); - builder->add(VPackValue("_" + SHARD_LEADER)); - builder->add(VPackValue(SHARD_FOLLOWER1)); builder->add(VPackValue(FREE_SERVER)); + builder->add(VPackValue(SHARD_FOLLOWER1)); builder->close(); } else { builder->add(s); @@ -2893,7 +2774,7 @@ TEST_F(MoveShardTest, if_the_job_fails_while_trying_to_switch_over_leadership_it Verify(Method(spy, abort)); } -TEST_F(MoveShardTest, if_the_job_timeouts_while_the_new_leader_is_trying_to_take_over_the_job_should_be_aborted) { +TEST_F(MoveShardTest, test_cancel_todo_job) { std::function(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) { std::unique_ptr builder; @@ -2908,7 +2789,7 @@ TEST_F(MoveShardTest, if_the_job_timeouts_while_the_new_leader_is_trying_to_take } } - if (path == "/arango/Target/Pending") { + if (path == "/arango/Target/ToDo") { VPackBuilder pendingJob; { VPackObjectBuilder b(&pendingJob); @@ -2916,7 +2797,7 @@ TEST_F(MoveShardTest, if_the_job_timeouts_while_the_new_leader_is_trying_to_take for (auto it : VPackObjectIterator(plainJob.slice())) { pendingJob.add(it.key.copyString(), it.value); } - pendingJob.add("timeCreated", VPackValue("2015-01-03T20:00:00Z")); + pendingJob.add("abort", VPackValue(true)); } builder->add(jobId, pendingJob.slice()); } @@ -2941,7 +2822,7 @@ TEST_F(MoveShardTest, if_the_job_timeouts_while_the_new_leader_is_trying_to_take Mock mockAgent; AgentInterface& agent = mockAgent.get(); - auto moveShard = MoveShard(agency, &agent, PENDING, jobId); + auto moveShard = MoveShard(agency, &agent, TODO, jobId); Mock spy(moveShard); Fake(Method(spy, abort)); From 38499ed4ea404403ac8435cbcc7a032f89c27356 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 10:17:52 +0200 Subject: [PATCH 05/19] cleaner --- arangod/Agency/CleanOutServer.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arangod/Agency/CleanOutServer.cpp b/arangod/Agency/CleanOutServer.cpp index 70f1fd6d2bf7..477d2ace9406 100644 --- a/arangod/Agency/CleanOutServer.cpp +++ b/arangod/Agency/CleanOutServer.cpp @@ -83,10 +83,7 @@ JOB_STATUS CleanOutServer::status() { if (found > 0) { // some subjob still running // consider cancellation - if (considerCancellation()) { - return FAILED; - } - return PENDING; + return considerCancellation() ? FAILED : PENDING; } Node::Children const& failed = _snapshot.hasAsChildren(failedPrefix).first; From 9deb4dab5252a9177a32e2f8e4f19454f5d1a913 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 10:27:10 +0200 Subject: [PATCH 06/19] allow for long supervision job runtimes --- CHANGELOG | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 86cc8f9f09db..a5ecceca1a87 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,10 +15,6 @@ v3.8.2 (XXXX-XX-XX) * Fix a rare shutdown race in RocksDBShaCalculatorThread. -* Reduce internal priority of AQL execution. This prevents possible deadlocks - with modification operations in a cluster and replicationFactor >= 2, and can - also improve responsiveness under high load of AQL queries. - v3.8.1 (2021-08-27) ------------------- From 8ecf5495e5df76d6d7ed52082f4b74dbfb101c8b Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 10:37:28 +0200 Subject: [PATCH 07/19] revert unfug in change log --- CHANGELOG | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a5ecceca1a87..2693b371dc53 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,14 +15,14 @@ v3.8.2 (XXXX-XX-XX) * Fix a rare shutdown race in RocksDBShaCalculatorThread. - -v3.8.1 (2021-08-27) -------------------- - * Reduce internal priority of AQL execution. This prevents possible deadlocks with modification operations in a cluster and replicationFactor >= 2, and can also improve responsiveness under high load of AQL queries. + +v3.8.1 (2021-08-27) +------------------- + * Updated arangosync to 2.6.0. * Added protocol specific metrics: histogram about request body size, total From cd2c8d62a5b1add3415df1ce2c7de44ea09ee36f Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 14:02:30 +0200 Subject: [PATCH 08/19] fix 412 response --- .../RestHandler/RestAdminClusterHandler.cpp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 446ba4f8f22e..b438977c6224 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -291,7 +291,7 @@ RestStatus RestAdminClusterHandler::execute() { bool const isWriteOperation = (request()->requestType() != rest::RequestType::GET); std::string const& apiJwtPolicy = server().getFeature().apiJwtPolicy(); - if (apiJwtPolicy == "jwt-all" || + if (apiJwtPolicy == "jwt-all" || (apiJwtPolicy == "jwt-write" && isWriteOperation)) { generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN); return RestStatus::DONE; @@ -456,7 +456,7 @@ RestAdminClusterHandler::FutureVoid RestAdminClusterHandler::tryDeleteServer( return futures::makeFuture(); }); } - + TRI_IF_FAILURE("removeServer::noRetry") { generateError(Result(TRI_ERROR_HTTP_PRECONDITION_FAILED)); return futures::makeFuture(); @@ -532,18 +532,18 @@ RestStatus RestAdminClusterHandler::handleShardStatistics() { generateError(rest::ResponseCode::METHOD_NOT_ALLOWED, TRI_ERROR_HTTP_METHOD_NOT_ALLOWED); return RestStatus::DONE; } - + if (!ServerState::instance()->isCoordinator()) { generateError(rest::ResponseCode::NOT_IMPLEMENTED, TRI_ERROR_CLUSTER_ONLY_ON_COORDINATOR); return RestStatus::DONE; } - + if (!_vocbase.isSystem()) { generateError(GeneralResponse::responseCode(TRI_ERROR_ARANGO_USE_SYSTEM_DATABASE), TRI_ERROR_ARANGO_USE_SYSTEM_DATABASE); return RestStatus::DONE; } - + std::string const& restrictServer = _request->value("DBserver"); bool details = _request->parsedValue("details", false); @@ -949,8 +949,9 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { LOG_TOPIC("eb139", INFO, Logger::SUPERVISION) << "Attempting to abort supervision job " << job.toJson(); auto type = job.get("type").copyString(); - if (type != "moveShard") { // only moveshard may be aborted - generateError(Result{400, "Only MoveShard jobs can be aborted"}); + // only moveshard and cleanoutserver may be aborted + if (type != "moveShard" && type != "cleanOutServer") { + generateError(Result{400, "Only MoveShard and CleanOutServer jobs can be aborted"}); return RestStatus::DONE; } else if (path[2] != "Pending" && path[2] != "ToDo") { generateError(Result{400, path[2] + " jobs can no longer be cancelled."}); @@ -987,7 +988,11 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { .thenValue([this](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { if (wr.statusCode() == 412) { - generateError(Result{412, "Job is no longer running"}); + try { + auto results = rw.slice().get("results"); + if (results[0] == 0 && results[1] == 0) { + generateError(Result{412, "Job is no longer pending or to do"}); + } } else { generateError(wr.asResult()); } From 8c29836bc617cf0cbc9b98d958e63c8ff848a976 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 14:06:49 +0200 Subject: [PATCH 09/19] fix 412 response --- arangod/RestHandler/RestAdminClusterHandler.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index b438977c6224..36add9a2ae43 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -988,11 +988,10 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { .thenValue([this](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { if (wr.statusCode() == 412) { - try { - auto results = rw.slice().get("results"); - if (results[0] == 0 && results[1] == 0) { - generateError(Result{412, "Job is no longer pending or to do"}); - } + auto results = rw.slice().get("results"); + if (results[0] == 0 && results[1] == 0) { + generateError(Result{412, "Job is no longer pending or to do"}); + } } else { generateError(wr.asResult()); } From 1ce1d6945ccdc13211947d99b3e243217189ee01 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Tue, 7 Sep 2021 16:15:29 +0200 Subject: [PATCH 10/19] Result constructors --- .../RestHandler/RestAdminClusterHandler.cpp | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 36add9a2ae43..4067caa700e4 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -951,13 +951,18 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { auto type = job.get("type").copyString(); // only moveshard and cleanoutserver may be aborted if (type != "moveShard" && type != "cleanOutServer") { - generateError(Result{400, "Only MoveShard and CleanOutServer jobs can be aborted"}); + generateError(Result{TRI_ERROR_HTTP_BAD_PARAMETER, + "Only MoveShard and CleanOutServer jobs can be aborted"}); return RestStatus::DONE; } else if (path[2] != "Pending" && path[2] != "ToDo") { - generateError(Result{400, path[2] + " jobs can no longer be cancelled."}); + generateError(Result{TRI_ERROR_HTTP_BAD_PARAMETER, path[2] + " jobs can no longer be cancelled."}); return RestStatus::DONE; } + // This tranaction aims at killing a job that is todo or pending. + // A todo job could be pending in the meantime however a pending + // job can never be todo again. Response ist evaluated in 412 result + // below. auto sendTransaction = [&] { VPackBuffer trxBody; { VPackBuilder builder(trxBody); @@ -987,10 +992,13 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { sendTransaction() .thenValue([this](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { + // Only if no longer pending or todo. if (wr.statusCode() == 412) { - auto results = rw.slice().get("results"); - if (results[0] == 0 && results[1] == 0) { - generateError(Result{412, "Job is no longer pending or to do"}); + auto results = wr.slice().get("results"); + if (results[0].getNumber() == 0 && + results[1].getNumber() == 0) { + generateError(Result{TRI_ERROR_HTTP_PRECONDITION_FAILED, + "Job is no longer pending or to do"}); } } else { generateError(wr.asResult()); @@ -999,7 +1007,7 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { return futures::makeFuture(); }) .thenError([this](VPackException const& e) { - generateError(Result{e.errorCode(), e.what()}); + generateError(Result{TRI_ERROR_HTTP_SERVER_ERROR, e.what()}); }) .thenError([this](std::exception const& e) { generateError(rest::ResponseCode::SERVER_ERROR, @@ -1022,7 +1030,7 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { generateError(rest::ResponseCode::NOT_FOUND, TRI_ERROR_HTTP_NOT_FOUND); } catch (VPackException const& e) { - generateError(Result{e.errorCode(), e.what()}); + generateError(Result{TRI_ERROR_HTTP_SERVER_ERROR, e.what()}); } catch (std::exception const& e) { generateError(rest::ResponseCode::SERVER_ERROR, TRI_ERROR_HTTP_SERVER_ERROR, e.what()); } From 967416a697a571579cd40f0b37878a774cf90f86 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 10:30:17 +0200 Subject: [PATCH 11/19] integration tests --- .../resilience/move/moving-shards-cluster.js | 99 ++++++++++++++++++- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/tests/js/server/resilience/move/moving-shards-cluster.js b/tests/js/server/resilience/move/moving-shards-cluster.js index 6c1873c97e69..b2c84c1214ac 100644 --- a/tests/js/server/resilience/move/moving-shards-cluster.js +++ b/tests/js/server/resilience/move/moving-shards-cluster.js @@ -585,12 +585,36 @@ function MovingShardsSuite ({useData}) { } +//////////////////////////////////////////////////////////////////////////////// +/// @brief Abort job +//////////////////////////////////////////////////////////////////////////////// + + function cancelAgencyJob(id) { + console.log("Killing job " + id); + var coordEndpoint = + global.ArangoClusterInfo.getServerEndpoint("Coordinator0001"); + var request = require("@arangodb/request"); + var endpointToURL = require("@arangodb/cluster").endpointToURL; + var url = endpointToURL(coordEndpoint); + var req; + try { + req = request({ method: "POST", + url: url + "/_admin/cluster/cancelAgencyJob", + body: JSON.stringify({id:id}) }); + } catch (err) { + console.error( + "Exception for POS /_admin/cluster/cancelAgencyJob:" + {id:id}, err.stack); + return false; + } + return JSON.parse(req.body).error === false; + } + //////////////////////////////////////////////////////////////////////////////// /// @brief create some collections //////////////////////////////////////////////////////////////////////////////// - function createSomeCollections(n, nrShards, replFactor, useData) { + function createSomeCollections(n, nrShards, replFactor, useData, otherNumDocuments = 0) { assertEqual('boolean', typeof useData); var systemCollServers = findCollectionServers("_system", "_graphs"); console.info("System collections use servers:", systemCollServers); @@ -605,7 +629,9 @@ function MovingShardsSuite ({useData}) { if (useData) { // insert some documents - coll.insert(_.range(0, numDocuments).map(v => ({ value: v, x: "someString" + v }))); + let nd = (otherNumDocuments === 0) ? numDocuments : otherNumDocuments; + print(coll.name()); + coll.insert(_.range(0, nd).map(v => ({ value: v, x: "someString" + v }))); } var servers = findCollectionServers("_system", name); @@ -623,9 +649,11 @@ function MovingShardsSuite ({useData}) { } } - function checkCollectionContents() { - const numDocs = useData ? numDocuments : 0; + function checkCollectionContents(otherNumDocuments = 0) { + let nd = (otherNumDocuments === 0) ? numDocuments : otherNumDocuments; + const numDocs = useData ? nd : 0; for(const collection of c) { + print(collection.name()); assertEqual(numDocs, collection.count()); const values = db._query( 'FOR doc IN @@col SORT doc.value RETURN doc.value', @@ -1052,6 +1080,69 @@ function MovingShardsSuite ({useData}) { checkCollectionContents(); }, +//////////////////////////////////////////////////////////////////////////////// +/// @brief kill todo moveShard job +//////////////////////////////////////////////////////////////////////////////// + + testCancelToDoMoveShard : function() { + createSomeCollections(1, 1, 3, useData); + assertTrue(waitForSynchronousReplication("_system")); + var servers = findCollectionServers("_system", c[1].name()); + var fromServer = servers[0]; + var toServer = findServerNotOnList(servers); + var cinfo = global.ArangoClusterInfo.getCollectionInfo( + "_system", c[1].name()); + var shard = Object.keys(cinfo.shards)[0]; + assertTrue(maintenanceMode("on")); + let result = moveShard("_system", c[1]._id, shard, fromServer, toServer, true, "Finished"); + assertEqual(result.statusCode, 202); + let id = JSON.parse(result.body).id; + assertTrue(cancelAgencyJob(id)); + assertTrue(maintenanceMode("off")); + var job = queryAgencyJob(result.json.id); + assertEqual(job.status, "Finished"); + assertEqual(job.abort); + assertTrue(waitForSupervision()); + checkCollectionContents(); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief kill todo moveShard job +//////////////////////////////////////////////////////////////////////////////// + + testCancelPendingMoveShard : function() { + if (useData) { + for (var i = 0; i < c.length; ++i) { + c[i].drop(); + } + c = []; + let otherNumDocuments = 100000; + createSomeCollections(1, 1, 3, useData, otherNumDocuments); + assertTrue(waitForSynchronousReplication("_system")); + var servers = findCollectionServers("_system", c[0].name()); + var fromServer = servers[0]; + var toServer = findServerNotOnList(servers); + var cinfo = global.ArangoClusterInfo.getCollectionInfo( + "_system", c[0].name()); + var shard = Object.keys(cinfo.shards)[0]; + let result = moveShard("_system", c[0]._id, shard, fromServer, toServer, true, "Finished"); + assertEqual(result.statusCode, 202); + let id = JSON.parse(result.body).id; + while (queryAgencyJob(result.json.id).status === "ToDo") { // wait for job to start + wait(0.1); + } + assertTrue(cancelAgencyJob(id)); + while (queryAgencyJob(result.json.id).status === "Pending") { // wait for job to be killed + wait(0.1); + } + var job = queryAgencyJob(result.json.id); + assertEqual(job.status, "Failed"); + assertEqual(job.abort); + assertTrue(waitForSupervision()); + checkCollectionContents(otherNumDocuments); + } + }, + }; } From 0878397a97abddb759d5b4841bf7f937f5ab98bf Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 10:37:54 +0200 Subject: [PATCH 12/19] integration tests --- arangod/RestHandler/RestAdminClusterHandler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 4067caa700e4..31b32c1070f7 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -945,7 +945,6 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { VPackSlice job = res.at(0).get(path); if (job.isObject()) { - LOG_DEVEL << job.toJson(); LOG_TOPIC("eb139", INFO, Logger::SUPERVISION) << "Attempting to abort supervision job " << job.toJson(); auto type = job.get("type").copyString(); From 4d1a1d7d4042422c1dc1af1ba45352e1cbad19d6 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 10:39:05 +0200 Subject: [PATCH 13/19] integration tests --- tests/js/server/resilience/move/moving-shards-cluster.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/js/server/resilience/move/moving-shards-cluster.js b/tests/js/server/resilience/move/moving-shards-cluster.js index b2c84c1214ac..6b98e06580f9 100644 --- a/tests/js/server/resilience/move/moving-shards-cluster.js +++ b/tests/js/server/resilience/move/moving-shards-cluster.js @@ -630,7 +630,6 @@ function MovingShardsSuite ({useData}) { if (useData) { // insert some documents let nd = (otherNumDocuments === 0) ? numDocuments : otherNumDocuments; - print(coll.name()); coll.insert(_.range(0, nd).map(v => ({ value: v, x: "someString" + v }))); } @@ -653,7 +652,6 @@ function MovingShardsSuite ({useData}) { let nd = (otherNumDocuments === 0) ? numDocuments : otherNumDocuments; const numDocs = useData ? nd : 0; for(const collection of c) { - print(collection.name()); assertEqual(numDocs, collection.count()); const values = db._query( 'FOR doc IN @@col SORT doc.value RETURN doc.value', From 5d299fe95f81347efc65fa6c251e89b8170d7150 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 12:46:05 +0200 Subject: [PATCH 14/19] integration tests cleanout server --- arangod/Agency/MoveShard.cpp | 2 +- .../RestHandler/RestAdminClusterHandler.cpp | 35 ++++----- tests/Agency/MoveShardTest.cpp | 2 +- .../resilience/move/moving-shards-cluster.js | 74 +++++++++++++++++-- 4 files changed, 87 insertions(+), 26 deletions(-) diff --git a/arangod/Agency/MoveShard.cpp b/arangod/Agency/MoveShard.cpp index 1c3dc03768f8..3f8d8554a03a 100644 --- a/arangod/Agency/MoveShard.cpp +++ b/arangod/Agency/MoveShard.cpp @@ -975,7 +975,7 @@ arangodb::Result MoveShard::abort(std::string const& reason) { } } - if (finish("", "", true, "job aborted (1): " + reason, todoPrec)) { + if (finish("", "", false, "job aborted (1): " + reason, todoPrec)) { return result; } _status = PENDING; diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 31b32c1070f7..26e0d2682cd7 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -947,7 +947,7 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { if (job.isObject()) { LOG_TOPIC("eb139", INFO, Logger::SUPERVISION) << "Attempting to abort supervision job " << job.toJson(); - auto type = job.get("type").copyString(); + auto type = job.get("type").stringView(); // only moveshard and cleanoutserver may be aborted if (type != "moveShard" && type != "cleanOutServer") { generateError(Result{TRI_ERROR_HTTP_BAD_PARAMETER, @@ -962,33 +962,30 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { // A todo job could be pending in the meantime however a pending // job can never be todo again. Response ist evaluated in 412 result // below. - auto sendTransaction = [&] { - VPackBuffer trxBody; - { VPackBuilder builder(trxBody); - { VPackArrayBuilder trxs(&builder); - if (path[2] == "ToDo") { - VPackArrayBuilder trx(&builder); - { VPackObjectBuilder op(&builder); - builder.add("arango/Target/ToDo/" + jobId + "/abort", VPackValue(true)); } - { VPackObjectBuilder pre(&builder); - builder.add(VPackValue("arango/Target/ToDo/" + jobId)); - { VPackObjectBuilder val(&builder); - builder.add("oldEmpty", VPackValue(false)); }} - } + VPackBuffer trxBody; + { VPackBuilder builder(trxBody); + { VPackArrayBuilder trxs(&builder); + if (path[2] == "ToDo") { VPackArrayBuilder trx(&builder); { VPackObjectBuilder op(&builder); - builder.add("arango/Target/Pending/" + jobId + "/abort", VPackValue(true)); } + builder.add("arango/Target/ToDo/" + jobId + "/abort", VPackValue(true)); } { VPackObjectBuilder pre(&builder); - builder.add(VPackValue("arango/Target/Pending/" + jobId)); + builder.add(VPackValue("arango/Target/ToDo/" + jobId)); { VPackObjectBuilder val(&builder); builder.add("oldEmpty", VPackValue(false)); }} } + VPackArrayBuilder trx(&builder); + { VPackObjectBuilder op(&builder); + builder.add("arango/Target/Pending/" + jobId + "/abort", VPackValue(true)); } + { VPackObjectBuilder pre(&builder); + builder.add(VPackValue("arango/Target/Pending/" + jobId)); + { VPackObjectBuilder val(&builder); + builder.add("oldEmpty", VPackValue(false)); }} } - return AsyncAgencyComm().sendWriteTransaction(60s, std::move(trxBody)); - }; + } waitForFuture( - sendTransaction() + AsyncAgencyComm().sendWriteTransaction(60s, std::move(trxBody)) .thenValue([this](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { // Only if no longer pending or todo. diff --git a/tests/Agency/MoveShardTest.cpp b/tests/Agency/MoveShardTest.cpp index 793fa4673775..9f3c370d38f0 100644 --- a/tests/Agency/MoveShardTest.cpp +++ b/tests/Agency/MoveShardTest.cpp @@ -1610,7 +1610,7 @@ TEST_F(MoveShardTest, a_moveshard_job_that_just_made_it_to_todo_can_simply_be_ab auto writes = q->slice()[0][0]; EXPECT_TRUE(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete"); - EXPECT_TRUE(std::string(writes.get("/arango/Target/Finished/1").typeName()) == + EXPECT_TRUE(std::string(writes.get("/arango/Target/Failed/1").typeName()) == "object"); auto precond = q->slice()[0][1]; EXPECT_TRUE(precond.get("/arango/Target/ToDo/1").get("oldEmpty").isFalse()); diff --git a/tests/js/server/resilience/move/moving-shards-cluster.js b/tests/js/server/resilience/move/moving-shards-cluster.js index 6b98e06580f9..c57d30a4958d 100644 --- a/tests/js/server/resilience/move/moving-shards-cluster.js +++ b/tests/js/server/resilience/move/moving-shards-cluster.js @@ -366,7 +366,7 @@ function MovingShardsSuite ({useData}) { /// @brief order the cluster to clean out a server: //////////////////////////////////////////////////////////////////////////////// - function cleanOutServer(id) { + function cleanOutServer(id, dontwait = false) { var coordEndpoint = global.ArangoClusterInfo.getServerEndpoint("Coordinator0001"); var request = require("@arangodb/request"); @@ -385,6 +385,9 @@ function MovingShardsSuite ({useData}) { } console.info("cleanOutServer job:", JSON.stringify(body)); console.info("result of request:", JSON.stringify(result.json)); + if (dontwait) { + return result; + } // Now wait until the job we triggered is finished: var count = 1200; // seconds while (true) { @@ -535,11 +538,11 @@ function MovingShardsSuite ({useData}) { "Exception for PUT /_admin/cluster/moveShard:", err.stack); return false; } + console.info("moveShard job:", JSON.stringify(body)); + console.info("result of request:", JSON.stringify(result.json)); if (dontwait) { return result; } - console.info("moveShard job:", JSON.stringify(body)); - console.info("result of request:", JSON.stringify(result.json)); // Now wait until the job we triggered is finished: var count = 600; // seconds while (true) { @@ -1098,14 +1101,14 @@ function MovingShardsSuite ({useData}) { assertTrue(cancelAgencyJob(id)); assertTrue(maintenanceMode("off")); var job = queryAgencyJob(result.json.id); - assertEqual(job.status, "Finished"); + assertEqual(job.status, "Failed"); assertEqual(job.abort); assertTrue(waitForSupervision()); checkCollectionContents(); }, //////////////////////////////////////////////////////////////////////////////// -/// @brief kill todo moveShard job +/// @brief kill pending moveShard job //////////////////////////////////////////////////////////////////////////////// testCancelPendingMoveShard : function() { @@ -1141,6 +1144,67 @@ function MovingShardsSuite ({useData}) { } }, +//////////////////////////////////////////////////////////////////////////////// +/// @brief kill todo cleanOutServer job +//////////////////////////////////////////////////////////////////////////////// + + testCancelToDoCleanOutServer : function() { + createSomeCollections(1, 1, 3, useData); + assertTrue(waitForSynchronousReplication("_system")); + var servers = findCollectionServers("_system", c[1].name()); + var fromServer = servers[0]; + var cinfo = global.ArangoClusterInfo.getCollectionInfo( + "_system", c[1].name()); + var shard = Object.keys(cinfo.shards)[0]; + assertTrue(maintenanceMode("on")); + let result = cleanOutServer(servers[0], true); + assertEqual(result.statusCode, 202); + let id = JSON.parse(result.body).id; + assertTrue(cancelAgencyJob(id)); + assertTrue(maintenanceMode("off")); + var job = queryAgencyJob(result.json.id); + assertEqual(job.status, "Failed"); + assertEqual(job.abort); + assertTrue(waitForSupervision()); + checkCollectionContents(); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief kill pending moveShard job +//////////////////////////////////////////////////////////////////////////////// + + testCancelPendingCleanOutServer : function() { + if (useData) { + for (var i = 0; i < c.length; ++i) { + c[i].drop(); + } + c = []; + let otherNumDocuments = 100000; + createSomeCollections(1, 1, 3, useData, otherNumDocuments); + assertTrue(waitForSynchronousReplication("_system")); + var servers = findCollectionServers("_system", c[0].name()); + var fromServer = servers[0]; + var cinfo = global.ArangoClusterInfo.getCollectionInfo( + "_system", c[0].name()); + var shard = Object.keys(cinfo.shards)[0]; + let result = cleanOutServer(fromServer, true); + assertEqual(result.statusCode, 202); + let id = JSON.parse(result.body).id; + while (queryAgencyJob(result.json.id).status === "ToDo") { // wait for job to start + wait(0.1); + } + assertTrue(cancelAgencyJob(id)); + while (queryAgencyJob(result.json.id).status === "Pending") { // wait for job to be killed + wait(0.1); + } + var job = queryAgencyJob(result.json.id); + assertEqual(job.status, "Failed"); + assertEqual(job.abort); + assertTrue(waitForSupervision()); + checkCollectionContents(otherNumDocuments); + } + }, + }; } From 595cad3868280eb6c7e81c55ec5c988928a4d5f5 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 13:24:08 +0200 Subject: [PATCH 15/19] cleanoutserver integration tests --- .../RestHandler/RestAdminClusterHandler.cpp | 73 ++++++++++--------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 26e0d2682cd7..790f8eb18f40 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -948,7 +948,7 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { LOG_TOPIC("eb139", INFO, Logger::SUPERVISION) << "Attempting to abort supervision job " << job.toJson(); auto type = job.get("type").stringView(); - // only moveshard and cleanoutserver may be aborted + // only moveshard and cleanoutserver may be aborted if (type != "moveShard" && type != "cleanOutServer") { generateError(Result{TRI_ERROR_HTTP_BAD_PARAMETER, "Only MoveShard and CleanOutServer jobs can be aborted"}); @@ -962,64 +962,69 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { // A todo job could be pending in the meantime however a pending // job can never be todo again. Response ist evaluated in 412 result // below. - VPackBuffer trxBody; - { VPackBuilder builder(trxBody); - { VPackArrayBuilder trxs(&builder); - if (path[2] == "ToDo") { + auto sendTransaction = [&] { + VPackBuffer trxBody; + { VPackBuilder builder(trxBody); + { VPackArrayBuilder trxs(&builder); + if (path[2] == "ToDo") { + VPackArrayBuilder trx(&builder); + { VPackObjectBuilder op(&builder); + builder.add("arango/Target/ToDo/" + jobId + "/abort", VPackValue(true)); } + { VPackObjectBuilder pre(&builder); + builder.add(VPackValue("arango/Target/ToDo/" + jobId)); + { VPackObjectBuilder val(&builder); + builder.add("oldEmpty", VPackValue(false)); }} + } VPackArrayBuilder trx(&builder); { VPackObjectBuilder op(&builder); - builder.add("arango/Target/ToDo/" + jobId + "/abort", VPackValue(true)); } + builder.add("arango/Target/Pending/" + jobId + "/abort", VPackValue(true)); } { VPackObjectBuilder pre(&builder); - builder.add(VPackValue("arango/Target/ToDo/" + jobId)); + builder.add(VPackValue("arango/Target/Pending/" + jobId)); { VPackObjectBuilder val(&builder); builder.add("oldEmpty", VPackValue(false)); }} } - VPackArrayBuilder trx(&builder); - { VPackObjectBuilder op(&builder); - builder.add("arango/Target/Pending/" + jobId + "/abort", VPackValue(true)); } - { VPackObjectBuilder pre(&builder); - builder.add(VPackValue("arango/Target/Pending/" + jobId)); - { VPackObjectBuilder val(&builder); - builder.add("oldEmpty", VPackValue(false)); }} } - } + return AsyncAgencyComm().sendWriteTransaction(60s, std::move(trxBody)); + }; - waitForFuture( - AsyncAgencyComm().sendWriteTransaction(60s, std::move(trxBody)) - .thenValue([this](AsyncAgencyCommResult&& wr) { + return waitForFuture( + sendTransaction() + .thenValue([this, &jobId](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { // Only if no longer pending or todo. if (wr.statusCode() == 412) { auto results = wr.slice().get("results"); - if (results[0].getNumber() == 0 && - results[1].getNumber() == 0) { - generateError(Result{TRI_ERROR_HTTP_PRECONDITION_FAILED, - "Job is no longer pending or to do"}); + if (results[0].getNumber() == 0 && + results[1].getNumber() == 0) { + generateError( + Result{TRI_ERROR_HTTP_PRECONDITION_FAILED, "Job is no longer pending or to do"}); } } else { generateError(wr.asResult()); } } - return futures::makeFuture(); + + VPackBuffer payload; + { + VPackBuilder builder(payload); + VPackObjectBuilder ob(&builder); + builder.add("job", VPackValue(jobId)); + builder.add("status", VPackValue("aborted")); + builder.add("error", VPackValue(false)); + } + resetResponse(rest::ResponseCode::OK); + response()->setPayload(std::move(payload)); + return futures::makeFuture(RestStatus::DONE); }) .thenError([this](VPackException const& e) { generateError(Result{TRI_ERROR_HTTP_SERVER_ERROR, e.what()}); + return futures::makeFuture(RestStatus::DONE); }) .thenError([this](std::exception const& e) { generateError(rest::ResponseCode::SERVER_ERROR, TRI_ERROR_HTTP_SERVER_ERROR, e.what()); + return futures::makeFuture(RestStatus::DONE); })); - VPackBuffer payload; - { - VPackBuilder builder(payload); - VPackObjectBuilder ob(&builder); - builder.add("job", VPackValue(jobId)); - builder.add("status", VPackValue("aborted")); - builder.add("error", VPackValue(false)); - } - resetResponse(rest::ResponseCode::OK); - response()->setPayload(std::move(payload)); - return RestStatus::DONE; } } From 8bc611819ef612ff4dc8518aed63bea220191f79 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Wed, 8 Sep 2021 13:45:46 +0200 Subject: [PATCH 16/19] fixing test --- arangod/RestHandler/RestAdminClusterHandler.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arangod/RestHandler/RestAdminClusterHandler.cpp b/arangod/RestHandler/RestAdminClusterHandler.cpp index 790f8eb18f40..107051628cbc 100644 --- a/arangod/RestHandler/RestAdminClusterHandler.cpp +++ b/arangod/RestHandler/RestAdminClusterHandler.cpp @@ -989,7 +989,7 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { return waitForFuture( sendTransaction() - .thenValue([this, &jobId](AsyncAgencyCommResult&& wr) { + .thenValue([=](AsyncAgencyCommResult&& wr) { if (!wr.ok()) { // Only if no longer pending or todo. if (wr.statusCode() == 412) { @@ -998,9 +998,11 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { results[1].getNumber() == 0) { generateError( Result{TRI_ERROR_HTTP_PRECONDITION_FAILED, "Job is no longer pending or to do"}); + return; } } else { generateError(wr.asResult()); + return; } } @@ -1014,16 +1016,13 @@ RestStatus RestAdminClusterHandler::handleCancelJob() { } resetResponse(rest::ResponseCode::OK); response()->setPayload(std::move(payload)); - return futures::makeFuture(RestStatus::DONE); }) .thenError([this](VPackException const& e) { generateError(Result{TRI_ERROR_HTTP_SERVER_ERROR, e.what()}); - return futures::makeFuture(RestStatus::DONE); }) .thenError([this](std::exception const& e) { generateError(rest::ResponseCode::SERVER_ERROR, TRI_ERROR_HTTP_SERVER_ERROR, e.what()); - return futures::makeFuture(RestStatus::DONE); })); } } From 1643021ab7fea490e1722b67d293b9a46c76dc0c Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Thu, 9 Sep 2021 09:31:06 +0200 Subject: [PATCH 17/19] 3.8 cleanout server abort -> moveshard failed --- tests/Agency/CleanOutServerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Agency/CleanOutServerTest.cpp b/tests/Agency/CleanOutServerTest.cpp index 05e1a0da4f19..034a94d83138 100644 --- a/tests/Agency/CleanOutServerTest.cpp +++ b/tests/Agency/CleanOutServerTest.cpp @@ -638,7 +638,7 @@ TEST_F(CleanOutServerTest, test_cancel_pending_job) { EXPECT_TRUE(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete"); // a not yet started job will be moved to finished - EXPECT_TRUE(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == + EXPECT_TRUE(std::string(writes.get("/arango/Target/Failed/1-0").typeName()) == "object"); auto preconds = q->slice()[0][1]; EXPECT_TRUE(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse()); From 57c3b98abb85b1a403febe86c78c4b7777845506 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Thu, 9 Sep 2021 11:58:53 +0200 Subject: [PATCH 18/19] same error over? --- tests/Agency/CleanOutServerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Agency/CleanOutServerTest.cpp b/tests/Agency/CleanOutServerTest.cpp index 034a94d83138..13d7f61dd9c0 100644 --- a/tests/Agency/CleanOutServerTest.cpp +++ b/tests/Agency/CleanOutServerTest.cpp @@ -895,7 +895,7 @@ TEST_F(CleanOutServerTest, when_the_cleanout_server_job_aborts_abort_all_subjobs EXPECT_TRUE(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete"); // a not yet started job will be moved to finished - EXPECT_TRUE(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == + EXPECT_TRUE(std::string(writes.get("/arango/Target/Failed/1-0").typeName()) == "object"); auto preconds = q->slice()[0][1]; EXPECT_TRUE(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse()); From f78a185c6a6547d905e31af7204ad9257081ef7a Mon Sep 17 00:00:00 2001 From: Vadim Date: Sat, 11 Sep 2021 19:42:15 +0300 Subject: [PATCH 19/19] Update CHANGELOG --- CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 59b330dd4e73..5f4c110e1b55 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,8 @@ v3.8.2 (XXXX-XX-XX) ------------------- -* No runtime limits for shard move and server cleanout jobs, instead - possibility to cancel them. +* No runtime limits for shard move and server cleanout jobs, instead possibility + to cancel them. * (EE only) Bug-fix: If you created a ArangoSearch view on Satellite- Collections only and then join with a collection only having a single shard