8000 move IOHeartbeatThread into its own file (#17988) · olegrok/arangodb@3b5c099 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3b5c099

Browse files
authored
move IOHeartbeatThread into its own file (arangodb#17988)
1 parent 9391d9e commit 3b5c099

File tree

5 files changed

+253
-183
lines changed

5 files changed

+253
-183
lines changed

arangod/RestServer/DatabaseFeature.cpp

Lines changed: 3 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@
4444
#include "Logger/LogMacros.h"
4545
#include "Logger/Logger.h"
4646
#include "Logger/LoggerStream.h"
47-
#include "Metrics/CounterBuilder.h"
48-
#include "Metrics/HistogramBuilder.h"
4947
#include "Metrics/MetricsFeature.h"
5048
#include "ProgramOptions/ProgramOptions.h"
5149
#include "ProgramOptions/Section.h"
@@ -54,6 +52,7 @@
5452
#include "Replication2/Version.h"
5553
#include "RestServer/DatabaseFeature.h"
5654
#include "RestServer/DatabasePathFeature.h"
55+
#include "RestServer/IOHeartbeatThread.h"
5756
#include "RestServer/QueryRegistryFeature.h"
5857
#include "StorageEngine/EngineSelectorFeature.h"
5958
#include "StorageEngine/StorageEngine.h"
@@ -271,153 +270,6 @@ void DatabaseManagerThread::run() {
271270
}
272271
}
273272

274-
struct HeartbeatTimescale {
275-
static arangodb::metrics::LogScale<double> scale() {
276-
return {10.0, 0.0, 1000000.0, 8};
277-
}
278-
};
279-
280-
DECLARE_HISTOGRAM(arangodb_ioheartbeat_duration, HeartbeatTimescale,
281-
"Time to execute the io heartbeat once [us]");
282-
DECLARE_COUNTER(arangodb_ioheartbeat_failures_total,
283-
"Total number of failures in IO heartbeat");
284-
DECLARE_COUNTER(arangodb_ioheartbeat_delays_total,
285-
"Total number of delays in IO heartbeat");
286-
287-
/// IO check thread main loop
288-
/// The purpose of this thread is to try to perform a simple IO write
289-
/// operation on the database volume regularly. We need visibility in
290-
/// production if IO is slow or not possible at all.
291-
IOHeartbeatThread::IOHeartbeatThread(Server& server,
292-
metrics::MetricsFeature& metricsFeature)
293-
: ServerThread<ArangodServer>(server, "IOHeartbeat"),
294-
_exeTimeHistogram(metricsFeature.add(arangodb_ioheartbeat_duration{})),
295-
_failures(metricsFeature.add(arangodb_ioheartbeat_failures_total{})),
296-
_delays(metricsFeature.add(arangodb_ioheartbeat_delays_total{})) {}
297-
298-
IOHeartbeatThread::~IOHeartbeatThread() { shutdown(); }
299-
300-
void IOHeartbeatThread::run() {
301-
auto& databasePathFeature = server().getFeature<DatabasePathFeature>();
302-
std::string testFilePath = FileUtils::buildFilename(
303-
databasePathFeature.directory(), "TestFileIOHeartbeat");
304-
std::string testFileContent = "This is just an I/O test.\n";
305-
306-
LOG_TOPIC("66665", DEBUG, Logger::ENGINES) << "IOHeartbeatThread: running...";
307-
308-
while (true) {
309-
try { // protect thread against any exceptions
310-
if (isStopping()) {
311-
// done
312-
break;
313-
}
314-
315-
LOG_TOPIC("66659", DEBUG, Logger::ENGINES)
316-
<< "IOHeartbeat: testing to write/read/remove " << testFilePath;
317-
// We simply write a file and sync it to disk in the database
318-
// directory and then read it and then delete it again:
319-
auto start1 = std::chrono::steady_clock::now();
320-
bool trouble = false;
321-
try {
322-
FileUtils::spit(testFilePath, testFileContent, true);
323-
} catch (std::exception const& exc) {
324-
++_failures;
325-
LOG_TOPIC("66663", INFO, Logger::ENGINES)
326-
<< "IOHeartbeat: exception when writing test file: " << exc.what();
327-
trouble = true;
328-
}
329-
auto finish = std::chrono::steady_clock::now();
330-
std::chrono::duration<double> dur = finish - start1;
331-
bool delayed = dur > std::chrono::seconds(1);
332-
if (trouble || delayed) {
333-
if (delayed) {
334-
++_delays;
335-
}
336-
LOG_TOPIC("66662", INFO, Logger::ENGINES)
337-
<< "IOHeartbeat: trying to write test file took "
338-
<< std::chrono::duration_cast<std::chrono::microseconds>(dur)
339-
.count()
340-
<< " microseconds.";
341-
}
342-
343-
// Read the file if we can reasonably assume it is there:
344-
if (!trouble) {
345-
auto start = std::chrono::steady_clock::now();
346-
try {
347-
std::string content = FileUtils::slurp(testFilePath);
348-
if (content != testFileContent) {
349-
LOG_TOPIC("66660", INFO, Logger::ENGINES)
350-
<< "IOHeartbeat: read content of test file was not as "
351-
"expected, found:'"
352-
<< content << "', expected: '" << testFileContent << "'";
353-
trouble = true;
354-
++_failures;
355-
}
356-
} catch (std::exception const& exc) {
357-
++_failures;
358-
LOG_TOPIC("66661", INFO, Logger::ENGINES)
359-
<< "IOHeartbeat: exception when reading test file: "
360-
<< exc.what();
361-
trouble = true;
362-
}
363-
auto finish = std::chrono::steady_clock::now();
364-
std::chrono::duration<double> dur = finish - start;
365-
bool delayed = dur > std::chrono::seconds(1);
366-
if (trouble || delayed) {
367-
if (delayed) {
368-
++_delays;
369-
}
370-
LOG_TOPIC("66669", INFO, Logger::ENGINES)
371-
<< "IOHeartbeat: trying to read test file took "
372-
<< std::chrono::duration_cast<std::chrono::microseconds>(dur)
373-
.count()
374-
<< " microseconds.";
375-
}
376-
377-
// And remove it again:
378-
start = std::chrono::steady_clock::now();
379-
ErrorCode err = FileUtils::remove(testFilePath);
380-
if (err != TRI_ERROR_NO_ERROR) {
381-
++_failures;
382-
LOG_TOPIC("66670", INFO, Logger::ENGINES)
383-
<< "IOHeartbeat: error when removing test file: " << err;
384-
trouble = true;
385-
}
386-
finish = std::chrono::steady_clock::now();
387-
dur = finish - start;
388-
delayed = dur > std::chrono::seconds(1);
389-
if (trouble || delayed) {
390-
if (delayed) {
391-
++_delays;
392-
}
393-
LOG_TOPIC("66671", INFO, Logger::ENGINES)
394-
<< "IOHeartbeat: trying to remove test file took "
395-
<< std::chrono::duration_cast<std::chrono::microseconds>(dur)
396-
.count()
397-
<< " microseconds.";
398-
}
399-
}
400-
401-
// Total duration and update histogram:
402-
dur = finish - start1;
403-
_exeTimeHistogram.count(static_cast<double>(
404-
std::chrono::duration_cast<std::chrono::microseconds>(dur).count()));
405-
406-
std::unique_lock<std::mutex> guard(_mutex);
407-
if (trouble) {
408-
// In case of trouble, we retry more quickly, since we want to
409-
// have a record when the trouble has actually stopped!
410-
_cv.wait_for(guard, checkIntervalTrouble);
411-
} else {
412-
_cv.wait_for(guard, checkIntervalNormal);
413-
}
414-
} catch (...) {
415-
}
416-
// next iteration
417-
}
418-
LOG_TOPIC("66664", DEBUG, Logger::ENGINES) << "IOHeartbeatThread: stopped.";
419-
}
420-
421273
DatabaseFeature::DatabaseFeature(Server& server)
422274
: ArangodFeature{server, *this} {
423275
setOptional(false);
@@ -430,6 +282,8 @@ DatabaseFeature::DatabaseFeature(Server& server)
430282
startsAfter<StorageEngineFeature>();
431283
}
432284

285+
DatabaseFeature::~DatabaseFeature() = default;
286+
433287
void DatabaseFeature::collectOptions(std::shared_ptr<ProgramOptions> options) {
434288
options->addSection("database", "database options");
435289

arangod/RestServer/DatabaseFeature.h

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,13 @@
2727
#include "Basics/Thread.h"
2828
#include "Containers/FlatHashMap.h"
2929
#include "Containers/FlatHashSet.h"
30-
#include "Metrics/Counter.h"
31-
#include "Metrics/Histogram.h"
32-
#include "Metrics/LogScale.h"
3330
#include "Replication2/Version.h"
3431
#include "RestServer/arangod.h"
3532
#include "Utils/DatabaseGuard.h"
3633
#include "Utils/VersionTracker.h"
3734
#include "VocBase/voc-types.h"
3835
#include "VocBase/Methods/Databases.h"
3936

40-
#include <condition_variable>
4137
#include <mutex>
4238
#include <memory>
4339
#include <vector>
@@ -48,6 +44,7 @@ namespace arangodb {
4844
namespace application_features {
4945
class ApplicationServer;
5046
}
47+
class IOHeartbeatThread;
5148
class LogicalCollection;
5249
} // namespace arangodb
5350

@@ -75,43 +72,14 @@ class DatabaseManagerThread final : public ServerThread<ArangodServer> {
7572
}
7673
};
7774

78-
class IOHeartbeatThread final : public ServerThread<ArangodServer> {
79-
public:
80-
IOHeartbeatThread(IOHeartbeatThread const&) = delete;
81-
IOHeartbeatThread& operator=(IOHeartbeatThread const&) = delete;
82-
83-
explicit IOHeartbeatThread(Server&, metrics::MetricsFeature& metricsFeature);
84-
~IOHeartbeatThread();
85-
86-
void run() override;
87-
void wakeup() {
88-
std::lock_guard<std::mutex> guard(_mutex);
89-
_cv.notify_one();
90-
}
91-
92-
private:
93-
// how long will the thread pause between iterations, in case of trouble:
94-
static constexpr std::chrono::duration<int64_t> checkIntervalTrouble =
95-
std::chrono::seconds(1);
96-
// how long will the thread pause between iterations:
97-
static constexpr std::chrono::duration<int64_t> checkIntervalNormal =
98-
std::chrono::seconds(15);
99-
100-
std::mutex _mutex;
101-
std::condition_variable _cv; // for waiting with wakeup
102-
103-
metrics::Histogram<metrics::LogScale<double>>& _exeTimeHistogram;
104-
metrics::Counter& _failures;
105-
metrics::Counter& _delays;
106-
};
107-
10875
class DatabaseFeature : public ArangodFeature {
10976
friend class DatabaseManagerThread;
11077

11178
public:
11279
static constexpr std::string_view name() noexcept { return "Database"; }
11380

11481
explicit DatabaseFeature(Server& server);
82+
~DatabaseFeature();
11583

11684
void collectOptions(std::shared_ptr<options::ProgramOptions>) override final;
11785
void validateOptions(std::shared_ptr<options::ProgramOptions>) override final;

0 commit comments

Comments
 (0)
0