8000 Added (gauge) metric "rocksdb_read_only" (#14470) · RtiWeb/arangodb@5e917d4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5e917d4

Browse files
jsteemanngoedderz
andauthored
Added (gauge) metric "rocksdb_read_only" (arangodb#14470)
Co-authored-by: Tobias Gödderz <tobias@arangodb.com>
1 parent 170b13f commit 5e917d4

File tree

5 files changed

+55
-2
lines changed

5 files changed

+55
-2
lines changed

CHANGELOG

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,13 @@ devel
7070
metrics (including the metrics starting with `arangodb_process` prefix) were
7171
not returned by agent instances.
7272

73+
* APM-107: Added metric "rocksdb_read_only" to determine whether RocksDB is
74+
currently in read-only mode due to a background error. The metric will have
75+
a value of "1" if RocksDB is in read-only mode and "0" if RocksDB is in
76+
normal operations mode. If the metric value is "1" it means all writes into
77+
RocksDB will fail, so inspecting the logfiles and acting on the actual error
78+
situation is required.
79+
7380
* Fix potential memleak in Pregel conductor garbage collection.
7481

7582
* Added a retry loop for arangorestore during the initial connection phase. The
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: rocksdb_read_only
2+
introducedIn: "3.8.1"
3+
help: |
4+
RocksDB metric "background-errors"
5+
unit: number
6+
type: gauge
7+
category: RocksDB
8+
complexity: simple
9+
exposedBy:
10+
- dbserver
11+
- agent
12+
- single
13+
description: |
14+
This metric indicates whether RocksDB currently is in read-only
15+
mode, due to a background error. If RocksDB is in read-only mode,
16+
this metric will have a value of "1". When in read-only mode, all
17+
writes into RocksDB will fail. When RocksDB is in normal operations
18+
mode, this metric will have a value of "0".
19+
troubleshoot: |
20+
If this value is non-zero, it means that all write operations in
21+
RocksDB will fail until the RocksDB background error is resolved.
22+
The arangod server logfile should show more details about the exact
23+
errors that are happening, so logs should be inspected first.
24+
RocksDB can set a background error when some I/O operation fails.
25+
This is often due to disk space usage issues, so often either freeing
26+
disk space or increasing the disk capacity will help.
27+
Under some conditions, RocksDB can automatically resume from the
28+
background error and go back into normal operations. However, if the
29+
background error happens during certain RocksDB operations, it cannot
30+
resume operations automatically, so the instance will need a manual
31+
restart after the error condition is removed.

arangod/RocksDBEngine/Listeners/RocksDBBackgroundErrorListener.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ void RocksDBBackgroundErrorListener::OnBackgroundError(rocksdb::BackgroundErrorR
3939
}
4040

4141
if (!_called.exchange(true)) {
42-
std::string operation = "unknown";
42+
char const* operation = "unknown";
4343
switch (reason) {
4444
case rocksdb::BackgroundErrorReason::kFlush: {
4545
operation = "flush";
@@ -61,8 +61,16 @@ void RocksDBBackgroundErrorListener::OnBackgroundError(rocksdb::BackgroundErrorR
6161

6262
LOG_TOPIC("fae2c", ERR, Logger::ROCKSDB)
6363
<< "RocksDB encountered a background error during a " << operation << " operation: "
64-
<< (status != nullptr ? status->ToString() : "unknown error") << "; The database will be put in read-only mode, and subsequent write errors are likely. It is advised to shut down this instance, resolve the error offline and then restart it.";
64+
<< (status != nullptr ? status->ToString() : "unknown error")
65+
<< "; The database will be put in read-only mode, and subsequent write errors are likely. It is advised to shut down this instance, resolve the error offline and then restart it.";
6566
}
6667
}
6768

69+
void RocksDBBackgroundErrorListener::OnErrorRecoveryCompleted(rocksdb::Status /* old_bg_error */) {
70+
_called.store(false, std::memory_order_relaxed);
71+
72+
LOG_TOPIC("8ff56", WARN, Logger::ROCKSDB)
73+
<< "RocksDB resuming operations after background error";
74+
}
75+
6876
} // namespace arangodb

arangod/RocksDBEngine/Listeners/RocksDBBackgroundErrorListener.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class RocksDBBackgroundErrorListener : public rocksdb::EventListener {
3737

3838
void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status* error) override;
3939

40+
void OnErrorRecoveryCompleted(rocksdb::Status /* old_bg_error */) override;
41+
4042
bool called() const { return _called.load(std::memory_order_relaxed); }
4143

4244
private:

arangod/RocksDBEngine/RocksDBEngine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2406,6 +2406,7 @@ DECLARE_GAUGE(rocksdb_total_disk_space, uint64_t, "rocksdb_total_disk_space");
24062406
DECLARE_GAUGE(rocksdb_total_inodes, uint64_t, "rocksdb_total_inodes");
24072407
DECLARE_GAUGE(rocksdb_total_sst_files_size, uint64_t, "rocksdb_total_sst_files_size");
24082408
DECLARE_GAUGE(rocksdb_engine_throttle_bps, uint64_t, "rocksdb_engine_throttle_bps");
2409+
DECLARE_GAUGE(rocksdb_read_only, uint64_t, "rocksdb_read_only");
24092410

24102411
void RocksDBEngine::getStatistics(std::string& result, bool v2) const {
24112412
VPackBuilder stats;
@@ -2618,6 +2619,10 @@ void RocksDBEngine::getStatistics(VPackBuilder& builder, bool v2) const {
26182619
}
26192620
}
26202621

2622+
if (_errorListener) {
2623+
builder.add("rocksdb.read-only", VPackValue(_errorListener->called() ? 1 : 0));
2624+
}
2625+
26212626
builder.close();
26222627
}
26232628

0 commit comments

Comments
 (0)
0