8000 improvements for memory-mapped files (#14657) · arangodb/arangodb@972a53b · GitHub
[go: up one dir, main page]

Skip to content

Commit 972a53b

Browse files
jsteemannKVS85
andauthored
improvements for memory-mapped files (#14657)
Co-authored-by: Vadim <vadim@arangodb.com>
1 parent 7df9931 commit 972a53b

File tree

8 files changed

+231
-168
lines changed

8 files changed

+231
-168
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
v3.7.15 (XXXX-XX-XX)
22
--------------------
33

4+
* When creating Pregel memory-mapped files, create them with O_TMPFILE attribute
5+
on Linux so that files are guaranteed to vanish even if a process dies.
6+
47
* Preselect "create index in background" option when creating indexes in the web
58
UI. The "create index in background" option can be less intrusive because it
69
allows other write operations on the collection to proceed.

arangod/Pregel/Algos/PageRank.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ struct PRMasterContext : public MasterContext {
115115
}
116116

117117
void preApplication() override {
118-
LOG_TOPIC("e0598", DEBUG, Logger::PREGEL) << "Using threshold " << _threshold;
119-
};
118+
LOG_TOPIC("e0598", DEBUG, Logger::PREGEL) << "Using threshold " << _threshold << " for pagerank";
119+
}
120120

121121
bool postGlobalSuperstep() override {
122122
float const* diff = getAggregatedValue<float>(kConvergence);

arangod/Pregel/GraphStore.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -296,12 +296,15 @@ void moveAppend(std::vector<X>& src, std::vector<X>& dst) {
296296
template<typename M>
297297
std::unique_ptr<TypedBuffer<M>> createBuffer(WorkerConfig const& config, size_t cap) {
298298
if (config.useMemoryMaps()) {
299-
auto ptr = std::make_unique<MappedFileBuffer<M>>(cap);
299+
// prefix used for logging in TypedBuffer.h
300+
std::string logPrefix = "[job " + std::to_string(config.executionNumber()) + "] ";
301+
302+
auto ptr = std::make_unique<MappedFileBuffer<M>>(cap, logPrefix);
300303
ptr->sequentialAccess();
301304
return ptr;
302-
} else {
303-
return std::make_unique<VectorTypedBuffer<M>>(cap);
304305
}
306+
307+
return std::make_unique<VectorTypedBuffer<M>>(cap);
305308
}
306309
}
307310

arangod/Pregel/TypedBuffer.h

Lines changed: 215 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -23,29 +23,36 @@
2323
#ifndef ARANGODB_PREGEL_BUFFER_H
2424
#define ARANGODB_PREGEL_BUFFER_H 1
2525

26+
#include <cstddef>
27+
#include <errno.h>
28+
#include <fcntl.h>
29+
#include <sys/stat.h>
30+
#include <sys/types.h>
31+
32+
#ifdef TRI_HAVE_UNISTD_H
33+
#include <unistd.h>
34+
#endif
35+
36+
#ifdef __linux__
37+
#include <sys/mman.h>
38+
#endif
39+
2640
#include "Basics/Common.h"
2741

2842
#include "Basics/FileUtils.h"
2943
#include "Basics/PageSize.h"
3044
#include "Basics/Thread.h"
45+
#include "Basics/debugging.h"
46+
#include "Basics/error.h"
3147
#include "Basics/files.h"
3248
#include "Basics/memory-map.h"
49+
#include "Basics/operating-system.h"
3350
#include "Basics/system-functions.h"
3451
#include "Logger/LogMacros.h"
3552
#include "Logger/Logger.h"
3653
#include "Logger/LoggerStream.h"
3754
#include "Random/RandomGenerator.h"
3855

39-
#include <cstddef>
40-
41-
#ifdef TRI_HAVE_UNISTD_H
42-
#include <unistd.h>
43-
#endif
44-
45-
#ifdef __linux__
46-
#include <sys/mman.h>
47-
#endif
48-
4956
namespace arangodb {
5057
namespace pregel {
5158

@@ -152,36 +159,34 @@ class VectorTypedBuffer : public TypedBuffer<T> {
152159
/** Filesize limited by size_t, usually 2^32 or 2^64 */
153160
template <typename T>
154161
class MappedFileBuffer : public TypedBuffer<T> {
155-
std::string _filename; // underlying filename
156-
int _fd = -1; // underlying file descriptor
157-
void* _mmHandle; // underlying memory map object handle (windows only)
158-
size_t _mappedSize; // actually mapped size
162+
std::string _logPrefix; // prefix used for logging
163+
std::string _filename; // underlying filename
164+
int _fd; // underlying file descriptor
165+
bool _temporary; // O_TMPFILE used?
166+
void* _mmHandle; // underlying memory map object handle (windows only)
167+
size_t _mappedSize; // actually mapped size
159168

160169
public:
161-
explicit MappedFileBuffer(size_t capacity)
162-
: TypedBuffer<T>() {
163-
TRI_ASSERT(capacity > 0u);
164-
double tt = TRI_microtime();
165-
int64_t tt2 = arangodb::RandomGenerator::interval((int64_t)0LL, (int64_t)0x7fffffffffffffffLL);
170+
MappedFileBuffer(MappedFileBuffer const& other) = delete;
171+
MappedFileBuffer& operator=(MappedFileBuffer const& other) = delete;
172+
173+
explicit MappedFileBuffer(size_t capacity, std::string const& logPrefix)
174+
: TypedBuffer<T>(),
175+
_logPrefix(logPrefix),
176+
_fd(-1),
177+
_temporary(false),
178+
_mmHandle(nullptr),
179+
_mappedSize(sizeof(T) * capacity) {
166180

167-
std::string file = "pregel-" +
168-
std::to_string(uint64_t(Thread::currentProcessId())) + "-" +
169-
std::to_string(uint64_t(tt)) + "-" +
170-
std::to_string(tt2) +
171-
".mmap";
172-
this->_filename = basics::FileUtils::buildFilename(TRI_GetTempPath(), file);
173-
174-
_mappedSize = sizeof(T) * capacity;
175181
size_t pageSize = PageSize::getValue();
176182
TRI_ASSERT(pageSize >= 256);
177183
// use multiples of page-size
178184
_mappedSize = (size_t)(((_mappedSize + pageSize - 1) / pageSize) * pageSize);
179185

180-
LOG_TOPIC("358e3", DEBUG, Logger::PREGEL) << "creating mmap file '" << _filename << "' with capacity " << capacity << " and size " << _mappedSize;
181-
182-
_fd = TRI_CreateDatafile(_filename, _mappedSize);
186+
_fd = createFile(_mappedSize);
187+
183188
if (_fd < 0) {
184-
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_SYS_ERROR, std::string("pregel cannot create mmap file '") + _filename + "': " + TRI_last_error());
189+
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_SYS_ERROR, std::string("pregel cannot create mmap ") + label() + ": " + TRI_last_error());
185190
}
186191

187192
// memory map the data
@@ -199,16 +204,16 @@ class MappedFileBuffer : public TypedBuffer<T> {
199204
TRI_CLOSE(_fd);
200205
_fd = -1;
201206

202-
// remove empty file
203-
TRI_UnlinkFile(_filename.c_str());
204-
205-
LOG_TOPIC("54dfb", ERR, arangodb::Logger::FIXME)
206-
<< "cannot memory map file '" << _filename << "': '"
207+
LOG_TOPIC("54dfb", ERR, arangodb::Logger::PREGEL) << _logPrefix
208+
<< "cannot memory map " << label() << ": '"
207209
<< TRI_errno_string(res) << "'";
208-
LOG_TOPIC("1a034", ERR, arangodb::Logger::FIXME)
210+
LOG_TOPIC("1a034", ERR, arangodb::Logger::PREGEL) << _logPrefix
209211
<< "The database directory might reside on a shared folder "
210212
"(VirtualBox, VMWare) or an NFS-mounted volume which does not "
211213
"allow memory mapped files.";
214+
215+
removeFile();
216+
212217
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, std::string("cannot memory map file '") +
213218
_filename + "': '" + TRI_errno_string(res) + "'");
214219
}
@@ -229,14 +234,6 @@ class MappedFileBuffer : public TypedBuffer<T> {
229234
TRI_MMFileAdvise(this->_begin, _mappedSize, TRI_MADVISE_RANDOM);
230235
}
231236

232-
void willNeed() {
233-
TRI_MMFileAdvise(this->_begin, _mappedSize, TRI_MADVISE_WILLNEED);
234-
}
235-
236-
void dontNeed() {
237-
TRI_MMFileAdvise(this->_begin, _mappedSize, TRI_MADVISE_DONTNEED);
238-
}
239-
240237
/// close file
241238
// cppcheck-suppress virtualCallInConstructor
242239
void close() override {
@@ -245,7 +242,7 @@ class MappedFileBuffer : public TypedBuffer<T> {
245242
return;
246243
}
247244

248-
LOG_TOPIC("45530", DEBUG, Logger::PREGEL) << "closing mmap file '" << _filename << "'";
245+
LOG_TOPIC("45530", DEBUG, Logger::PREGEL) << _logPrefix << "closing mmap " << label();
249246

250247
// destroy all elements in the buffer
251248
for (auto* p = this->_begin; p != this->_end; ++p) {
@@ -255,18 +252,18 @@ class MappedFileBuffer : public TypedBuffer<T> {
255252
int res = TRI_UNMMFile(this->_begin, _mappedSize, _fd, &_mmHandle);
256253
if (res != TRI_ERROR_NO_ERROR) {
257254
// leave file open here as it will still be memory-mapped
258-
LOG_TOPIC("ab7be", ERR, arangodb::Logger::FIXME) << "munmap failed with: " << res;
255+
LOG_TOPIC("ab7be", ERR, arangodb::Logger::PREGEL) << _logPrefix << "munmap failed with: " << res;
259256
}
260257
if (_fd != -1) {
261258
TRI_ASSERT(_fd >= 0);
262259
res = TRI_CLOSE(_fd);
263260
if (res != TRI_ERROR_NO_ERROR) {
264-
LOG_TOPIC("00e1d", ERR, arangodb::Logger::FIXME)
265-
<< "unable to close pregel mapped file '" << _filename << "': " << res;
261+
LOG_TOPIC("00e1d", ERR, arangodb::Logger::PREGEL) << _logPrefix
262+
<< "unable to close pregel mapped " << label() << ": " << res;
266263
}
267264

268-
// remove file
269-
TRI_UnlinkFile(this->_filename.c_str());
265+
removeFile();
266+
_filename.clear();
270267
}
271268

272269
this->_begin = nullptr;
@@ -277,6 +274,173 @@ class MappedFileBuffer : public TypedBuffer<T> {
277274

278275
/// true, if file successfully opened
279276
bool isValid() const { return this->_begin != nullptr; }
277+
278+
private:
279+
std::string label() const {
280+
if (_temporary) {
281+
return "temporary file in " + _filename;
282+
}
283+
return "file " + _filename;
284+
}
285+
286+
std::string buildFilename(bool temporary) const {
287+
if (temporary) {
288+
// only need a path
289+
return TRI_GetTempPath();
290+
}
291+
292+
double tt = TRI_microtime();
293+
int64_t tt2 = arangodb::RandomGenerator::interval((int64_t)0LL, (int64_t)0x7fffffffffffffffLL);
294+
295+
std::string file = "pregel-" +
296+
std::to_string(uint64_t(Thread::currentProcessId())) + "-" +
297+
std::to_string(uint64_t(tt)) + "-" +
298+
std::to_string(tt2) +
299+
".mmap";
300+
return basics::FileUtils::buildFilename(TRI_GetTempPath(), file);
301+
}
302+
303+
void removeFile() const {
304+
if (!_temporary && !_filename.empty()) {
305+
TRI_UnlinkFile(_filename.c_str());
306+
}
307+
}
308+
309+
/// @brief creates a new datafile
310+
/// returns the file descriptor or -1 if the file cannot be created
311+
int createFile(size_t maximalSize) {
312+
TRI_ERRORBUF;
313+
314+
#ifdef _WIN32
315+
bool temporary = false;
316+
#else
317+
bool temporary = true;
318+
#endif
319+
320+
// open the file
321+
int fd = -1;
322+
if (temporary) {
323+
_temporary = true;
324+
_filename = buildFilename(_temporary);
325+
// try creating a temporary file with O_TMPFILE first.
326+
// this may be unsupported.
327+
// in that case, we will fall back to creating a regular (non-temp) file.
328+
fd = TRI_CREATE(_filename.c_str(), O_EXCL | O_RDWR | TRI_O_CLOEXEC | TRI_NOATIME | TRI_O_TMPFILE,
329+
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
330+
// if fd is < 0, we will try without O_TMPFILE below.
331+
}
332+
333+
if (fd < 0) {
334+
_temporary = false;
335+
_filename = buildFilename(_temporary);
336+
fd = TRI_CREATE(_filename.c_str(), O_CREAT | O_EXCL | O_RDWR | TRI_O_CLOEXEC | TRI_NOATIME,
337+
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
338+
}
339+
340+
LOG_TOPIC("358e3", DEBUG, Logger::PREGEL) << _logPrefix
341+
<< "creating mmap " << label() << " of " << _mappedSize << " bytes capacity";
342+
343+
TRI_IF_FAILURE("CreateDatafile1") {
344+
// intentionally fail
345+
TRI_CLOSE(fd);
346+
fd = -1;
347+
errno = ENOSPC;
348+
}
349+
350+
if (fd < 0) {
351+
if (errno == ENOSPC) {
352+
TRI_set_errno(TRI_ERROR_ARANGO_FILESYSTEM_FULL);
353+
LOG_TOPIC("f7530", ERR, arangodb::Logger::PREGEL) << _logPrefix
354+
<< "cannot create " << label() << ": " << TRI_last_error();
355+
} else {
356+
TRI_SYSTEM_ERROR();
357+
358+
TRI_set_errno(TRI_ERROR_SYS_ERROR);
359+
LOG_TOPIC("53a75", ERR, arangodb::Logger::PREGEL) << _logPrefix
360+
<< "cannot create " << label() << ": " << TRI_GET_ERRORBUF;
361+
}
362+
363+
_filename.clear();
364+
return -1;
365+
}
366+
367+
// no fallocate present, or at least pretend it's not there...
368+
int res = 1;
369+
370+
#ifdef __linux__
371+
#ifdef FALLOC_FL_ZERO_RANGE
372+
// try fallocate
373+
res = fallocate(fd, FALLOC_FL_ZERO_RANGE, 0, maximalSize);
374+
#endif
375+
#endif
376+
377+
// cppcheck-suppress knownConditionTrueFalse
378+
if (res != 0) {
379+
// either fallocate failed or it is not there...
380+
381+
// create a buffer filled with zeros
382+
static constexpr size_t nullBufferSize = 4096;
383+
char nullBuffer[nullBufferSize];
384+
memset(&nullBuffer[0], 0, nullBufferSize);
385+
386+
// fill file with zeros from buffer
387+
size_t writeSize = nullBufferSize;
388+
size_t written = 0;
389+
while (written < maximalSize) {
390+
if (writeSize + written > maximalSize) {
391+
writeSize = maximalSize - written;
392+
}
393+
394+
ssize_t writeResult = TRI_WRITE(fd, &nullBuffer[0], static_cast<TRI_write_t>(writeSize));
395+
396+
TRI_IF_FAILURE("CreateDatafile2") {
397+
// intentionally fail
398+
writeResult = -1;
399+
errno = ENOSPC;
400+
}
401+
402+
if (writeResult < 0) {
403+
if (errno == ENOSPC) {
404+
TRI_set_errno(TRI_ERROR_ARANGO_FILESYSTEM_FULL);
405+
LOG_TOPIC("449cf", ERR, arangodb::Logger::PREGEL) << _logPrefix
406+
<< "cannot create " << label() << ": " << TRI_last_error();
407+
} else {
408+
TRI_SYSTEM_ERROR();
409+
TRI_set_errno(TRI_ERROR_SYS_ERROR);
410+
LOG_TOPIC("2c4a6", ERR, arangodb::Logger::PREGEL) << _logPrefix
411+
<< "cannot create " << label() << ": " << TRI_GET_ERRORBUF;
412+
}
413+
414+
TRI_CLOSE(fd);
415+
removeFile();
416+
417+
return -1;
418+
}
419+
420+
written += static_cast<size_t>(writeResult);
421+
}
422+
}
423+
424+
// go back to offset 0
425+
TRI_lseek_t offset = TRI_LSEEK(fd, (TRI_lseek_t)0, SEEK_SET);
426+
427+
if (offset == (TRI_lseek_t)-1) {
428+
TRI_SYSTEM_ERROR();
429+
TRI_set_errno(TRI_ERROR_SYS_ERROR);
430+
TRI_CLOSE(fd);
431+
432+
LOG_TOPIC("dfc52", ERR, arangodb::Logger::PREGEL) << _logPrefix
433+
<< "cannot seek in " << label() << ": " << TRI_GET_ERRORBUF;
434+
435+
removeFile();
436+
_filename.clear();
437+
438+
return -1;
439+
}
440+
441+
return fd;
442+
}
443+
280444
};
281445
} // namespace pregel
282446
} // namespace arangodb

0 commit comments

Comments
 (0)
0