8000 Initial implementation of host memory stats (#147660) · pytorch/pytorch@c65ee72 · GitHub
[go: up one dir, main page]

Skip to content

Commit c65ee72

Browse files
mradmilapytorchmergebot
authored andcommitted
Initial implementation of host memory stats (#147660)
This is an initial attempt to provide some statistics for the pinned host memory allocations flowing through CachingHostAllocator. Many times in the past we have had inexplicable slowdowns that would be much easier to diagnose if we had some host memory characteristics. This change tries very hard not to disrupt the initial design of the allocator, and it uses existing locking mechanism, whenever possible, to gather statistics "for free". Only deviation from that is on the "slow path" where we incur CUDA calls anyway, so taking a short lock is not going to hurt the performance much, especially in the steady state where most allocations will come from cache. As mentioned before, this is the first PR, to introduce the concept and to see if it fits the right paradigm. We can always add more later. Metrics that would require more involved changes to the code base and locks, like requested memory, have been punted for now. I also tried to reuse the Stat structure used in CUDA caching allocator, in order to maintain symmetry. Pull Request resolved: #147660 Approved by: https://github.com/ngimel
1 parent 70c5edb commit c65ee72

18 files changed

+797
-88
lines changed

aten/src/ATen/core/CachingHostAllocator.h

Lines changed: 233 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
1111
namespace at {
1212

13+
using c10::CachingAllocator::Stat;
14+
using c10::CachingAllocator::DurationStat;
15+
1316
/**
1417
* HostBlock is typically a fundamental memory block used in pinned memory. It
1518
* is likely related to Event and Stream of device runtime. It is probably a
@@ -42,6 +45,60 @@ namespace {
4245
constexpr size_t MAX_SIZE_INDEX = 64;
4346
}
4447

48+
// Struct containing memory allocator summary statistics for host.
49+
struct HostStats {
50+
// COUNT: allocations requested by client code. Note that active
51+
// count can be extracted by looking at current allocations
52+
Stat allocation;
53+
// COUNT: number of allocated segments from host memory allocation.
54+
Stat segment;
55+
56+
// SUM: bytes allocated by this memory alocator. Note that active bytes
57+
// can be extracted by looking at current bytes allocated
58+
Stat allocated_bytes;
59+
// SUM: bytes reserved by this memory allocator (both free and used)
60+
Stat reserved_bytes;
61+
62+
// SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
63+
DurationStat host_alloc_time;
64+
65+
// SUM: time spent in cudaHostFree/cudaHostUnregister in microseconds
66+
DurationStat host_free_time;
67+
68+
// COUNT: number of times cudaHostAlloc/cudaHostRegister was called because
69+
// the request could not be satisfied from existing free blocks.
70+
int64_t num_host_alloc = 0; // This is derived from segment or timing
71+
72+
// COUNT: number of times cudaHostFree/cudaHostUnregister was called.
73+
int64_t num_host_free = 0; // This is derived from segment or timing
74+
};
75+
76+
// Struct containing memory allocator summary statistics for host, as they
77+
// are staged for reporting. This is a temporary struct that is used to
78+
// avoid locking the allocator while collecting stats.
79+
struct alignas(64) HostStatsStaged {
80+
std::mutex timing_mutex_;
81+
// COUNT: allocations requested by client code resulting in a new segment/block allocation
82+
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
83+
Stat allocation;
84+
// SUM: bytes within active memory blocks, including blocks that are
85+
// currently in the free list.
86+
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
87+
Stat allocated_bytes;
88+
// COUNT: number of allocations per bucket
89+
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
90+
std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
91+
// SUM: bytes of allocation per bucket
92+
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
93+
std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
94+
// SUM: time spent in cudaHostAlloc/cudaHostRegister
95+
// LOCK: access to this stat is protected by the timing_mutex_
96+
DurationStat host_alloc_time;
97+
// SUM: time spent in cudaHostFree/cudaHostUnregister
98+
// LOCK: access to this stat is protected by the timing_mutex_
99+
DurationStat host_free_time;
100+
};
101+
45102
/**
46103
* Note [HostAllocator design]
47104
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -103,6 +160,13 @@ namespace {
103160
*
104161
* Note that this caching host allocator does not split larger allocations into
105162
* smaller blocks, unlike the caching device allocator.
163+
*
164+
* In order to gather statistics about caching host allocator while minimally
165+
* impacting performance, we use a HostStatsStaged struct to stage the stats
166+
* before reporting them. This is done to avoid adding new locks to the allocator.
167+
* Collecting stats is carefully done under existing locks, and then the staged
168+
* stats are converted to the final stats when getStats is called. At that time
169+
* we hold the same locks as empty_cache, to ensure the fidelity of the stats.
106170
*/
107171

108172
template <
@@ -199,6 +263,8 @@ struct CachingHostAllocatorImpl {
199263
auto index = size_index(block->size_);
200264
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
201265
free_list_[index].list_.push_back(block);
266+
stats_.allocation_bucket_stats[index].decrease(1);
267+
stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
202268
} else {
203269
// restore these events that record by used streams.
204270
std::lock_guard<std::mutex> g(events_mutex_);
@@ -253,9 +319,12 @@ struct CachingHostAllocatorImpl {
253319

254320
std::vector<B*> blocks_to_remove(free_list_[i].list_.begin(), free_list_[i].list_.end());
255321
free_list_[i].list_.clear();
322+
256323
for (auto* block : blocks_to_remove) {
257324
blocks_.erase(block);
258325
ptr_to_block_.erase(block->ptr_);
326+
stats_.allocation.decrease(1);
327+
stats_.allocated_bytes.decrease(block->size_);
259328
free_block(block);
260329
delete block;
261330
}
@@ -274,11 +343,125 @@ struct CachingHostAllocatorImpl {
274343
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
275344
}
276345

346+
HostStats getStats() {
347+
HostStats stats;
348+
349+
// To keep getStats lightweight we do *not* flush any available blocks
350+
// into the free_list. This may skew the stats a bit.
351+
352+
auto add_bucket_stats = [](Stat& accumulator, const Stat& other) {
353+
accumulator.allocated += other.allocated;
354+
accumulator.current += other.current;
355+
accumulator.freed += other.freed;
356+
// Since peaks are measured per bucket independently, we add them up
357+
// to estimate the total peak. This is not strictly correct, but it is
358+
// the best approximation we can get after the fact.
359+
accumulator.peak += other.peak;
360+
};
361+
362+
// Accurate reading of memory stats requires concurrently holding both the
363+
// free list mutexes and the blocks mutex. Previously, this was only done in
364+
// empty_cache function.
365+
for (size_t i = 0; i < free_list_.size(); ++i) {
366+
std::lock(free_list_[i].mutex_, blocks_mutex_);
367+
std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
368+
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
369+
370+
// We collect the slow-path stats only once, since they are not collected
371+
// per bucket (we pick index 0 arbitrarily). These are also all the host
372+
// allocations, not taking into account caching and free lists.
373+
if (i == 0) {
374+
stats.segment = stats_.allocation;
375+
stats.reserved_bytes = stats_.allocated_bytes;
376+
stats.num_host_alloc = stats.segment.allocated;
377+
stats.num_host_free = stats.segment.freed;
378+
}
379+
380+
// Bucket stats need to be merged with the slow-path stats. We do this in
381+
// a best effort manner, since we can't really replay the cached events per bucket.
382+
add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
383+
add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
384+
}
385+
386+
// Get the timing stats
387+
{
388+
std::lock_guard<std::mutex> g(stats_.timing_mutex_);
389+
390+
stats.host_alloc_time = stats_.host_alloc_time;
391+
stats.host_free_time = stats_.host_free_time;
392+
}
393+
394+
return stats;
395+
}
396+
397+
void resetAccumulatedStats() {
398+
// Reseting accumulated memory stats requires concurrently holding both the
399+
// free list mutexes and the blocks mutex. Previously, this was only done in
400+
// empty_cache function.
401+
for (size_t i = 0; i < free_list_.size(); ++i) {
402+
std::lock(free_list_[i].mutex_, blocks_mutex_);
403+
std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
404+
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
405+
406+
if (i == 0) {
407+
stats_.allocation.reset_accumulated();
408+
stats_.allocated_bytes.reset_accumulated();
409+
}
410+
stats_.allocation_bucket_stats[i].reset_accumulated();
411+
stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
412+
}
413+
414+
// Also reset timing stats
415+
{
416+
std::lock_guard<std::mutex> g(stats_.timing_mutex_);
417+
stats_.host_alloc_time.reset_accumulated();
418+
stats_.host_free_time.reset_accumulated();
419+
}
420+
}
421+
422+
void resetPeakStats() {
423+
// Reseting peak memory stats requires concurrently holding both the
424+
// free list mutexes and the blocks mutex. Previously, this was only done in
425+
// empty_cache function.
426+
for (size_t i = 0; i < free_list_.size(); ++i) {
427+
std::lock(free_list_[i].mutex_, blocks_mutex_);
428+
std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
429+
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
430+
431+
if (i == 0) {
432+
stats_.allocation.reset_peak();
433+
stats_.allocated_bytes.reset_peak();
434+
}
435+
stats_.allocation_bucket_stats[i].reset_peak();
436+
stats_.allocated_bytes_bucket_stats[i].reset_peak();
437+
}
438+
439+
// Also reset timing stats
440+
{
441+
std::lock_guard<std::mutex> g(stats_.timing_mutex_);
442+
stats_.host_alloc_time.reset_peak();
443+
stats_.host_free_time.reset_peak();
444+
}
445+
}
446+
277447
private:
278448
virtual void add_allocated_block(B* block) {
279449
std::lock_guard<std::mutex> g(blocks_mutex_);
280450
blocks_.insert(block);
451+
stats_.allocation.increase(1);
452+
stats_.allocated_bytes.increase(block->size_);
281453
ptr_to_block_.insert({block->ptr_, block});
454+
455+
// Unfortunately, we have to, on the slow path, quickly
456+
// lock the bucket to record the allocation. This should
457+
// be a rare event once the cache is warmed up.
458+
auto size = block->size_;
459+
auto index = size_index(size);
460+
{
461+
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
462+
stats_.allocation_bucket_stats[index].increase(1);
463+
stats_.allocated_bytes_bucket_stats[index].increase(size);
464+
}
282465
}
283466

284467
virtual B* get_free_block(size_t size) {
@@ -288,6 +471,8 @@ struct CachingHostAllocatorImpl {
288471
B* block = free_list_[index].list_.back();
289472
free_list_[index].list_.pop_back();
290473
block->allocated_ = true;
474+
stats_.allocation_bucket_stats[index].increase(1);
475+
stats_.allocated_bytes_bucket_stats[index].increase(size);
291476
return block;
292477
}
293478
return nullptr;
@@ -381,6 +566,8 @@ struct CachingHostAllocatorImpl {
381566
auto index = size_index(block->size_);
382567
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
383568
free_list_[index].list_.push_back(block);
569+
stats_.allocation_bucket_stats[index].decrease(1);
570+
stats_.allocated_bytes_bucket_stats[index].decrease(size);
384571
if (size != -1) {
385572
return;
386573
}
@@ -393,42 +580,45 @@ struct CachingHostAllocatorImpl {
393580
return pool;
394581
}
395582

396-
/* These following functions are runtime-related. */
397-
398-
// Allocate page-locked memory on the host.
399-
virtual void allocate_host_memory(size_t size, void** ptr) {
400-
TORCH_CHECK_NOT_IMPLEMENTED(
401-
false, "Not implemented for allocate_host_memory");
402-
}
403-
404-
// Free block and release the pointer contained in block.
405-
virtual void free_block(B* block) {
406-
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
407-
}
583+
/* These following functions are runtime-related. */
408584

409-
// Record an event on stream and store event into events.
410-
virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
411-
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
412-
}
585+
// Allocate page-locked memory on the host.
586+
virtual void allocate_host_memory(size_t size, void** ptr) {
587+
TORCH_CHECK_NOT_IMPLEMENTED(
588+
false, "Not implemented for allocate_host_memory");
589+
}
413590

414-
// Query event if it is completed.
415-
virtual bool query_event(E& event) {
416-
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
417-
}
591+
// Free block and release the pointer contained in block.
592+
virtual void free_block(B* block) {
593+
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
594+
}
418595

419-
alignas(64) std::mutex blocks_mutex_;
420-
ska::flat_hash_set<B*> blocks_; // block list
421-
ska::flat_hash_map<void*, B*> ptr_to_block_;
596+
// Record an event on stream and store event into events.
597+
virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
598+
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
599+
}
422600

423-
// We keep free list as a vector of free lists, one for each power of two
424-
// size. This allows us to quickly find a free block of the right size.
425-
// We use deque to store per size free list and guard the list with its own
426-
// mutex.
427-
alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
601+
// Query event if it is completed.
602+
virtual bool query_event(E& event) {
603+
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
604+
}
428605

429-
alignas(64) std::mutex events_mutex_;
430-
std::deque<std::pair<E, B*>> events_; // event queue paired with block
431-
};
606+
alignas(64) std::mutex blocks_mutex_;
607+
ska::flat_hash_set<B*> blocks_; // block list
608+
ska::flat_hash_map<void*, B*> ptr_to_block_;
609+
610+
// We keep free list as a vector of free lists, one for each power of two
611+
// size. This allows us to quickly find a free block of the right size.
612+
// We use deque to store per size free list and guard the list with its own
613+
// mutex.
614+
alignas(64) std::vector<FreeBlockList<B>> free_list_ =
615+
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
616+
617+
alignas(64) std::mutex events_mutex_;
618+
std::deque<std::pair<E, B*>> events_; // event queue paired with block
619+
protected:
620+
alignas(64) HostStatsStaged stats_;
621+
};
432622

433623
template <typename T>
434624
struct CachingHostAllocatorInterface : public at::Allocator {
@@ -456,6 +646,18 @@ struct CachingHostAllocatorInterface : public at::Allocator {
456646
impl_->copy_data(dest, src, count);
457647
}
458648

649+
HostStats getStats() {
650+
return impl_->getStats();
651+
}
652+
653+
void resetAccumulatedStats() {
654+
impl_->resetAccumulatedStats();
655+
}
656+
657+
void resetPeakStats() {
658+
impl_->resetPeakStats();
659+
}
660+
459661
std::unique_ptr<T> impl_;
460662
};
461663

0 commit comments

Comments
 (0)
0