10
10
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED (" -Wunused-parameter" )
11
11
namespace at {
12
12
13
+ using c10::CachingAllocator::Stat;
14
+ using c10::CachingAllocator::DurationStat;
15
+
13
16
/* *
14
17
* HostBlock is typically a fundamental memory block used in pinned memory. It
15
18
* is likely related to Event and Stream of device runtime. It is probably a
@@ -42,6 +45,60 @@ namespace {
42
45
constexpr size_t MAX_SIZE_INDEX = 64 ;
43
46
}
44
47
48
+ // Struct containing memory allocator summary statistics for host.
49
+ struct HostStats {
50
+ // COUNT: allocations requested by client code. Note that active
51
+ // count can be extracted by looking at current allocations
52
+ Stat allocation;
53
+ // COUNT: number of allocated segments from host memory allocation.
54
+ Stat segment;
55
+
56
+ // SUM: bytes allocated by this memory alocator. Note that active bytes
57
+ // can be extracted by looking at current bytes allocated
58
+ Stat allocated_bytes;
59
+ // SUM: bytes reserved by this memory allocator (both free and used)
60
+ Stat reserved_bytes;
61
+
62
+ // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
63
+ DurationStat host_alloc_time;
64
+
65
+ // SUM: time spent in cudaHostFree/cudaHostUnregister in microseconds
66
+ DurationStat host_free_time;
67
+
68
+ // COUNT: number of times cudaHostAlloc/cudaHostRegister was called because
69
+ // the request could not be satisfied from existing free blocks.
70
+ int64_t num_host_alloc = 0 ; // This is derived from segment or timing
71
+
72
+ // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
73
+ int64_t num_host_free = 0 ; // This is derived from segment or timing
74
+ };
75
+
76
+ // Struct containing memory allocator summary statistics for host, as they
77
+ // are staged for reporting. This is a temporary struct that is used to
78
+ // avoid locking the allocator while collecting stats.
79
+ struct alignas (64 ) HostStatsStaged {
80
+ std::mutex timing_mutex_;
81
+ // COUNT: allocations requested by client code resulting in a new segment/block allocation
82
+ // LOCK: access to this stat is protected by the allocator's blocks_mutex_
83
+ Stat allocation;
84
+ // SUM: bytes within active memory blocks, including blocks that are
85
+ // currently in the free list.
86
+ // LOCK: access to this stat is protected by the allocator's blocks_mutex_
87
+ Stat allocated_bytes;
88
+ // COUNT: number of allocations per bucket
89
+ // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
90
+ std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
91
+ // SUM: bytes of allocation per bucket
92
+ // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
93
+ std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
94
+ // SUM: time spent in cudaHostAlloc/cudaHostRegister
95
+ // LOCK: access to this stat is protected by the timing_mutex_
96
+ DurationStat host_alloc_time;
97
+ // SUM: time spent in cudaHostFree/cudaHostUnregister
98
+ // LOCK: access to this stat is protected by the timing_mutex_
99
+ DurationStat host_free_time;
100
+ };
101
+
45
102
/* *
46
103
* Note [HostAllocator design]
47
104
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -103,6 +160,13 @@ namespace {
103
160
*
104
161
* Note that this caching host allocator does not split larger allocations into
105
162
* smaller blocks, unlike the caching device allocator.
163
+ *
164
+ * In order to gather statistics about caching host allocator while minimally
165
+ * impacting performance, we use a HostStatsStaged struct to stage the stats
166
+ * before reporting them. This is done to avoid adding new locks to the allocator.
167
+ * Collecting stats is carefully done under existing locks, and then the staged
168
+ * stats are converted to the final stats when getStats is called. At that time
169
+ * we hold the same locks as empty_cache, to ensure the fidelity of the stats.
106
170
*/
107
171
108
172
template <
@@ -199,6 +263,8 @@ struct CachingHostAllocatorImpl {
199
263
auto index = size_index (block->size_ );
200
264
std::lock_guard<std::mutex> g (free_list_[index].mutex_ );
201
265
free_list_[index].list_ .push_back (block);
266
+ stats_.allocation_bucket_stats [index].decrease (1 );
267
+ stats_.allocated_bytes_bucket_stats [index].decrease (block->size_ );
202
268
} else {
203
269
// restore these events that record by used streams.
204
270
std::lock_guard<std::mutex> g (events_mutex_);
@@ -253,9 +319,12 @@ struct CachingHostAllocatorImpl {
253
319
254
320
std::vector<B*> blocks_to_remove (free_list_[i].list_ .begin (), free_list_[i].list_ .end ());
255
321
free_list_[i].list_ .clear ();
322
+
256
323
for (auto * block : blocks_to_remove) {
257
324
blocks_.erase (block);
258
325
ptr_to_block_.erase (block->ptr_ );
326
+ stats_.allocation .decrease (1 );
327
+ stats_.allocated_bytes .decrease (block->size_ );
259
328
free_block (block);
260
329
delete block;
261
330
}
@@ -274,11 +343,125 @@ struct CachingHostAllocatorImpl {
274
343
TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for copy_data" );
275
344
}
276
345
346
+ HostStats getStats () {
347
+ HostStats stats;
348
+
349
+ // To keep getStats lightweight we do *not* flush any available blocks
350
+ // into the free_list. This may skew the stats a bit.
351
+
352
+ auto add_bucket_stats = [](Stat& accumulator, const Stat& other) {
353
+ accumulator.allocated += other.allocated ;
354
+ accumulator.current += other.current ;
355
+ accumulator.freed += other.freed ;
356
+ // Since peaks are measured per bucket independently, we add them up
357
+ // to estimate the total peak. This is not strictly correct, but it is
358
+ // the best approximation we can get after the fact.
359
+ accumulator.peak += other.peak ;
360
+ };
361
+
362
+ // Accurate reading of memory stats requires concurrently holding both the
363
+ // free list mutexes and the blocks mutex. Previously, this was only done in
364
+ // empty_cache function.
365
+ for (size_t i = 0 ; i < free_list_.size (); ++i) {
366
+ std::lock (free_list_[i].mutex_ , blocks_mutex_);
367
+ std::lock_guard<std::mutex> gf (free_list_[i].mutex_ , std::adopt_lock);
368
+ std::lock_guard<std::mutex> gb (blocks_mutex_, std::adopt_lock);
369
+
370
+ // We collect the slow-path stats only once, since they are not collected
371
+ // per bucket (we pick index 0 arbitrarily). These are also all the host
372
+ // allocations, not taking into account caching and free lists.
373
+ if (i == 0 ) {
374
+ stats.segment = stats_.allocation ;
375
+ stats.reserved_bytes = stats_.allocated_bytes ;
376
+ stats.num_host_alloc = stats.segment .allocated ;
377
+ stats.num_host_free = stats.segment .freed ;
378
+ }
379
+
380
+ // Bucket stats need to be merged with the slow-path stats. We do this in
381
+ // a best effort manner, since we can't really replay the cached events per bucket.
382
+ add_bucket_stats (stats.allocation , stats_.allocation_bucket_stats [i]);
383
+ add_bucket_stats (stats.allocated_bytes , stats_.allocated_bytes_bucket_stats [i]);
384
+ }
385
+
386
+ // Get the timing stats
387
+ {
388
+ std::lock_guard<std::mutex> g (stats_.timing_mutex_ );
389
+
390
+ stats.host_alloc_time = stats_.host_alloc_time ;
391
+ stats.host_free_time = stats_.host_free_time ;
392
+ }
393
+
394
+ return stats;
395
+ }
396
+
397
+ void resetAccumulatedStats () {
398
+ // Reseting accumulated memory stats requires concurrently holding both the
399
+ // free list mutexes and the blocks mutex. Previously, this was only done in
400
+ // empty_cache function.
401
+ for (size_t i = 0 ; i < free_list_.size (); ++i) {
402
+ std::lock (free_list_[i].mutex_ , blocks_mutex_);
403
+ std::lock_guard<std::mutex> gf (free_list_[i].mutex_ , std::adopt_lock);
404
+ std::lock_guard<std::mutex> gb (blocks_mutex_, std::adopt_lock);
405
+
406
+ if (i == 0 ) {
407
+ stats_.allocation .reset_accumulated ();
408
+ stats_.allocated_bytes .reset_accumulated ();
409
+ }
410
+ stats_.allocation_bucket_stats [i].reset_accumulated ();
411
+ stats_.allocated_bytes_bucket_stats [i].reset_accumulated ();
412
+ }
413
+
414
+ // Also reset timing stats
415
+ {
416
+ std::lock_guard<std::mutex> g (stats_.timing_mutex_ );
417
+ stats_.host_alloc_time .reset_accumulated ();
418
+ stats_.host_free_time .reset_accumulated ();
419
+ }
420
+ }
421
+
422
+ void resetPeakStats () {
423
+ // Reseting peak memory stats requires concurrently holding both the
424
+ // free list mutexes and the blocks mutex. Previously, this was only done in
425
+ // empty_cache function.
426
+ for (size_t i = 0 ; i < free_list_.size (); ++i) {
427
+ std::lock (free_list_[i].mutex_ , blocks_mutex_);
428
+ std::lock_guard<std::mutex> gf (free_list_[i].mutex_ , std::adopt_lock);
429
+ std::lock_guard<std::mutex> gb (blocks_mutex_, std::adopt_lock);
430
+
431
+ if (i == 0 ) {
432
+ stats_.allocation .reset_peak ();
433
+ stats_.allocated_bytes .reset_peak ();
434
+ }
435
+ stats_.allocation_bucket_stats [i].reset_peak ();
436
+ stats_.allocated_bytes_bucket_stats [i].reset_peak ();
437
+ }
438
+
439
+ // Also reset timing stats
440
+ {
441
+ std::lock_guard<std::mutex> g (stats_.timing_mutex_ );
442
+ stats_.host_alloc_time .reset_peak ();
443
+ stats_.host_free_time .reset_peak ();
444
+ }
445
+ }
446
+
277
447
private:
278
448
virtual void add_allocated_block (B* block) {
279
449
std::lock_guard<std::mutex> g (blocks_mutex_);
280
450
blocks_.insert (block);
451
+ stats_.allocation .increase (1 );
452
+ stats_.allocated_bytes .increase (block->size_ );
281
453
ptr_to_block_.insert ({block->ptr_ , block});
454
+
455
+ // Unfortunately, we have to, on the slow path, quickly
456
+ // lock the bucket to record the allocation. This should
457
+ // be a rare event once the cache is warmed up.
458
+ auto size = block->size_ ;
459
+ auto index = size_index (size);
460
+ {
461
+ std::lock_guard<std::mutex> g (free_list_[index].mutex_ );
462
+ stats_.allocation_bucket_stats [index].increase (1 );
463
+ stats_.allocated_bytes_bucket_stats [index].increase (size);
464
+ }
282
465
}
283
466
284
467
virtual B* get_free_block (size_t size) {
@@ -288,6 +471,8 @@ struct CachingHostAllocatorImpl {
288
471
B* block = free_list_[index].list_ .back ();
289
472
free_list_[index].list_ .pop_back ();
290
473
block->allocated_ = true ;
474
+ stats_.allocation_bucket_stats [index].increase (1 );
475
+ stats_.allocated_bytes_bucket_stats [index].increase (size);
291
476
return block;
292
477
}
293
478
return nullptr ;
@@ -381,6 +566,8 @@ struct CachingHostAllocatorImpl {
381
566
auto index = size_index (block->size_ );
382
567
std::lock_guard<std::mutex> g (free_list_[index].mutex_ );
383
568
free_list_[index].list_ .push_back (block);
569
+ stats_.allocation_bucket_stats [index].decrease (1 );
570
+ stats_.allocated_bytes_bucket_stats [index].decrease (size);
384
571
if (size != -1 ) {
385
572
return ;
386
573
}
@@ -393,42 +580,45 @@ struct CachingHostAllocatorImpl {
393
580
return pool;
394
581
}
395
582
396
- /* These following functions are runtime-related. */
397
-
398
- // Allocate page-locked memory on the host.
399
- virtual void allocate_host_memory (size_t size, void ** ptr) {
400
- TORCH_CHECK_NOT_IMPLEMENTED (
401
- false , " Not implemented for allocate_host_memory" );
402
- }
403
-
404
- // Free block and release the pointer contained in block.
405
- virtual void free_block (B* block) {
406
- TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for free_block" );
407
- }
583
+ /* These following functions are runtime-related. */
408
584
409
- // Record an event on stream and store event into events.
410
- virtual void record_stream (std::optional<std::vector<E>>& events, S stream) {
411
- TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for record_stream" );
412
- }
585
+ // Allocate page-locked memory on the host.
586
+ virtual void allocate_host_memory (size_t size, void ** ptr) {
587
+ TORCH_CHECK_NOT_IMPLEMENTED (
588
+ false , " Not implemented for allocate_host_memory" );
589
+ }
413
590
414
- // Query event if it is completed .
415
- virtual bool query_event (E& event ) {
416
- TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for query_event " );
417
- }
591
+ // Free block and release the pointer contained in block .
592
+ virtual void free_block (B* block ) {
593
+ TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for free_block " );
594
+ }
418
595
419
- alignas (64 ) std::mutex blocks_mutex_;
420
- ska::flat_hash_set<B*> blocks_; // block list
421
- ska::flat_hash_map<void *, B*> ptr_to_block_;
596
+ // Record an event on stream and store event into events.
597
+ virtual void record_stream (std::optional<std::vector<E>>& events, S stream) {
598
+ TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for record_stream" );
599
+ }
422
600
423
- // We keep free list as a vector of free lists, one for each power of two
424
- // size. This allows us to quickly find a free block of the right size.
425
- // We use deque to store per size free list and guard the list with its own
426
- // mutex.
427
- alignas (64 ) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
601
+ // Query event if it is completed.
602
+ virtual bool query_event (E& event) {
603
+ TORCH_CHECK_NOT_IMPLEMENTED (false , " Not implemented for query_event" );
604
+ }
428
605
429
- alignas (64 ) std::mutex events_mutex_;
430
- std::deque<std::pair<E, B*>> events_; // event queue paired with block
431
- };
606
+ alignas (64 ) std::mutex blocks_mutex_;
607
+ ska::flat_hash_set<B*> blocks_; // block list
608
+ ska::flat_hash_map<void *, B*> ptr_to_block_;
609
+
610
+ // We keep free list as a vector of free lists, one for each power of two
611
+ // size. This allows us to quickly find a free block of the right size.
612
+ // We use deque to store per size free list and guard the list with its own
613
+ // mutex.
614
+ alignas (64 ) std::vector<FreeBlockList<B>> free_list_ =
615
+ std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
616
+
617
+ alignas (64 ) std::mutex events_mutex_;
618
+ std::deque<std::pair<E, B*>> events_; // event queue paired with block
619
+ protected:
620
+ alignas (64 ) HostStatsStaged stats_;
621
+ };
432
622
433
623
template <typename T>
434
624
struct CachingHostAllocatorInterface : public at ::Allocator {
@@ -456,6 +646,18 @@ struct CachingHostAllocatorInterface : public at::Allocator {
456
646
impl_->copy_data (dest, src, count);
457
647
}
458
648
649
+ HostStats getStats () {
650
+ return impl_->getStats ();
651
+ }
652
+
653
+ void resetAccumulatedStats () {
654
+ impl_->resetAccumulatedStats ();
655
+ }
656
+
657
+ void resetPeakStats () {
658
+ impl_->resetPeakStats ();
659
+ }
660
+
459
661
std::unique_ptr<T> impl_;
460
662
};
461
663
0 commit comments