pytorch
diff --git a/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 233 additions & 31 deletions b/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 233 additions & 31 deletions
@@ -10,6 +10,9 @@
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
 namespace at {
 
+using c10::CachingAllocator::Stat;
+using c10::CachingAllocator::DurationStat;
+
 /**
  * HostBlock is typically a fundamental memory block used in pinned memory. It
  * is likely related to Event and Stream of device runtime. It is probably a
@@ -42,6 +45,60 @@ namespace {
   constexpr size_t MAX_SIZE_INDEX = 64;
 }
 
+// Struct containing memory allocator summary statistics for host.
+struct HostStats {
+  // COUNT: allocations requested by client code. Note that active
+  // count can be extracted by looking at current allocations
+  Stat allocation;
+  // COUNT: number of allocated segments from host memory allocation.
+  Stat segment;
+
+  // SUM: bytes allocated by this memory alocator. Note that active bytes
+  // can be extracted by looking at current bytes allocated
+  Stat allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  Stat reserved_bytes;
+
+  // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
+  DurationStat host_alloc_time;
+
+  // SUM: time spent in cudaHostFree/cudaHostUnregister in microseconds
+  DurationStat host_free_time;
+
+  // COUNT: number of times cudaHostAlloc/cudaHostRegister was called because
+  // the request could not be satisfied from existing free blocks.
+  int64_t num_host_alloc = 0; // This is derived from segment or timing
+  // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
+  int64_t num_host_free = 0; // This is derived from segment or timing
+};
+
+// Struct containing memory allocator summary statistics for host, as they
+// are staged for reporting. This is a temporary struct that is used to
+// avoid locking the allocator while collecting stats.
+struct alignas(64) HostStatsStaged {
+  std::mutex timing_mutex_;
+  // COUNT: allocations requested by client code resulting in a new segment/block allocation
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocation;
+  // SUM: bytes within active memory blocks, including blocks that are
+  // currently in the free list.
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocated_bytes;
+  // COUNT: number of allocations per bucket
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: bytes of allocation per bucket
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: time spent in cudaHostAlloc/cudaHostRegister
+  // LOCK: access to this stat is protected by the timing_mutex_
+  DurationStat host_alloc_time;
+  // SUM: time spent in cudaHostFree/cudaHostUnregister
+  // LOCK: access to this stat is protected by the timing_mutex_
+  DurationStat host_free_time;
+};
+
 /**
  * Note [HostAllocator design]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -103,6 +160,13 @@ namespace {
  *
  * Note that this caching host allocator does not split larger allocations into
  * smaller blocks, unlike the caching device allocator.
+ *
+ * In order to gather statistics about caching host allocator while minimally
+ * impacting performance, we use a HostStatsStaged struct to stage the stats
+ * before reporting them. This is done to avoid adding new locks to the allocator.
+ * Collecting stats is carefully done under existing locks, and then the staged
+ * stats are converted to the final stats when getStats is called. At that time
+ * we hold the same locks as empty_cache, to ensure the fidelity of the stats.
  */
 
 template <
@@ -199,6 +263,8 @@ struct CachingHostAllocatorImpl {
       auto index = size_index(block->size_);
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
       free_list_[index].list_.push_back(block);
+      stats_.allocation_bucket_stats[index].decrease(1);
+      stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
     } else {
       // restore these events that record by used streams.
       std::lock_guard<std::mutex> g(events_mutex_);
@@ -253,9 +319,12 @@ struct CachingHostAllocatorImpl {
 
       std::vector<B*> blocks_to_remove(free_list_[i].list_.begin(), free_list_[i].list_.end());
       free_list_[i].list_.clear();
+
       for (auto* block : blocks_to_remove) {
         blocks_.erase(block);
         ptr_to_block_.erase(block->ptr_);
+        stats_.allocation.decrease(1);
+        stats_.allocated_bytes.decrease(block->size_);
         free_block(block);
         delete block;
       }
@@ -274,11 +343,125 @@ struct CachingHostAllocatorImpl {
   }
 
+  HostStats getStats() {
+    HostStats stats;
+
+    // To keep getStats lightweight we do *not* flush any available blocks
+    // into the free_list. This may skew the stats a bit.
+
+    auto add_bucket_stats = [](Stat& accumulator, const Stat& other) {
+      accumulator.allocated += other.allocated;
+      accumulator.current += other.current;
+      accumulator.freed += other.freed;
+      // Since peaks are measured per bucket independently, we add them up
+      // to estimate the total peak. This is not strictly correct, but it is
+      // the best approximation we can get after the fact.
+      accumulator.peak += other.peak;
+    };
+
+    // Accurate reading of memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+
+      // We collect the slow-path stats only once, since they are not collected
+      // per bucket (we pick index 0 arbitrarily). These are also all the host
+      // allocations, not taking into account caching and free lists.
+      if (i == 0) {
+        stats.segment = stats_.allocation;
+        stats.reserved_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.segment.allocated;
+        stats.num_host_free = stats.segment.freed;
+      }
+
+      // Bucket stats need to be merged with the slow-path stats. We do this in
+      // a best effort manner, since we can't really replay the cached events per bucket.
+      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
+      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
+    }
+
+    // Get the timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+
+      stats.host_alloc_time = stats_.host_alloc_time;
+      stats.host_free_time = stats_.host_free_time;
+    }
+
+    return stats;
+  }
+
+  void resetAccumulatedStats() {
+    // Reseting accumulated memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+
+      if (i == 0) {
+        stats_.allocation.reset_accumulated();
+        stats_.allocated_bytes.reset_accumulated();
+      }
+      stats_.allocation_bucket_stats[i].reset_accumulated();
+      stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
+    }
+
+    // Also reset timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      stats_.host_alloc_time.reset_accumulated();
+      stats_.host_free_time.reset_accumulated();
+    }
+  }
+
+  void resetPeakStats() {
+    // Reseting peak memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+
+      if (i == 0) {
+        stats_.allocation.reset_peak();
+        stats_.allocated_bytes.reset_peak();
+      }
+      stats_.allocation_bucket_stats[i].reset_peak();
+      stats_.allocated_bytes_bucket_stats[i].reset_peak();
+    }
+
+    // Also reset timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      stats_.host_alloc_time.reset_peak();
+      stats_.host_free_time.reset_peak();
+    }
+  }
+
  private:
   virtual void add_allocated_block(B* block) {
     std::lock_guard<std::mutex> g(blocks_mutex_);
     blocks_.insert(block);
+    stats_.allocation.increase(1);
+    stats_.allocated_bytes.increase(block->size_);
     ptr_to_block_.insert({block->ptr_, block});
+
+    // Unfortunately, we have to, on the slow path, quickly
+    // lock the bucket to record the allocation. This should
+    // be a rare event once the cache is warmed up.
+    auto size = block->size_;
+    auto index = size_index(size);
+    {
+      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+      stats_.allocation_bucket_stats[index].increase(1);
+      stats_.allocated_bytes_bucket_stats[index].increase(size);
+    }
   }
 
   virtual B* get_free_block(size_t size) {
@@ -288,6 +471,8 @@ struct CachingHostAllocatorImpl {
       B* block = free_list_[index].list_.back();
       free_list_[index].list_.pop_back();
       block->allocated_ = true;
+      stats_.allocation_bucket_stats[index].increase(1);
+      stats_.allocated_bytes_bucket_stats[index].increase(size);
       return block;
     }
     return nullptr;
@@ -381,6 +566,8 @@ struct CachingHostAllocatorImpl {
         auto index = size_index(block->size_);
         std::lock_guard<std::mutex> g(free_list_[index].mutex_);
         free_list_[index].list_.push_back(block);
+        stats_.allocation_bucket_stats[index].decrease(1);
+        stats_.allocated_bytes_bucket_stats[index].decrease(size);
         if (size != -1) {
           return;
         }
@@ -393,42 +580,45 @@ struct CachingHostAllocatorImpl {
     return pool;
   }
 
-    /* These following functions are runtime-related. */
-
-    // Allocate page-locked memory on the host.
-    virtual void allocate_host_memory(size_t size, void** ptr) {
-      TORCH_CHECK_NOT_IMPLEMENTED(
-          false, "Not implemented for allocate_host_memory");
-    }
-
-    // Free block and release the pointer contained in block.
-    virtual void free_block(B* block) {
-      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
-    }
+  /* These following functions are runtime-related. */
 
-    // Record an event on stream and store event into events.
-    virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
-      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
-    }
+  // Allocate page-locked memory on the host.
+  virtual void allocate_host_memory(size_t size, void** ptr) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "Not implemented for allocate_host_memory");
+  }
 
-    // Query event if it is completed.
-    virtual bool query_event(E& event) {
-      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
-    }
+  // Free block and release the pointer contained in block.
+  virtual void free_block(B* block) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
+  }
 
-    alignas(64) std::mutex blocks_mutex_;
-    ska::flat_hash_set<B*> blocks_; // block list
-    ska::flat_hash_map<void*, B*> ptr_to_block_;
+  // Record an event on stream and store event into events.
+  virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
+  }
 
-    // We keep free list as a vector of free lists, one for each power of two
-    // size. This allows us to quickly find a free block of the right size.
-    // We use deque to store per size free list and guard the list with its own
-    // mutex.
-    alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+  // Query event if it is completed.
+  virtual bool query_event(E& event) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
+  }
 
-    alignas(64) std::mutex events_mutex_;
-    std::deque<std::pair<E, B*>> events_; // event queue paired with block
-  };
+  alignas(64) std::mutex blocks_mutex_;
+  ska::flat_hash_set<B*> blocks_; // block list
+  ska::flat_hash_map<void*, B*> ptr_to_block_;
+
+  // We keep free list as a vector of free lists, one for each power of two
+  // size. This allows us to quickly find a free block of the right size.
+  // We use deque to store per size free list and guard the list with its own
+  // mutex.
+  alignas(64) std::vector<FreeBlockList<B>> free_list_ =
+      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+
+  alignas(64) std::mutex events_mutex_;
+  std::deque<std::pair<E, B*>> events_; // event queue paired with block
+protected:
+  alignas(64) HostStatsStaged stats_;
+};
 
 template <typename T>
 struct CachingHostAllocatorInterface : public at::Allocator {
@@ -456,6 +646,18 @@ struct CachingHostAllocatorInterface : public at::Allocator {
     impl_->copy_data(dest, src, count);
   }
 
+  HostStats getStats() {
+    return impl_->getStats();
+  }
+
+  void resetAccumulatedStats() {
+    impl_->resetAccumulatedStats();
+  }
+
+  void resetPeakStats() {
+    impl_->resetPeakStats();
+  }
+
   std::unique_ptr<T> impl_;
 };