pytorch
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 23 additions & 24 deletions b/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 23 additions & 24 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.h
Lines changed: 1 addition & 2 deletions b/‎c10/cuda/CUDACachingAllocator.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 53 additions & 43 deletions b/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 53 additions & 43 deletions
@@ -310,11 +310,13 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
 #endif
 }
 
-} // namespace
+} // anonymous namespace
+} // namespace Native
 
 // Environment config parser
 // Defined here, rather than its own .cpp file,
 // because parseArgs needs to know kLargeBuffer.
+// Defined outside namespace Native because it's not Native-specific.
 class CachingAllocatorConfig {
  public:
   static AllocatorBackend allocator_backend() {
@@ -379,11 +381,11 @@ class CachingAllocatorConfig {
           if (kv[0].compare("max_split_size_mb") == 0) {
             size_t val2 = stoi(kv[1]);
             TORCH_CHECK(
-                val2 > kLargeBuffer / (1024 * 1024),
+                val2 > Native::kLargeBuffer / (1024 * 1024),
                 "CachingAllocator option max_split_size_mb too small, must be > ",
-                kLargeBuffer / (1024 * 1024),
+                Native::kLargeBuffer / (1024 * 1024),
                 "");
-            val2 = std::max(val2, kLargeBuffer / (1024 * 1024));
+            val2 = std::max(val2, Native::kLargeBuffer / (1024 * 1024));
             val2 = std::min(
                 val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
             m_max_split_size = val2 * 1024 * 1024;
@@ -452,6 +454,8 @@ class CachingAllocatorConfig {
   }
 };
 
+namespace Native {
+
 class DeviceCachingAllocator {
  private:
   // lock around all operations
@@ -509,7 +513,7 @@ class DeviceCachingAllocator {
   DeviceCachingAllocator()
       : large_blocks(BlockComparator, /*is_small=*/false),
         small_blocks(BlockComparator, /*is_small=*/true) {
-    stats.max_split_size = CUDACachingAllocator::maxSplitSize();
+    stats.max_split_size = CachingAllocatorConfig::max_split_size();
   }
 
   // All public methods (except the above) acquire the allocator mutex.
@@ -676,7 +680,7 @@ class DeviceCachingAllocator {
       update_stat(stats.active[stat_type], 1);
       update_stat(stats.active_bytes[stat_type], block->size);
     });
-    if (block->size >= stats.max_split_size)
+    if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_allocations, 1);
 
     c10::reportMemoryUsageToProfiler(
@@ -707,7 +711,7 @@ class DeviceCachingAllocator {
       update_stat(stats.allocation[stat_type], -1);
       update_stat(stats.allocated_bytes[stat_type], -block->size);
     });
-    if (block->size >= stats.max_split_size)
+    if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_allocations, -1);
 
     if (!block->stream_uses.empty()) {
@@ -1133,7 +1137,7 @@ class DeviceCachingAllocator {
     if (block->pool->is_small) {
       return remaining >= kMinBlockSize;
     } else {
-      return (size < stats.max_split_size) &&
+      return (size < CachingAllocatorConfig::max_split_size()) &&
           (remaining > kSmallSize);
     }
   }
@@ -1162,11 +1166,11 @@ class DeviceCachingAllocator {
     if (it == pool.blocks.end() || (*it)->stream != p.stream())
       return false;
     // Do not return an oversized block for a large request
-    if ((p.size() < stats.max_split_size) &&
-        ((*it)->size >= stats.max_split_size))
+    if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= CachingAllocatorConfig::max_split_size()))
       return false;
     // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= stats.max_split_size) &&
+    if ((p.size() >= CachingAllocatorConfig::max_split_size()) &&
         ((*it)->size >= p.size() + kLargeBuffer))
       return false;
     p.block = *it;
@@ -1288,7 +1292,7 @@ class DeviceCachingAllocator {
       update_stat(stats.segment[stat_type], 1);
       update_stat(stats.reserved_bytes[stat_type], size);
     });
-    if (size >= stats.max_split_size)
+    if (size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_segments, 1);
 
     // p.block came from new, not cudaMalloc. It should not be nullptr here.
@@ -1300,13 +1304,13 @@ class DeviceCachingAllocator {
    * **/
   /** to satisfy the target size **/
   bool release_available_cached_blocks(const AllocParams& p) {
-    if (stats.max_split_size ==
+    if (CachingAllocatorConfig::max_split_size() ==
         std::numeric_limits<size_t>::max())
       return false;
     BlockPool& pool = *p.pool;
     Block key = p.search_key;
-    key.size = (key.size < stats.max_split_size)
-        ? stats.max_split_size
+    key.size = (key.size < CachingAllocatorConfig::max_split_size())
+        ? CachingAllocatorConfig::max_split_size()
         : key.size;
     auto it = pool.blocks.lower_bound(&key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
@@ -1318,7 +1322,7 @@ class DeviceCachingAllocator {
       --it; // Back up one item.  Now on the largest block for the correct
             // stream
       while ((totalReleased < key.size) &&
-             ((*it)->size >= stats.max_split_size) &&
+             ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
              ((*it)->stream == p.stream())) {
         auto cur = it;
         totalReleased += (*it)->size;
@@ -1383,7 +1387,7 @@ class DeviceCachingAllocator {
       update_stat(stats.segment[stat_type], -1);
       update_stat(stats.reserved_bytes[stat_type], -block->size);
     });
-    if (block->size >= stats.max_split_size)
+    if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_segments, -1);
 
     pool->blocks.erase(block);
@@ -1870,18 +1874,13 @@ std::shared_ptr<void> getIpcDevPtr(std::string handle) {
 // General caching allocator utilities
 
 // External config interface (declared in CUDACachingAllocator.h)
-// Should we bother having these two functions?
-// They are basically useless layers of indirection, but a minor
-// code-cleanliness benefit is they alleviate the need to define
+// This is a useless layer of indirection with a minor
+// code-cleanliness benefit: it alleviates the need to define
 // CachingAllocatorConfig itself in CUDACachingAllocator.h.
 AllocatorBackend allocatorBackend() {
   return CachingAllocatorConfig::allocator_backend();
 }
 
-size_t maxSplitSize() {
-  return CachingAllocatorConfig::max_split_size();
-}
-
 // Size pretty-printer
 inline std::string format_size(uint64_t size) {
   std::ostringstream os;
 
@@ -125,10 +125,9 @@ enum struct AllocatorBackend : uint8_t {
 };
 
 C10_CUDA_API AllocatorBackend allocatorBackend();
-C10_CUDA_API size_t maxSplitSize();
 
 // Size pretty-printer
-inline std::string format_size(uint64_t size);
+std::string format_size(uint64_t size);
 
 #define CUDA_ALLOCATOR_BACKEND_INTERFACE \
 C10_CUDA_API void* raw_alloc(size_t nbytes); \
 
@@ -45,8 +45,21 @@ struct UsageStream {
   }
 };
 
+bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
+  return (lhs.stream == rhs.stream) && (lhs.device == rhs.device);
+}
+
+struct UsageStreamHash {
+  size_t operator()(const UsageStream& us) const noexcept {
+    return std::hash<void*>{}(us.stream) + size_t(us.device);
+  }
+};
+
 struct PtrUsage {
-  std::vector<UsageStream> usage_streams;
+  // recorded_streams holds side usage streams added by record_stream calls.
+  // In other words, it does NOT include the original creation stream.
+  ska::flat_hash_set<UsageStream, UsageStreamHash> recorded_streams;
+  UsageStream creation_stream;
   uint64_t size;
   bool captured;
   PtrUsage(uint64_t s, bool c) : size(s), captured(c) {}
@@ -128,16 +141,6 @@ std::vector<size_t> pytorch_memory_limits;
  * carefully about the CPU overhead of remembering and rejoining
  * all free streams during capture. Maybe it's not a big deal.
  */
-bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
-  return (lhs.stream == rhs.stream) && (lhs.device == rhs.device);
-}
-
-struct UsageStreamHash {
-  size_t operator()(const UsageStream& us) const noexcept {
-    return std::hash<void*>{}(us.stream) + size_t(us.device);
-  }
-};
-
 std::unordered_set<UsageStream, UsageStreamHash> capture_free_streams;
 bool capture_underway = false;
 
@@ -180,26 +183,36 @@ inline void lazy_init_device(int device) {
   }
 }
 
+inline void sync_raw(cudaStream_t dependency, cudaStream_t dependent) {
+  // CUDACachingAllocator.cpp uses raw cuda events, as do we.
+  cudaEvent_t event;
+  C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+  C10_CUDA_CHECK(cudaEventRecord(event, dependency));
+  C10_CUDA_CHECK(cudaStreamWaitEvent(dependent, event));
+  C10_CUDA_CHECK(cudaEventDestroy(event));
+}
+
 // Assumes the caller holds general_mutex
 inline void free_impl(PtrInfo::iterator& it) {
   // Possible micro-optimization: If we did a value-copy here, we could move
   // ptr_info.erase(it) up here and drop the lock immediately.
-  const auto& usage_streams = it->second.usage_streams;
+  const auto& recorded_streams = it->second.recorded_streams;
+  const auto& creation_stream = it->second.creation_stream;
 
   // If the usage stream is a null (default) stream,
   // cudaFreeAsync infers the device from the ambient context,
   // so we need to set the right ambient context.
-  CUDAGuard g(usage_streams[0].device);
+  CUDAGuard g(creation_stream.device);
 
-  if (usage_streams.size() == 1) {
+  if (recorded_streams.size() == 0) {
     // ptr was only used on one stream, which must have been
     // the original allocation stream.
     // Frees ptr in the original allocation stream.
-    C10_CUDA_CHECK(cudaFreeAsync(it->first, usage_streams[0].stream));
+    C10_CUDA_CHECK(cudaFreeAsync(it->first, creation_stream.stream));
 
     if (C10_UNLIKELY(capture_underway)) {
       // See Note [Avoid dangling free streams during CUDA graph capture]
-      capture_free_streams.insert(usage_streams[0]);
+      capture_free_streams.insert(creation_stream);
     }
   } else {
     // ptr was used on many streams. We don't know which was the most recent.
@@ -212,23 +225,21 @@ inline void free_impl(PtrInfo::iterator& it) {
 
     // Retrieves the dummy "unifier" stream from the device
     // on which the pointer was originally allocated.
-    auto dummy_unifying_free_stream = dummy_unifying_free_streams[usage_streams[0].device];
-    TORCH_INTERNAL_ASSERT(dummy_unifying_free_stream.device == usage_streams[0].device);
+    auto dummy_unifying_free_stream = dummy_unifying_free_streams[creation_stream.device];
+    TORCH_INTERNAL_ASSERT(dummy_unifying_free_stream.device == creation_stream.device);
+
+    // we're already on creation_stream.device, no need to re-guard
+    sync_raw(creation_stream.stream, dummy_unifying_free_stream.stream);
 
     // The number of usage streams is typically small (low single digits)
-    for (const auto& usage_stream : usage_streams) {
+    for (const auto& recorded_stream : recorded_streams) {
       // Logic here accommodates the chance some of the usage streams were on other devices,
       // which is possible if some usage kernels accessed the memory via p2p.
 
       // cudaEventRecord requires that the input event and stream are on the same device.
-      CUDAGuard g_usage(usage_stream.device);
-
-      // CUDACachingAllocator.cpp uses raw cuda events, as do we.
-      cudaEvent_t event;
-      C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-      C10_CUDA_CHECK(cudaEventRecord(event, usage_stream.stream));
-      C10_CUDA_CHECK(cudaStreamWaitEvent(dummy_unifying_free_stream.stream, event));
-      C10_CUDA_CHECK(cudaEventDestroy(event));
+      CUDAGuard g_usage(recorded_stream.device);
+
+      sync_raw(recorded_stream.stream, dummy_unifying_free_stream.stream);
     }
 
     // Frees ptr in the dummy "unifier" stream.
@@ -240,10 +251,10 @@ inline void free_impl(PtrInfo::iterator& it) {
     // In theory, we could remove the need for the driver to do this tracking by e.g. replacing
     // cudaStreamWaitEvent(dummy_unifying_free_stream.stream, event);
     // with
-    // cudaStreamWaitEvent(usage_streams[0].stream, event);
-    // then cudaFreeAsyncing straight back into usage_streams[0];
-    // but this forces a potentially false dependency of usage_streams[0]
-    // on all the other usage_streams.
+    // cudaStreamWaitEvent(creation_stream.stream, event);
+    // then cudaFreeAsyncing straight back into creation_stream.stream,
+    // but this forces a potentially false dependency of creation_stream.stream
+    // on all the recorded_streams.
 
     if (C10_UNLIKELY(capture_underway)) {
       // See Note [Avoid dangling free streams during CUDA graph capture]
@@ -252,7 +263,7 @@ inline void free_impl(PtrInfo::iterator& it) {
     }
   }
 
-  pytorch_used_bytes[usage_streams[0].device] -= it->second.size;
+  pytorch_used_bytes[creation_stream.device] -= it->second.size;
 
   ptr_info.erase(it);
 }
@@ -263,8 +274,6 @@ void free(void* ptr) {
   auto it = ptr_info.find(ptr);
   TORCH_INTERNAL_ASSERT(it != ptr_info.end(),
                         "ptr not found in ptr_info");
-  TORCH_INTERNAL_ASSERT(it->second.usage_streams.size() != 0,
-                        "ptr's stream uses vector is empty");
 
   if (C10_UNLIKELY(capture_underway)) {
     if (!it->second.captured) {
@@ -354,7 +363,7 @@ void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
                         "address returned by cudaMallocAsync already exists "
                         "in ptr_info");
 
-  inserted.first->second.usage_streams.emplace_back(stream, device);
+  inserted.first->second.creation_stream = {stream, device};
 
   pytorch_used_bytes[device] += size;
 }
@@ -394,7 +403,7 @@ Allocator* get(void) {
 // just set up for later calls to init per-device pools based
 // on the current device each later call sees.
 void init(int dev_count) {
-  static bool called = [] {;
+  static bool called = [](int dev_count) {;
     // Are there external guarantees init will be called before
     // any of the allocator's other functions?
     // std::lock_guard<std::mutex> lk(general_mutex);
@@ -404,7 +413,7 @@ void init(int dev_count) {
     pytorch_used_bytes.resize(dev_count);
     pytorch_memory_limits.resize(dev_count);
     return true;
-  }();
+  }(dev_count);
 }
 
 static inline void assertValidDevice(int device) {
@@ -532,11 +541,14 @@ void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) {
   auto it = ptr_info.find(ptr.get());
   TORCH_INTERNAL_ASSERT(it != ptr_info.end(),
                         "ptr not found in ptr_info");
-  TORCH_INTERNAL_ASSERT(it->second.usage_streams.size() != 0,
-                        "ptr's stream uses vector is empty");
 
-  it->second.usage_streams.emplace_back(stream.stream(),
-                                        stream.device_index());
+  UsageStream to_record{stream.stream(), stream.device_index()};
+  if (to_record == it->second.creation_stream) {
+    TORCH_WARN("Called record_stream on tensor whose original creation stream "
+               "matches the recorded stream. This is unnecessary and has no effect.");
+  } else {
+    it->second.recorded_streams.insert(to_record);
+  }
 }
 
 std::mutex* getFreeMutex() {
@@ -700,8 +712,6 @@ void notifyCaptureEnded(int device, CaptureId_t graph_id) {
     auto it = ptr_info.find(ptr);
     TORCH_INTERNAL_ASSERT(it != ptr_info.end(),
                           "ptr not found in ptr_info");
-    TORCH_INTERNAL_ASSERT(it->second.usage_streams.size() != 0,
-                          "ptr's stream uses vector is empty");
     free_impl(it);
   }