pytorch
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
Lines changed: 39 additions & 63 deletions b/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
Lines changed: 39 additions & 63 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 21 additions & 52 deletions b/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 21 additions & 52 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.h
Lines changed: 0 additions & 10 deletions b/‎c10/cuda/CUDACachingAllocator.h
Lines changed: 0 additions & 10 deletions
diff --git a/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 0 additions & 8 deletions b/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 0 additions & 8 deletions
diff --git a/‎docs/source/cuda.rst
Lines changed: 0 additions & 9 deletions b/‎docs/source/cuda.rst
Lines changed: 0 additions & 9 deletions
diff --git a/‎test/distributed/_composable/fully_shard/test_fully_shard_util.py
Lines changed: 1 addition & 37 deletions b/‎test/distributed/_composable/fully_shard/test_fully_shard_util.py
Lines changed: 1 addition & 37 deletions
diff --git a/‎test/distributed/_tensor/test_dtensor_ops.py
BF1B
Lines changed: 0 additions & 5 deletions b/‎test/distributed/_tensor/test_dtensor_ops.py
BF1B
Lines changed: 0 additions & 5 deletions
@@ -285,64 +285,44 @@ struct Copy<dst_t, c10::complex<float>> {
   }
 };
 
-#define AT_DISPATCH_SOURCE_TYPES(TYPE, NAME, ...)                                                \
-  AT_DISPATCH_SWITCH(                                                                            \
-      TYPE,                                                                                      \
-      NAME,                                                                                      \
-      AT_PRIVATE_CASE_TYPE_USING_HINT(                                                           \
-          at::ScalarType::Byte,                                                                  \
-          src_t,                                                                                 \
-          __VA_ARGS__) AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Char, src_t, __VA_ARGS__) \
-          AT_PRIVATE_CASE_TYPE_USING_HINT(                                                       \
-              at::ScalarType::Long, src_t, __VA_ARGS__)                                          \
-              AT_PRIVATE_CASE_TYPE_USING_HINT(                                                   \
-                  at::ScalarType::Short, src_t, __VA_ARGS__)                                     \
-                  AT_PRIVATE_CASE_TYPE_USING_HINT(                                               \
-                      at::ScalarType::Int, src_t, __VA_ARGS__)                                   \
-                      AT_PRIVATE_CASE_TYPE_USING_HINT(                                           \
-                          at::ScalarType::Double, src_t, __VA_ARGS__)                            \
-                          AT_PRIVATE_CASE_TYPE_USING_HINT(                                       \
-                              at::ScalarType::Float, src_t, __VA_ARGS__)                         \
-                              AT_PRIVATE_CASE_TYPE_USING_HINT(                                   \
-                                  at::ScalarType::ComplexDouble,                                 \
-                                  src_t,                                                         \
-                                  __VA_ARGS__)                                                   \
-                                  AT_PRIVATE_CASE_TYPE_USING_HINT(                               \
-                                      at::ScalarType::ComplexFloat,                              \
-                                      src_t,                                                     \
-                                      __VA_ARGS__)                                               \
-                                      AT_PRIVATE_CASE_TYPE_USING_HINT(                           \
-                                          at::ScalarType::Half,                                  \
-                                          src_t,                                                 \
-                                          __VA_ARGS__)                                           \
-                                          AT_PRIVATE_CASE_TYPE_USING_HINT(                       \
-                                              at::ScalarType::BFloat16,                          \
-                                              src_t,                                             \
-                                              __VA_ARGS__)                                       \
-                                              AT_PRIVATE_CASE_TYPE_USING_HINT(                   \
-                                                  at::ScalarType::Bool,                          \
-                                                  src_t,                                         \
-                                                  __VA_ARGS__)                                   \
-                                                  AT_PRIVATE_CASE_TYPE_USING_HINT(               \
-                                                      at::ScalarType::                           \
-                                                          Float8_e4m3fn,                         \
-                                                      src_t,                                     \
-                                                      __VA_ARGS__)                               \
-                                                      AT_PRIVATE_CASE_TYPE_USING_HINT(           \
-                                                          at::ScalarType::                       \
-                                                              Float8_e4m3fnuz,                   \
-                                                          src_t,                                 \
-                                                          __VA_ARGS__)                           \
-                                                          AT_PRIVATE_CASE_TYPE_USING_HINT(       \
-                                                              at::ScalarType::                   \
-                                                                  Float8_e5m2,                   \
-                                                              src_t,                             \
-                                                              __VA_ARGS__)                       \
-                                                              AT_PRIVATE_CASE_TYPE_USING_HINT(   \
-                                                                  at::ScalarType::               \
-                                                                      Float8_e5m2fnuz,           \
-                                                                  src_t,                         \
-                                                                  __VA_ARGS__))
+#define AT_DISPATCH_SOURCE_TYPES(TYPE, NAME, ...)                                  \
+  AT_DISPATCH_SWITCH(                                                              \
+      TYPE,                                                                        \
+      NAME,                                                                        \
+      AT_PRIVATE_CASE_TYPE_USING_HINT(                                             \
+          at::ScalarType::Byte, src_t, __VA_ARGS__)                                \
+          AT_PRIVATE_CASE_TYPE_USING_HINT(                                         \
+              at::ScalarType::Char, src_t, __VA_ARGS__)                            \
+              AT_PRIVATE_CASE_TYPE_USING_HINT(                                     \
+                  at::ScalarType::Long, src_t, __VA_ARGS__)                        \
+                  AT_PRIVATE_CASE_TYPE_USING_HINT(                                 \
+                      at::ScalarType::Short, src_t, __VA_ARGS__)                   \
+                      AT_PRIVATE_CASE_TYPE_USING_HINT(                             \
+                          at::ScalarType::Int, src_t, __VA_ARGS__)                 \
+                          AT_PRIVATE_CASE_TYPE_USING_HINT(                         \
+                              at::ScalarType::Double, src_t, __VA_ARGS__)          \
+                              AT_PRIVATE_CASE_TYPE_USING_HINT(                     \
+                                  at::ScalarType::Float, src_t, __VA_ARGS__)       \
+                                  AT_PRIVATE_CASE_TYPE_USING_HINT(                 \
+                                      at::ScalarType::ComplexDouble,               \
+                                      src_t,                                       \
+                                      __VA_ARGS__)                                 \
+                                      AT_PRIVATE_CASE_TYPE_USING_HINT(             \
+                                          at::ScalarType::ComplexFloat,            \
+                                          src_t,                                   \
+                                          __VA_ARGS__)                             \
+                                          AT_PRIVATE_CASE_TYPE_USING_HINT(         \
+                                              at::ScalarType::Half,                \
+                                              src_t,                               \
+                                              __VA_ARGS__)                         \
+                                              AT_PRIVATE_CASE_TYPE_USING_HINT(     \
+                                                  at::ScalarType::BFloat16,        \
+                                                  src_t,                           \
+                                                  __VA_ARGS__)                     \
+                                                  AT_PRIVATE_CASE_TYPE_USING_HINT( \
+                                                      at::ScalarType::Bool,        \
+                                                      src_t,                       \
+                                                      __VA_ARGS__))
 
 namespace {
 
@@ -430,14 +410,10 @@ void foreach_tensor_copy_list_kernel_cuda_(
 
   std::vector<std::vector<at::Tensor>> tensor_lists{src.vec(), self.vec()};
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND7(
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       ScalarType::Half,
       ScalarType::BFloat16,
       ScalarType::Bool,
-      ScalarType::Float8_e4m3fn,
-      ScalarType::Float8_e4m3fnuz,
-      ScalarType::Float8_e5m2,
-      ScalarType::Float8_e5m2fnuz,
       self[0].scalar_type(),
       "foreach_tensor_copy",
       [&]() {
 
@@ -3127,28 +3127,12 @@ class DeviceCachingAllocator {
 // Returns whether to force all allocations to bypass the caching allocator and
 // go straight to cudaMalloc.  This setting is useful when debugging GPU memory
 // errors, since the caching allocator foils cuda-memcheck.
-static bool forceUncachedAllocator() {
-  // Allow either CUDA or HIP name for env var for maximum user comfort
-  // the CUDA env var avoids being hipified in cuda_to_hip_mappings.py
-  static const char* cuda_env = getenv("PYTORCH_NO_CUDA_MEMORY_CACHING");
-  static const char* rocm_env = getenv("PYTORCH_NO_HIP_MEMORY_CACHING");
-  static bool force_uncached = (cuda_env != nullptr) || (rocm_env != nullptr);
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
   return force_uncached;
 }
 
-static void* uncached_allocate(size_t size) {
-  void* devPtr = nullptr;
-  // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
-  // if someone tries to use forceUncachedAllocator while capturing.
-  C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
-  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-  if (C10_UNLIKELY(interp)) {
-    (*interp)->trace_gpu_memory_allocation(
-        c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
-  }
-  return devPtr;
-}
-
 static void uncached_delete(void* ptr) {
   if (TORCH_SDT_IS_ENABLED(free)) {
     TORCH_SDT_WITH_SEMAPHORE(free, ptr);
@@ -3166,9 +3150,6 @@ void local_raw_delete(void* ptr);
 
 class NativeCachingAllocator : public CUDAAllocator {
  private:
-  // allows this allocator to be turned on and off programmatically
-  bool enable_ = true;
-
   // Shard allocation region to have independent mutexes to reduce contention.
   static constexpr size_t kNumMutexShard = 67;
 
@@ -3343,14 +3324,6 @@ class NativeCachingAllocator : public CUDAAllocator {
       da->emptyCache(
10000
);
   }
 
-  void enable(bool value) override {
-    enable_ = value;
-  }
-
-  bool isEnabled() const override {
-    return enable_;
-  }
-
   void* getBaseAllocation(void* ptr, size_t* outSize) override {
     Block* block = get_allocated_block(ptr);
     if (!block) {
@@ -3485,9 +3458,17 @@ class NativeCachingAllocator : public CUDAAllocator {
     void (*deleteFunc)(void*) = &local_raw_delete;
     CUDAStream stream = cuda::getCurrentCUDAStream(device);
 
-    if (forceUncachedAllocator() || !isEnabled()) {
+    if (forceUncachedAllocator()) {
       deleteFunc = &uncached_delete;
-      devPtr = uncached_allocate(size);
+
+      // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
+      // if someone tries to use forceUncachedAllocator while capturing.
+      C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_memory_allocation(
+            c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
+      }
     } else {
       if (size != 0) {
         this->malloc(&devPtr, device, size, stream);
@@ -3501,7 +3482,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
   }
   DeleterFnPtr raw_deleter() const override {
-    if (forceUncachedAllocator() || !isEnabled()) {
+    if (forceUncachedAllocator()) {
       return &uncached_delete;
     } else {
       return &local_raw_delete;
@@ -3558,29 +3539,21 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
-    if (forceUncachedAllocator() || !isEnabled()) {
-      r = uncached_allocate(nbytes);
-    } else {
-      c10::DeviceIndex device = 0;
-      C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-      malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
-    }
+    malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
     return r;
   }
 
   void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
     if (nbytes == 0) {
       return nullptr;
     }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
-    if (forceUncachedAllocator() || !isEnabled()) {
-      r = uncached_allocate(nbytes);
-    } else {
-      c10::DeviceIndex device = 0;
-      C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-      malloc(&r, device, nbytes, stream);
-    }
+    malloc(&r, device, nbytes, stream);
     return r;
   }
 
@@ -3625,11 +3598,7 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   void raw_delete(void* ptr) override {
-    if (forceUncachedAllocator() || !isEnabled()) {
-      uncached_delete(ptr);
-    } else {
-      this->free(ptr);
-    }
+    this->free(ptr);
   }
 
   // In CUDA IPC, sender sends a tensor to receiver via shareIPCHandle,
 
@@ -206,8 +206,6 @@ class CUDAAllocator : public Allocator {
   virtual bool initialized() = 0;
   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
   virtual void emptyCache() = 0;
-  virtual void enable(bool value) = 0;
-  virtual bool isEnabled() const = 0;
   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
   virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
@@ -329,14 +327,6 @@ inline void emptyCache() {
   return get()->emptyCache();
 }
 
-inline void enable(bool value) {
-  return get()->enable(value);
-}
-
-inline bool isEnabled() {
-  return get()->isEnabled();
-}
-
 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
   return get()->cacheInfo(device, largestBlock);
 }
 
@@ -496,14 +496,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     }
   }
 
-  void enable(bool) override {
-    // cannot disable
-  }
-
-  bool isEnabled() const override {
-    return true;
-  }
-
   void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {
     // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
     // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
 
@@ -123,15 +123,6 @@ Memory management
      MemPool
      MemPoolContext
 
-.. currentmodule:: torch.cuda.memory
-
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    caching_allocator_enable
-
-.. currentmodule:: torch.cuda
 .. autoclass:: torch.cuda.use_mem_pool
 
 .. FIXME The following doesn't seem to exist. Is it supposed to?
 
@@ -2,8 +2,6 @@
 
 import sys
 
-import pytest
-
 import torch
 import torch.distributed as dist
 from torch.distributed._composable import fully_shard
@@ -14,14 +12,7 @@
 from torch.testing._internal.common_dist_composable import CompositeModel, UnitModule
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
-    TestCase,
-)
-
-
-is_cuda_8_9 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 
 if not dist.is_available():
@@ -121,32 +112,5 @@ def test_get_sharded_module_tree_with_module_name_to_fqns(self):
         )
 
 
-class TestUtilsSingleDevice(TestCase):
-    @pytest.mark.skipif(not is_cuda_8_9, reason="requires SM89 compatible machine")
-    def test_foreach_copy_float8(self):
-        for dtype in [
-            torch.float8_e4m3fn,
-            torch.float8_e4m3fnuz,
-            torch.float8_e5m2,
-            torch.float8_e5m2fnuz,
-        ]:
-            src = [torch.rand(2, 2, device="cuda").to(dtype)] * 2
-            dst = [torch.zeros(2, 2, device="cuda").to(dtype)
741A
] * 2
-            # needed by fully_shard(Float8Linear)
-            torch._foreach_copy_(src, dst)
-            for s, d in zip(src, dst):
-                self.assertEqual(s, d)
-            torch.equal(src[0], dst[0])
-
-            src = [torch.rand(2, 2, device="cpu").to(dtype)] * 2
-            dst = [torch.zeros(2, 2, device="cpu").to(dtype)] * 2
-            # needed by fully_shard(Float8Linear)
-            torch._foreach_copy_(src, dst)
-            for s, d in zip(src, dst):
-                # did not use torch.equal because
-                # "equal_cpu" not implemented
-                assert torch.all(s == d).item()
-
-
 if __name__ == "__main__":
     run_tests()
@@ -314,12 +314,8 @@ def wrapped(fn):
     xfail("nn.functional.huber_loss"),
     xfail("nn.functional.instance_norm"),
     xfail("nn.functional.interpolate", "area"),
-    xfail("nn.functional.interpolate", "bicubic"),
-    xfail("nn.functional.interpolate", "bilinear"),
-    xfail("nn.functional.interpolate", "linear"),
     xfail("nn.functional.interpolate", "nearest"),
     xfail("nn.functional.interpolate", "nearest-exact"),
-    xfail("nn.functional.interpolate", "trilinear"),
     xfail("nn.functional.leaky_relu"),
     xfail("nn.functional.linear"),
     xfail("nn.functional.local_response_norm"),
@@ -361,7 +357,6 @@ def wrapped(fn):
     xfail("nn.functional.triplet_margin_loss"),
     xfail("nn.functional.triplet_margin_with_distance_loss"),
     xfail("nn.functional.unfold"),
-    xfail("nn.functional.upsample_bilinear"),
     xfail("nn.functional.upsample_nearest"),
     xfail("nonzero"),
     xfail("normal"),
Original file line number	Diff line number	Diff line change
`@@ -496,14 +496,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {`
`496`	`496`	`}`
`497`	`497`	`}`
`498`	`498`
`499`		`- void enable(bool) override {`
`500`		`- // cannot disable`
`501`		`- }`
`502`		`-`
`503`		`- bool isEnabled() const override {`
`504`		`- return true;`
`505`		`- }`
`506`		`-`
`507`	`499`	`void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {`
`508`	`500`	`// The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.`
`509`	`501`	`// Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable`