pytorch
diff --git a/‎aten/src/ATen/cuda/PeerToPeerAccess.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/cuda/PeerToPeerAccess.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/Copy.cu
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/Copy.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 1 addition & 1 deletion b/‎c10/cuda/CUDACachingAllocator.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 1 addition & 1 deletion b/‎c10/cuda/CUDAMallocAsyncAllocator.cpp
Lines changed: 1 addition & 1 deletion
@@ -39,7 +39,7 @@ bool get_p2p_access(int dev, int dev_to_access) {
               dev_to_access, " is not a device");
   TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
 
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
   static bool using_cudaMallocAsync = std::strcmp(CUDACachingAllocator::allocatorBackend(),
                                                   "cudaMallocAsync") == 0;
 #endif
@@ -55,7 +55,7 @@ bool get_p2p_access(int dev, int dev_to_access) {
   int access = 0;
   C10_CUDA_CHECK(cudaDeviceCanAccessPeer(&access, dev, dev_to_access));
   if (access) {
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
     if (using_cudaMallocAsync) {
       // cudaMallocAsync pools are unaffected by cudaDeviceEnablePeerAccess.
       // We need pool-specific enablement. See
@@ -78,7 +78,7 @@ bool get_p2p_access(int dev, int dev_to_access) {
       } else {
         C10_CUDA_CHECK(err);
       }
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
     }
 #endif
     cache = 1;
 
@@ -93,7 +93,7 @@ void copy_device_to_device(TensorIterator& iter,
     void *src = iter.data_ptr(1);
     size_t size = numel * iter.element_size(0);
     if (src != dst || src_device != dst_device) {
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
       // Due to bizarre cuda driver intricacies, copies of
       // cudaMallocAsynced memory between devices that aren't
       // peer-to-peer-capable need "cudaMemcpyPeerAsync".
@@ -113,7 +113,7 @@ void copy_device_to_device(TensorIterator& iter,
             dst, src, size,
             cudaMemcpyDeviceToDevice,
             copy_stream));
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
       }
 #endif
     }
 
@@ -1660,7 +1660,7 @@ void parseArgs() {
           m_allocator_backend = kv[1];
           used_cudaMallocAsync = (kv[1].compare("cudaMallocAsync") == 0);
           if (used_cudaMallocAsync) {
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
             int version;
             C10_CUDA_CHECK(cudaDriverGetVersion(&version));
             TORCH_CHECK(version >= 11040,
 
@@ -15,7 +15,7 @@ namespace cuda {
 namespace CUDACachingAllocator {
 namespace CudaMallocAsync {
 
-#if CUDA_VERSION > 11040
+#if CUDA_VERSION >= 11040
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.