|
5 | 5 | #include <ATen/cuda/CUDAContext.h>
|
6 | 6 | #include <ATen/cuda/CUDAEvent.h>
|
7 | 7 | #include <ATen/cuda/PeerToPeerAccess.h>
|
8 |
| -#include <c10/cuda/CUDAStream.h> |
9 | 8 | #include <ATen/native/Copy.h>
|
10 | 9 | #include <ATen/native/TensorIterator.h>
|
11 | 10 | #include <ATen/native/cuda/Loops.cuh>
|
12 | 11 |
|
| 12 | +#include <c10/cuda/CUDACachingAllocator.h> |
| 13 | +#include <c10/cuda/CUDAStream.h> |
| 14 | + |
13 | 15 | namespace at {
|
14 | 16 | namespace native {
|
15 | 17 |
|
@@ -41,7 +43,9 @@ void neg_conj_kernel_cuda(TensorIteratorBase &iter) {
|
41 | 43 | using namespace at::cuda;
|
42 | 44 |
|
43 | 45 | // device-to-device copy, does type conversion
|
44 |
| -void copy_device_to_device(TensorIterator& iter, bool non_blocking) { |
| 46 | +void copy_device_to_device(TensorIterator& iter, |
| 47 | + bool non_blocking, |
| 48 | + bool p2p_enabled) { |
45 | 49 | int64_t numel = iter.numel();
|
46 | 50 |
|
47 | 51 | // We can memcpy the memory if both tensors have the same type AND both
|
@@ -82,11 +86,29 @@ void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
|
82 | 86 | void *src = iter.data_ptr(1);
|
83 | 87 | size_t size = numel * iter.element_size(0);
|
84 | 88 | if (src != dst || src_device != dst_device) {
|
85 |
| - // Perform the copy |
86 |
| - AT_CUDA_CHECK(cudaMemcpyAsync( |
87 |
| - dst, src, size, |
88 |
| - cudaMemcpyDeviceToDevice, |
89 |
| - copy_stream)); |
| 89 | +#if CUDA_VERSION > 11040 |
| 90 | + // Due to bizarre cuda driver intricacies, copies of |
| 91 | + // cudaMallocAsynced memory between devices that aren't |
| 92 | + // peer-to-peer-capable need "cudaMemcpyPeerAsync". |
| 93 | + static bool using_cudaMallocAsync = std::strcmp(CUDACachingAllocator::allocatorBackend(), |
| 94 | + "cudaMallocAsync") == 0; |
| 95 | + bool needs_MemcpyPeer = (src_device != dst_device && |
| 96 | + using_cudaMallocAsync && |
| 97 | + !p2p_enabled); |
| 98 | + if (needs_MemcpyPeer) { |
| 99 | + AT_CUDA_CHECK(cudaMemcpyPeerAsync( |
| 100 | + dst, dst_device.index(), |
| 101 | + src, src_device.index(), |
| 102 | + size, copy_stream)); |
| 103 | + } else { |
| 104 | +#endif |
| 105 | + AT_CUDA_CHECK(cudaMemcpyAsync( |
| 106 | + dst, src, size, |
| 107 | + cudaMemcpyDeviceToDevice, |
| 108 | + copy_stream)); |
| 109 | +#if CUDA_VERSION > 11040 |
| 110 | + } |
| 111 | +#endif |
90 | 112 | }
|
91 | 113 | } else {
|
92 | 114 | if (same_neg) {
|
@@ -199,7 +221,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
|
199 | 221 |
|
200 | 222 | // Copy on GPU (or between GPUs)
|
201 | 223 | if (dst_device.is_cuda() && src_device.is_cuda()) {
|
202 |
| - copy_device_to_device(iter, non_blocking); |
| 224 | + copy_device_to_device(iter, non_blocking, p2p_enabled); |
203 | 225 | return;
|
204 | 226 | }
|
205 | 227 |
|
|
0 commit comments