Add Python-facing torch.cuda.get_allocator_backend()

mcarilli · mcarilli · commit 3d23053a2924 · 2022-03-29T20:55:40.000-06:00
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
@@ -111,6 +111,7 @@ Memory management
      reset_peak_memory_stats
      caching_allocator_alloc
      caching_allocator_delete
+     get_allocator_backend
 .. FIXME The following doesn't seem to exist. Is it supposed to?
    https://github.com/pytorch/pytorch/issues/27785
    .. autofunction:: reset_max_memory_reserved
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -365,6 +365,8 @@ Available options:
   implementation, and ``cudaMallocAsync``, which uses
   `CUDA's built-in asynchronous allocator`_.
   ``cudaMallocAsync`` requires CUDA 11.4 or newer. The default is ``native``.
+  ``backend`` applies to all devices used by the process, and can't be
+  specified on a per-device basis.
 * ``max_split_size_mb`` prevents the native allocator
   from splitting blocks larger than this size (in MB). This can reduce
   fragmentation and may allow some borderline workloads to complete without
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -44,9 +44,7 @@
     print('CUDA not available, skipping tests', file=sys.stderr)
     TestCase = object  # noqa: F811
 
-TEST_CUDAMALLOCASYNC = ((os.getenv("PYTORCH_CUDA_ALLOC_CONF") is not None) and
-                        ("backend:cudaMallocAsync" in os.getenv("PYTORCH_CUDA_ALLOC_CONF")))
-
+TEST_CUDAMALLOCASYNC = (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
 TEST_LARGE_TENSOR = TEST_CUDA
 TEST_MEDIUM_TENSOR = TEST_CUDA
 TEST_CUDNN = TEST_CUDA
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -225,6 +225,24 @@ PyObject * THCPModule_cudaCachingAllocator_raw_delete(PyObject *_unused, PyObjec
   END_HANDLE_TH_ERRORS
 }
 
+PyObject * THCPModule_getAllocatorBackend(PyObject *_unused, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  using c10::cuda::CUDACachingAllocator::AllocatorBackend;
+  AllocatorBackend backend = c10::cuda::CUDACachingAllocator::allocatorBackend();
+  // this call should be uncommon, don't bother interning strings
+  switch (backend) {
+    case AllocatorBackend::NATIVE:
+      return THPUtils_packString("native");
+    case AllocatorBackend::CUDAMALLOCASYNC:
+      return THPUtils_packString("cudaMallocAsync");
+    default:
+      THPUtils_assert(false, "Unexpected value for backend");
+      return nullptr;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject * THCPModule_cudaSynchronize(PyObject *_unused, PyObject *noargs)
 {
   HANDLE_TH_ERRORS
@@ -590,6 +608,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
   {"_cuda_cudaHostAllocator", THCPModule_cudaHostAllocator, METH_NOARGS, nullptr},
   {"_cuda_cudaCachingAllocator_raw_alloc", THCPModule_cudaCachingAllocator_raw_alloc, METH_VARARGS, nullptr},
   {"_cuda_cudaCachingAllocator_raw_delete", THCPModule_cudaCachingAllocator_raw_delete, METH_O, nullptr},
+  {"_cuda_getAllocatorBackend", THCPModule_getAllocatorBackend, METH_NOARGS, nullptr},
   {"_cuda_synchronize", THCPModule_cudaSynchronize, METH_NOARGS, nullptr},
   {"_cuda_ipc_collect", THCPModule_cudaIPCCollect, METH_NOARGS, nullptr},
   {"_cuda_sleep", THCPModule_cudaSleep, METH_O, nullptr},
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -587,3 +587,14 @@ def mem_get_info(device: Union[Device, int] = None) -> int:
         device = torch.cuda.current_device()
     device = _get_device_index(device)
     return torch.cuda.cudart().cudaMemGetInfo(device)
+
+def get_allocator_backend() -> str:
+    r"""Returns a string describing the active allocator backend as set by
+    ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are
+    ``native`` (Pytorch's native caching allocator) and `cudaMallocAsync``
+    (CUDA's built-in asynchronous allocator).
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on choosing the allocator backend.
+    """
+    return torch._C._cuda_getAllocatorBackend()