pytorch
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp‎
Lines changed: 80 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py‎
Lines changed: 34 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/setup.py‎
Lines changed: 9 additions & 2 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/setup.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py‎
Lines changed: 33 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_torch/c/shim.h‎
Lines changed: 30 additions & 0 deletions b/‎torch/csrc/inductor/aoti_torch/c/shim.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_torch/shim_common.cpp‎
Lines changed: 56 additions & 0 deletions b/‎torch/csrc/inductor/aoti_torch/shim_common.cpp‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎torch/csrc/stable/accelerator.h‎
Lines changed: 71 additions & 0 deletions b/‎torch/csrc/stable/accelerator.h‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎torch/csrc/stable/tensor.h‎
Lines changed: 4 additions & 3 deletions b/‎torch/csrc/stable/tensor.h‎
Lines changed: 4 additions & 3 deletions
@@ -1,9 +1,14 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/accelerator.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
 #include <torch/headeronly/util/Exception.h>
 
+#ifdef LAE_USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include
8000
 <optional>
 
 void inline sgd_math(
@@ -397,3 +402,78 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("test_default_constructor", &boxed_test_default_constructor);
 }
+
+// Test functions for torch::stable::accelerator APIs
+
+#ifdef LAE_USE_CUDA
+int64_t test_device_guard(int64_t device_index) {
+  using torch::stable::accelerator::DeviceGuard;
+
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  DeviceGuard guard(device_index);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+void boxed_test_device_guard(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+int64_t test_device_guard_set_index() {
+  using torch::stable::accelerator::DeviceGuard;
+
+  DeviceGuard guard(1);
+  guard.set_index(0);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+void boxed_test_device_guard_set_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_device_guard_set_index();
+  stack[0] = from(res);
+}
+
+int64_t test_stream(int32_t device_index) {
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  return torch::stable::accelerator::getCurrentStream(device_index).id();
+}
+
+void boxed_test_stream(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_device_guard(int device_index) -> int");
+  m.def("test_device_guard_set_index() -> int");
+  m.def("test_stream(int device_index) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_device_guard", &boxed_test_device_guard);
+  m.impl("test_device_guard_set_index", &boxed_test_device_guard_set_index);
+  m.impl("test_stream", &boxed_test_stream);
+}
+#endif // LAE_USE_CUDA
@@ -203,3 +203,37 @@ def my_narrow(t, dim, start, length) -> Tensor:
     Returns: Narrowed tensor
     """
     return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
+
+
+def test_device_guard(device_index) -> int:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: result of cudaGetDevice() as an integer after using the guard
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
+
+
+def test_device_guard_set_index() -> int:
+    """
+    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
+    then setting it to index 0, and returning the current device.
+
+    Returns: result of cudaGetDevice() as an integer after using set_index
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
@@ -4,7 +4,8 @@
 
 from setuptools import find_packages, setup
 
-from torch.utils.cpp_extension import BuildExtension, CppExtension
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
 
 
 ROOT_DIR = Path(__file__).parent
@@ -35,10 +36,16 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extension = CUDAExtension
+
     sources = list(CSRC_DIR.glob("**/*.cpp"))
 
     return [
-        CppExtension(
+        extension(
             "libtorch_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
 
@@ -5,6 +5,7 @@
 
 import torch
 from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -252,6 +253,38 @@ def test_my_narrow(self, device):
             expected0 = torch.narrow(t, dim0, start0, length0)
             self.assertEqual(out0, expected0)
 
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard(self, device):
+            import libtorch_agnostic
+
+            device_index = 1
+            out = libtorch_agnostic.ops.test_device_guard(device_index)
+            self.assertEqual(out, device_index)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard_set_index(self, device):
+            import libtorch_agnostic
+
+            # This test creates a DeviceGuard with index 1, then sets it to index 0
+            # and returns the current device (should be 0)
+            out = libtorch_agnostic.ops.test_device_guard_set_index()
+            self.assertEqual(out, 0)
+
+        @onlyCUDA
+        def test_stream(self, device):
+            import libtorch_agnostic
+
+            stream = torch.cuda.Stream()
+            device = torch.cuda.current_device()
+
+            with stream:
+                expected_stream_id = torch.cuda.current_stream(0).stream_id
+                stream_id = libtorch_agnostic.ops.test_stream(device)
+
+            self.assertEqual(stream_id, expected_stream_id)
+
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
 
@@ -496,6 +496,36 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_call_dispatcher(
     const char* overloadName,
     StableIValue* stack);
 
+// Device-generic guard for managing device context
+struct DeviceGuardOpaque;
+using DeviceGuardHandle = DeviceGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_device_guard(
+    int32_t device_index,
+    DeviceGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_device_guard(DeviceGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_device_guard_set_index(
+    DeviceGuardHandle guard,
+    int32_t device_index);
+
+// Device-generic stream for managing stream objects
+struct StreamOpaque;
+using StreamHandle = StreamOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_stream(StreamHandle stream);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_stream_id(StreamHandle stream, int64_t* ret_stream_id);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_stream(
+    int32_t device_index,
+    StreamHandle* ret_stream // returns new reference
+);
+
 #ifdef USE_CUDA
 
 struct CUDAGuardOpaque;
 
@@ -24,6 +24,10 @@
 #include <iostream>
 #include <vector>
 
+#include <c10/core/Device.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
@@ -1620,3 +1624,55 @@ AOTITorchError aoti_torch_call_dispatcher(
     }
   });
 }
+
+AOTITorchError aoti_torch_create_device_guard(
+    int32_t device_index,
+    DeviceGuardHandle* ret_guard // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    // checked=true will fail if no accelerator is available
+    const auto device_type =
+        at::accelerator::getAccelerator(/*checked=*/true).value();
+    c10::Device device(device_type, device_index);
+    c10::DeviceGuard* guard = new c10::DeviceGuard(device);
+    *ret_guard = reinterpret_cast<DeviceGuardHandle>(guard);
+  });
+}
+
+AOTITorchError aoti_torch_delete_device_guard(DeviceGuardHandle guard) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { delete reinterpret_cast<c10::DeviceGuard*>(guard); });
+}
+
+AOTITorchError aoti_torch_device_guard_set_index(
+    DeviceGuardHandle guard,
+    int32_t device_index) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { reinterpret_cast<c10::DeviceGuard*>(guard)->set_index(device_index); });
+}
+
+AOTITorchError aoti_torch_delete_stream(StreamHandle stream) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { delete reinterpret_cast<c10::Stream*>(stream); });
+}
+
+AOTITorchError aoti_torch_stream_id(
+    StreamHandle stream,
+    int64_t* ret_stream_id) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::Stream* stream_ptr = reinterpret_cast<c10::Stream*>(stream);
+    *ret_stream_id = stream_ptr->id();
+  });
+}
+
+// This function creates a new Stream object and makes StreamHandle point to it.
+// The caller is responsible for managing the object's lifecycle.
+AOTITorchError aoti_torch_get_current_stream(
+    int32_t device_index,
+    StreamHandle* ret_stream) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::Stream stream = at::accelerator::getCurrentStream(device_index);
+    c10::Stream* stream_ptr = new c10::Stream(stream);
+    *ret_stream = reinterpret_cast<StreamHandle>(stream_ptr);
+  });
+}
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <memory>
+
+using DeleterFnPtr = void (*)(void*);
+
+namespace torch::stable::accelerator {
+
+namespace {
+inline void delete_device_guard(void* ptr) {
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_device_guard(reinterpret_cast<DeviceGuardHandle>(ptr)));
+}
+
+} // namespace
+
+// this is bigger than DeviceIndex in c10/core/Device.h but it is the type we
+// can converge on in this world as DeviceIndex in libtorch is not stable.
+using DeviceIndex = int32_t;
+using StreamId = int64_t; // this is from c10/core/Stream.h
+
+class DeviceGuard {
+ public:
+  explicit DeviceGuard() = delete;
+  explicit DeviceGuard(DeviceIndex device_index)
+      : guard_(nullptr, delete_device_guard) {
+    DeviceGuardHandle ptr = nullptr;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_create_device_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(DeviceIndex device_index) {
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_device_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<DeviceGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class Stream {
+ public:
+  explicit Stream() = delete;
+
+  // Construct a stable::Stream from a StreamHandle
+  // Steals ownership from the StreamHandle
+  explicit Stream(StreamHandle stream)
+      : stream_(stream, [](StreamHandle stream) {
+          TORCH_ERROR_CODE_CHECK(aoti_torch_delete_stream(stream));
+        }) {}
+
+  StreamId id() const {
+    StreamId stream_id;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_stream_id(stream_.get(), &stream_id));
+    return stream_id;
+  }
+
+ private:
+  std::shared_ptr<StreamOpaque> stream_;
+};
+
+inline Stream getCurrentStream(DeviceIndex device_index) {
+  StreamHandle stream = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_stream(device_index, &stream));
+  return Stream(stream);
+}
+
+} // namespace torch::stable::accelerator
@@ -5,11 +5,12 @@
 #include <torch/headeronly/util/shim_utils.h>
 #include <climits>
 #include <memory>
+
+#include <torch/csrc/stable/accelerator.h>
+
 namespace torch::stable {
 
-// this is bigger than DeviceIndex in c10/core/Device.h but it is the type we
-// can converge on in this world as DeviceIndex in libtorch is not stable.
-using DeviceIndex = int32_t;
+using DeviceIndex = torch::stable::accelerator::DeviceIndex;
 
 // The torch::stable::Tensor class is a highlevel C++ wrapper around
 // the C shim Tensor APIs. We've modeled this class after TensorBase, as custom