pytorch
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp‎
Lines changed: 147 additions & 52 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp‎
Lines changed: 147 additions & 52 deletions
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py‎
Lines changed: 24 additions & 0 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎test/cpp_extensions/libtorch_agnostic_extension/setup.py‎
Lines changed: 9 additions & 2 deletions b/‎test/cpp_extensions/libtorch_agnostic_extension/setup.py‎
Lines changed: 9 additions & 2 deletions
@@ -1,25 +1,30 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/accelerator.h>
 #include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
 #include <torch/headeronly/util/Exception.h>
 
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include <optional>
 
 void inline sgd_math(
-  float* param_ptr,
-  float* grad_ptr,
-  float* out_ptr,
-  const float weight_decay,
-  const double lr,
-  const bool maximize,
-  int64_t size
-){
+    float* param_ptr,
+    float* grad_ptr,
+    float* out_ptr,
+    const float weight_decay,
+    const double lr,
+    const bool maximize,
+    int64_t size) {
   int64_t d = 0;
   for (; d < size; d++) {
     float grad_val = grad_ptr[d];
-    if (maximize) grad_val = -grad_val;
-    if (weight_decay != 0.0){
+    if (maximize)
+      grad_val = -grad_val;
+    if (weight_decay != 0.0) {
       grad_val += param_ptr[d] * weight_decay;
     }
     out_ptr[d] = param_ptr[d] - grad_val * float(lr);
@@ -36,8 +41,8 @@ Tensor sgd_out_of_place(
     const bool maximize) {
   STD_TORCH_CHECK(param.dim() == 1, "param must be 1D");
 
-  int64_t *param_sizes;
-  int64_t *param_strides;
+  int64_t* param_sizes;
+  int64_t* param_strides;
   aoti_torch_get_sizes(param.get(), &param_sizes);
   aoti_torch_get_strides(param.get(), &param_strides);
 
@@ -48,35 +53,45 @@ Tensor sgd_out_of_place(
   aoti_torch_get_device_type(param.get(), &param_device_type);
 
   AtenTensorHandle out_ath;
-  aoti_torch_empty_strided(param.dim(), param_sizes, param_strides, param_dtype, param_device_type, param.get_device(), &out_ath);
+  aoti_torch_empty_strided(
+      param.dim(),
+      param_sizes,
+      param_strides,
+      param_dtype,
+      param_device_type,
+      param.get_device(),
+      &out_ath);
   auto out = Tensor(out_ath);
 
   sgd_math(
-    reinterpret_cast<float*>(param.data_ptr()),
-    reinterpret_cast<float*>(grad.data_ptr()),
-    reinterpret_cast<float*>(out.data_ptr()),
-    weight_decay,
-    lr,
-    maximize,
-    param.numel()
-  );
+      reinterpret_cast<float*>(param.data_ptr()),
+      reinterpret_cast<float*>(grad.data_ptr()),
+      reinterpret_cast<float*>(out.data_ptr()),
+      weight_decay,
+      lr,
+      maximize,
+      param.numel());
 
   return out;
 }
 
-void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_sgd_out_of_place(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor res = sgd_out_of_place(
-    to<Tensor>(stack[0]),
-    to<Tensor>(stack[1]),
-    float(to<double>(stack[2])),
-    to<double>(stack[3]),
-    to<bool>(stack[4]));
+      to<Tensor>(stack[0]),
+      to<Tensor>(stack[1]),
+      float(to<double>(stack[2])),
+      to<double>(stack[3]),
+      to<bool>(stack[4]));
 
   stack[0] = from(res);
 }
 
 STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
-  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
+  m.def(
+      "sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
@@ -87,7 +102,10 @@ Tensor identity(Tensor t) {
   return t;
 }
 
-void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_identity(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor res = identity(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -112,7 +130,10 @@ Tensor my_abs(Tensor t) {
   return to<Tensor>(stack[0]);
 }
 
-void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_my_abs(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
   stack[0] = from(tensor_res);
 }
@@ -134,18 +155,21 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
   auto mf = aoti_torch_memory_format_contiguous_format();
 
   stack[0] = from(t);
-  stack[1] = from(std::optional(t_dtype));    // dtype
-  stack[2] = from(std::nullopt);              // layout
-  stack[3] = from(std::optional(device));     // device
-  stack[4] = from(std::optional(false));      // pin_memory
-  stack[5] = from(std::optional(mf));         // memory_format
+  stack[1] = from(std::optional(t_dtype)); // dtype
+  stack[2] = from(std::nullopt); // layout
+  stack[3] = from(std::optional(device)); // device
+  stack[4] = from(std::optional(false)); // pin_memory
+  stack[5] = from(std::optional(mf)); // memory_format
 
   aoti_torch_call_dispatcher("aten::ones_like", "", stack);
 
   return to<Tensor>(stack[0]);
 }
 
-void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_my_ones_like(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
   stack[0] = from(res);
 }
@@ -158,7 +182,10 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_ones_like", &boxed_my_ones_like);
 }
 
-std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
+std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(
+    Tensor t1,
+    Tensor t2,
+    Tensor t3) {
   StableIValue stack_exp[1];
   stack_exp[0] = from(t1);
   aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
@@ -172,20 +199,25 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
   aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);
 
   return std::make_tuple(
-    to<Tensor>(stack_exp[0]),
-    to<Tensor>(stack_neg[0]),
-    to<bool>(stack_is_leaf[0]));
+      to<Tensor>(stack_exp[0]),
+      to<Tensor>(stack_neg[0]),
+      to<bool>(stack_is_leaf[0]));
 }
 
-void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
+void boxed_exp_neg_is_leaf(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  auto tuple = exp_neg_is_leaf(
+      to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
   stack[0] = from(std::get<0>(tuple));
   stack[1] = from(std::get<1>(tuple));
   stack[2] = from(std::get<2>(tuple));
 }
 
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
+  m.def(
+      "exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
@@ -200,7 +232,10 @@ Tensor neg_exp(Tensor t) {
   return to<Tensor>(stack[0]);
 }
 
-void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_neg_exp(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor res = neg_exp(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -229,7 +264,10 @@ Tensor divide_neg_exp(Tensor t) {
   return to<Tensor>(stack_div[0]);
 }
 
-void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_divide_neg_exp(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -246,7 +284,10 @@ bool is_contiguous(Tensor t) {
   return t.is_contiguous();
 }
 
-void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_is_contiguous(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   bool res = is_contiguous(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -263,8 +304,12 @@ Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
   return transpose(t, dim0, dim1);
 }
 
-void boxed_my_transpose(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_transpose(to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
+void boxed_my_transpose(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  auto res = my_transpose(
+      to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
 
   stack[0] = from(res);
 }
@@ -273,7 +318,10 @@ Tensor my_empty_like(Tensor t) {
   return empty_like(t);
 }
 
-void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_empty_like(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   auto res = my_empty_like(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -303,12 +351,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("fill_infinity", &boxed_fill_infinity);
 }
 
-
 Tensor my_zero_(Tensor t) {
   return zero_(t);
 }
 
-void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+void boxed_my_zero_(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
   auto res = my_zero_(to<Tensor>(stack[0]));
   stack[0] = from(res);
 }
@@ -320,3 +370,48 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
   m.impl("my_zero_", &boxed_my_zero_);
 }
+
+// Test functions for torch::stable::accelerator APIs
+
+#ifdef USE_CUDA
+int test_device_guard(int8_t device_index) {
+  using torch::stable::accelerator::DeviceGuard;
+
+  DeviceGuard guard(device_index);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+void boxed_test_device_guard(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int res = test_device_guard(static_cast<int8_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+int64_t test_stream(int8_t device_index) {
+  auto id = torch::stable::accelerator::getCurrentStream(device_index).id();
+  return id;
+}
+
+void boxed_test_stream(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_stream(static_cast<int8_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_device_guard(int device_index) -> int");
+  m.def("test_stream(int device_index) -> int&q
C755
uot;);
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_device_guard", &boxed_test_device_guard);
+  m.impl("test_stream", &boxed_test_stream);
+}
+#endif // USE_CUDA
@@ -164,3 +164,27 @@ def fill_infinity(t) -> Tensor:
     Returns: The modified tensor (same as input)
     """
     return torch.ops.libtorch_agnostic.fill_infinity.default(t)
+
+
+def test_device_guard(device_index) -> Tensor:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: A 3x3 empty tensor created on the device specified by device_index
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
@@ -4,7 +4,8 @@
 
 from setuptools import find_packages, setup
 
-from torch.utils.cpp_extension import BuildExtension, CppExtension
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
 
 
 ROOT_DIR = Path(__file__).parent
@@ -35,10 +36,16 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DUSE_CUDA")
+        extension = CUDAExtension
+
     sources = list(CSRC_DIR.glob("**/*.cpp"))
 
     return [
-        CppExtension(
+        extension(
             "libtorch_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,