pytorch
diff --git a/‎test/cpp/aoti_inference/test.cpp
Lines changed: 44 additions & 0 deletions b/‎test/cpp/aoti_inference/test.cpp
Lines changed: 44 additions & 0 deletions
diff --git a/‎test/inductor/test_aot_inductor.py
Lines changed: 35 additions & 0 deletions b/‎test/inductor/test_aot_inductor.py
Lines changed: 35 additions & 0 deletions
diff --git a/‎torch/_inductor/__init__.py
Lines changed: 10 additions & 2 deletions b/‎torch/_inductor/__init__.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎torch/_inductor/package/package.py
Lines changed: 3 additions & 2 deletions b/‎torch/_inductor/package/package.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎torch/csrc/inductor/aoti_package/model_package_loader.cpp
Lines changed: 11 additions & 7 deletions b/‎torch/csrc/inductor/aoti_package/model_package_loader.cpp
Lines changed: 11 additions & 7 deletions
diff --git a/‎torch/csrc/inductor/aoti_package/model_package_loader.h
Lines changed: 3 additions & 1 deletion b/‎torch/csrc/inductor/aoti_package/model_package_loader.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/csrc/inductor/aoti_package/pybind.cpp
Lines changed: 7 additions & 3 deletions b/‎torch/csrc/inductor/aoti_package/pybind.cpp
Lines changed: 7 additions & 3 deletions
@@ -14,6 +14,7 @@
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
 #if defined(USE_CUDA)
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <cuda_runtime.h>
 #endif
 #if defined(USE_CUDA) || defined(USE_ROCM)
@@ -139,6 +140,45 @@ void test_aoti_package_loader(
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 }
 
+void test_aoti_package_loader_multi_gpu(
+    const std::string& device,
+    bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string suffix = use_runtime_constant_folding
+      ? device + "_use_runtime_constant_folding"
+      : device;
+  std::string path_attr = "pt2_package_path_" + suffix;
+  std::string inputs_attr = "inputs_" + suffix;
+  std::string outputs_attr = "outputs_" + suffix;
+  const auto& pt2_package_path =
+      data_loader.attr(path_attr.c_str()).toStringRef();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  // For all available CUDA devices: Load PT2 package on this device, run
+  // inference, and validate results
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  for (int i = 0; i < torch::cuda::device_count(); i++) {
+    auto options = torch::TensorOptions().device(torch::kCUDA, i);
+    torch::inductor::AOTIModelPackageLoader runner(
+        pt2_package_path, "model", false, 1, i);
+    std::vector<torch::Tensor> input_tensors_on_device;
+    for (auto input_tensor : input_tensors) {
+      input_tensors_on_device.push_back(input_tensor.clone().to(options));
+    }
+    // Run loaded PT2 package on device
+    auto actual_output_tensors = runner.run(input_tensors_on_device);
+    ASSERT_TRUE(torch::allclose(
+        ref_output_tensors[0].cpu(), actual_output_tensors[0].cpu()));
+  }
+}
+
 void test_aoti_constants_update(
     const std::string& device,
     bool use_runtime_constant_folding) {
@@ -988,6 +1028,10 @@ TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
   test_aoti_package_loader("cuda", false);
 }
 
+TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
+  test_aoti_package_loader_multi_gpu("cuda", false);
+}
+
 TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
   test_aoti_user_managed_buffer();
 }
 
@@ -2155,6 +2155,41 @@ def forward(self, x, y):
         self.assertTrue(same(result_cpu, result_gpu_0.cpu()))
         self.assertTrue(same(result_cpu, result_gpu_1.cpu()))
 
+    @requires_multigpu()
+    def test_load_package_multiple_gpus(s
9E88
elf):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight):
+                super().__init__()
+                self.weight = weight
+
+            def forward(self, x, y):
+                return x + torch.nn.functional.linear(y, self.weight)
+
+        weight = torch.randn(10, 10, device=self.device)
+        inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+        model = Model(weight).to(device=self.device)
+        result_ref = model(*inputs)
+
+        package_path = AOTIRunnerUtil.compile(model, inputs)
+
+        # Load AOT package on gpu:N
+        device_interface = get_interface_for_device(GPU_TYPE)
+        for i in range(device_interface.device_count()):
+            device = torch.device(GPU_TYPE, i)
+            with device_interface.device(i), torch.no_grad():
+                model_package = torch._inductor.aoti_load_package(
+                    package_path, device_index=i
+                )
+                inputs_on_device = [input.to(device=device) for input in inputs]
+                result_package = model_package(*inputs_on_device)
+            self.assertTrue(same(result_ref.cpu(), result_package.cpu()))
+
     def test_reuse_kernel(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
 
@@ -235,7 +235,9 @@ def _aoti_compile_and_package_inner(
     return package_path
 
 
-def aoti_load_package(path: FileLike, run_single_threaded: bool = False) -> Any:  # type: ignore[type-arg]
+def aoti_load_package(
+    path: FileLike, run_single_threaded: bool = False, device_index: int = -1
+) -> Any:  # type: ignore[type-arg]
     """
     Loads the model from the PT2 package.
 
@@ -254,10 +256,16 @@ def aoti_load_package(path: FileLike, run_single_threaded: bool = False) -> Any:
         run_single_threaded (bool): Whether the model should be run without
             thread synchronization logic. This is useful to avoid conflicts with
             CUDAGraphs.
+        device_index (int): The index of the device to which the PT2 package is
+            to be loaded. By default, `device_index=-1` is used, which corresponds
+            to the device `cuda` when using CUDA. Passing `device_index=1` would
+            load the package to `cuda:1`, for example.
     """
     from torch._inductor.package import load_package
 
-    return load_package(path, run_single_threaded=run_single_threaded)
+    return load_package(
+        path, run_single_threaded=run_single_threaded, device_index=device_index
+    )
 
 
 def aot_compile(
 
@@ -290,6 +290,7 @@ def load_package(
     model_name: str = "model",
     run_single_threaded: bool = False,
     num_runners: int = 1,
+    device_index: int = -1,
 ) -> AOTICompiledModel:  # type: ignore[type-arg]
     assert (
         isinstance(path, (io.IOBase, IO)) and path.readable() and path.seekable()
@@ -305,12 +306,12 @@ def load_package(
             path.seek(0)
             log.debug("Writing buffer to tmp file located at %s.", f.name)
             loader = torch._C._aoti.AOTIModelPackageLoader(
-                f.name, model_name, run_single_threaded, num_runners
+                f.name, model_name, run_single_threaded, num_runners, device_index
             )  # type: ignore[call-arg]
             return AOTICompiledModel(loader)
 
     path = os.fspath(path)  # AOTIModelPackageLoader expects (str, str)
     loader = torch._C._aoti.AOTIModelPackageLoader(
-        path, model_name, run_single_threaded, num_runners
+        path, model_name, run_single_threaded, num_runners, device_index
     )  # type: ignore[call-arg]
     return AOTICompiledModel(loader)
@@ -342,7 +342,8 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
     const std::string& model_package_path,
     const std::string& model_name,
     const bool run_single_threaded,
-    const size_t num_runners) {
+    const size_t num_runners,
+    const c10::DeviceIndex device_index) {
   if (run_single_threaded) {
     if (num_runners != 1) {
       throw std::runtime_error(
@@ -470,22 +471,25 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   load_metadata(cpp_filename);
 
   // Construct the runner depending on the device information
-  std::string device = metadata_["AOTI_DEVICE_KEY"];
+  std::string device_key = metadata_["AOTI_DEVICE_KEY"];
 
-  if (device.empty()) {
+  if (device_key.empty()) {
     throw std::runtime_error("No device information found.");
   }
 
   std::unordered_map<std::string, CreateAOTIModelRunnerFunc>
       registered_aoti_runner = getAOTIModelRunnerRegistry();
 
-  if (registered_aoti_runner.find(device) == registered_aoti_runner.end()) {
-    throw std::runtime_error("Unsupported device found: " + device);
+  if (registered_aoti_runner.find(device_key) == registered_aoti_runner.end()) {
+    throw std::runtime_error("Unsupported device key found: " + device_key);
   }
 
+  c10::Device device = c10::Device(device_key);
+  device.set_index(device_index);
+
   std::string cubin_dir = temp_dir_ + k_separator + model_directory;
-  runner_ = registered_aoti_runner[device](
-      so_path, num_runners, device, cubin_dir, run_single_threaded);
+  runner_ = registered_aoti_runner[device_key](
+      so_path, num_runners, device.str(), cubin_dir, run_single_threaded);
 }
 
 AOTIModelPackageLoader::~AOTIModelPackageLoader() {
 
@@ -2,6 +2,7 @@
 #pragma once
 
 #include <ATen/Tensor.h>
+#include <c10/core/Device.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
 
 namespace torch::inductor {
@@ -11,7 +12,8 @@ class TORCH_API AOTIModelPackageLoader {
       const std::string& model_package_path,
       const std::string& model_name = "model",
       const bool run_single_threaded = false,
-      const size_t num_runners = 1);
+      const size_t num_runners = 1,
+      const c10::DeviceIndex device_index = -1);
   ~AOTIModelPackageLoader();
 
   AOTIModelContainerRunner* get_runner();
 
@@ -6,6 +6,7 @@
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
 #endif
 
+#include <c10/core/Device.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/inductor/aoti_runner/pybind.h>
 #include <torch/csrc/utils/pybind.h>
@@ -18,12 +19,14 @@ class AOTIModelPackageLoaderPybind : public AOTIModelPackageLoader {
       const std::string& model_package_path,
       const std::string& model_name,
       const bool run_single_threaded,
-      const size_t num_runners)
+      const size_t num_runners,
+      const c10::DeviceIndex device_index)
       : AOTIModelPackageLoader(
             model_package_path,
             model_name,
             run_single_threaded,
-            num_runners) {}
+            num_runners,
+            device_index) {}
 
   py::list boxed_run(py::list& inputs, void* stream_handle = nullptr) {
     std::vector<at::Tensor> input_tensors;
@@ -54,7 +57,8 @@ void initAOTIPackageBindings(PyObject* module) {
            const std::string&,
            const std::string&,
            const bool,
-           const size_t>())
+           const size_t,
+           const c10::DeviceIndex>())
       .def("get_metadata", &AOTIModelPackageLoaderPybind::get_metadata)
       .def(
           "run",