pytorch
diff --git a/‎test/inductor/test_aot_inductor.py
Lines changed: 26 additions & 0 deletions b/‎test/inductor/test_aot_inductor.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp_utils.py
Lines changed: 3 additions & 0 deletions b/‎torch/_inductor/codegen/cpp_utils.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 34 additions & 4 deletions b/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 34 additions & 4 deletions
@@ -3810,6 +3810,32 @@ def forward(self, x):
         with self.assertRaisesRegex(Exception, ""):
             aot_inductor_module(x_casted)
 
+    @patch.dict(os.environ, {"AOTI_RUNTIME_CHECK_INPUTS": "1"})
+    def test_runtime_checks_device_type_failed(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return x + 1
+
+        x = torch.randn(1, 4, dtype=torch.float16, device="cpu")
+        model = Model()
+        with torch.no_grad():
+            package_path: str = AOTIRunnerUtil.compile(
+                model,
+                (x,),
+            )
+
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+        aot_inductor_module(x)
+        x_casted = x.to("cuda")
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(x_casted)
+
     def test_non_contiguous_output_alias(self):
         # Test return x, x.contiguous() where x is non-contiguous.
         class Model(torch.nn.Module):
 
@@ -87,6 +87,9 @@
     torch._mkldnn: "at::kMkldnn",  # type: ignore[attr-defined]
 }
 
+# matches c10/core/DeviceType.h
+DEVICE_TO_INT = {"cpu": 0, "cuda": 1}
+
 _IS_WINDOWS = sys.platform == "win32"
 
 INDEX_TYPE = "int64_t"
 
@@ -23,7 +23,7 @@
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides, IndentedBuffer, Kernel
-from .cpp_utils import cexpr, DEVICE_TO_ATEN, DTYPE_TO_ATEN, DTYPE_TO_CPP
+from .cpp_utils import cexpr, DEVICE_TO_ATEN, DEVICE_TO_INT, DTYPE_TO_ATEN, DTYPE_TO_CPP
 from .wrapper import (
     EnterSubgraphLine,
     ExitSubgraphLine,
@@ -322,9 +322,12 @@ def codegen_symbol(
             raise AssertionError(f"Unknown value type: {type(value)}")
 
     def generate_input_output_runtime_checks(self):
-        # In debug_compile mode, we generate checks to ensure the dtype/shape/stride of each
-        # real input/output tensor match ones provided at compile time via sample
-        # input/output.
+        """
+        In debug_compile mode, we generate checks to ensure the dtype/shape/stride/device of each
+        real input/output tensor match ones provided at compile time via sample
+        input/output.
+        """
+
         def gen_check(handle_kind, idx, name, tensor):
             # Wrap AtenTensorHandle with ConstantHandle for cleaner utility function access
             self.prefix.writeline(
@@ -404,6 +407,27 @@ def gen_check(handle_kind, idx, name, tensor):
                     """
                 )
 
+            # check input device type
+            if isinstance(tensor, ir.TensorBox):
+                tensor_device = tensor.get_device()
+                if tensor_device is not None:
+                    expected_device_type = DEVICE_TO_INT.get(tensor_device.type)
+                    if expected_device_type is not None:
+                        self.codegen_input_device_type_var_decl(self.prefix, name)
+                        device_type_str = str(tensor_device.type)
+                        self.prefix.splice(
+                            f"""
+                                int32_t {name}_expected_device_type = {expected_device_type};
+                                if ({name}_expected_device_type != {name}_device_type) {{
+                                    std::stringstream ss;
+                                    ss << "{handle_kind}[{idx}]: unmatched device type, "
+                                    << "expected: " << {name}_expected_device_type << "{expected_device_type}({device_type_str}), "
+                                    << "but got: " << {name}_device_type << "\\n";
+                                    throw std::runtime_error(ss.str());
+                                }}
+                            """
+                        )
+
         # Create a separate function for each input check to avoid "too big to optimize" error
         for idx, (name, tensor) in enumerate(V.graph.graph_inputs.items()):
             self.prefix.splice(
@@ -593,6 +617,12 @@ def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
     def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
         code.writeline(f"auto {name}_stride = {name}.strides();")
 
+    def codegen_input_device_type_var_decl(self, code: IndentedBuffer, name):
+        code.writeline(f"int32_t {name}_device_type;")
+        code.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type({name}, &{name}_device_type));"
+        )
+
     def codegen_model_kernels(self):
         self.prefix.writeline("namespace {")
Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,9 @@`
`87`	`87`	`torch._mkldnn: "at::kMkldnn", # type: ignore[attr-defined]`
`88`	`88`	`}`
`89`	`89`
	`90`	`+# matches c10/core/DeviceType.h`
	`91`	`+DEVICE_TO_INT = {"cpu": 0, "cuda": 1}`
	`92`	`+`
`90`	`93`	`_IS_WINDOWS = sys.platform == "win32"`
`91`	`94`
`92`	`95`	`INDEX_TYPE = "int64_t"`