pytorch
diff --git a/‎test/inductor/test_op_dtype_prop.py
Lines changed: 83 additions & 0 deletions b/‎test/inductor/test_op_dtype_prop.py
Lines changed: 83 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/common.py
Lines changed: 16 additions & 2 deletions b/‎torch/_inductor/codegen/common.py
Lines changed: 16 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/triton.py
Lines changed: 3 additions & 1 deletion b/‎torch/_inductor/codegen/triton.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/_inductor/config.py
Lines changed: 2 additions & 0 deletions b/‎torch/_inductor/config.py
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,83 @@
+# Owner(s): ["module: inductor"]
+import importlib
+import os
+import sys
+
+import torch
+from torch._dynamo.utils import disable_cache_limit
+from torch._inductor import config
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_methods_invocations import op_db
+
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+
+importlib.import_module("functorch")
+importlib.import_module("filelock")
+
+
+from torch._inductor.lowering import lowerings
+from torch.testing._internal.common_device_type import ops
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+unique_pointwise_op_names = set()
+
+for op in lowerings:
+    if not isinstance(op, torch._ops.OpOverload):
+        continue
+
+    if torch.Tag.pointwise not in op.tags:
+        continue
+
+    if op._schema.is_mutable:
+        continue
+
+    op_name = (op.name().split("::")[-1]).split(".")[0]
+    unique_pointwise_op_names.add(op_name)
+
+pointwise_ops = [
+    op
+    for op in op_db
+    if op.name in unique_pointwise_op_names and "reduction" not in op.variant_test_name
+]
+
+
+class TestCase(InductorTestCase):
+    @ops(
+        pointwise_ops,
+        allowed_dtypes=(
+            torch.float32,
+            torch.float64,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ),
+    )
+    # @config.patch("triton.codegen_upcast_to_fp32", False) # TODO enable
+    @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @disable_cache_limit()
+    def test_op_dtype_propagation(self, op, dtype):
+        def run(op, args, kwargs):
+            return op(*args, **kwargs)
+
+        sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
+        for sample_input in sample_inputs_itr:
+            args = (sample_input.input,) + sample_input.args
+            kwargs = sample_input.kwargs
+            out = run(op.get_op(), args, kwargs)
+            out_c = torch.compile(run)(op.get_op(), args, kwargs)
+            self.assertEqual(out, out_c)
+
+
+instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU:
+        run_tests(needs="filelock")
@@ -23,10 +23,10 @@
 )
 
 import sympy
-from sympy.printing.printer import Printer
 
 import torch
 import torch.fx
+from sympy.printing.printer import Printer
 from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
 from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
 from torch.utils import _pytree as pytree
@@ -2010,12 +2010,13 @@ def inner(*args, **kwargs):
                     value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
                     dtype_handler = DtypePropagationOpsHandler()
 
+                    idx = 0
+
                     def do_cse(v):
                         # TODO - throw on default
                         output_dtype = getattr(
                             dtype_handler,
                             name,
-                            dtype_handler.default_handler,
                         )(*args)
 
                         csevar = V.kernel.cse.generate(
@@ -2024,7 +2025,20 @@ def do_cse(v):
                             bounds=bounds,
                             dtype=output_dtype,
                         )
+
+                        nonlocal idx
+                        if config.test_configs.runtime_triton_dtype_assert:
+                            from torch._inductor.codegen.triton import triton_type
+
+                            if isinstance(output_dtype, (list, tuple)):
+                                output_dtype = output_dtype[idx]
+                            V.kernel.compute.writeline(
+                                f"tl.static_assert({csevar}.dtype == {triton_type(output_dtype)})"
+                            )
+                        idx += 1
+
                         csevar.update_on_args(name, args, kwargs)
+
                         return csevar
 
                     return pytree.tree_map(do_cse, value)
 
@@ -1087,7 +1087,9 @@ def sigmoid(x):
     @staticmethod
     def signbit(x):
         # XX: This is wrong for the value -0.0 in floating point
-        return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+        return (
+            f"(libdevice.signbit({x}) != 0) if ({x}).dtype is tl.float32 else {x} < 0"
+        )
 
     @staticmethod
     def fmod(a, b):
 
@@ -1314,6 +1314,8 @@ class trace:
 class test_configs:
     force_extern_kernel_in_multi_template = False
 
+    runtime_triton_dtype_assert = False
+
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403