pytorch
diff --git a/‎test/inductor/test_op_dtype_prop.py
Lines changed: 137 additions & 1 deletion b/‎test/inductor/test_op_dtype_prop.py
Lines changed: 137 additions & 1 deletion
diff --git a/‎test/inductor/test_torchinductor.py
Lines changed: 0 additions & 31 deletions b/‎test/inductor/test_torchinductor.py
Lines changed: 0 additions & 31 deletions
@@ -1,16 +1,21 @@
 # Owner(s): ["module: inductor"]
 import importlib
 import os
+import re
 import sys
 
 import torch
 from torch._dynamo.utils import disable_cache_limit
 from torch._inductor import config
+from torch._inductor.codegen.triton import OpDtypeSupport
 from torch._inductor.test_case import TestCase as InductorTestCase
-from torch._inductor.utils import run_and_get_code
+from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
+from torch.fx.operator_schemas import get_signature_for_torch_op
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_utils import parametrize
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
 
 
 # Make the helper files in test/ importable
@@ -75,6 +80,137 @@ def run(op, args, kwargs):
             out_c = torch.compile(run)(op.get_op(), args, kwargs)
             self.assertEqual(out, out_c)
 
+    @requires_gpu()
+    @parametrize("upcast_to_fp32", [False, True])
+    @config.patch("triton.use_block_ptr", True)
+    def test_codegen_upcast_to_fp32(self, upcast_to_fp32):
+        @torch.compile
+        def func(a, b, c, d):
+            return a * b * c * d
+
+        inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=torch.float16),) * 4
+        with config.patch("triton.codegen_upcast_to_fp32", upcast_to_fp32):
+            func_opt = torch._dynamo.optimize("inductor")(func)
+            code = run_and_get_triton_code(func_opt, *inps)
+            fp32_cast_in_code = "to(tl.float32)" in code
+            self.assertEqual(fp32_cast_in_code, upcast_to_fp32)
+
+    def test_op_dtype_support(self):
+        """
+        Triton codegen upcasts values to float32 for certain ops.
+        Check that those ops have accurate dtype information.
+        """
+
+        for op_name in [
+            "rsqrt",
+            "sqrt",
+            "isnan",
+            "floor",
+            "ceil",
+            "tan",
+            "atan",
+            "atanh",
+            "sigmoid",
+            "log2",
+            "log10",
+            "cosh",
+            "sinh",
+            "acosh",
+            "asinh",
+            "asin",
+            "acos",
+            "asinh",
+            "erf",
+            "lgamma",
+            "sin",
+            "cos",
+            "exp",
+            "expm1",
+            "exp2",
+            "abs",
+            "hypot",
+            "nextafter",
+        ]:
+            # These ops do not support float16 and bfloat16.
+            supported_dtypes = OpDtypeSupport.supported_dtypes[op_name]
+            self.assertNotIn(torch.float16, supported_dtypes)
+            self.assertNotIn(torch.bfloat16, supported_dtypes)
+
+            # These ops should support float32 and float64.
+            self.assertIn(torch.float32, supported_dtypes)
+            self.assertIn(torch.float64, supported_dtypes)
+
+    @requires_gpu()
+    @parametrize("op_name", OpDtypeSupport.supported_dtypes)
+    @parametrize("load_upcast_to_fp32", [False, True])
+    @parametrize("input_dtype", [torch.float16, torch.bfloat16])
+    @config.patch("triton.use_block_ptr", True)
+    def test_dtype_aware_codegen(self, op_name: str, load_upcast_to_fp32, input_dtype):
+        """
+        Test dtype aware codegen for some tl.math/libdevice calls.
+        Operands should be upcast to float32, and the output should be downcast to float16.
+        """
+
+        # Check if the op's output should be upcasted/downcasted.
+        supported_dtypes = OpDtypeSupport.supported_dtypes[op_name]
+        convert_output = OpDtypeSupport.convert_outputs[op_name]
+        self.assertNotIn(input_dtype, supported_dtypes)
+
+        # Retrieve the corresponding torch op.
+        torch_op_name = op_name.removeprefix("libdevice_")
+        op = getattr(torch, torch_op_name)
+
+        # Edge case: torch.round maps to libdevice.nearbyint.
+        triton_op_name_overrides = {
+            "round": "nearbyint",
+        }
+        override = triton_op_name_overrides.get(op_name)
+        triton_op_name = override if override is not None else torch_op_name
+
+        # Get the number of args for the op.
+        signatures = get_signature_for_torch_op(op)
+        num_args = len(signatures[0].parameters)
+
+        # Test codegen and check for casts.
+        inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=input_dtype),) * num_args
+        tl_dtype_str = str(input_dtype).replace("torch", "tl")
+        with config.patch("triton.codegen_upcast_to_fp32", load_upcast_to_fp32):
+            compiled = torch._dynamo.optimize("inductor")(op)
+            code = run_and_get_triton_code(compiled, *inps)
+
+            # Search the code with a regex.
+            # Example code: libdevice.floor(tmp3.to(tl.float32)).to(tl.float16)
+            output_cast = rf"\.to\({tl_dtype_str}\)" if convert_output else ""
+            pattern = rf"{triton_op_name}\(.*\.to\(tl\.float32\)\){output_cast}"
+            cast_in_code = re.search(pattern, code, re.MULTILINE) is not None
+            self.assertNotEqual(cast_in_code, load_upcast_to_fp32)
+
+    @config.patch("triton.codegen_upcast_to_fp32", False)
+    def test_binary_math_mixed_precision(self):
+        """
+        Test a binary math operator where only one input needs to be upcast.
+        """
+        # Create inputs of different dtypes.
+        inputs = [
+            torch.randn(8, device=GPU_TYPE, dtype=dtype)
+            for dtype in (torch.float16, torch.float32)
+        ]
+
+        func = torch.hypot
+        compiled = torch.compile(backend="inductor")(func)
+        result, (code,) = run_and_get_code(compiled, *inputs)
+
+        # Check accuracy.
+        ref = func(*inputs)
+        self.assertTrue(torch.allclose(ref, result))
+
+        # Check for exactly one upcast.
+        num_upcasts = code.count(".to(tl.float32)")
+        self.assertEqual(num_upcasts, 1)
+
+        # There should be no downcast, since the input is promoted to float32.
+        self.assertNotIn(".to(tl.float16)", code)
+
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_constant(self):
         def fn():
 
@@ -12491,37 +12491,6 @@ def f(x, mask):
             # it does not move the tensor constructor to cuda and keeps it on CPU.
             self.assertFalse("empty_strided_cuda(()" in code)
 
-        @requires_gpu()
-        @parametrize("upcast_to_fp32", [False, True])
-        @config.patch("triton.use_block_ptr", True)
-        def test_codegen_upcast_to_fp32(self, upcast_to_fp32):
-            @torch.compile
-            def func(a, b, c, d):
-                return a * b * c * d
-
-            inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=torch.float16),) * 4
-            with config.patch("triton.codegen_upcast_to_fp32", upcast_to_fp32):
-                func_opt = torch._dynamo.optimize("inductor")(func)
-                code = run_and_get_triton_code(func_opt, *inps)
-                fp32_cast_in_code = "to(tl.float32)" in code
-                self.assertEqual(fp32_cast_in_code, upcast_to_fp32)
-
-        @requires_gpu()
-        @parametrize("load_upcast_to_fp32", [False, True])
-        @parametrize("input_dtype", [torch.float16, torch.bfloat16])
-        @config.patch("triton.use_block_ptr", True)
-        def test_dtype_aware_codegen(self, load_upcast_to_fp32, input_dtype):
-            @torch.compile
-            def func(a, b, c, d):
-                return torch.sqrt(a * b * c * d)
-
-            inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=input_dtype),) * 4
-            with config.patch("triton.codegen_upcast_to_fp32", load_upcast_to_fp32):
-                func_opt = torch._dynamo.optimize("inductor")(func)
-                code = run_and_get_triton_code(func_opt, *inps)
-                libdevice_cast_in_code = "libdevice.sqrt(tmp3.to(tl.float32))" in code
-                self.assertNotEqual(libdevice_cast_in_code, load_upcast_to_fp32)
-
         @config.patch("triton.use_block_ptr", False)
         def test_evict_last_non_coalesced_loads(self):
             @torch.compile