[MPSInductor] Cast halfs to floats (#151246)

malfet · pytorchmergebot · commit 46ce8f7df687 · 2025-04-14T19:47:04.000Z
To avoid accuracy issues when small reductions are unrolled, cast half to float during the `load` op As `op_math_t<half>` is indeed float This fixes `test_unroll_small_reduction` for reduced precision types Pull Request resolved: #151246 Approved by: https://github.com/dcci ghstack dependencies: #151224
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
@@ -234,6 +234,7 @@ def fn(a):
     "test_sum_int",
     "test_sum_keepdims",
     "test_tanh",
+    "test_unroll_small_reduction",
     "test_vectorized_ops_masked",
     "test_var_mean_tile_reduction_True",
     "test_view_as_complex",
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
@@ -487,8 +487,15 @@ def load(self, name: str, index: sympy.Expr) -> CSEVariable:
         """Codegen a load from an InputBuffer"""
         var = self.args.input(name)
         index = self.prepare_indexing(index)
+        dtype = V.graph.get_dtype(name)
         line = f"{var}[{self.index_to_str(index)}]"
-        return self.cse.generate(self.loads, line, dtype=V.graph.get_dtype(name))
+        if dtype in [torch.float16, torch.bfloat16]:
+            # TODO(NS): Figure out the right balance betwene optype casts
+            # op_math_t for half-precision floats should be float32
+            # Otherwise it can lead to a corretness issues with eager
+            line = f"static_cast<float>({line})"
+            dtype = torch.float32
+        return self.cse.generate(self.loads, line, dtype=dtype)
 
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None