pytorch
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/inductor/test_aot_inductor.py
Lines changed: 11 additions & 0 deletions b/‎test/inductor/test_aot_inductor.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎torch/_inductor/decomposition.py
Lines changed: 11 additions & 1 deletion b/‎torch/_inductor/decomposition.py
Lines changed: 11 additions & 1 deletion
@@ -286,7 +286,7 @@ resnext50_32x4d,pass,0
 
 
 
-sam,fail_to_run,0
+sam,pass,0
 
 
 
 
@@ -573,6 +573,17 @@ def forward(self, y):
                 model = LinearModel(device=self.device)
                 self.check_model(model, example_inputs)
 
+    def test_empty_cat_dtype_promotion(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                z = torch.cat([x, y], dim=1)
+                z = z.to(dtype=torch.bfloat16)
+                return z * 2
+
+        model = Foo()
+        inps = (torch.randn(4, 10, dtype=torch.bfloat16), torch.randn(4, 0))
+        self.check_model(model, inps)
+
     @unittest.skipIf(
         not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
     )
 
@@ -390,7 +390,17 @@ def non_empty_tensor(x: torch.Tensor) -> bool:
     filtered_tensors = list(filter(non_empty_tensor, tensors))
 
     if len(filtered_tensors) == 1:
-        return filtered_tensors[0].clone()
+        # check dtype promotion
+        promoted_dtype = elementwise_dtypes(
+            *tensors,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+        )[1]
+        filtered_t = filtered_tensors[0]
+        return (
+            filtered_t.clone()
+            if promoted_dtype == filtered_t.dtype
+            else filtered_t.to(dtype=promoted_dtype)
+        )
     elif 1 < len(filtered_tensors) < len(tensors):
         # on the first call, when we remove empty tensors, we redispatch recursively
         return aten.cat.default(filtered_tensors, dim)
Original file line number	Diff line number	Diff line change
`@@ -286,7 +286,7 @@ resnext50_32x4d,pass,0`
`286`	`286`
`287`	`287`
`288`	`288`
`289`		`-sam,fail_to_run,0`
	`289`	`+sam,pass,0`
`290`	`290`
`291`	`291`
`292`	`292`