pytorch
diff --git a/‎torch/_inductor/compile_fx.py
Lines changed: 29 additions & 0 deletions b/‎torch/_inductor/compile_fx.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎torch/cuda/__init__.py
Lines changed: 4 additions & 1 deletion b/‎torch/cuda/__init__.py
Lines changed: 4 additions & 1 deletion
@@ -858,6 +858,7 @@ def fx_codegen_and_compile(
                     else:
                         output_strides.append(None)
 
+            _check_triton_bf16_support(graph)
             compiled_fn = graph.compile_to_fn()
             num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
             metrics.num_bytes_accessed += num_bytes
@@ -1628,3 +1629,31 @@ def wrapper(*args):
         return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
 
     return wrapper
+
+
+def _check_triton_bf16_support(graph: GraphLowering) -> None:
+    def warn_and_skip(device) -> None:
+        from torch._dynamo.exc import SkipFrame
+        device_props = torch.cuda.get_device_properties(device)
+        warnings.warn(f"{device_props.name} does not support bfloat16 compilation natively, skipping")
+        raise SkipFrame("BF16 is not supported")
+
+    for inp in graph.graph_inputs.values():
+       device = inp.get_device()
+       if device.type != "cuda" or inp.get_dtype() != torch.bfloat16:
+           continue
+       # Print warning and skip frame if attempting to compile for bfloat16
+       # on device without hardware support for dtype
+       if torch.cuda.is_bf16_supported(including_emulation=False):
+          return
+       warn_and_skip(device)
+
+    for out in graph.graph_outputs:
+       device = out.get_device()
+       if device.type != "cuda" or out.get_dtype() != torch.bfloat16:
+           continue
+       # Print warning and skip frame if attempting to compile for bfloat16
+       # on device without hardware support for dtype
+       if torch.cuda.is_bf16_supported(including_emulation=False):
+          return
+       warn_and_skip(device)
@@ -128,7 +128,7 @@ def is_available() -> bool:
         return torch._C._cuda_getDeviceCount() > 0
 
 
-def is_bf16_supported():
+def is_bf16_supported(including_emulation: bool = True):
     r"""Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16."""
     # Check for ROCm, if true return true, no ROCM_VERSION check required,
     # since it is supported on AMD GPU archs.
@@ -147,6 +147,9 @@ def is_bf16_supported():
     ):
         return True
 
+    if not including_emulation:
+        return False
+
     # Finally try to create a bfloat16 device.
     return _check_bf16_tensor_supported(device)