pytorch
diff --git a/‎test/inductor/test_cuda_repro.py
Lines changed: 26 additions & 0 deletions b/‎test/inductor/test_cuda_repro.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎torch/_inductor/config.py
Lines changed: 3 additions & 0 deletions b/‎torch/_inductor/config.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/_inductor/utils.py
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/utils.py
Lines changed: 1 addition & 0 deletions
@@ -1926,6 +1926,32 @@ def f(x, y):
 
         self.assertEqual(f(x_ref, y_ref), out)
 
+    @unittest.skipIf(
+        not config.is_fbcode(),
+        "bfloat16 atomic add is only supported in fbcode today #97016",
+    )
+    @skipCUDAIf(
+        not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
+    )
+    @config.patch({"bfloat16_atomic_adds_enabled": False})
+    def test_atomic_add_bfloat16_config(self):
+        def f(x, y):
+            return torch.index_select(x, 0, y)
+
+        x = torch.randn(
+            2000, 384, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        y = torch.ones(713268, dtype=torch.int64, device="cuda")
+        x_ref = x.clone().detach().requires_grad_(True)
+        y_ref = y.clone().detach()
+
+        out, (_, bw_code) = run_fw_bw_and_get_code(lambda: torch.compile(f)(x, y))
+        fc = FileCheck()
+        fc.check_not("tl.atomic_add")
+        fc.run(bw_code)
+
+        self.assertEqual(f(x_ref, y_ref), out)
+
     @skipCUDAIf(
         not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
     )
 
@@ -173,6 +173,9 @@ def prologue_fusion_enabled() -> bool:
 # Enable to allow using ftz variant of exponenet instruction in triton codegen.
 use_fast_math = os.environ.get("TORCHINDUCTOR_USE_FAST_MATH") == "1"
 
+# Enable bfloat16 atomic adds (fbcode only until upstreamed to triton)
+bfloat16_atomic_adds_enabled = True
+
 # How to organize memory under memory_planning=True:
 # - "none": do not try to pool storage, just reuse
 # - "intermediates": all non-outputs share storage, outputs each get unique storage
 
@@ -2268,6 +2268,7 @@ def needs_fallback_due_to_atomic_add_limitations(dtype: torch.dtype) -> bool:
         and dtype == torch.bfloat16
         and torch.cuda.is_available()
         and torch.cuda.get_device_capability() >= (9, 0)
+        and config.bfloat16_atomic_adds_enabled
     ):
         return False
     else: