pytorch · malfet · Jan 29, 2024 · Jan 30, 2024
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
@@ -1,5 +1,6 @@
+import functools
 import logging
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch._inductor.virtualized import V
@@ -259,11 +260,19 @@ def fallback_mixed_mm(mat1, mat2, *, out):
 aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None)
 
 
+@functools.lru_cache(None)
+def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
+    props = torch.cuda.get_device_properties(index or 0)
+    return props.major <= 7
+
+
 def tuned_mixed_mm(mat1, mat2, mat2_dtype):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None)
     choices = [aten_fallback_mixed_mm.bind((mat1, mat2), layout)]
-    if mat1.layout.dtype != torch.float32 and not mat2.layout.is_contiguous():
-        # can't use triton kernel unless one of these is true
+    if (
+        mat1.layout.dtype != torch.float32 and not mat2.layout.is_contiguous()
+    ) or _is_sm7x_or_older_gpu(layout.device.index):
+        # can't use triton kernel unless one of these is true or if running on v100 (numerical issues)
         return autotune_select_algorithm("mixed_mm", choices, [mat1, mat2], layout)
     if inductor_config.force_mixed_mm:
         choices = []