pytorch
diff --git a/‎test/test_matmul_cuda.py
Lines changed: 97 additions & 117 deletions b/‎test/test_matmul_cuda.py
Lines changed: 97 additions & 117 deletions
@@ -62,15 +62,6 @@
 assert torch.get_default_dtype() is torch.float32
 
 
-@contextlib.contextmanager
-def blas_library_context(backend):
-    prev_backend = torch.backends.cuda.preferred_blas_library()
-    torch.backends.cuda.preferred_blas_library(backend)
-    try:
-        yield
-    finally:
-        torch.backends.cuda.preferred_blas_library(prev_backend)
-
 class TestMatmulCuda(TestCase):
     def setUp(self):
         super().setUp()
@@ -150,10 +141,8 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
                         torch.float32: xtol(atol=1e-1, rtol=1e-1)})
     @dtypes(torch.float16, torch.bfloat16, torch.float32)
     @parametrize("size", [100, 1000, 10000])
-    @parametrize("backend", ["cublas", "cublaslt"])
-    def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
-        with blas_library_context(backend):
-            self.cublas_addmm(size, dtype, False)
+    def test_cublas_addmm(self, size: int, dtype: torch.dtype):
+        self.cublas_addmm(size, dtype, False)
 
     @onlyCUDA
     @skipIfRocmVersionLessThan((5, 2))
@@ -162,10 +151,8 @@ def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
     @dtypes(torch.float16, torch.bfloat16)
     @parametrize("size", [100, 1000, 10000])
-    @parametrize("backend", ["cublas", "cublaslt"])
-    def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype, backend):
-        with blas_library_context(backend):
-            self.cublas_addmm(size, dtype, True)
+    def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype):
+        self.cublas_addmm(size, dtype, True)
 
     @onlyCUDA
     @skipIfRocmVersionLessThan((5, 2))
@@ -174,10 +161,8 @@ def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype, bac
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
     @dtypes(torch.float16, torch.bfloat16)
     @parametrize("size", [100, 1000, 10000])
-    @parametrize("backend", ["cublas", "cublaslt"])
-    def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype: torch.dtype, backend):
-        with blas_library_context(backend):
-            self.cublas_addmm(size, dtype, False, True)
+    def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype: torch.dtype):
+        self.cublas_addmm(size, dtype, False, True)
 
     @onlyCUDA
     @skipIfRocm
@@ -471,48 +456,49 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
-        with blas_library_context(backend):
-            def create_inputs(B=None):
-                if B is None:
-                    a = torch.randn(M, K, device=device, dtype=dtype)
-                    b = torch.randn(K, N, device=device, dtype=dtype)
-                else:
-                    a = torch.randn(B, M, K, device=device, dtype=dtype)
-                    b = torch.randn(B, K, N, device=device, dtype=dtype)
-                return a, b
+        torch.backends.cuda.preferred_blas_library(backend)
 
-            a, b = create_inputs(batch_size)
+        def create_inputs(B=None):
+            if B is None:
+                a = torch.randn(M, K, device=device, dtype=dtype)
+                b = torch.randn(K, N, device=device, dtype=dtype)
+            else:
+                a = torch.randn(B, M, K, device=device, dtype=dtype)
+                b = torch.randn(B, K, N, device=device, dtype=dtype)
+            return a, b
+
+        a, b = create_inputs(batch_size)
 
-            a_fp32, b_fp32 = a.to(torch.float32), b.to(torch.float32)
+        a_fp32, b_fp32 = a.to(torch.float32), b.to(torch.float32)
 
-            output_dtypes = [torch.float32]
+        output_dtypes = [torch.float32]
 
-            if input_dtype != torch.float32:
-                output_dtypes.append(input_dtype)
+        if input_dtype != torch.float32:
+            output_dtypes.append(input_dtype)
 
-            for output_dtype in output_dtypes:
-                # Catch edge case of incompat with bfloat16 and major version < 8
-                if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
-                    if output_dtype == torch.bfloat16:
-                        continue
+        for output_dtype in output_dtypes:
+            # Catch edge case of incompat with bfloat16 and major version < 8
+            if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
+                if output_dtype == torch.bfloat16:
+                    continue
 
-                    if batch_size:
-                        with self.assertRaises(RuntimeError):
-                            torch.bmm(a, b, out_dtype=output_dtype)
-                    else:
-                        with self.assertRaises(RuntimeError):
-                            torch.mm(a, b, out_dtype=output_dtype)
+                if batch_size:
+                    with self.assertRaises(RuntimeError):
+                        torch.bmm(a, b, out_dtype=output_dtype)
                 else:
-                    if batch_size:
-                        out = torch.bmm(a, b, out_dtype=output_dtype)
-                        baseline = torch.bmm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.bmm(a, b)
-                    else:
-                        out = torch.mm(a, b, out_dtype=output_dtype)
-                        baseline = torch.mm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.mm(a, b)
+                    with self.assertRaises(RuntimeError):
+                        torch.mm(a, b, out_dtype=output_dtype)
+            else:
+                if batch_size:
+                    out = torch.bmm(a, b, out_dtype=output_dtype)
+                    baseline = torch.bmm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.bmm(a, b)
+                else:
+                    out = torch.mm(a, b, out_dtype=output_dtype)
+                    baseline = torch.mm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.mm(a, b)
 
-                    self.assertEqual(out.dtype, output_dtype)
+                self.assertEqual(out.dtype, output_dtype)
 
-                    torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+                torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
 
 
     @onlyCUDA
@@ -526,56 +512,51 @@ def create_inputs(B=None):
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
-        with blas_library_context(backend):
-            def create_inputs(B=None):
-                if B is None:
-                    a = torch.randn(M, K, device=device, dtype=dtype)
-                    b = torch.randn(K, N, device=device, dtype=dtype)
-                    c = torch.randn(M, N, device=device, dtype=dtype)
-                else:
-                    a = torch.randn(B, M, K, device=device, dtype=dtype)
-                    b = torch.randn(B, K, N, device=device, dtype=dtype)
-                    c = torch.randn(B, M, N, device=device, dtype=dtype)
+        torch.backends.cuda.preferred_blas_library(backend)
 
-                return a, b, c
+        def create_inputs(B=None):
+            if B is None:
+                a = torch.randn(M, K, device=device, dtype=dtype)
+                b = torch.randn(K, N, device=device, dtype=dtype)
+                c = torch.randn(M, N, device=device, dtype=dtype)
+            else:
+                a = torch.randn(B, M, K, device=device, dtype=dtype)
+                b = torch.randn(B, K, N, device=device, dtype=dtype)
+                c = torch.randn(B, M, N, device=device, dtype=dtype)
 
-            a, b, c = create_inputs(batch_size)
+            return a, b, c
 
-            a_fp32, b_fp32, c_fp32 = a.to(torch.float32), b.to(torch.float32), c.to(torch.float32)
+        a, b, c = create_inputs(batch_size)
 
-            output_dtypes = [torch.float32]
+        a_fp32, b_fp32, c_fp32 = a.to(torch.float32), b.to(torch.float32), c.to(torch.float32)
 
-            if input_dtype != torch.float32:
-                output_dtypes.append(input_dtype)
+        output_dtypes = [torch.float32]
 
-            for output_dtype in output_dtypes:
-                # Catch edge case of incompat with bfloat16 and major version < 8
-                if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
-                    if output_dtype == torch.bfloat16:
-                        continue
+        if input_dtype != torch.float32:
+            output_dtypes.append(input_dtype)
 
-                    if batch_size:
-                        with self.assertRaises(RuntimeError):
-                            torch.baddbmm(c, a, b, out_dtype=output_dtype)
-                    else:
-                        with self.assertRaises(RuntimeError):
-                            torch.addmm(c, a, b, out_dtype=output_dtype)
+        for output_dtype in output_dtypes:
+            # Catch edge case of incompat with bfloat16 and major version < 8
+            if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
+                if output_dtype == torch.bfloat16:
+                    continue
+
+                if batch_size:
+                    with self.assertRaises(RuntimeError):
+                        torch.baddbmm(c, a, b, out_dtype=output_dtype)
+                else:
+                    with self.assertRaises(RuntimeError):
+                        torch.addmm(c, a, b, out_dtype=output_dtype)
+            else:
+                if batch_size:
+                    out = torch.baddbmm(c, a, b, out_dtype=output_dtype)
+                    baseline = torch.baddbmm(c_fp32, a_fp32, b_fp32) if output_dtype == torch.float32 else torch.baddbmm(c, a, b)
                 else:
-                    if batch_size:
-                        out = torch.baddbmm(c, a, b, out_dtype=output_dtype)
-                        if output_dtype == torch.float32:
-                            baseline = torch.baddbmm(c_fp32, a_fp32, b_fp32)
-                        else:
-                            baseline = torch.baddbmm(c, a, b)
-                    else:
-                        out = torch.addmm(c, a, b, out_dtype=output_dtype)
-                        if output_dtype == torch.float32:
-                            baseline = torch.addmm(c_fp32, a_fp32, b_fp32)
-                        else:
-                            baseline = torch.addmm(c, a, b)
-
-                    self.assertEqual(out.dtype, output_dtype)
-                    torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+                    out = torch.addmm(c, a, b, out_dtype=output_dtype)
+                    baseline = torch.addmm(c_fp32, a_fp32, b_fp32) if output_dtype == torch.float32 else torch.addmm(c, a, b)
+
+                self.assertEqual(out.dtype, output_dtype)
+                torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
 
     @onlyCUDA
@@ -586,36 +567,35 @@ def test_fp16_accum_and_fp32_out_failure(self, batch_size, backend):
         M, N, K = 32, 32, 32
         device = "cuda"
         dtype = torch.float16
-        with blas_library_context(backend):
-            torch.backends.cuda.preferred_blas_library(backend)
+        torch.backends.cuda.preferred_blas_library(backend)
 
-            orig_fp16_accum = torch.backends.cuda.matmul.allow_fp16_accumulation
-            torch.backends.cuda.matmul.allow_fp16_accumulation = True
+        orig_fp16_accum = torch.backends.cuda.matmul.allow_fp16_accumulation
+        torch.backends.cuda.matmul.allow_fp16_accumulation = True
 
-            def create_inputs():
-                a = torch.randn(M, K, device=device, dtype=dtype)
-                b = torch.randn(K, N, device=device, dtype=dtype)
-                c = torch.randn(M, N, device=device, dtype=dtype)
-                return a, b, c
+        def create_inputs():
+            a = torch.randn(M, K, device=device, dtype=dtype)
+            b = torch.randn(K, N, device=device, dtype=dtype)
+            c = torch.randn(M, N, device=device, dtype=dtype)
+            return a, b, c
 
-            def expand(tensor):
-                return tensor.unsqueeze(0).expand(batch_size, *tensor.shape)
+        def expand(tensor):
+            return tensor.unsqueeze(0).expand(batch_size, *tensor.shape)
 
-            a, b, c = create_inputs()
+        a, b, c = create_inputs()
 
-            with self.assertRaises(Exception):
-                torch.baddbmm(expand(c), expand(a), expand(b), out_dtype=torch.float32)
+        with self.assertRaises(Exception):
+            torch.baddbmm(expand(c), expand(a), expand(b), out_dtype=torch.float32)
 
-            with self.assertRaises(Exception):
-                torch.addmm(c, a, b, out_dtype=torch.float32)
+        with self.assertRaises(Exception):
+            torch.addmm(c, a, b, out_dtype=torch.float32)
 
-            with self.assertRaises(Exception):
-                torch.bmm(expand(a,), expand(b), out_dtype=torch.float32)
+        with self.assertRaises(Exception):
+            torch.bmm(expand(a,), expand(b), out_dtype=torch.float32)
 
-            with self.assertRaises(Exception):
-                torch.mm(a, b, out_dtype=torch.float32)
+        with self.assertRaises(Exception):
+            torch.mm(a, b, out_dtype=torch.float32)
 
-            torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accum
+        torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accum
 
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"