add ut for dispatch sdpa kernel to sbgemv in CPUBlas.cpp

taoye9 · pytorchmergebot · commit c52849fe2795 · 2025-05-14T07:16:41.000Z
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -7770,6 +7770,27 @@ def test_fp16_mv_transposed_first_argument_arm_cpu(self, device, m, k):
         finally:
             torch._C._set_cpu_allow_fp16_reduced_precision_reduction(prev)
 
+    @onlyCPU
+    @dtypes(torch.bfloat16)
+    @parametrize("m", [32, 35, 36, 40, 64, 128])
+    @parametrize("k", [32, 35, 36, 40, 64, 128])
+    # NOTE: This is intended to cover sbgemv_ testcase in CPUBlas.cpp.
+    def test_lowprecision_gemv_cpu(self, device, dtype, m, k):
+        torch.manual_seed(1)
+        a = torch.rand((m, k), dtype=dtype, device=device)
+        b = torch.rand((k, 1), dtype=dtype, device=device)
+
+        ref = torch.mm(a.to(torch.float32), b.to(torch.float32))
+        res = torch.mm(a, b).to(torch.float32)
+        torch.testing.assert_close(res, ref, atol=1e-2, rtol=1e-2)
+
+        a = torch.rand((k, m), dtype=dtype, device=device)
+        b = torch.rand((k, 1), dtype=dtype, device=device)
+
+        ref = torch.mm(a.t().to(torch.float32), b.to(torch.float32))
+        res = torch.mm(a.t(), b).to(torch.float32)
+        torch.testing.assert_close(res, ref, atol=1e-2, rtol=1e-2)
+
     @slowTest
     @onlyNativeDeviceTypes
     # bfloat16 doesn't have sufficient precision to pass this test
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -2009,7 +2009,7 @@ def test_fused_sdp_choice_cpu(self, device, type: str, dropout: float, dtype: to
     @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION])
     @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16, torch.float16])
     @parametrize("batch_size", [2, 12])
-    @parametrize("q_seq_len", [11, 514, 1030])
+    @parametrize("q_seq_len", [1, 11, 514, 1030])
     @parametrize("kv_seq_len", [17, 514])
     @parametrize("n_head", [1, 3])
     @parametrize("head_dim", [8])