pytorch
diff --git a/‎test/inductor/test_cpu_repro.py
Lines changed: 27 additions & 0 deletions b/‎test/inductor/test_cpu_repro.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎test/inductor/test_flex_attention.py
Lines changed: 0 additions & 3 deletions b/‎test/inductor/test_flex_attention.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torch/_inductor/codegen/cpp.py
Lines changed: 15 additions & 0 deletions b/‎torch/_inductor/codegen/cpp.py
Lines changed: 15 additions & 0 deletions
@@ -987,6 +987,33 @@ def fn(x):
         # aten parallel.
         self.common(fn, (v,), atol=5e-1, rtol=5e-1)
 
+    def test_parallel_reduction_vectorization(self):
+        # Fix issue: https://github.com/pytorch/pytorch/issues/151523
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=3,
+                    out_channels=16,
+                    kernel_size=(1, 7),
+                    stride=(2, 1),
+                    padding=0,
+                )
+
+            def forward(self, x, weight):
+                x = self.conv(x)
+                x = F.hardshrink(x, lambd=0)
+                x = x.view(x.size(0), -1)
+                x = torch.mv(weight, x[0])
+                return x
+
+        mod = Model().eval()
+        x = torch.randn(2, 3, 127, 255)
+        weight = torch.randn(10, 254976)
+        # Use same criterion as test_inplace_squeeze_needed
+        # for parallel reduction.
+        self.common(mod, (x, weight), atol=5e-1, rtol=5e-1)
+
     def test_cat_mul(self):
         # https://github.com/pytorch/pytorch/issues/93365
         def fn(p0, p1):
 
@@ -2026,9 +2026,6 @@ def func(qk, b, h, q, kv):
             self.assertTrue((ref - out).abs().mean() < 1e-2)
 
     @supported_platform
-    @unittest.skipIf(
-        SKIP_UT_ON_CPU, "TODO: fix https://github.com/pytorch/pytorch/issues/151290"
-    )
     def test_make_block_mask(self, device):
         def causal_mask(b, h, q_idx, kv_idx):
             return q_idx >= kv_idx
 
@@ -5464,6 +5464,15 @@ def max_parallel_depth(self):
             num_steps = num_steps * FloorDiv(loop.size, loop.steps)
             max_depth += 1
 
+        def get_simd_vec_depth(loops):
+            # Return the first loop level which is simd_vec
+            for i, loop in enumerate(loops):
+                if loop.simd_vec:
+                    return i
+            return None
+
+        simd_vec_depth = get_simd_vec_depth(self.loops)
+
         # When the number of steps of the first inner loop is much larger than the number of steps of
         # all outer loops, change `start_depth` to the first inner loop and recalculate `max_depth`.
         if (
@@ -5472,6 +5481,12 @@ def max_parallel_depth(self):
             and isinstance(self.loops[max_depth].size, sympy.Integer)
             and num_steps * 300
             < FloorDiv(self.loops[max_depth].size, self.loops[max_depth].steps)
+            and not (
+                # Disable parallel reduction under the vec loop
+                simd_vec_depth is not None
+                and max_depth > simd_vec_depth
+                and self.loops[max_depth].is_reduction
+            )
         ):
             start_depth = max_depth
             max_depth = 0