pytorch
diff --git a/‎test/inductor/test_cpu_select_algorithm.py
Lines changed: 37 additions & 0 deletions b/‎test/inductor/test_cpu_select_algorithm.py
Lines changed: 37 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp_gemm_template.py
Lines changed: 149 additions & 40 deletions b/‎torch/_inductor/codegen/cpp_gemm_template.py
Lines changed: 149 additions & 40 deletions
@@ -585,6 +585,43 @@ def forward(self, x):
         vec_amx = VecAMX()
         self._check_amx_counter(vec_amx)
 
+    @inductor_config.patch({"freezing": True})
+    @inductor_config.patch({"cpp.gemm_max_k_slices": 0})
+    @patches
+    @torch.no_grad
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    @parametrize("batch_size", (2,))
+    @parametrize("in_features", (1000,))
+    @parametrize("out_features", (2,))
+    @parametrize("bias", (True, False))
+    @parametrize(
+        "epilogue",
+        (
+            "none",
+            "relu",
+        ),
+    )
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    def test_linear_k_slicing(
+        self, batch_size, in_features, out_features, bias, epilogue, dtype
+    ):
+        class M(torch.nn.Module):
+            def __init__(self, bias, epilogue, other):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_features, out_features, bias)
+                self.epilogue = _get_epilogue(epilogue, other)
+
+            def forward(self, x):
+                return self.epilogue(self.linear(x))
+
+        counters.clear()
+        v = torch.randn(batch_size, in_features).to(dtype=dtype)
+        u = torch.randn(batch_size, out_features).to(dtype=dtype)
+        mod = M(bias=bias, epilogue=epilogue, other=u).to(dtype=dtype).eval()
+        with verify(dtype) as (atol, rtol):
+            self.common(mod, (v,), atol=atol, rtol=rtol)
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class _DynamicShapesTestBase(BaseTestSelectAlgorithm):
 
@@ -2,22 +2,27 @@
 import contextlib
 import logging
 import math
+from functools import lru_cache
 from typing import Any, Callable, cast, List, Optional, Set, Union
 from unittest.mock import patch
 
 import torch
 import torch.utils
 
 from ..._dynamo.utils import counters
-from .. import ir, lowering as L
+from .. import config, ir, lowering as L
 from ..kernel.mm_common import mm_args
 from ..select_algorithm import DataProcessorTemplateWrapper
 from ..utils import cache_on_self, has_free_symbols, parallel_num_threads
 from ..virtualized import ops, V
 from .cpp_micro_gemm import CppMicroGemmAMX, create_micro_gemm, LayoutType
 from .cpp_template import CppTemplate
 from .cpp_template_kernel import CppTemplateKernel
-from .cpp_utils import GemmBlocking, get_gemm_template_output_and_compute_dtype
+from .cpp_utils import (
+    DTYPE_TO_CPP,
+    GemmBlocking,
+    get_gemm_template_output_and_compute_dtype,
+)
 
 log = logging.getLogger(__name__)
 
@@ -58,6 +63,9 @@
     {%- endif %}
     const int64_t Mc_blocks = Mt_blocks;
     const int64_t Kc_blocks = Kt_blocks;
+    const int64_t num_Mc_blocks = (M0_blocks + Mc_blocks - 1) / Mc_blocks;
+    const int64_t num_Nc_blocks = N0_blocks;
+    const int64_t num_k_slices = (K0_blocks + Kt_blocks - 1) / Kt_blocks;
     {%- else %}
     constexpr int64_t M = {{kernel.size(GemmOut, 0)}};
     constexpr int64_t M0_blocks = (M + M0 - 1) / M0;
@@ -66,52 +74,68 @@
     constexpr int64_t Kt_blocks = {{template.thread_blocking().block_k}};
     constexpr int64_t Mc_blocks = {{template.cache_blocking().block_m}};
     constexpr int64_t Kc_blocks = {{template.cache_blocking().block_k}};
+    constexpr int64_t num_Mc_blocks = (M0_blocks + Mc_blocks - 1) / Mc_blocks;
+    constexpr int64_t num_Nc_blocks = N0_blocks;
+    constexpr int64_t num_k_slices = (K0_blocks + Kt_blocks - 1) / Kt_blocks;
     {%- endif %}
 
-    // TODO(jgong5): support k-slicing
-    {{kernel.assert_function}}(Kt_blocks == K0_blocks, "Do not support k slicing yet.");
     // make sure all partitions are assigned
     {{kernel.assert_function}}(
         Mt_blocks * Nt_blocks * Kt_blocks * {{num_threads}} >= M0_blocks * N0_blocks * K0_blocks,
         "Not all partitions are assigned."
     );
 
+    {%- if maybe_k_slicing %}
+    std::unique_ptr<std::unique_ptr<{{DTYPE_TO_CPP[acc_buf_dtype]}}[]>[]> local_buf_ptrs;
+    if (num_k_slices > 1) {
+        local_buf_ptrs.reset(new std::unique_ptr<{{DTYPE_TO_CPP[acc_buf_dtype]}}[]>[num_Mc_blocks * num_Nc_blocks * num_k_slices]);
+    }
+    {%- endif %}
+
     {%- if num_threads > 1 %}
     #pragma omp parallel num_threads({{num_threads}})
     {
-        int tid = omp_get_thread_num();
+        const int tid = omp_get_thread_num();
         int64_t m_block_start, m_block_end, n_block_start, n_block_end, k_block_start, k_block_end;
         mm_get_thread_blocks(
             tid, M0_blocks, N0_blocks, K0_blocks, Mt_blocks, Nt_blocks, Kt_blocks,
             m_block_start, m_block_end, n_block_start, n_block_end, k_block_start, k_block_end);
+        {%- if maybe_k_slicing %}
+        const int64_t k_group_id = tid / num_k_slices;
+        const int64_t k_slice_id = tid % num_k_slices;
+        {%- endif %}
     {%- else %}
     {
-        int64_t m_block_start = 0;
-        int64_t m_block_end = M0_blocks;
-        int64_t n_block_start = 0;
-        int64_t n_block_end = N0_blocks;
-        int64_t k_block_start = 0;
-        int64_t k_block_end = K0_blocks;
+        const int tid = 0;
+        const int64_t m_block_start = 0;
+        const int64_t m_block_end = M0_blocks;
+        const int64_t n_block_start = 0;
+        const int64_t n_block_end = N0_blocks;
+        const int64_t k_block_start = 0;
+        const int64_t k_block_end = K0_blocks;
     {%- endif %}
         {{ micro_gemm.codegen_init(kernel) }}
         for (int64_t mc = m_block_start; mc < m_block_end; mc += Mc_blocks) {
             const int64_t m_start = mc * M0;
             const int64_t m_end = std::min(std::min(mc + Mc_blocks, m_block_end) * M0, M);
             const int64_t m_size = m_end - m_start;
             {%- if use_local_acc %}
+            {%- set acc_buf_name = "local_acc_buf" %}
             {{ kernel.define_buffer(acc_buf_name, ["m_end - m_start", "N0"], acc_buf_dtype) }}
             {%- endif %}
             for (int64_t nc = n_block_start; nc < n_block_end; ++nc) {
                 const int64_t n_start = nc * N0;
                 const int64_t n_end = std::min((nc + 1) * N0, N);
+                const int64_t n_size = n_end - n_start;
                 {%- if use_local_acc %}
                 {%- set acc = kernel.local_buffers[acc_buf_name] %}
+                {{ kernel.reinit_buffer_if_null(acc_buf_name) }}
                 {%- else %}
                 {%- set acc = kernel.slice_nd(GemmOut, [("m_start", "m_end"), ("n_start", "n_start + N0")]) %}
                 {%- endif %}
                 for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) {
                     int64_t k_start = kc * K0;
-                    int64_t k_end = std::min((kc + Kc_blocks) * K0, K);
+                    int64_t k_end = std::min(std::min(kc + Kc_blocks, k_block_end) * K0, K);
                     {%- set tile_X = kernel.slice_nd(X, [("m_start", "m_end"), ("k_start", "k_end")]) %}
                     {%- set tile_W_3d = kernel.slice_nd(W, [("nc", "nc + 1"), ("k_start", "k_end"), ()]) %}
                     {%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
@@ -121,19 +145,64 @@
                         {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=True)|indent(24, false) }}
                     }
                 }
+                {%- if maybe_k_slicing %}
+                if (num_k_slices > 1) {
+                    const int64_t mxn_cache_block_id = mc * num_Nc_blocks + nc;
+                    local_buf_ptrs[mxn_cache_block_id * num_k_slices + k_slice_id].reset({{ kernel.release_buffer(acc_buf_name) }});
+                } else
+                {%- endif %}
+                {
                 {%- if N == PADDED_N %}
                     {%- set tile_Y = kernel.slice_nd(Y_2d, [("m_start", "m_end"), ("n_start", "n_start + N0")]) %}
                     {%- set tile_acc = acc %}
                 {%- else %}
                     {%- set tile_Y = kernel.slice_nd(Y_2d, [("m_start", "m_end"), ("n_start", "n_end")]) %}
                     {%- set tile_acc = kernel.slice_nd(acc, [(), ("0", "n_end - n_start")]) %}
                 {%- endif %}
-                {{ kernel.store_output(
-                      tile_Y, tile_acc, GemmOut, epilogue_nodes, offsets=("m_start", "n_start"), reindexers=reindexers
-                   )|indent(16, false)
-                }}
+                    {{ kernel.store_output(
+                        tile_Y, tile_acc, GemmOut, epilogue_nodes, offsets=("m_start", "n_start"), reindexers=reindexers
+                    )|indent(20, false)
+                    }}
+                }
+            }
+        }
+        {%- if maybe_k_slicing %}
+        if (num_k_slices > 1) {
+            #pragma omp barrier
+            for (int64_t mc = m_block_start; mc < m_block_end; mc += Mc_blocks) {
+                // We slice M-dim and each thread in the k-slicing group works on a slice
+                const int64_t m_start_unsliced = mc * M0;
+                const int64_t m_end_unsliced = std::min(std::min(mc + Mc_blocks, m_block_end) * M0, M);
+                const int64_t m_size_unsliced = m_end_unsliced - m_start_unsliced;
+                const int64_t m_slice_size = (m_size_unsliced + num_k_slices - 1) / num_k_slices;
+                const int64_t m_start = std::min(m_start_unsliced + m_slice_size * k_slice_id, m_end_unsliced);
+                const int64_t m_end = std::min(m_start_unsliced + m_slice_size * (k_slice_id + 1), m_end_unsliced);
+                const int64_t m_size = m_end - m_start;
+                const int64_t m_offset = m_start - m_start_unsliced;
+                for (int64_t nc = n_block_start; nc < n_block_end; ++nc) {
+                    const int64_t n_start = nc * N0;
+                    const int64_t n_end = std::min((nc + 1) * N0, N);
+                    const int64_t n_size = n_end - n_start;
+                    const int64_t mxn_cache_block_id = mc * num_Nc_blocks + nc;
+                    auto {{acc_buf_name}} = local_buf_ptrs[mxn_cache_block_id * num_k_slices].get();
+                    for (int64_t other_slice = 1; other_slice < num_k_slices; other_slice++) {
+                        auto other_acc = local_buf_ptrs[mxn_cache_block_id * num_k_slices + other_slice].get();
+                        for (int64_t m = m_offset; m < m_offset + m_size; m++) {
+                            #pragma omp simd
+                            for (int64_t n = 0; n < n_size; n++) {
+                                {{acc_buf_name}}[m*N0 + n] += other_acc[m*N0 + n];
+                            }
+                        }
+                    }
+                    {%- set tile_acc_m_slice = kernel.slice_nd(tile_acc, [("m_offset", "m_offset + m_end - m_start"), ()]) %}
+                    {{ kernel.store_output(
+                        tile_Y, tile_acc_m_slice, GemmOut, epilogue_nodes, offsets=("m_start", "n_start"), reindexers=reindexers
+                    )|indent(20, false)
+                    }}
+                }
             }
         }
+        {%- endif %}
         {{ micro_gemm.codegen_finalize(kernel) }}
     }
 }
@@ -180,12 +249,12 @@ def thread_blocking(self) -> GemmBlocking:
         NOTE [Thread blocking in Cpp GEMM]
         We use simple heuristics to decide the thread blocking:
         1. Make sure all threads are occupied as much as possible.
-        2. Favor more square-sized thread blocks for better data reuse.
-        TODO(jgong5): we only do blocking on on M and N now, add blocking on K
-                      after supporting k-slicing.
+        2. For (m, n) blocks, favor more square-sized thread blocks for better data reuse.
+        3. If (m, n) blocks cannot occupy all the threads, we consider k-slicing.
         TODO(jgong5): allow tuning various blocking options
         """
 
+        @lru_cache(maxsize=100)
         def get_factors(number):
             factors = []
             for i in range(int(number**0.5), 0, -1):
@@ -194,19 +263,19 @@ def get_factors(number):
                     factors.append(i)
             return factors
 
-        def get_blocking(num_threads, factor, m_blocks, n_blocks, k_blocks):
-            thread_block_n = (n_blocks + factor - 1) // factor
-            cofactor = num_threads // factor
-            thread_block_m = (m_blocks + cofactor - 1) // cofactor
-            return GemmBlocking(thread_block_m, thread_block_n, k_blocks)
+        def get_blocking(m_factor, n_factor, k_factor, m_blocks, n_blocks, k_blocks):
+            thread_block_k = math.ceil(k_blocks / k_factor)
+            thread_block_n = math.ceil(n_blocks / n_factor)
+            thread_block_m = math.ceil(m_blocks / m_factor)
+            return GemmBlocking(thread_block_m, thread_block_n, thread_block_k)
 
         assert (
             not self.is_dynamic_M
         ), "Unable to determine thread blocking for dynamic M."
         register_blocking = self.register_blocking
-        m_blocks = (self.m + register_blocking.block_m - 1) // register_blocking.block_m
-        n_blocks = (self.n + register_blocking.block_n - 1) // register_blocking.block_n
-        k_blocks = (self.k + register_blocking.block_k - 1) // register_blocking.block_k
+        m_blocks = math.ceil(self.m / register_blocking.block_m)
+        n_blocks = math.ceil(self.n / register_blocking.block_n)
+        k_blocks = math.ceil(self.k / register_blocking.block_k)
         factors = get_factors(self.num_threads)
         assert len(factors) > 0
 
@@ -219,26 +288,52 @@ def get_better_blocking(blocking, best_blocking):
                 block_n_size = blocking.block_n * register_blocking.block_n
                 best_block_m_size = best_blocking.block_m * register_blocking.block_m
                 best_block_n_size = best_blocking.block_n * register_blocking.block_n
-                if block_m_size + block_n_size < best_block_m_size + best_block_n_size:
+                if blocking.block_k > best_blocking.block_k:
+                    best_blocking = blocking
+                elif (
+                    blocking.block_k == best_blocking.block_k
+                    and block_m_size + block_n_size
+                    < best_block_m_size + best_block_n_size
+                ):
                     best_blocking = blocking
             return best_blocking
 
         best_blocking = None
-        # check if we can have a thread-blocking to occupy all threads
-        for factor in factors:
-            cofactor = self.num_threads // factor
-            if n_blocks >= factor and m_blocks >= cofactor:
+        # check if we can have a thread-blocking to occupy all threads without k-slicing
+        for n_factor in factors:
+            m_factor = self.num_threads // n_factor
+            if n_blocks >= n_factor and m_blocks >= m_factor:
                 blocking = get_blocking(
-                    self.num_threads, factor, m_blocks, n_blocks, k_blocks
+                    m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks
                 )
                 best_blocking = get_better_blocking(blocking, best_blocking)
 
         if best_blocking is None:
-            for factor in factors:
-                cofactor = self.num_threads // factor
-                if n_blocks >= factor or m_blocks >= cofactor:
+            for k_factor in factors:
+                if k_blocks >= k_factor and (
+                    config.cpp.gemm_max_k_slices == 0
+                    or k_factor <= config.cpp.gemm_max_k_slices
+                ):
+                    n_factors = get_factors(self.num_threads // k_factor)
+                    for n_factor in n_factors:
+                        m_factor = (self.num_threads // k_factor) // n_factor
+                        if n_blocks >= n_factor and m_blocks >= m_factor:
+                            blocking = get_blocking(
+                                m_factor,
+                                n_factor,
+                                k_factor,
+                                m_blocks,
+                                n_blocks,
+                                k_blocks,
+                            )
+                            best_blocking = get_better_blocking(blocking, best_blocking)
+
+        if best_blocking is None:
+            for n_factor in factors:
+                m_factor = self.num_threads // n_factor
+                if n_blocks >= n_factor or m_blocks >= m_factor:
                     blocking = get_blocking(
-                        self.num_threads, factor, m_blocks, n_blocks, k_blocks
+                        m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks
                     )
                     best_blocking = get_better_blocking(blocking, best_blocking)
 
@@ -327,6 +422,17 @@ def get_occupancy():
             f"Number of threads: {self.num_threads}, occupancy: {get_occupancy()}"  # noqa: G004
         )
 
+    def maybe_k_slicing(self):
+        if self.num_threads == 1:
+            return False
+        if self.is_dynamic_M:
+            # TODO(jgong5): perhaps use size hint to decide?
+            return True
+        register_blocking = self.register_blocking
+        k_blocks = math.ceil(self.k / register_blocking.block_k)
+        thread_blocking = self.thread_blocking()
+        return k_blocks > thread_blocking.block_k
+
     @staticmethod
     def add_choices(
         choices,
@@ -645,9 +751,11 @@ def bias_add_inner(index):
 
         Y_2d: Union[ir.Buffer, ir.ReinterpretView] = Y
         use_local_acc = (
-            self.layout.dtype != torch.float or int8_gemm or self.padded_n != self.n
+            self.layout.dtype != torch.float
+            or int8_gemm
+            or self.padded_n != self.n
+            or self.maybe_k_slicing()
         )
-        acc_buf_name = "local_acc_buf"
         if epilogue_nodes:
             epilogues.extend(epilogue_nodes)
             assert Y.get_numel() == epilogues[-1].get_numel()
@@ -719,12 +827,13 @@ def bias_add_inner(index):
             reindexers=reindexers,
             Y_2d=Y_2d,
             use_local_acc=use_local_acc,
-            acc_buf_name=acc_buf_name,
+            maybe_k_slicing=self.maybe_k_slicing(),
             x_scale=x_scale,
             x_zp=x_zp,
             w_scale=w_scale,
             w_zp=w_zp,
             acc_buf_dtype=torch.int32 if int8_gemm else torch.float,
+            DTYPE_TO_CPP=DTYPE_TO_CPP,
         )
         with contextlib.ExitStack() as stack:
             for buf in fake_buffers: