pytorch
diff --git a/‎.ci/manywheel/build_common.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/manywheel/build_common.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/pytorch/smoke_test/check_gomp.py
Lines changed: 74 additions & 0 deletions b/‎.ci/pytorch/smoke_test/check_gomp.py
Lines changed: 74 additions & 0 deletions
diff --git a/‎.circleci/scripts/binary_linux_test.sh
Lines changed: 5 additions & 0 deletions b/‎.circleci/scripts/binary_linux_test.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/c10d/nvshmem_extension.cu
Lines changed: 32 additions & 9 deletions b/‎torch/csrc/distributed/c10d/nvshmem_extension.cu
Lines changed: 32 additions & 9 deletions
diff --git a/‎torch/utils/viz/MemoryViz.js
Lines changed: 4 additions & 1 deletion b/‎torch/utils/viz/MemoryViz.js
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchgen/utils.py
Lines changed: 18 additions & 6 deletions b/‎torchgen/utils.py
Lines changed: 18 additions & 6 deletions
@@ -321,8 +321,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
             # ROCm workaround for roctracer dlopens
             if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                 patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies
-            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                 patchedpath=$destpath
             else
                 patchedpath=$(fname_with_sha256 $destpath)
 
@@ -0,0 +1,74 @@
+import ctypes
+import os
+import sys
+from pathlib import Path
+
+
+def get_gomp_thread():
+    """
+    Retrieves the maximum number of OpenMP threads after loading the `libgomp.so.1` library
+    and the `libtorch_cpu.so` library. It then queries the
+    maximum number of threads available for OpenMP parallel regions using the
+    `omp_get_max_threads` function.
+
+    Returns:
+        int: The maximum number of OpenMP threads available.
+
+    Notes:
+        - The function assumes the default path for `libgomp.so.1` on AlmaLinux OS.
+        - The path to `libtorch_cpu.so` is constructed based on the Python executable's
+          installation directory.
+        - This function is specific to environments where PyTorch and OpenMP are used
+          together and may require adjustments for other setups.
+    """
+    python_path = Path(sys.executable).resolve()
+    python_prefix = (
+        python_path.parent.parent
+    )  # Typically goes to the Python installation root
+
+    # Get the additional ABI flags (if any); it may be an empty string.
+    abiflags = getattr(sys, "abiflags", "")
+
+    # Construct the Python directory name correctly (e.g., "python3.13t").
+    python_version = (
+        f"python{sys.version_info.major}.{sys.version_info.minor}{abiflags}"
+    )
+
+    libtorch_cpu_path = (
+        python_prefix
+        / "lib"
+        / python_version
+        / "site-packages"
+        / "torch"
+        / "lib"
+        / "libtorch_cpu.so"
+    )
+
+    # use the default gomp path of AlmaLinux OS
+    libgomp_path = "/usr/lib64/libgomp.so.1"
+    os.environ["GOMP_CPU_AFFINITY"] = "0-3"
+
+    libgomp = ctypes.CDLL(libgomp_path)
+    libgomp = ctypes.CDLL(libtorch_cpu_path)
+
+    libgomp.omp_get_max_threads.restype = ctypes.c_int
+    libgomp.omp_get_max_threads.argtypes = []
+
+    omp_max_threads = libgomp.omp_get_max_threads()
+    return omp_max_threads
+
+
+def main():
+    omp_max_threads = get_gomp_thread()
+    print(
+        f"omp_max_threads after loading libgomp.so and libtorch_cpu.so: {omp_max_threads}"
+    )
+    if omp_max_threads == 1:
+        raise RuntimeError(
+            "omp_max_threads is 1. Check whether libgomp.so is loaded twice."
+        )
+
+
+if __name__ == "__main__":
+    main()
@@ -101,6 +101,11 @@ if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_
   else
     python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled $extra_parameters
   fi
+
+  if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then
+    # https://github.com/pytorch/pytorch/issues/149422
+    python /pytorch/.ci/pytorch/smoke_test/check_gomp.py
+  fi
 fi
 
 # Clean temp files
 
@@ -202,22 +202,30 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
   auto source_offsets = in_out_splits + npes * 2;
   int bid = blockIdx.x;
   int tid = threadIdx.x;
+  int blocks_per_peer = max(gridDim.x / npes, 1);
 
   // Calculate the output offsets
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
   prefixSum(peer_offsets, output_splits, npes);
   __syncthreads();
 
-  // Each block targets a different peer
-  for (int i = bid; i < npes; i += gridDim.x) {
+  // Target a different peer based on bid
+  for (int i = bid / blocks_per_peer; i < npes; i += gridDim.x / blocks_per_peer) {
     int peer = (mype + i) % npes;
-    auto size = output_splits[peer] * stride;
-    auto source_offset = source_offsets[peer] * stride;
-    auto write_offset = peer_offsets[peer] * stride;
+    // Total amount from `peer`
+    auto peer_size = output_splits[peer] * stride;
+    // Amount to get from `peer` in this block
+    auto block_size = peer_size / blocks_per_peer;
+    // Being lazy here, we should handle the residual if the division is not exact
+    CUDA_KERNEL_ASSERT(block_size * blocks_per_peer == peer_size);
+    // This block's offset in the data from `peer`
+    auto block_offset = block_size * (bid % blocks_per_peer);
+    auto source_offset = source_offsets[peer] * stride + block_offset;
+    auto write_offset = peer_offsets[peer] * stride + block_offset;
     nvshmemx_getmem_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
-      size,
+      block_size,
       peer);
   }
   // Write out the output offsets (to the scratchpad line)
@@ -266,11 +274,26 @@ at::Tensor nvshmem_all_to_all_vdev(
       0,
       stream);
 
-  // All to all data exchange
-  // Limit the number of blocks to 16
-  int num_blocks = std::min(world_size, 16);
+  // CTA Tuning
+  // Intra-node: use multiple blocks per peer to increase data parallelism, up to 8.
+  // Up to 1 MB -> 1 block
+  // Up to 2 MB -> 2 blocks
+  // Up to 4 MB -> 4 blocks
+  // More -> 8 blocks
+  auto input_size = input.numel() * input.element_size();
+  const int max_blocks_per_peer = input_size < 1024 * 1024 ? 1 :
+      (input_size < 2 * 1024 * 1024 ? 2 :
+      (input_size < 4 * 1024 * 1024 ? 4 : 8));
+
+  // Inter-node: limit the total the number of blocks to 8 which is able to
+  // drive 57 GB/s bandwidth in test, enough to drive a 400 Gb/s NIC.
+  // TODO: better intra vs inter detection, currently it is based on world_size
+  int num_blocks = world_size > 8 ? 8 : max_blocks_per_peer * world_size;
+
   // Stride at dim 0 (assuming input is contiguous, TODO)
   size_t stride_bytes = input.stride(0) * input.element_size();
+
+  // All to all data exchange
   void* args1[] = {
       &input_ptr,
       &output_ptr,
 
@@ -1228,6 +1228,7 @@ function create_trace_view(
   dst.selectAll('svg').remove();
   dst.selectAll('div').remove();
 
+  max_entries = Math.min(max_entries, data.elements_length);
   const d = dst.append('div');
   d.append('input')
     .attr('type', 'range')
@@ -1237,7 +1238,9 @@ function create_trace_view(
     .on('change', function () {
       create_trace_view(dst, snapshot, device, plot_segments, this.value);
     });
-  d.append('label').text('Detail');
+  d.append('label').text(
+    `Detail: ${max_entries} of ${data.elements_length} entries`,
+  );
 
   const grid_container = dst
     .append('div')
 
@@ -10,8 +10,8 @@
 from dataclasses import fields, is_dataclass
 from enum import auto, Enum
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, TYPE_CHECKING, TypeVar
-from typing_extensions import assert_never, Self
+from typing import Any, Callable, Generic, Literal, NoReturn, TYPE_CHECKING, TypeVar
+from typing_extensions import assert_never, deprecated, Self
 
 from torchgen.code_template import CodeTemplate
 
@@ -97,6 +97,15 @@ def context(msg_fn: Callable[[], str]) -> Iterator[None]:
         raise
 
 
+if TYPE_CHECKING:
+    # A little trick from https://github.com/python/mypy/issues/6366
+    # for getting mypy to do exhaustiveness checking
+    # TODO: put this somewhere else, maybe
+    @deprecated("Use typing_extensions.assert_never instead")
+    def assert_never(x: NoReturn) -> NoReturn:  # type: ignore[misc] # noqa: F811
+        raise AssertionError(f"Unhandled type: {type(x).__name__}")
+
+
 @functools.cache
 def _read_template(template_fn: str) -> CodeTemplate:
     return CodeTemplate.from_file(template_fn)
@@ -173,14 +182,17 @@ def substitute_with_template(
                 }
             template = _read_template(template_path)
             substitute_out = template.substitute(env)
-            # Ensure an extra blank line before the class/function definition
-            # if it is followed by a docstring
+            # Ensure an extra blank line between the class/function definition
+            # and the docstring of the previous class/function definition.
+            # NB: It is generally not recommended to have docstrings in pyi stub
+            #     files. But if there are any, we need to ensure that the file
+            #     is properly formatted.
             return re.sub(
                 r'''
                 (""")\n+             # match triple quotes
                 (
-                    ([ ]*@.+\n)*     # match decorators if any
-                    [ ]*(class|def)  # match class/function definition
+                    (\s*@.+\n)*     # match decorators if any
+                    \s*(class|def)  # match class/function definition
                 )
                 ''',
                 r"\g<1>\n\n\g<2>",