pytorch
diff --git a/‎test/distributed/test_nvshmem.py‎
Lines changed: 6 additions & 4 deletions b/‎test/distributed/test_nvshmem.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎torch/csrc/distributed/c10d/SymmetricMemory.cpp‎
Lines changed: 1 addition & 1 deletion b/‎torch/csrc/distributed/c10d/SymmetricMemory.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/csrc/distributed/c10d/nvshmem_extension.cu‎
Lines changed: 6 additions & 2 deletions b/‎torch/csrc/distributed/c10d/nvshmem_extension.cu‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎torch/csrc/distributed/c10d/nvshmem_extension.cuh‎
Lines changed: 1 addition & 1 deletion b/‎torch/csrc/distributed/c10d/nvshmem_extension.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -13,6 +13,8 @@
 import torch.distributed._symmetric_memory as symm_mem
 from torch.testing._internal.common_distributed import MultiProcContinousTest
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
@@ -42,6 +44,7 @@ def requires_nvshmem():
 device_module = torch.get_device_module(device_type)
 
 
+@instantiate_parametrized_tests
 @requires_nvshmem()
 class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
     def _init_device(self) -> None:
@@ -129,7 +132,8 @@ def test_nvshmem_all_to_all_vdev(self) -> None:
         torch.testing.assert_close(out[:out_numel], expected)
 
     @skipIfRocm
-    def test_nvshmem_all_to_all_vdev_2d(self) -> None:
+    @parametrize("align", [1, 8, 16])  # `major_align` of output
+    def test_nvshmem_all_to_all_vdev_2d(self, align: int) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
@@ -142,8 +146,6 @@ def test_nvshmem_all_to_all_vdev_2d(self) -> None:
         nsplits = ne * self.world_size
         # Number of elements for an expert is random between [0, k)
         k = 3
-        # Align
-        align = 16
         inp_splits = torch.randint(k, (nsplits,), device=self.device)
         inp_numel = inp_splits.sum().item()
         # Exchange input splits to get output splits
@@ -168,7 +170,7 @@ def test_nvshmem_all_to_all_vdev_2d(self) -> None:
         in_out_splits[0].copy_(inp_splits)
 
         torch.ops.symm_mem.nvshmem_all_to_all_vdev_2d(
-            inp, out, in_out_splits, group_name, align
+            inp, out, in_out_splits, group_name, major_align=align
         )
         received_out_splits = in_out_splits[1]
         received_out_offsets = in_out_splits[2]
 
@@ -281,7 +281,7 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
   m.def(
       "nvshmem_all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
   m.def(
-      "nvshmem_all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int major_align) -> Tensor(a!)");
+      "nvshmem_all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int? major_align=None) -> Tensor(a!)");
 }
 
 TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {
 
@@ -469,7 +469,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
     at::Tensor& out,
     at::Tensor& in_out_splits,
     std::string group_name,
-    int64_t major_align) {
+    std::optional<int64_t> major_align) {
   /* Perform a 2D AllToAllv shuffle operation using NVSHMEM, with split information provided on device.
    * Arguments:
    *  - `input` is the input tensor
@@ -514,6 +514,10 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
   // TODO: world_size is currently limited by the number of elements in a WarpScan.
   TORCH_CHECK(world_size <= A2AV_TILE_SIZE, "world_size must be smaller than A2AV_TILE_SIZE", A2AV_TILE_SIZE);
 
+  // If `major_align` is not provided, use 1 as the default value.
+  int64_t major_align_val = major_align.value_or(1);
+  TORCH_CHECK(major_align_val > 0, "major_align must be positive");
+
   void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
   void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
   int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
@@ -579,7 +583,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
       &rank,
       &world_size,
       &ne,
-      &major_align};
+      &major_align_val};
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
 
@@ -33,6 +33,6 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
     at::Tensor& out,
     at::Tensor& in_out_splits,
     std::string group_name,
-    int64_t major_align = 1);
+    std::optional<int64_t> major_align = std::nullopt);
 
 } // namespace c10d::nvshmem_extension
Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {`
`281`	`281`	`m.def(`
`282`	`282`	`"nvshmem_all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");`
`283`	`283`	`m.def(`
`284`		`- "nvshmem_all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int major_align) -> Tensor(a!)");`
	`284`	`+ "nvshmem_all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int? major_align=None) -> Tensor(a!)");`
`285`	`285`	`}`
`286`	`286`
`287`	`287`	`TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {`