add args prefill_without_permute

InternLM · hellozmz · Apr 11, 2025 · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025
commit 47d9980ff9a5105cfc9168d5da95df09ba6c052d
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -512,6 +512,17 @@ def eager_mode(parser):
                                    help='Whether to enable eager mode. '
                                    'If True, cuda graph would be disabled')
 
+    @staticmethod
+    def prefill_without_permute(parser):
+        """Add argument prefill_without_permute to parser."""
+
+        return parser.add_argument('--prefill-without-permute',
+                                   action='store_true',
+                                   default=False,
+                                   help='Whether to enable prefill_without_permute. '
+                                   'If True, the moe layer would not permute the input, '
+                                   'and would not unpermute the output')
+
     @staticmethod
     def communicator(parser):
         return parser.add_argument('--communicator',

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -297,6 +297,8 @@ class PytorchEngineConfig:
             bit, set it to 4 or 8, respectively
         distributed_executor_backend (str): backend of distributed backend,
             options: ['uni', 'mp', 'ray']
+        prefill_without_permute(bool): whether to use moe without permute.
+            Default to False.
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -321,6 +323,7 @@ class PytorchEngineConfig:
     revision: str = None
     quant_policy: Literal[0, 4, 8] = 0
     distributed_executor_backend: str = None
+    prefill_without_permute: bool = False
 
     def __post_init__(self):
         """Check input validation."""

diff --git a/lmdeploy/pytorch/backends/cuda/moe.py b/lmdeploy/pytorch/backends/cuda/moe.py
@@ -6,6 +6,7 @@
 import torch.distributed as dist
 
 from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPTokenDispatcherLowLatency, TokenDispatcherBuilder
+from lmdeploy.pytorch.distributed import prefill_without_permute
 from lmdeploy.pytorch.kernels.cuda import fused_moe, fused_moe_w8a8
 from lmdeploy.pytorch.kernels.cuda.blocked_fp8_fused_moe import fused_moe_blocked_fp8
 from lmdeploy.pytorch.kernels.cuda.blocked_gemm_fp8 import quant_fp8
@@ -20,6 +21,7 @@
 from ..moe import (FusedMoEBlockedF8Builder, FusedMoEBlockedF8Impl, FusedMoEBuilder, FusedMoEImpl, FusedMoEW8A8Builder,
                    FusedMoEW8A8Impl)
 
+is_prefill_without_permute = prefill_without_permute()
 logger = get_logger('lmdeploy')
 
 

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -97,6 +97,7 @@ class DistConfig:
     dp_rank: int = 0
     world_size: int = None
     attn_config: 'DistConfig' = None
+    prefill_without_permute: bool = False
 
     def __post_init__(self):
         """post init."""

diff --git a/lmdeploy/pytorch/distributed.py b/lmdeploy/pytorch/distributed.py
@@ -29,6 +29,7 @@ class DistContext:
     ep_gpu_group: dist.ProcessGroup = None
     ep_gpu_groups: List[dist.ProcessGroup] = None
     dist_config: DistConfig = None
+    prefill_without_permute: bool = False
 
     @classmethod
     def build(cls, rank: int = 0, dist_config: DistConfig = None, ccl_backend: str = 'nccl'):
@@ -44,6 +45,7 @@ def build(cls, rank: int = 0, dist_config: DistConfig = None, ccl_backend: str =
         ep = dist_config.ep
         world_size = dist_config.world_size
         dp_rank = dist_config.dp_rank
+        prefill_without_permute = dist_config.prefill_without_permute
 
         if world_size == 1:
             return DistContext(dist_config=dist_config)
@@ -104,6 +106,7 @@ def build(cls, rank: int = 0, dist_config: DistConfig = None, ccl_backend: str =
             ep_gpu_group=ep_gpu_group,
             ep_gpu_groups=ep_gpu_groups,
             dist_config=dist_config,
+            prefill_without_permute=prefill_without_permute,
         )
         return context
 
@@ -181,6 +184,11 @@ def get_ep_world_rank():
     return ctx.ep, ctx.ep_rank
 
 
+def prefill_without_permute():
+    ctx = get_dist_manager().current_context()
+    return ctx.prefill_without_permute
+
+
 def _check_group_device(device: str):
     """check group device."""
     assert (device in ['cpu', 'gpu']), ('Expect process group device in ("cpu", "gpu"), '

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -89,6 +89,7 @@ def _build_dist_config(engine_config: PytorchEngineConfig):
         tp=engine_config.tp,
         ep=engine_config.ep,
         dp_rank=engine_config.dp_rank,
+        prefill_without_permute=engine_config.prefill_without_permute,
     )
     return dist_config