InternLM
diff --git a/‎lmdeploy/pytorch/backends/cuda/graph_runner.py
Lines changed: 4 additions & 0 deletions b/‎lmdeploy/pytorch/backends/cuda/graph_runner.py
Lines changed: 4 additions & 0 deletions
diff --git a/`‎lmdeploy/pytorch/engine/engine.py` b/`‎lmdeploy/pytorch/engine/engine.py`
diff --git a/‎lmdeploy/pytorch/engine/model_agent.py
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/pytorch/engine/model_agent.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmdeploy/pytorch/model_inputs.py
Lines changed: 7 additions & 2 deletions b/‎lmdeploy/pytorch/model_inputs.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎lmdeploy/pytorch/models/utils/cudagraph.py
Lines changed: 6 additions & 1 deletion b/‎lmdeploy/pytorch/models/utils/cudagraph.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/nn/linear.py
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/pytorch/nn/linear.py
Lines changed: 1 addition & 1 deletion
@@ -43,10 +43,12 @@ def __init__(
         num_blocks: int,
         is_decoding: bool,
         pool: Tuple[int, int],
+        model_config: ModelConfig,
         device: torch.device,
     ):
         self.model = model
         self.ctx_mgr = model.ctx_mgr
+        self.model_config = model_config
 
         self.meta = CudaGraphMeta(
             max_batchs=max_batches,
@@ -56,6 +58,7 @@ def __init__(
             device=device,
             input_buffers=dict(),
             output_buffers=dict(),
+            vocab_size=self.model_config.vocab_size,
         )
         self.device = device
         self.max_batches = max_batches
@@ -171,6 +174,7 @@ def __call__(self, **kwargs):
                                            num_blocks=self.num_blocks,
                                            is_decoding=is_decoding,
                                            pool=self.graph_pool_handle,
+                                           model_config=self.model_config,
                                            device=self.device)
             runner.capture(**kwargs)
             self._runner_map[graph_key] = runner
 
@@ -835,7 +835,7 @@ def __make_dummy_inputs():
             num_loops = 1 if prefill else prefill_interval
             return dict(
                 running=[],
-                inputs=ModelInputs.make_dummy(1, is_decoding=not prefill),
+                inputs=ModelInputs.make_dummy(1, is_decoding=not prefill, vocab_size=self.model_config.vocab_size),
                 swap_in_map=dict(),
                 swap_out_map=dict(),
                 loop_count=num_loops,
 
@@ -301,7 +301,7 @@ async def __long_context_single_forward(new_inputs, max_seqlen: int):
 
         # compute dummy loop
         if dummy_loop > 0:
-            dummy_inputs = ModelInputs.make_dummy(1, False, 'cuda')
+            dummy_inputs = ModelInputs.make_dummy(1, False, 'cuda', vocab_size=self.model_config.vocab_size)
         for _ in range(dummy_loop):
             await __forward(dummy_inputs)
 
 
@@ -285,9 +285,14 @@ def build_dp_meta(self):
         self.dp_meta = DPMeta.build(self.input_ids.numel())
 
     @classmethod
-    def make_dummy(cls, batch_size: int, is_decoding: bool, device: str = 'cpu', dummy_block_id: int = 0):
+    def make_dummy(cls,
+                   batch_size: int,
+                   is_decoding: bool,
+                   device: str = 'cpu',
+                   dummy_block_id: int = 0,
+                   vocab_size: int = 1):
         """make dummy inputs."""
-        input_ids = torch.zeros((
+        input_ids = torch.randint(0, vocab_size, (
             1,
             batch_size,
         ), dtype=torch.long, device=device)
 
@@ -33,6 +33,7 @@ class CudaGraphMeta:
     device: torch.device
     input_buffers: BuffType = None
     output_buffers: BuffType = None
+    vocab_size: int = 1
 
 
 class CudaGraphMixin:
@@ -58,7 +59,10 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, **kwargs) ->
         device = graph_meta.device
 
         input_buffers: BuffType = dict()
-        input_buffers['input_ids'] = torch.zeros(1, max_tokens, dtype=torch.int64, device=device)
+        input_buffers['input_ids'] = torch.randint(0,
+                                                   graph_meta.vocab_size, (1, max_tokens),
+                                                   dtype=torch.int64,
+                                                   device=device)
         input_buffers['position_ids'] = torch.zeros((1, max_tokens), dtype=torch.int64, device=device)
         if getattr(self.config, 'use_flash_mla', False) is True:
             import flash_mla_cuda
@@ -96,6 +100,7 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, input_ids: Tensor, p
         num_tokens = input_ids.size(-1)
 
         # fill buffer
+        input_buffers['input_ids'].random_(0, graph_meta.vocab_size)
         input_buffers['input_ids'][:, :num_tokens] = input_ids
         input_buffers['position_ids'][:, :num_tokens] = position_ids
         input_buffers['block_offsets'][:batch_size, :num_blocks] = block_offsets
 
@@ -1675,7 +1675,7 @@ def build_qkv_proj(in_features: int,
                    device: Optional[torch.device] = None,
                    is_tp: bool = True,
                    num_replicate_kv_heads: int = 1,
-                   dp_disable_tp: bool = False,
+                   dp_disable_tp: bool = True,
                    all_reduce: bool = False,
                    dp_gather: bool = False):
     """build qkv proj."""