InternLM
diff --git a/‎.github/workflows/unit-test.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/unit-test.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎generate.sh
Lines changed: 1 addition & 0 deletions b/‎generate.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/model.py
Lines changed: 1 addition & 1 deletion b/‎lmdeploy/model.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmdeploy/serve/async_engine.py
Lines changed: 4 additions & 2 deletions b/‎lmdeploy/serve/async_engine.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎lmdeploy/serve/vl_async_engine.py
Lines changed: 4 additions & 4 deletions b/‎lmdeploy/serve/vl_async_engine.py
Lines changed: 4 additions & 4 deletions
@@ -72,6 +72,7 @@ jobs:
               -DUSE_NVTX=ON \
               -DSM=80 \
               -DCMAKE_CUDA_ARCHITECTURES=80 \
+              -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
               -DBUILD_TEST=OFF
           make -j$(nproc) && make install
       - name: Install lmdeploy
 
@@ -14,4 +14,5 @@ cmake ${builder} .. \
     -DBUILD_PY_FFI=ON \
     -DBUILD_MULTI_GPU=ON \
     -DCMAKE_CUDA_FLAGS="-lineinfo" \
+    -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
     -DUSE_NVTX=ON
@@ -228,7 +228,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
             role = message['role']
             content = get_text(message['content'])
             ret += f'{box_map[role]}{content}{eox_map[role]}'
-        if len(messages) and messages[-1]['role'] == 'assistant':
+        if len(messages) and messages[-1]['role'] == 'assistant' and len(eox_map['assistant']) > 0:
             return ret[:-len(eox_map['assistant'])]  # prefix of response
         ret += f'{self.assistant}'
         return ret
 
@@ -21,7 +21,7 @@
 from lmdeploy.archs import get_model_arch
 from lmdeploy.logger import RequestLogger
 from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig
-from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
+from lmdeploy.model import MODELS, BaseChatTemplate, ChatTemplateConfig, best_match_model
 from lmdeploy.serve.utils import LogitsMixin
 from lmdeploy.tokenizer import DetokenizeState
 from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_hf_gen_cfg, get_logger
@@ -560,7 +560,9 @@ async def _get_prompt_input(self,
             chat_template = self.chat_template
             if adapter_name in MODELS.module_dict:
                 chat_template = MODELS.module_dict[adapter_name]()
-            prompt = chat_template.messages2prompt(prompt, sequence_start, tools=tools)
+        else:
+            chat_template = BaseChatTemplate()
+        prompt = chat_template.messages2prompt(prompt, sequence_start, tools=tools)
         if prompt is None:
             raise ValueError(
                 f'You are using base template to handle chat task. Please specify a `--chat-template` name chosen from `lmdeploy list` if you want to use OpenAI messages input.'  # noqa
 
@@ -6,6 +6,7 @@
 import PIL
 
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
+from lmdeploy.model import BaseChatTemplate
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.utils import get_logger, try_import_deeplink
 from lmdeploy.vl.engine import ImageEncoder
@@ -71,6 +72,7 @@ async def _get_prompt_input(self,
         else:
             raise RuntimeError(f'unsupported messages {messages}')
 
+        chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_convert_to_pil_images(messages)
         results = await self.vl_encoder.preprocess(messages)
         if self.backend == 'turbomind':
@@ -80,13 +82,11 @@ async def _get_prompt_input(self,
             # embedding_ranges and so on. All the returned values are passed
             # to tm engine for token generation
             results = await self.vl_encoder.async_infer(results)
-            results = await self.vl_encoder.wrap_for_turbomind(results, self.chat_template, self.tokenizer,
-                                                               sequence_start)
+            results = await self.vl_encoder.wrap_for_turbomind(results, chat_template, self.tokenizer, sequence_start)
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
-            results = await self.vl_encoder.wrap_for_pytorch(results, self.chat_template, self.tokenizer,
-                                                             sequence_start)
+            results = await self.vl_encoder.wrap_for_pytorch(results, chat_template, self.tokenizer, sequence_start)
         return results
 
     @classmethod