8000 support List[dict] prompt input without do_preprocess (#3385) · InternLM/lmdeploy@9b55d0b · GitHub
[go: up one dir, main page]

Skip to content

Commit 9b55d0b

Browse files
authored
support List[dict] prompt input without do_preprocess (#3385)
* support List[dict] prompt input without do_preproces 8000 s * set cmake policy minimum version as 3.5 (#3376)
1 parent c02dd78 commit 9b55d0b

File tree

5 files changed

+11
-7
lines changed

5 files changed

+11
-7
lines changed

.github/workflows/unit-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ jobs:
7272
-DUSE_NVTX=ON \
7373
-DSM=80 \
7474
-DCMAKE_CUDA_ARCHITECTURES=80 \
75+
-DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
7576
-DBUILD_TEST=OFF
7677
make -j$(nproc) && make install
7778
- name: Install lmdeploy

generate.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ cmake ${builder} .. \
1414
-DBUILD_PY_FFI=ON \
1515
-DBUILD_MULTI_GPU=ON \
1616
-DCMAKE_CUDA_FLAGS="-lineinfo" \
17+
-DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
1718
-DUSE_NVTX=ON

lmdeploy/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
228228
role = message['role']
229229
content = get_text(message['content'])
230230
ret += f'{box_map[role]}{content}{eox_map[role]}'
231-
if len(messages) and messages[-1]['role'] == 'assistant':
231+
if len(messages) and messages[-1]['role'] == 'assistant' and len(eox_map['assistant']) > 0:
232232
return ret[:-len(eox_map['assistant'])] # prefix of response
233233
ret += f'{self.assistant}'
234234
return ret

lmdeploy/serve/async_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from lmdeploy.archs import get_model_arch
2222
from lmdeploy.logger import RequestLogger
2323
from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig
24-
from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
24+
from lmdeploy.model import MODELS, BaseChatTemplate, ChatTemplateConfig, best_match_model
2525
from lmdeploy.serve.utils import LogitsMixin
2626
from lmdeploy.tokenizer import DetokenizeState
2727
from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_hf_gen_cfg, get_logger
@@ -560,7 +560,9 @@ async def _get_prompt_input(self,
560560
chat_template = self.chat_template
561561
if adapter_name in MODELS.module_dict:
562562
chat_template = MODELS.module_dict[adapter_name]()
563-
prompt = chat_template.messages2prompt(prompt, sequence_start, tools=tools)
563+
else:
564+
chat_template = BaseChatTemplate()
565+
prompt = chat_template.messages2prompt(prompt, sequence_start, tools=tools)
564566
if prompt is None:
565567
raise ValueError(
566568
f'You are using base template to handle chat task. Please specify a `--chat-template` name chosen from `lmdeploy list` if you want to use OpenAI messages input.' # noqa

lmdeploy/serve/vl_async_engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import PIL
77

88
from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
9+
from lmdeploy.model import BaseChatTemplate
910
from lmdeploy.serve.async_engine import AsyncEngine
1011
from lmdeploy.utils import get_logger, try_import_deeplink
1112
from lmdeploy.vl.engine import ImageEncoder
@@ -71,6 +72,7 @@ async def _get_prompt_input(self,
7172
else:
7273
raise RuntimeError(f'unsupported messages {messages}')
7374

75+
chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
7476
messages = await self.async_convert_to_pil_images(messages)
7577
results = await self.vl_encoder.preprocess(messages)
7678
if self.backend == 'turbomind':
@@ -80,13 +82,11 @@ async def _get_prompt_input(self,
8082
# embedding_ranges and so on. All the returned values are passed
8183
# to tm engine for token generation
8284
results = await self.vl_encoder.async_infer(results)
83-
results = await self.vl_encoder.wrap_for_turbomind(results, self.chat_template, self.tokenizer,
84-
sequence_start)
85+
results = await self.vl_encoder.wrap_for_turbomind(results, chat_template, self.tokenizer, sequence_start)
8586
elif self.backend == 'pytorch':
8687
# for pt engine, this module only conduct the image preprocessing
8788
# It leaves the vision embedding to the pt engine
88-
results = await self.vl_encoder.wrap_for_pytorch(results, self.chat_template, self.tokenizer,
89-
sequence_start)
89+
results = await self.vl_encoder.wrap_for_pytorch(results, chat_template, self.tokenizer, sequence_start)
9090
return results
9191

9292
@classmethod

0 commit comments

Comments
 (0)
0