8000 adapt the api · ggml-org/llama.cpp@7033aa1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7033aa1

Browse files
committed
adapt the api
1 parent 23d0d7f commit 7033aa1

File tree

4 files changed

+55
-29
lines changed

4 files changed

+55
-29
lines changed

tools/mtmd/mtmd-cli.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,8 @@ int main(int argc, char ** argv) {
283283

284284
if (is_single_turn) {
285285
g_is_generating = true;
286-
if (params.prompt.find("<__image__>") == std::string::npos) {
287-
params.prompt += " <__image__>";
286+
if (params.prompt.find(MTMD_DEFAULT_MEDIA_MARKER) == std::string::npos) {
287+
params.prompt += MTMD_DEFAULT_MEDIA_MARKER;
288288
}
289289
common_chat_msg msg;
290290
msg.role = "user";
@@ -341,7 +341,7 @@ int main(int argc, char ** argv) {
341341
std::string image = line.substr(7);
342342
if (ctx.load_image(image)) {
343343
LOG("Image %s loaded\n", image.c_str());
344-
content += "<__image__>";
344+
content += MTMD_DEFAULT_MEDIA_MARKER;
345345
}
346346
// else, error is already printed by libmtmd
347347
continue;

tools/mtmd/mtmd.cpp

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ mtmd_context_params mtmd_context_params_default() {
8686
params.n_threads = 4;
8787
params.verbosity = GGML_LOG_LEVEL_INFO;
8888
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
89+
params.media_marker = MTMD_DEFAULT_MEDIA_MARKER;
8990
return params;
9091
}
9192

@@ -96,7 +97,7 @@ struct mtmd_context {
9697

9798
bool print_timings;
9899
int n_threads;
99-
std::string image_marker;
100+
std::string media_marker;
100101
bool has_vision;
101102
bool has_audio;
102103

@@ -127,8 +128,12 @@ struct mtmd_context {
127128
text_model (text_model),
128129
print_timings(ctx_params.print_timings),
129130
n_threads (ctx_params.n_threads),
130-
image_marker (ctx_params.image_marker)
131+
media_marker (ctx_params.media_marker)
131132
{
133+
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
134+
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
135+
}
136+
132137
clip_context_params ctx_clip_params;
133138
ctx_clip_params.use_gpu = ctx_params.use_gpu;
134139
ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -269,48 +274,51 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
269< EDBE /td>274
auto vocab = llama_model_get_vocab(ctx->text_model);
270275

271276
std::string prompt_modified(text->text);
272-
std::string marker_modified(ctx->image_marker);
277+
std::string marker_modified(ctx->media_marker);
273278
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
274279

280+
// for compatibility, we convert image marker to media marker
281+
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
282+
275283
// a bit hacky here, but works for now
276284
// for some models, we need to add prefix and suffix to the image embeddings
277285
if (clip_is_gemma3(ctx->ctx_clip)) {
278286
// gemma 3
279287
// <start_of_image> ... (image embeddings) ... <end_of_image>
280-
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
281-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
288+
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
289+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
282290

283291
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
284292
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
285-
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
286-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
293+
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
294+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
287295

288296
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
289297
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
290-
marker_modified = ctx->image_marker + "[IMG_END]";
291-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
298+
marker_modified = ctx->media_marker + "[IMG_END]";
299+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
292300

293301
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
294302
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
295-
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
296-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
303+
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
304+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
297305

298306
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {< 1E0A /div>
299307
// (more details in mtmd_context constructor)
300-
marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
301-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
308+
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
309+
F438 string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
302310

303311
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
304312
// <img> ... (image embeddings) ... </img>
305-
marker_modified = "<img>" + ctx->image_marker + "</img>";
306-
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
313+
marker_modified = "<img>" + ctx->media_marker + "</img>";
314+
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
307315

308316
}
309317

310318
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
311319
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
312320

313-
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
321+
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
314322
output->entries.clear();
315323
output->entries.reserve(parts.size());
316324

@@ -820,6 +828,15 @@ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
820828
}
821829
}
822830

831+
const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
832+
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
833+
return chunk->tokens_image->id.c_str();
834+
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
835+
return chunk->tokens_audio->id.c_str();
836+
}
837+
return nullptr;
838+
}
839+
823840
mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
824841
mtmd_input_chunk * copy = new mtmd_input_chunk{
825842
chunk->type,

tools/mtmd/mtmd.h

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
# define MTMD_API
4040
#endif
4141

42+
#define MTMD_DEFAULT_MEDIA_MARKER "<__media__>"
43+
44+
// deprecated marker, use MTMD_DEFAULT_MEDIA_MARKER instead
4245
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
4346

4447
#ifdef __cplusplus
@@ -80,7 +83,8 @@ struct mtmd_context_params {
8083
bool print_timings;
8184
int n_threads;
8285
enum ggml_log_level verbosity;
83-
const char * image_marker;
86+
const char * image_marker; // deprecated, use media_marker instead
87+
const char * media_marker;
8488
};
8589

8690
MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
@@ -112,6 +116,7 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
112116
// the data is in RGBRGBRGB... format
113117
// if bitmap is audio:
114118
// length of data must be n_samples * sizeof(float)
119+
// the data is in float format (PCM F32)
115120
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
116121
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
117122
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
@@ -141,8 +146,11 @@ MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chu
141146
MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
142147
MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
143148
MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
144-
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
145149
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
150+
// returns nullptr for ID on text chunk
151+
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
152+
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
153+
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
146154

147155
// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
148156
// you can move the chunk ownership to your own code by copying it
@@ -162,20 +170,21 @@ MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * i
162170
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
163171
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
164172

165-
// tokenize an input text prompt and an image
166-
// the prompt must have the input image marker (default: "<__image__>") in it
167-
// the marker will be replaced with the image tokens
173+
// tokenize an input text prompt and a list of bitmaps (images/audio)
174+
// the prompt must have the input image marker (default: "<__media__>") in it
175+
// the default marker is defined by MTMD_DEFAULT_MEDIA_MARKER
176+
// the marker will be replaced with the image/audio chunk
168177
// for example:
169-
// "here is an image: <__image__>\ndescribe it in detail."
178+
// "here is an image: <__media__>\ndescribe it in detail."
170179
// this will gives 3 chunks:
171180
// 1. "here is an image: <start_of_image>"
172-
// 2. (image tokens)
181+
// 2. (image/audio tokens)
173182
// 3. "<end_of_image>\ndescribe it in detail."
174-
// number of bitmaps must be equal to the number of image markers in the prompt
183+
// number of bitmaps must be equal to the number of markers in the prompt
175184
// this function is thread-safe (shared ctx)
176185
// return values:
177186
// 0 on success
178-
// 1 on number of images not matching the number of markers
187+
// 1 on number of bitmaps not matching the number of markers
179188
// 2 on image preprocessing error
180189
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
181190
mtmd_input_chunks * output,

tools/server/utils.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ static json oaicompat_completion_params_parse(
710710

711711
// replace this chunk with a marker
712712
p["type"] = "text";
713-
p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
713+
p["text"] = MTMD_DEFAULT_MEDIA_MARKER;
714714
p.erase("image_url");
715715
}
716716
}

0 commit comments

Comments
 (0)
0