adapt the api

ggml-org · ngxson · May 22, 2025 · May 4, 2025 · May 4, 2025 · May 4, 2025
commit 7033aa16a78c894b061efaf9a7b6bccb5e55f26d
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -283,8 +283,8 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find("<__image__>") == std::string::npos) {
-            params.prompt += " <__image__>";
+        if (params.prompt.find(MTMD_DEFAULT_MEDIA_MARKER) == std::string::npos) {
+            params.prompt += MTMD_DEFAULT_MEDIA_MARKER;
         }
         common_chat_msg msg;
         msg.role = "user";
@@ -341,7 +341,7 @@ int main(int argc, char ** argv) {
                 std::string image = line.substr(7);
                 if (ctx.load_image(image)) {
                     LOG("Image %s loaded\n", image.c_str());
-                    content += "<__image__>";
+                    content += MTMD_DEFAULT_MEDIA_MARKER;
                 }
                 // else, error is already printed by libmtmd
                 continue;

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -86,6 +86,7 @@ mtmd_context_params mtmd_context_params_default() {
     params.n_threads = 4;
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
+    params.media_marker = MTMD_DEFAULT_MEDIA_MARKER;
     return params;
 }
 
@@ -96,7 +97,7 @@ struct mtmd_context {
 
     bool print_timings;
     int n_threads;
-    std::string image_marker;
+    std::string media_marker;
     bool has_vision;
     bool has_audio;
 
@@ -127,8 +128,12 @@ struct mtmd_context {
         text_model   (text_model),
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
-        image_marker (ctx_params.image_marker)
+        media_marker (ctx_params.media_marker)
     {
+        if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+        }
+
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -269,48 +274,51 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text->text);
-    std::string marker_modified(ctx->image_marker);
+    std::string marker_modified(ctx->media_marker);
     projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
 
+    // for compatibility, we convert image marker to media marker
+    string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+
     // a bit hacky here, but works for now
     // for some models, we need to add prefix and suffix to the image embeddings
     if (clip_is_gemma3(ctx->ctx_clip)) {
         // gemma 3
         // <start_of_image> ... (image embeddings) ... <end_of_image>
-        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
         // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
         // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
-        marker_modified = ctx->image_marker + "[IMG_END]";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = ctx->media_marker + "[IMG_END]";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
         // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
         // (more details in mtmd_context constructor)
-        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
         // <img> ... (image embeddings) ... </img>
-        marker_modified = "<img>" + ctx->image_marker + "</img>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<img>" + ctx->media_marker + "</img>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
 
     }
 
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
     // for glm-edge, BOI and EOI token's embeddings are not present in the text model
 
-    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
     output->entries.clear();
     output->entries.reserve(parts.size());
 
@@ -820,6 +828,15 @@ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
     }
 }
 
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image->id.c_str();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->id.c_str();
+    }
+    return nullptr;
+}
+
 mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
     mtmd_input_chunk * copy = new mtmd_input_chunk{
         chunk->type,

diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -39,6 +39,9 @@
 #    define MTMD_API
 #endif
 
+#define MTMD_DEFAULT_MEDIA_MARKER "<__media__>"
+
+// deprecated marker, use MTMD_DEFAULT_MEDIA_MARKER instead
 #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 
 #ifdef __cplusplus
@@ -80,7 +83,8 @@ struct mtmd_context_params {
     bool print_timings;
     int n_threads;
     enum ggml_log_level verbosity;
-    const char * image_marker;
+    const char * image_marker; // deprecated, use media_marker instead
+    const char * media_marker;
 };
 
 MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
@@ -112,6 +116,7 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 //     the data is in RGBRGBRGB... format
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx  (const mtmd_bitmap * bitmap);
@@ -141,8 +146,11 @@ MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chu
 MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
 MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
-MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
+// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
 // you can move the chunk ownership to your own code by copying it
@@ -162,20 +170,21 @@ MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * i
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
-// tokenize an input text prompt and an image
-// the prompt must have the input image marker (default: "<__image__>") in it
-// the marker will be replaced with the image tokens
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by MTMD_DEFAULT_MEDIA_MARKER
+// the marker will be replaced with the image/audio chunk
 // for example:
-//   "here is an image: <__image__>\ndescribe it in detail."
+//   "here is an image: <__media__>\ndescribe it in detail."
 //   this will gives 3 chunks:
 //   1. "here is an image: <start_of_image>"
-//   2. (image tokens)
+//   2. (image/audio tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
-// number of bitmaps must be equal to the number of image markers in the prompt
+// number of bitmaps must be equal to the number of markers in the prompt
 // this function is thread-safe (shared ctx)
 // return values:
 //   0 on success
-//   1 on number of images not matching the number of markers
+//   1 on number of bitmaps not matching the number of markers
 //   2 on image preprocessing error
 MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                mtmd_input_chunks * output,

@@ -710,7 +710,7 @@ static json oaicompat_completion_params_parse(
 
                 // replace this chunk with a marker
                 p["type"] = "text";
-                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
+                p["text"] = MTMD_DEFAULT_MEDIA_MARKER;
                 p.erase("image_url");
             }
         }