8000 llava : introduce libmtmd (#12849) · colout/llama.cpp@c98da8e · GitHub
[go: up one dir, main page]

Skip to content

Commit c98da8e

Browse files
ngxsonggerganov
authored andcommitted
llava : introduce libmtmd (ggml-org#12849)
* wip llava2 * migrated gemma3 to llava2 * add timings * correct pre/postfix * fix missing include * fix compilation unused var warn * update llava2_tokenize * change name llava2 --> mtmd * improve api * refine helpers * Update examples/llava/mtmd.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 8476c6d commit c98da8e

File tree

7 files changed

+687
-128
lines changed

7 files changed

+687
-128
lines changed

examples/llava/CMakeLists.txt

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# llava (legacy)
2+
13
add_library(llava OBJECT
24
llava.cpp
35
llava.h
@@ -22,12 +24,41 @@ if (BUILD_SHARED_LIBS)
2224
install(TARGETS llava_shared LIBRARY)
2325
endif()
2426

27+
# mtmd
28+
29+
add_library(mtmd OBJECT
30+
mtmd.cpp
31+
mtmd.h
32+
clip.cpp
33+
clip.h
34+
clip-impl.h
35+
)
36+
37+
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
38+
39+
target_include_directories(mtmd PUBLIC .)
40+
target_include_directories(mtmd PRIVATE ../..)
41+
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
42+
43+
target_compile_features(mtmd PRIVATE cxx_std_17)
44+
45+
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
46+
if (BUILD_SHARED_LIBS)
47+
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
48+
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
49+
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
50+
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
51+
install(TARGETS mtmd_shared LIBRARY)
52+
endif()
53+
2554
if (NOT MSVC)
2655
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
56+
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
2757
endif()
2858

2959
if(TARGET BUILD_INFO)
3060
add_dependencies(llava BUILD_INFO)
61+
add_dependencies(mtmd BUILD_INFO)
3162
endif()
3263

3364
set(TARGET llama-llava-cli)
@@ -55,7 +86,7 @@ set(TARGET llama-gemma3-cli)
5586
add_executable(${TARGET} gemma3-cli.cpp)
5687
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
5788
install(TARGETS ${TARGET} RUNTIME)
58-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
89+
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
5990
target_compile_features(${TARGET} PRIVATE cxx_std_17)
6091

6192
set(TARGET llama-llava-clip-quantize-cli)

examples/llava/clip-impl.h

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
#include "ggml.h"
22
#include "gguf.h"
33

4+
#include "clip.h"
5+
46
#include <climits>
57
#include <cstdarg>
68
#include <string>
79
#include <map>
810
#include <sstream>
911
#include <vector>
12+
#include <memory>
1013

1114
// Internal header for clip.cpp
1215

@@ -120,6 +123,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
120123
return PROJECTOR_TYPE_UNKNOWN;
121124
}
122125

126+
// RGB uint8 image
127+
struct clip_image_u8 {
128+
int nx;
129+
int ny;
130+
131+
std::vector<uint8_t> buf;
132+
};
133+
134+
// RGB float32 image (NHWC)
135+
// Memory layout: RGBRGBRGB...
136+
struct clip_image_f32 {
137+
int nx;
138+
int ny;
139+
140+
std::vector<float> buf;
141+
};
142+
123143
//
124144
// logging
125145
//
@@ -178,6 +198,28 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
178198
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
179199
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
180200

201+
//
202+
// cpp wrappers
203+
//
204+
205+
struct clip_image_u8_deleter {
206+
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
207+
};
208+
209+
struct clip_image_f32_deleter {
210+
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
211+
};
212+
213+
struct clip_image_f32_batch_deleter {
214+
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
215+
};
216+
217+
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
218+
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
219+
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
220+
221+
// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
222+
181223
//
182224
// common utils
183225
//
@@ -214,6 +256,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons
214256
s = std::move(builder);
215257
}
216258

259+
// split string by a `std::string delim` instead of `char delim`
260+
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
261+
std::vector<std::string> tokens;
262+
size_t pos = 0;
263+
std::string token;
264+
while ((pos = s.find(delimiter)) != std::string::npos) {
265+
token = s.substr(0, pos);
266+
tokens.push_back(token);
267+
s.erase(0, pos + delimiter.length());
268+
}
269+
tokens.push_back(s);
270+
return tokens;
271+
}
272+
217273
//
218274
// gguf utils
219275
//
@@ -271,3 +327,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
271327
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
272328
}
273329
}
330+
331+
//
332+
// API used internally with mtmd
333+
//
334+
335+
projector_type clip_get_projector_type(const struct clip_ctx * ctx);

examples/llava/clip.cpp

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
3232

3333
//#define CLIP_DEBUG_FUNCTIONS
3434

35-
// RGB uint8 image
36-
struct clip_image_u8 {
37-
int nx;
38-
int ny;
39-
40-
std::vector<uint8_t> buf;
41-
};
42-
43-
// RGB float32 image (NHWC)
44-
// Memory layout: RGBRGBRGB...
45-
struct clip_image_f32 {
46-
int nx;
47-
int ny;
48-
49-
std::vector<float> buf;
50-
};
51-
5235
#ifdef CLIP_DEBUG_FUNCTIONS
5336
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
5437
std::ofstream file(filename, std::ios::binary);
@@ -1614,6 +1597,12 @@ struct clip_image_f32 * clip_image_f32_init() {
16141597
return new clip_image_f32();
16151598
}
16161599

1600+
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
1601+
if (nx) *nx = img->nx;
1602+
if (ny) *ny = img->ny;
1603+
return img->buf.data();
1604+
}
1605+
16171606
void clip_image_size_free(struct clip_image_size * load_image_size) {
16181607
if (load_image_size == nullptr) {
16191608
return;
@@ -2346,6 +2335,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
23462335
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
23472336
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
23482337
n_patches = x_patch * y_patch;
2338+
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2339+
n_patches = 256;
23492340
}
23502341

23512342
return n_patches;
@@ -2893,3 +2884,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
28932884
clip_image_encode(ctx, n_threads, &clip_img, vec);
28942885
return true;
28952886
}
2887+
2888+
//
2889+
// API used internally with mtmd
2890+
//
2891+
2892+
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
2893+
return ctx->proj_type;
2894+
}

examples/llava/clip.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ CLIP_API struct clip_image_size * clip_image_size_init();
7777
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
7878
CLIP_API struct clip_image_f32 * clip_image_f32_init();
7979

80+
// nx, ny are the output image dimensions
81+
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
82+
8083
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
8184
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
8285
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);

0 commit comments

Comments
 (0)
0