ngxson
diff --git a/‎tools/server/tests/unit/test_vision_api.py
Lines changed: 10 additions & 9 deletions b/‎tools/server/tests/unit/test_vision_api.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎tools/server/utils.hpp
Lines changed: 22 additions & 11 deletions b/‎tools/server/utils.hpp
Lines changed: 22 additions & 11 deletions
@@ -20,18 +20,19 @@ def create_server():
 
 
 @pytest.mark.parametrize(
-    "image_url, success, re_content",
+    "prompt, image_url, success, re_content",
     [
         # test model is trained on CIFAR-10, but it's quite dumb due to small size
-        (IMG_URL_0,                True, "(cat)+"),
-        ("IMG_BASE64_0",           True, "(cat)+"), # exceptional, so that we don't cog up the log
-        (IMG_URL_
10000
1,                True, "(frog)+"),
-        ("malformed",              False, None),
-        ("https://google.com/404", False, None), # non-existent image
-        ("https://ggml.ai",        False, None), # non-image data
+        ("What is this:\n", IMG_URL_0,                True, "(cat)+"),
+        ("What is this:\n", "IMG_BASE64_0",           True, "(cat)+"), # exceptional, so that we don't cog up the log
+        ("What is this:\n", IMG_URL_1,                True, "(frog)+"),
+        ("Test test\n",     IMG_URL_1,                True, "(frog)+"), # test invalidate cache
+        ("What is this:\n", "malformed",              False, None),
+        ("What is this:\n", "https://google.com/404", False, None), # non-existent image
+        ("What is this:\n", "https://ggml.ai",        False, None), # non-image data
     ]
 )
-def test_vision_chat_completion(image_url, success, re_content):
+def test_vision_chat_completion(prompt, image_url, success, re_content):
     global server
     server.start(timeout_seconds=60) # vision model may take longer to load due to download size
     if image_url == "IMG_BASE64_0":
@@ -41,7 +42,7 @@ def test_vision_chat_completion(image_url, success, re_content):
         "top_k": 1,
         "messages": [
             {"role": "user", "content": [
-                {"type": "text", "text": "What is this:\n"},
+                {"type": "text", "text": prompt},
                 {"type": "image_url", "image_url": {
                     "url": image_url,
                 }},
 
@@ -1155,17 +1155,28 @@ struct server_tokens {
 
     void resize(size_t n) {
         GGML_ASSERT(n <= tokens.size());
-        // we throw an error if we try to remove a token in the middle of an image
-        // for ex. with input of 5 text tokens and 2 images:
-        //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
-        // n  1   2   3   4   5   6      7      8      9      10
-        // allowed to resize      ^                    ^
-        // disallowed to resize          ^      ^             ^
-        if (n > 0) {
-            llama_token last_token = tokens[n - 1];
-            // make sure we never remove tokens in the middle of an image
-            if (last_token == LLAMA_TOKEN_NULL) {
-                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+        if (has_mtmd) {
+            // we throw an error if we try to remove a token in the middle of an image
+            // for ex. with input of 5 text tokens and 2 images:
+            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+            // n  1   2   3   4   5   6      7      8      9      10
+            // allowed to resize      ^                    ^
+            // disallowed to resize          ^      ^             ^
+            if (n > 0) {
+                llama_token last_token = tokens[n - 1];
+                // make sure we never remove tokens in the middle of an image
+                if (last_token == LLAMA_TOKEN_NULL) {
+                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                }
+            }
+            // remove all image chunks that are not used anymore
+            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+                llama_pos pos = it->first;
+                if (pos >= (llama_pos)n) {
+                    it = map_pos_to_image.erase(it);
+                } else {
+                    ++it;
+                }
             }
         }
         tokens.resize(n);