kv-cache : improve find_slot() using min/max seq pos info

ggml-ci
ggml-org · ggerganov · May 31, 2025 · May 25, 2025 · May 25, 2025 · May 25, 2025
commit 7764d91497d853f2c6c255e3ae0daa39e94ab2df
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -4,21 +4,6 @@
 #include <cstring>
 #include <algorithm>
 
-void llama_ubatch::update() {
-    if (equal_seqs) {
-        // TODO: for now don't compute min/max for recurrent batches since we don't need this.
-        //       the batches will be refactored anyway, so we'll fix this later
-        return;
-    }
-
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        const llama_seq_id s = seq_id[i][0];
-
-        seq_pos_min[s] = seq_pos_min[s] == -1 ? pos[i] : std::min(seq_pos_min[s], pos[i]);
-        seq_pos_max[s] = seq_pos_max[s] == -1 ? pos[i] : std::max(seq_pos_max[s], pos[i]);
-    }
-}
-
 llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
     // clear empty sequences
     // the previous ubatch is assumed to be gone,
@@ -47,8 +32,6 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
         /*n_tokens     =*/ 0,
         /*n_seq_tokens =*/ 0,
         /*n_seqs       =*/ 0,
-        /*seq_pos_min  =*/ {-1},
-        /*seq_pos_max  =*/ {-1},
         /*token        =*/ !has_embd ? udata.token.data() : nullptr,
         /*embd         =*/ has_embd  ? udata.embd.data()  : nullptr,
         /*pos          =*/ udata.pos.data(),
@@ -172,7 +155,6 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
         GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
         add_seq_to_ubatch(ubatch, s, length);
     }
-    ubatch.update();
     return ubatch;
 }
 
@@ -200,7 +182,6 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
             if (length + n_tokens_in_ubatch > n_ubatch) { break; }
         }
     }
-    ubatch.update();
     return ubatch;
 }
 
@@ -213,7 +194,6 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
         GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
         add_seq_to_ubatch(ubatch, s, length);
     }
-    ubatch.update();
     return ubatch;
 }
 

diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -1,26 +1,20 @@
 #pragma once
 
 #include "llama.h"
-#include "llama-cparams.h"
 
 #include <array>
 #include <vector>
 
 // very similar to llama_batch,
 // but has more metadata about sequences
 struct llama_ubatch {
-    void update();
-
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
 
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
     uint32_t n_seq_tokens; // tokens per sequence
     uint32_t n_seqs;
 
-    llama_pos seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES]; // min position of each sequence
-    llama_pos seq_pos_max[LLAMA_MAX_PARALLEL_SEQUENCES]; // max position of each sequence
-
     llama_token  *  token;    // [n_tokens]
     float        *  embd;     // [n_embd, n_tokens]
     llama_pos    *  pos;      // [n_tokens]

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1233,7 +1233,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     this->n_outputs = n_outputs;
 
     llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-    llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, {-1}, {-1}, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+    llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
     auto * gf = graph_init();
     auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -548,7 +548,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
                 if (cells.is_empty(i)) {
                     ss += '.';
                 } else {
-                    ss += 'x';
+                    ss += std::to_string(cells.seq_get(i));
                 }
                 if (i%256 == 255) {
                     ss += '\n';
@@ -557,6 +557,10 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
         }
         LLAMA_LOG_WARN("\n%s\n", ss.c_str());
     }
+
+    LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[0] = %5d, max[0] = %5d\n", n_swa, cells.seq_pos_min(0), cells.seq_pos_max(0));
+    LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[1] = %5d, max[1] = %5d\n", n_swa, cells.seq_pos_min(1), cells.seq_pos_max(1));
+    LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[2] = %5d, max[2] = %5d\n", n_swa, cells.seq_pos_min(2), cells.seq_pos_max(2));
 #endif
 
     uint32_t n_tested = 0;
@@ -568,24 +572,44 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
             continue;
         }
 
+        // keep track of what the minimum sequence positions would be if we accept the ubatch
+        llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            seq_pos_min[s] = cells.seq_pos_min(s);
+        }
+
         bool found = true;
         for (uint32_t i = 0; i < n_tokens; i++) {
             const llama_pos    pos    = ubatch.pos[i];
             const llama_seq_id seq_id = ubatch.seq_id[i][0];
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            const llama_seq_id seq_id = ubatch.seq_id[i] == nullptr ? ubatch.seq_id[0][0] : ubatch.seq_id[i][0];
+
 ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits 
 struct llama_ubatch { 
     bool equal_seqs; 
     // TODO: whole_seqs for embeddings? 
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs) 
     uint32_t n_seq_tokens; // tokens per sequence 
     uint32_t n_seqs; 
     llama_token  *  token;    // [n_tokens] 
     float        *  embd;     // [n_embd, n_tokens] 
     llama_pos    *  pos;      // [n_tokens] 
     int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence 
     llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id; 
     int8_t       *  output;   // [n_tokens] 
 }; 
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            const llama_seq_id seq_id = ubatch.seq_id[i / ubatch.n_seq_tokens][0];
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            const llama_seq_id seq_id = ubatch.seq_id[i] == nullptr ? ubatch.seq_id[0][0] : ubatch.seq_id[i][0];
+
 ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits 
 struct llama_ubatch { 
     bool equal_seqs; 
     // TODO: whole_seqs for embeddings? 
  
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs) 
     uint32_t n_seq_tokens; // tokens per sequence 
     uint32_t n_seqs; 
  
     llama_token  *  token;    // [n_tokens] 
     float        *  embd;     // [n_embd, n_tokens] 
     llama_pos    *  pos;      // [n_tokens] 
     int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence 
     llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id; 
     int8_t       *  output;   // [n_tokens] 
 }; 
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            const llama_seq_id seq_id = ubatch.seq_id[i / ubatch.n_seq_tokens][0];
 
             // can we use this cell? either:
             //  - the cell is empty
-            //  - the cell is occupied only by the same sequence, and the pos is masked
-            const bool can_use =
-                    cells.is_empty(head_cur + i) ||
-                    (
-                        cells.seq_has  (head_cur + i, seq_id) && // sequence mask
-                        cells.seq_count(head_cur + i) == 1    &&
-                        (
-                            cells.pos_get  (head_cur + i) >= pos ||                                // causal mask
-                            is_masked_swa(cells.pos_get(head_cur + i), ubatch.seq_pos_min[seq_id]) // SWA mask
-                        )
-                    );
+            //  - the cell is occupied only by one sequence:
+            //    - mask causally, if the sequence is the same as the one we are inserting
+            //    - mask SWA, using current max pos for that sequence in the cache
+            //                always insert in the cell with minimum pos
+            bool can_use = cells.is_empty(head_cur + i);
+
+            if (!can_use && cells.seq_count(head_cur + i) == 1) {
+                const llama_pos pos_cell = cells.pos_get(head_cur + i);
+
+                // causal mask
+                if (cells.seq_has(head_cur + i, seq_id)) {
+                    can_use = pos_cell >= pos;
+                }
+
+                if (!can_use) {
+                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
+
+                    // SWA mask
+                    if (pos_cell == seq_pos_min[seq_id_cell] &&
+                        is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+                        seq_pos_min[seq_id_cell]++;
+                        can_use = true;
+                    }
+                }
+            }
 
             if (!can_use) {
                 found = false;
@@ -613,9 +637,7 @@ void llama_kv_cache_unified::fill_slot(uint32_t head_cur, const llama_ubatch & u
 
     for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
         if (!cells.is_empty(head + i)) {
-            cells.pos_chg(head + i, ubatch.pos[i]);
-
-            continue;
+            cells.rm(head + i);
         }
 
         cells.pos_set(head + i, ubatch.pos[i]);

diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -68,12 +68,6 @@ class llama_kv_cells_unified {
     // the index of the last cell that is used + 1
     // return 0 if no cells are used
     uint32_t used_max_p1() const {
-#if 0
-        if (!seq_pos[0].empty()) printf("kv_cells: min[0] = %5d, max[0] = %5d\n", *seq_pos[0].begin(), *seq_pos[0].rbegin());
-        if (!seq_pos[1].empty()) printf("kv_cells: min[1] = %5d, max[1] = %5d\n", *seq_pos[1].begin(), *seq_pos[1].rbegin());
-        if (!seq_pos[2].empty()) printf("kv_cells: min[2] = %5d, max[2] = %5d\n", *seq_pos[2].begin(), *seq_pos[2].rbegin());
-#endif
-
         return used.empty() ? 0 : *used.rbegin() + 1;
     }
 
@@ -144,6 +138,18 @@ class llama_kv_cells_unified {
         }
     }
 
+    void rm(uint32_t i) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        seq_pos_rm(i);
+
+        pos[i] = -1;
+        seq[i].reset();
+
+        used.erase(i);
+    }
+
     // note: call only if the cell has seq_id
     // return true if the cell becomes empty
     bool seq_rm(uint32_t i, llama_seq_id seq_id) {
@@ -220,6 +226,18 @@ class llama_kv_cells_unified {
         seq_pos[seq_id].insert(pos[i]);
     }
 
+    llama_seq_id seq_get(uint32_t i) const {
+        assert(seq[i].count() == 1);
+
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            if (seq[i].test(s)) {
+                return s;
+            }
+        }
+
+        return -1;
+    }
+
     // the minimum position of sequence seq_id currently present in any of the cells
     // return -1 if the sequence is not present
     llama_pos seq_pos_min(llama_seq_id seq_id) const {
@@ -275,22 +293,13 @@ class llama_kv_cells_unified {
     void pos_set(uint32_t i, llama_pos p) {
         assert(i < pos.size());
         assert(pos[i] == -1);
+        assert(seq[i].none());
 
         pos[i] = p;
 
         used.insert(i);
     }
 
-    // change the position of a non-empty cell
-    // does not modify "has_shift"
-    // note: call only if the cell is not empty
-    void pos_chg(uint32_t i, llama_pos p) {
-        assert(i < pos.size());
-        assert(pos[i] != -1);
-
-        pos[i] = p;
-    }
-
     // pos[i] = pos[i] + d
     // sets "has_shift" to true
    // note: call only if the cell is not empty