Commit 852c7c5
1 parent 1885859 commit 852c7c5
File tree
15 files changed
+1342
-74
lines changed- common
- batched-bench
- batched
- parallel
- perplexity
- server
gguf-py/gguf
15 files changed
+1342
-74
lines changedLines changed: 2 additions & 0 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
10 | 10 |
| |
11 | 11 |
| |
12 | 12 |
| |
| 13 | + | |
13 | 14 |
| |
14 | 15 |
| |
15 | 16 |
| |
| |||
110 | 111 |
| |
111 | 112 |
| |
112 | 113 |
| |
| 114 | + | |
113 | 115 |
| |
114 | 116 |
| |
115 | 117 |
| |
|
Lines changed: 1 addition & 0 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
1288 | 1288 |
| |
1289 | 1289 |
| |
1290 | 1290 |
| |
| 1291 | + | |
1291 | 1292 |
| |
1292 | 1293 |
| |
1293 | 1294 |
| |
|
Lines changed: 118 additions & 0 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
1847 | 1847 |
| |
1848 | 1848 |
| |
1849 | 1849 |
| |
| 1850 | + | |
| 1851 | + | |
| 1852 | + | |
| 1853 | + | |
| 1854 | + | |
| 1855 | + | |
| 1856 | + | |
| 1857 | + | |
| 1858 | + | |
| 1859 | + | |
| 1860 | + | |
| 1861 | + | |
| 1862 | + | |
| 1863 | + | |
| 1864 | + | |
| 1865 | + | |
| 1866 | + | |
| 1867 | + | |
| 1868 | + | |
| 1869 | + | |
| 1870 | + | |
| 1871 | + | |
| 1872 | + | |
| 1873 | + | |
| 1874 | + | |
| 1875 | + | |
| 1876 | + | |
| 1877 | + | |
| 1878 | + | |
| 1879 | + | |
| 1880 | + | |
| 1881 | + | |
| 1882 | + | |
| 1883 | + | |
| 1884 | + | |
| 1885 | + | |
| 1886 | + | |
| 1887 | + | |
| 1888 | + | |
| 1889 | + | |
| 1890 | + | |
| 1891 | + | |
| 1892 | + | |
| 1893 | + | |
| 1894 | + | |
| 1895 | + | |
| 1896 | + | |
| 1897 | + | |
| 1898 | + | |
| 1899 | + | |
| 1900 | + | |
| 1901 | + | |
| 1902 | + | |
| 1903 | + | |
| 1904 | + | |
| 1905 | + | |
| 1906 | + | |
| 1907 | + | |
| 1908 | + | |
| 1909 | + | |
| 1910 | + | |
| 1911 | + | |
| 1912 | + | |
| 1913 | + | |
| 1914 | + | |
| 1915 | + | |
| 1916 | + | |
| 1917 | + | |
| 1918 | + | |
| 1919 | + | |
| 1920 | + | |
| 1921 | + | |
| 1922 | + | |
| 1923 | + | |
| 1924 | + | |
| 1925 | + | |
| 1926 | + | |
| 1927 | + | |
| 1928 | + | |
| 1929 | + | |
| 1930 | + | |
| 1931 | + | |
| 1932 | + | |
| 1933 | + | |
| 1934 | + | |
| 1935 | + | |
| 1936 | + | |
| 1937 | + | |
| 1938 | + | |
| 1939 | + | |
| 1940 | + | |
| 1941 | + | |
| 1942 | + | |
| 1943 | + | |
| 1944 | + | |
| 1945 | + | |
| 1946 | + | |
| 1947 | + | |
F987
| 1948 | + | |
| 1949 | + | |
| 1950 | + | |
| 1951 | + | |
| 1952 | + | |
| 1953 | + | |
| 1954 | + | |
| 1955 | + | |
| 1956 | + | |
| 1957 | + | |
| 1958 | + | |
| 1959 | + | |
| 1960 | + | |
| 1961 | + | |
| 1962 | + | |
| 1963 | + | |
| 1964 | + | |
| 1965 | + | |
| 1966 | + | |
| 1967 | + | |
1850 | 1968 |
| |
1851 | 1969 |
| |
1852 | 1970 |
| |
|
Lines changed: 8 additions & 5 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
105 | 105 |
| |
106 | 106 |
| |
107 | 107 |
| |
| 108 | + | |
| 109 | + | |
| 110 | + | |
108 | 111 |
| |
109 | 112 |
| |
110 | 113 |
| |
| |||
174 | 177 |
| |
175 | 178 |
| |
176 | 179 |
| |
177 |
| - | |
178 |
| - | |
179 |
| - | |
180 |
| - | |
| 180 | + | |
| 181 | + | |
| 182 | + | |
| 183 | + | |
181 | 184 |
| |
182 | 185 |
| |
183 | 186 |
| |
| |||
192 | 195 |
| |
193 | 196 |
| |
194 | 197 |
| |
195 |
| - | |
| 198 | + | |
196 | 199 |
| |
197 | 200 |
| |
198 | 201 |
| |
|
Lines changed: 2 additions & 1 deletion
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
80 | 80 |
| |
81 | 81 |
| |
82 | 82 |
| |
| 83 | + | |
83 | 84 |
| |
84 | 85 |
| |
85 | 86 |
| |
| |||
132 | 133 |
| |
133 | 134 |
| |
134 | 135 |
| |
135 |
| - | |
| 136 | + | |
136 | 137 |
| |
137 | 138 |
| |
138 | 139 |
| |
|
Lines changed: 13 additions & 7 deletions
@@ -107,6 +107,9 @@ int main(int argc, char ** argv) {
107
107
// number of simultaneous "clients" to simulate
108
108
const int32_t n_clients = params.n_parallel;
109
109
110
+ // dedicate one sequence to the system prompt
111
+ params.n_parallel += 1;
112
+
110
113
// requests to simulate
111
114
const int32_t n_seq = params.n_sequences;
112
115
@@ -196,8 +199,8 @@ int main(int argc, char ** argv) {
196
199
}
197
200
198
201
// assign the system KV cache to all parallel sequences
199
- for (int32_t i = 1; i < n_clients; ++i) {
200
- llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
202
+ for (int32_t i = 1; i <= n_clients; ++i) {
203
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
201
204
}
202
205
203
206
LOG_TEE("\n");
@@ -221,15 +224,17 @@ int main(int argc, char ** argv) {
221
224
222
225
client.i_batch = batch.n_tokens;
223
226
224
- llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
227
+ llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
225
228
226
229
client.n_decoded += 1;
227
230
}
228
231
229
232
if (batch.n_tokens == 0) {
230
233
// all sequences have ended - clear the entire KV cache
231
- for (int i = 0; i < n_clients; ++i) {
232
- llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
234
+ for (int i = 1; i <= n_clients; ++i) {
235
+ llama_kv_cache_seq_rm(ctx, i, -1, -1);
236
+ // but keep the system prompt
237
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
233
238
}
234
239
235
240
LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -255,7 +260,7 @@ int main(int argc, char ** argv) {
255
260
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
256
261
257
262
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
258
- llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
263
+ llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
259
264
}
260
265
261
266
// extract the logits only for the last token
@@ -366,7 +371,8 @@ int main(int argc, char ** argv) {
366
371
}
367
372
368
373
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
369
- llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
374
+ llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
375
+ llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
370
376
371
377
const auto t_main_end = ggml_time_us();
372
378
Lines changed: 6 additions & 3 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
809 | 809 |
| |
810 | 810 |
| |
811 | 811 |
| |
812 |
| - | |
| 812 | + | |
813 | 813 |
| |
814 | 814 |
| |
815 | 815 |
| |
| |||
1086 | 1086 |
| |
1087 | 1087 |
| |
1088 | 1088 |
| |
1089 |
| - | |
| 1089 | + | |
1090 | 1090 |
| |
1091 | 1091 |
| |
1092 | 1092 |
| |
| |||
1438 | 1438 |
| |
1439 | 1439 |
| |
1440 | 1440 |
| |
1441 |
| - | |
| 1441 | + | |
1442 | 1442 |
| |
1443 | 1443 |
| |
1444 | 1444 |
| |
| |||
1815 | 1815 |
| |
1816 | 1816 |
| |
1817 | 1817 |
| |
| 1818 | + | |
| 1819 | + | |
| 1820 | + | |
1818 | 1821 |
| |
1819 | 1822 |
| |
1820 | 1823 |
| |
|
0 commit comments