diff --git a/common/chat.cpp b/common/chat.cpp
index f138c7bcafcfa..8af41a8d454a6 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -26,13 +26,13 @@ struct common_chat_templates {
struct templates_params {
json messages;
json tools;
- common_chat_tool_choice tool_choice;
json json_schema;
+ common_chat_tool_choice tool_choice;
bool parallel_tool_calls;
bool stream;
- std::string grammar;
bool add_generation_prompt = true;
bool extract_reasoning = true;
+ std::string grammar;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
@@ -815,7 +815,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
}
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+ data.grammar_triggers.push_back({"[TOOL_CALLS]", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
data.preserved_tokens = {
"[TOOL_CALLS]",
};
@@ -862,8 +862,8 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
});
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
"<|START_ACTION|>",
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
});
data.preserved_tokens = {
"<|START_ACTION|>",
@@ -1004,11 +1004,11 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
});
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
});
if (!builtin_tools.empty()) {
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+ data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
data.preserved_tokens.push_back("<|python_tag|>");
}
// Allow a few empty lines on top of the usual constrained json schema space rule.
@@ -1085,10 +1085,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
"\"<|tool▁calls▁end|>\""
" space");
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"});
+ data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+ data.grammar_triggers.push_back({"<|tool_calls_begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+ data.grammar_triggers.push_back({"<|tool calls begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+ data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
data.preserved_tokens = {
"",
"",
@@ -1196,7 +1196,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
}
builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
+ data.grammar_triggers.push_back({" functools[", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
data.preserved_tokens = {
" functools[",
};
@@ -1230,20 +1230,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
regex_escape(name + "\n"),
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
});
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
});
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
regex_escape(">>>" + name + "\n"),
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
});
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
">>>assistant<|end_header_id|>\n" + name,
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
});
});
data.preserved_tokens = {
@@ -1339,12 +1339,12 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
});
if (has_raw_python) {
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+ data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
data.preserved_tokens.push_back("<|python_tag|>");
}
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "\" space"));
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
"",
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
});
auto escaped_name = regex_escape(name);
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
""});
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+ data.grammar_triggers.push_back({"|||)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
});
data.preserved_tokens = {
"",
diff --git a/common/chat.h b/common/chat.h
index d26a09c2f7c4f..01ce0a4741e94 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -68,18 +68,18 @@ struct common_chat_templates_inputs {
bool add_generation_prompt = true;
bool use_jinja = true;
// Parameters below only supported when use_jinja is true
- std::vector tools;
- common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
bool extract_reasoning = true;
+ common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+ std::vector tools;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
struct common_chat_params {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+ bool grammar_lazy = false;
std::string prompt;
std::string grammar;
- bool grammar_lazy = false;
std::vector grammar_triggers;
std::vector preserved_tokens;
std::vector additional_stops;
diff --git a/common/common.h b/common/common.h
index 556ff5be40798..26c12997bd6b0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -51,8 +51,8 @@ struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
- enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
@@ -119,8 +119,8 @@ enum common_grammar_trigger_type {
};
struct common_grammar_trigger {
- common_grammar_trigger_type type;
std::string value;
+ common_grammar_trigger_type type;
llama_token token = LLAMA_TOKEN_NULL;
};
@@ -156,6 +156,11 @@ struct common_params_sampling {
bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
+ bool grammar_lazy = false;
+ std::string grammar; // optional BNF-like grammar to constrain sampling
+ std::vector grammar_triggers; // optional triggers (for lazy grammars)
+ std::set preserved_tokens;
+
std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
@@ -171,11 +176,6 @@ struct common_params_sampling {
COMMON_SAMPLER_TYPE_TEMPERATURE,
};
- std::string grammar; // optional BNF-like grammar to constrain sampling
- bool grammar_lazy = false;
- std::vector grammar_triggers; // optional triggers (for lazy grammars)
- std::set preserved_tokens;
-
std::vector logit_bias; // logit biases to apply
// print the parameters into a string
@@ -240,14 +240,14 @@ struct common_params {
float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+
std::vector devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
@@ -283,11 +283,10 @@ struct common_params {
std::vector kv_overrides;
std::vector tensor_buft_overrides;
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
- std::vector lora_adapters; // lora adapter path with user defined scale
-
std::vector control_vectors; // control vector with user defined scale
+ std::vector lora_adapters; // lora adapter path with user defined scale
+
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
@@ -296,13 +295,15 @@ struct common_params {
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL divergence
@@ -337,23 +338,28 @@ struct common_params {
bool single_turn = false; // single turn chat conversation
- ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
- ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+ // options multimodal models (see tools/mtmd)
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
+ bool no_mmproj = false; // explicitly disable multimodal model
- common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+ // options embedding
+ bool embedding = false; // get only sentence embedding
+ bool reranking = false; // enable reranking support on server
// multimodal models (see tools/mtmd)
struct common_params_model mmproj;
- bool mmproj_use_gpu = true; // use GPU for multimodal model
- bool no_mmproj = false; // explicitly disable multimodal model
std::vector image; // path to image file(s)
// embedding
- bool embedding = false; // get only sentence embedding
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embeddings
- bool reranking = false; // enable reranking support on server
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+
// server params
int32_t port = 8080; // server listens on this network port
@@ -362,19 +368,14 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
- std::vector api_keys;
-
- std::string ssl_file_key = ""; // NOLINT
- std::string ssl_file_cert = ""; // NOLINT
-
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
@@ -383,22 +384,20 @@ struct common_params {
bool log_json = false;
- std::string slot_save_path;
+ std::vector api_keys;
- float slot_prompt_similarity = 0.5f;
+ std::string ssl_file_key = ""; // NOLINT
+ std::string ssl_file_cert = ""; // NOLINT
- // batched-bench params
- bool is_pp_shared = false;
+ std::string slot_save_path;
- std::vector n_pp;
- std::vector n_tg;
- std::vector n_pl;
+ float slot_prompt_similarity = 0.5f;
// retrieval params
- std::vector context_files; // context files to embed
-
int32_t chunk_size = 64; // chunk size for context embedding
+ std::vector context_files; // context files to embed
+
std::string chunk_separator = "\n"; // chunk separator for context embedding
// passkey params
@@ -414,12 +413,19 @@ struct common_params {
bool compute_ppl = true; // whether to compute perplexity
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
+ // batched-bench params
+ bool is_pp_shared = false;
+
+ std::vector n_pp;
+ std::vector n_tg;
+ std::vector n_pl;
+
// cvector-generator params
int n_pca_batch = 100;
int n_pca_iterations = 1000;
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
bool spm_infill = false; // suffix/prefix/middle pattern for infill
diff --git a/common/log.cpp b/common/log.cpp
index 52b31470c46bd..48111a047ece4 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -46,13 +46,13 @@ static std::vector g_col = {
};
struct common_log_entry {
- enum ggml_log_level level;
-
- bool prefix;
+ std::vector msg;
int64_t timestamp;
- std::vector msg;
+ enum ggml_log_level level;
+
+ bool prefix;
// signals the worker thread to stop
bool is_end;
diff --git a/include/llama.h b/include/llama.h
index 52cd7a5a037ef..c324486ea9f39 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -302,6 +302,12 @@ extern "C" {
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
int32_t main_gpu;
+ // Keep the booleans together to avoid misalignment during copy-by-value.
+ bool vocab_only; // only load the vocabulary, no weights
+ bool use_mmap; // use mmap if possible
+ bool use_mlock; // force system to keep model in RAM
+ bool check_tensors; // validate model tensor data
+
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
@@ -315,12 +321,6 @@ extern "C" {
// override key-value pairs of the model meta data
const struct llama_model_kv_override * kv_overrides;
-
- // Keep the booleans together to avoid misalignment during copy-by-value.
- bool vocab_only; // only load the vocabulary, no weights
- bool use_mmap; // use mmap if possible
- bool use_mlock; // force system to keep model in RAM
- bool check_tensors; // validate model tensor data
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 85b4324b699e6..fdc66cf9ee269 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1246,7 +1246,6 @@ llm_graph_result_ptr llama_context::graph_build(
return model.build_graph(
{
/*.ctx =*/ ctx,
- /*.arch =*/ model.arch,
/*.hparams =*/ model.hparams,
/*.cparams =*/ cparams,
/*.ubatch =*/ ubatch,
@@ -1256,6 +1255,7 @@ llm_graph_result_ptr llama_context::graph_build(
/*.loras =*/ &loras,
/*.memory =*/ memory.get(),
/*.cross =*/ &cross,
+ /*.arch =*/ model.arch,
/*.n_outputs =*/ n_outputs,
/*.cb =*/ graph_get_cb(),
}, gf, gtype);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 13e36d161c614..29b1c6f6c49c2 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -414,7 +414,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
//
llm_graph_context::llm_graph_context(const llm_graph_params & params) :
- arch (params.arch),
hparams (params.hparams),
cparams (params.cparams),
ubatch (params.ubatch),
@@ -430,6 +429,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
+ arch (params.arch),
freq_base (cparams.rope_freq_base),
freq_scale (cparams.rope_freq_scale),
ext_factor (cparams.yarn_ext_factor),
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 2b85bb25befba..55af082d449e4 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -374,8 +374,6 @@ using llm_graph_cb = std::function samplers;
- // timing
+ llama_sampler_chain_params params;
- mutable int64_t t_sample_us;
+ // timing
mutable int32_t n_sample;
+
+ mutable int64_t t_sample_us;
};
struct llama_sampler * llama_sampler_init_dry_testing(
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 087665e41411b..98a9a5430a6b2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -393,7 +393,7 @@ struct server_task {
params.sampling.grammar_triggers.push_back(std::move(trigger));
} else {
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
- params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+ params.sampling.grammar_triggers.push_back({word, COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
}
} else {
params.sampling.grammar_triggers.push_back(std::move(ct.value));