diff --git a/common/chat.cpp b/common/chat.cpp index f138c7bcafcfa..8af41a8d454a6 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -26,13 +26,13 @@ struct common_chat_templates { struct templates_params { json messages; json tools; - common_chat_tool_choice tool_choice; json json_schema; + common_chat_tool_choice tool_choice; bool parallel_tool_calls; bool stream; - std::string grammar; bool add_generation_prompt = true; bool extract_reasoning = true; + std::string grammar; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; @@ -815,7 +815,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat } builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema)); }); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}); + data.grammar_triggers.push_back({"[TOOL_CALLS]", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); data.preserved_tokens = { "[TOOL_CALLS]", }; @@ -862,8 +862,8 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\""); }); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|START_ACTION|>", + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, }); data.preserved_tokens = { "<|START_ACTION|>", @@ -1004,11 +1004,11 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te }); // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name. data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*", + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, }); if (!builtin_tools.empty()) { - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); data.preserved_tokens.push_back("<|python_tag|>"); } // Allow a few empty lines on top of the usual constrained json schema space rule. @@ -1085,10 +1085,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " " "\"<|tool▁calls▁end|>\"" " space"); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"}); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"}); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"}); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"}); + data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); + data.grammar_triggers.push_back({"<|tool_calls_begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); + data.grammar_triggers.push_back({"<|tool calls begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); + data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); data.preserved_tokens = { "", "", @@ -1196,7 +1196,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c } builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema)); }); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["}); + data.grammar_triggers.push_back({" functools[", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); data.preserved_tokens = { " functools[", }; @@ -1230,20 +1230,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule)); subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule)); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, regex_escape(name + "\n"), + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, }); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, regex_escape("assistant<|end_header_id|>\n" + name + "\n"), + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, }); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, regex_escape(">>>" + name + "\n"), + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, }); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ">>>assistant<|end_header_id|>\n" + name, + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, }); }); data.preserved_tokens = { @@ -1339,12 +1339,12 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con }); if (has_raw_python) { tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*")); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); data.preserved_tokens.push_back("<|python_tag|>"); } auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space"; builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "\" space")); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "", + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, }); auto escaped_name = regex_escape(name); data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, ""}); - data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "", COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); + data.grammar_triggers.push_back({"|||)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"", + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, }); data.preserved_tokens = { "", diff --git a/common/chat.h b/common/chat.h index d26a09c2f7c4f..01ce0a4741e94 100644 --- a/common/chat.h +++ b/common/chat.h @@ -68,18 +68,18 @@ struct common_chat_templates_inputs { bool add_generation_prompt = true; bool use_jinja = true; // Parameters below only supported when use_jinja is true - std::vector tools; - common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; bool parallel_tool_calls = false; bool extract_reasoning = true; + common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; + std::vector tools; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; struct common_chat_params { common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + bool grammar_lazy = false; std::string prompt; std::string grammar; - bool grammar_lazy = false; std::vector grammar_triggers; std::vector preserved_tokens; std::vector additional_stops; diff --git a/common/common.h b/common/common.h index 556ff5be40798..26c12997bd6b0 100644 --- a/common/common.h +++ b/common/common.h @@ -51,8 +51,8 @@ struct cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU - enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) bool strict_cpu = false; // Use strict CPU placement + enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; @@ -119,8 +119,8 @@ enum common_grammar_trigger_type { }; struct common_grammar_trigger { - common_grammar_trigger_type type; std::string value; + common_grammar_trigger_type type; llama_token token = LLAMA_TOKEN_NULL; }; @@ -156,6 +156,11 @@ struct common_params_sampling { bool no_perf = false; // disable performance metrics bool timing_per_token = false; + bool grammar_lazy = false; + std::string grammar; // optional BNF-like grammar to constrain sampling + std::vector grammar_triggers; // optional triggers (for lazy grammars) + std::set preserved_tokens; + std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY @@ -171,11 +176,6 @@ struct common_params_sampling { COMMON_SAMPLER_TYPE_TEMPERATURE, }; - std::string grammar; // optional BNF-like grammar to constrain sampling - bool grammar_lazy = false; - std::vector grammar_triggers; // optional triggers (for lazy grammars) - std::set preserved_tokens; - std::vector logit_bias; // logit biases to apply // print the parameters into a string @@ -240,14 +240,14 @@ struct common_params { float defrag_thold = 0.1f; // KV cache defragmentation threshold // offload params + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + std::vector devices; // devices to use for offloading int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -283,11 +283,10 @@ struct common_params { std::vector kv_overrides; std::vector tensor_buft_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) - std::vector lora_adapters; // lora adapter path with user defined scale - std::vector control_vectors; // control vector with user defined scale + std::vector lora_adapters; // lora adapter path with user defined scale + int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector @@ -296,13 +295,15 @@ struct common_params { int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt - size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt - size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + + size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed bool kl_divergence = false; // compute KL divergence @@ -337,23 +338,28 @@ struct common_params { bool single_turn = false; // single turn chat conversation - ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K - ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V + // options multimodal models (see tools/mtmd) + bool mmproj_use_gpu = true; // use GPU for multimodal model + bool no_mmproj = false; // explicitly disable multimodal model - common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; + // options embedding + bool embedding = false; // get only sentence embedding + bool reranking = false; // enable reranking support on server // multimodal models (see tools/mtmd) struct common_params_model mmproj; - bool mmproj_use_gpu = true; // use GPU for multimodal model - bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding - bool embedding = false; // get only sentence embedding - int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix std::string embd_sep = "\n"; // separator of embeddings - bool reranking = false; // enable reranking support on server + int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + + common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; + + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V + // server params int32_t port = 8080; // server listens on this network port @@ -362,19 +368,14 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT std::string chat_template = ""; // NOLINT bool use_jinja = false; // NOLINT bool enable_chat_template = true; - common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response - std::vector api_keys; - - std::string ssl_file_key = ""; // NOLINT - std::string ssl_file_cert = ""; // NOLINT - // "advanced" endpoints are disabled by default for better security bool webui = true; bool endpoint_slots = false; @@ -383,22 +384,20 @@ struct common_params { bool log_json = false; - std::string slot_save_path; + std::vector api_keys; - float slot_prompt_similarity = 0.5f; + std::string ssl_file_key = ""; // NOLINT + std::string ssl_file_cert = ""; // NOLINT - // batched-bench params - bool is_pp_shared = false; + std::string slot_save_path; - std::vector n_pp; - std::vector n_tg; - std::vector n_pl; + float slot_prompt_similarity = 0.5f; // retrieval params - std::vector context_files; // context files to embed - int32_t chunk_size = 64; // chunk size for context embedding + std::vector context_files; // context files to embed + std::string chunk_separator = "\n"; // chunk separator for context embedding // passkey params @@ -414,12 +413,19 @@ struct common_params { bool compute_ppl = true; // whether to compute perplexity bool parse_special = false; // whether to parse special tokens during imatrix tokenization + // batched-bench params + bool is_pp_shared = false; + + std::vector n_pp; + std::vector n_tg; + std::vector n_pl; + // cvector-generator params int n_pca_batch = 100; int n_pca_iterations = 1000; - dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; std::string cvector_positive_file = "tools/cvector-generator/positive.txt"; std::string cvector_negative_file = "tools/cvector-generator/negative.txt"; + dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; bool spm_infill = false; // suffix/prefix/middle pattern for infill diff --git a/common/log.cpp b/common/log.cpp index 52b31470c46bd..48111a047ece4 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -46,13 +46,13 @@ static std::vector g_col = { }; struct common_log_entry { - enum ggml_log_level level; - - bool prefix; + std::vector msg; int64_t timestamp; - std::vector msg; + enum ggml_log_level level; + + bool prefix; // signals the worker thread to stop bool is_end; diff --git a/include/llama.h b/include/llama.h index 52cd7a5a037ef..c324486ea9f39 100644 --- a/include/llama.h +++ b/include/llama.h @@ -302,6 +302,12 @@ extern "C" { // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE int32_t main_gpu; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data + // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; @@ -315,12 +321,6 @@ extern "C" { // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; - - // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool check_tensors; // validate model tensor data }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 85b4324b699e6..fdc66cf9ee269 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1246,7 +1246,6 @@ llm_graph_result_ptr llama_context::graph_build( return model.build_graph( { /*.ctx =*/ ctx, - /*.arch =*/ model.arch, /*.hparams =*/ model.hparams, /*.cparams =*/ cparams, /*.ubatch =*/ ubatch, @@ -1256,6 +1255,7 @@ llm_graph_result_ptr llama_context::graph_build( /*.loras =*/ &loras, /*.memory =*/ memory.get(), /*.cross =*/ &cross, + /*.arch =*/ model.arch, /*.n_outputs =*/ n_outputs, /*.cb =*/ graph_get_cb(), }, gf, gtype); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 13e36d161c614..29b1c6f6c49c2 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -414,7 +414,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { // llm_graph_context::llm_graph_context(const llm_graph_params & params) : - arch (params.arch), hparams (params.hparams), cparams (params.cparams), ubatch (params.ubatch), @@ -430,6 +429,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used), + arch (params.arch), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), diff --git a/src/llama-graph.h b/src/llama-graph.h index 2b85bb25befba..55af082d449e4 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -374,8 +374,6 @@ using llm_graph_cb = std::function samplers; - // timing + llama_sampler_chain_params params; - mutable int64_t t_sample_us; + // timing mutable int32_t n_sample; + + mutable int64_t t_sample_us; }; struct llama_sampler * llama_sampler_init_dry_testing( diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 087665e41411b..98a9a5430a6b2 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -393,7 +393,7 @@ struct server_task { params.sampling.grammar_triggers.push_back(std::move(trigger)); } else { SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); - params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); + params.sampling.grammar_triggers.push_back({word, COMMON_GRAMMAR_TRIGGER_TYPE_WORD}); } } else { params.sampling.grammar_triggers.push_back(std::move(ct.value));