8000 llama: Add configuration presets for chat and reranking servers by heyyymonth · Pull Request #13462 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

llama: Add configuration presets for chat and reranking servers #13462

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
common: add configuration presets for chat and reranking servers
Added two new configuration presets to simplify command-line usage: 1. --chat-llama3-8b-default for running a chat server with Llama3 8B model, 2. --rerank-bge-default for running a reranking server with the BGE model. These presets configure appropriate model paths, server ports, GPU settings, and other parameters. Refs: #10932
  • Loading branch information
heyyymonth committed May 12, 2025
commit 8d23bfc10f0783551994a43918386e114a6b3ade
30 changes: 30 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3325,5 +3325,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--chat-llama3-8b-default"},
string_format("use default Llama3 8B model for chat server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Llama-3-8B-Q8_0-GGUF";
params.model.hf_file = "llama-3-8b-q8_0.gguf";
params.port = 8080;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 512;
params.n_batch = 512;
params.n_ctx = 4096;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--rerank-bge-default"},
string_format("use default BGE reranker model for reranking server (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/bge-reranker-base-Q8_0-GGUF";
params.model.hf_file = "bge-reranker-base-q8_0.gguf";
params.port = 8090;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ctx = 512;
params.reranking = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

return ctx_arg;
}
0