8000 Merge branch 'master' of https://github.com/ggerganov/llama.cpp into … · TabbyML/llama.cpp@c504a54 · GitHub
[go: up one dir, main page]

Skip to content

Commit c504a54

Browse files
committed
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into b2297
2 parents e841b7a + c29af7e commit c504a54

File tree

19 files changed

+661
-651
lines changed

19 files changed

+661
-651
lines changed

.github/workflows/build.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line n 628C umberDiff line change
@@ -145,6 +145,28 @@ jobs:
145145
cd build
146146
ctest -L main --verbose
147147
148+
ubuntu-22-cmake-vulkan:
149+
runs-on: ubuntu-22.04
150+
151+
steps:
152+
- name: Clone
153+
id: checkout
154+
uses: actions/checkout@v3
155+
156+
- name: Dependencies
157+
id: depends
158+
run: |
159+
sudo apt-get update
160+
sudo apt-get install build-essential libvulkan-dev
161+
162+
- name: Build
163+
id: cmake_build
164+
run: |
165+
mkdir build
166+
cd build
167+
cmake -DLLAMA_VULKAN=ON ..
168+
cmake --build . --config Release -j $(nproc)
169+
148170
ubuntu-22-cmake-sycl:
149171
runs-on: ubuntu-22.04
150172

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Hot topics
1212

13+
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
1314
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
1415
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
1516
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590

common/common.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,7 +1015,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10151015
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
10161016
printf(" --no-penalize-nl do not penalize newline token\n");
10171017
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
1018-
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
1018+
printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
10191019
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
10201020
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
10211021
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
@@ -1281,7 +1281,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12811281
cparams.n_batch = params.n_batch;
12821282
cparams.n_threads = params.n_threads;
12831283
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
1284-
cparams.mul_mat_q = params.mul_mat_q;
12851284
cparams.seed = params.seed;
12861285
cparams.logits_all = params.logits_all;
12871286
cparams.embedding = params.embedding;
@@ -1725,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
17251724
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
17261725
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
17271726
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1728-
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
17291727
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
17301728
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
17311729
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ struct gpt_params {
115115

116116
bool kl_divergence = false; // compute KL-divergence
117117

118-
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
119118
bool random_prompt = false; // do not randomize prompt if none provided
120119
bool use_color = false; // use color to distinguish generations and inputs
121120
bool interactive = false; // interactive mode

convert-hf-to-gguf.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,11 @@ def set_gguf_parameters(self):
9696
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
9797
self.gguf_writer.add_head_count_kv(n_head_kv)
9898

99+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
100+
self.gguf_writer.add_rope_freq_base(rope_theta)
99101
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
100102
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
101-
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
103+
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
102104
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
103105
if (n_experts := self.hparams.get("num_local_experts")) is not None:
104106
self.gguf_writer.add_expert_count(n_experts)
@@ -220,6 +222,8 @@ def from_model_architecture(model_architecture):
220222
return NomicBertModel
221223
if model_architecture == "GemmaForCausalLM":
222224
return GemmaModel
225+
if model_architecture == "Starcoder2ForCausalLM":
226+
return Model
223227
return Model
224228

225229
def _is_model_safetensors(self) -> bool:
@@ -281,6 +285,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
281285
return gguf.MODEL_ARCH.NOMIC_BERT
282286
if arch == "GemmaForCausalLM":
283287
return gguf.MODEL_ARCH.GEMMA
288+
if arch == "Starcoder2ForCausalLM":
289+
return gguf.MODEL_ARCH.STARCODER2
284290

285291
raise NotImplementedError(f'Architecture "{arch}" not supported!')
286292

@@ -1811,16 +1817,15 @@ def write_tensors(self):
18111817
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
18121818

18131819
for name, data_torch in self.get_tensors():
1814-
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
1815-
if name.endswith("norm.weight"):
1816-
data_torch = data_torch + 1
1817-
18181820
old_dtype = data_torch.dtype
18191821

18201822
# convert any unsupported data types to float32
18211823
if data_torch.dtype not in (torch.float16, torch.float32):
18221824
data_torch = data_torch.to(torch.float32)
18231825

1826+
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
1827+
if name.endswith("norm.weight"):
1828+
data_torch = data_torch + 1
18241829
data = data_torch.squeeze().numpy()
18251830

18261831
# map tensor names

examples/batched-bench/batched-bench.cpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,15 @@ int main(int argc, char ** argv) {
3232
gpt_params params;
3333

3434
if (argc == 1 || argv[1][0] == '-') {
35-
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
35+
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
3636
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
37-
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
37+
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
3838
return 1 ;
3939
}
4040

4141
int n_kv_max = 2048;
4242
int is_pp_shared = 0;
4343
int n_gpu_layers = 0;
44-
int mmq = 0;
4544

4645
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
4746
std::vector<int> n_tg = { 128, 256, };
@@ -65,19 +64,15 @@ int main(int argc, char ** argv) {
6564
}
6665

6766
if (argc >= 6) {
68-
mmq = std::atoi(argv[5]);
67+
n_pp = parse_list(argv[5]);
6968
}
7069

7170
if (argc >= 7) {
72-
n_pp = parse_list(argv[6]);
71+
n_tg = parse_list(argv[6]);
7372
}
7473

7574
if (argc >= 8) {
76-
n_tg = parse_list(argv[7]);
77-
}
78-
79-
if (argc >= 9) {
80-
n_pl = parse_list(argv[8]);
75+
n_pl = parse_list(argv[7]);
8176
}
8277

8378
// init LLM
@@ -106,7 +101,6 @@ int main(int argc, char ** argv) {
106101
ctx_params.seed = 1234;
107102
ctx_params.n_ctx = n_kv_max;
108103
ctx_params.n_batch = 512;
109-
ctx_params.mul_mat_q = mmq;
110104

111105
ctx_params.n_threads = params.n_threads;
112106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -159,7 +153,7 @@ int main(int argc, char ** argv) {
159153
}
160154

161155
LOG_TEE("\n");
162-
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
156+
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
163157
LOG_TEE("\n");
164158

165159
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

examples/llama-bench/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ options:
3535
-mg, --main-gpu <i> (default: 0)
3636
-nkvo, --no-kv-offload <0|1> (default: 0)
3737
-mmp, --mmap <0|1> (default: 1)
38-
-mmq, --mul-mat-q <0|1> (default: 1)
3938
-ts, --tensor_split <ts0/ts1/..> (default: 0)
4039
-r, --repetitions <n> (default: 5)
4140
-o, --output <csv|json|md|sql> (default: md)

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ struct cmd_params {
176176
std::vector<llama_split_mode> split_mode;
177177
std::vector<int> main_gpu;
178178
std::vector<bool> no_kv_offload;
179-
std::vector<bool> mul_mat_q;
180179
std::vector<std::vector<float>> tensor_split;
181180
std::vector<bool> use_mmap;
182181
int reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196195
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
197196
/* main_gpu */ {0},
198197
/* no_kv_offload */ {false},
199-
/* mul_mat_q */ {true},
200198
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
201199
/* use_mmap */ {true},
202200
/* reps */ 5,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221219
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
222220
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
223221
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
224-
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
225222
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
226223
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
227224
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383380
}
384381
auto p = split<bool>(argv[i], split_delim);
385382
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
386-
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
387-
if (++i >= argc) {
388-
invalid_param = true;
389-
break;
390-
}
391-
auto p = split<bool>(argv[i], split_delim);
392-
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
393383
} else if (arg == "-mmp" || arg == "--mmap") {
394384
if (++i >= argc) {
395385
invalid_param = true;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466456
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
467457
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
468458
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
469-
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
470459
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
471460
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
472461
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486475
llama_split_mode split_mode;
487476
int main_gpu;
488477
bool no_kv_offload;
489-
bool mul_mat_q;
490478
std::vector<float> tensor_split;
491479
bool use_mmap;
492480

@@ -518,7 +506,6 @@ struct cmd_params_instance {
518506
cparams.n_batch = n_batch;
519507
cparams.type_k = type_k;
520508
cparams.type_v = type_v;
521-
cparams.mul_mat_q = mul_mat_q;
522509
cparams.offload_kqv = !no_kv_offload;
523510

524511
return cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538525
for (const auto & nb : params.n_batch)
539526
for (const auto & tk : params.type_k)
540527
for (const auto & tv : params.type_v)
541-
for (const auto & mmq : params.mul_mat_q)
542528
for (const auto & nkvo : params.no_kv_offload)
543529
for (const auto & nt : params.n_threads) {
544530
for (const auto & n_prompt : params.n_prompt) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557543
/* .split_mode = */ sm,
558544
/* .main_gpu = */ mg,
559545
/* .no_kv_offload= */ nkvo,
560-
/* .mul_mat_q = */ mmq,
561546
/* .tensor_split = */ ts,
562547
/* .use_mmap = */ mmp,
563548
};
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580565
/* .split_mode = */ sm,
581566
/* .main_gpu = */ mg,
582567
/* .no_kv_offload= */ nkvo,
583-
/* .mul_mat_q = */ mmq,
584568
/* .tensor_split = */ ts,
585569
/* .use_mmap = */ mmp,
586570
};
@@ -616,7 +600,6 @@ struct test {
616600
llama_split_mode split_mode;
617601
int main_gpu;
618602
bool no_kv_offload;
619-
bool mul_mat_q;
620603
std::vector<float> tensor_split;
621604
bool use_mmap;
622605
int n_prompt;
@@ -639,7 +622,6 @@ struct test {
639622
split_mode = inst.split_mode;
640623
main_gpu = inst.main_gpu;
641624
no_kv_offload = inst.no_kv_offload;
642-
mul_mat_q = inst.mul_mat_q;
643625
tensor_split = inst.tensor_split;
644626
use_mmap = inst.use_mmap;
645627
n_prompt = inst.n_prompt;
@@ -713,7 +695,7 @@ struct test {
713695
"n_batch", "n_threads", "type_k", "type_v",
714696
"n_gpu_layers", "split_mode",
715697
"main_gpu", "no_kv_offload",
716-
"mul_mat_q", "tensor_split", "use_mmap",
698+
"tensor_split", "use_mmap",
717699
"n_prompt", "n_gen", "test_time",
718700
"avg_ns", "stddev_ns",
719701
"avg_ts", "stddev_ts"
@@ -733,7 +715,7 @@ struct test {
733715
}
734716
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
735717
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
736-
field == "mul_mat_q" || field == "use_mmap") {
718+
field == "use_mmap") {
737719
return BOOL;
738720
}
739721
if (field == "avg_ts" || field == "stddev_ts") {
@@ -767,7 +749,7 @@ struct test {
767749
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
768750
std::to_string(n_gpu_layers), split_mode_str(split_mode),
769751
std::to_string(main_gpu), std::to_string(no_kv_offload),
770-
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
752+
tensor_split_str, std::to_string(use_mmap),
771753
std::to_string(n_prompt), std::to_string(n_gen), test_time,
772754
std::to_string(avg_ns()), std::to_string(stdev_ns()),
773755
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -931,9 +913,6 @@ struct markdown_printer : public printer {
931913
if (field == "n_threads") {
932914
return "threads";
933915
}
934-
if (field == "mul_mat_q") {
935-
return "mmq";
936-
}
937916
if (field == "no_kv_offload") {
938917
return "nkvo";
939918
}
@@ -974,9 +953,6 @@ struct markdown_printer : public printer {
974953
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
975954
fields.emplace_back("split_mode");
976955
}
977-
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
978-
fields.emplace_back("mul_mat_q");
979-
}
980956
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
981957
fields.emplace_back("no_kv_offload");
982958
}

examples/server/README.md

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The project is under active development, and we are [looking for feedback and co
1818

1919
- `--threads N`, `-t N`: Set the number of threads to use during generation.
2020
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
21+
- `--threads-http N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`)
2122
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
2223
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
2324
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
@@ -325,7 +326,7 @@ Notice that each `probs` is an array of length `n_probs`.
325326
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
326327
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
327328

328-
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
329+
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
329330< BF52 div class="diff-text-inner">
330331
*Options:*
331332

@@ -527,20 +528,7 @@ bash chat.sh
527528

528529
### API like OAI
529530

530-
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
531-
This example must be used with server.cpp
532-
533-
```sh
534-
python api_like_OAI.py
535-
```
536-
537-
After running the API server, you can use it in Python by setting the API base URL.
538-
539-
```python
540-
openai.api_base = "http://<Your api-server IP>:port"
541-
```
542-
543-
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
531+
The HTTP server supports OAI-like API
544532

545533
### Extending or building alternative Web Front End
546534

0 commit comments

Comments
 (0)
0