You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Build and run the code, which outputs control tokens, even though it shouldn't.
#include"llama.h"
#include<cstdio>
#include<cstring>
#include<string>
#include<vector>staticvoidprint_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
printf("\n");
}
staticvoiddiscard_log(ggml_log_level level, constchar *text, void *user_data)
{}
intmain(int argc, char ** argv) {
llama_log_set(discard_log, nullptr);
// path to the model gguf file
std::string model_path;
// prompt to generate text from
std::string prompt = "<|im_start|>system\nAct as a professional programmer, and answer questions based on the given prompt and data. Here are some rules you must always follow in your responses:\n- Generate only the requested output. Don't include any other language before or after the requested output. Answer in markdown language.\n- Only answer in plain English in markdown. Never respond in code. Never give code snippets.\n<|im_end|>\n<|im_start|>user\nProvide a VERY SHORT overview of the following local object called \"cp2\". Do not state the obvious. Do not respond with a title/header.\n```c\n register char *cp1, *cp2;\n\n```\n<|im_end|>\n<|im_start|>assistant\n";
// number of layers to offload to the GPUint ngl = 99;
// number of tokens to predictint n_predict = 1024;
// parse command line arguments
{
int i = 1;
for (; i < argc; i++) {
if (strcmp(argv[i], "-m") == 0) {
if (i + 1 < argc) {
model_path = argv[++i];
} else {
print_usage(argc, argv);
return1;
}
} elseif (strcmp(argv[i], "-n") == 0) {
if (i + 1 < argc) {
try {
n_predict = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return1;
}
} else {
print_usage(argc, argv);
return1;
}
} elseif (strcmp(argv[i], "-ngl") == 0) {
if (i + 1 < argc) {
try {
ngl = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return1;
}
} else {
print_usage(argc, argv);
return1;
}
} else {
// prompt starts herebreak;
}
}
if (model_path.empty()) {
print_usage(argc, argv);
return1;
}
if (i < argc) {
prompt = argv[i++];
for (; i < argc; i++) {
prompt += "";
prompt += argv[i];
}
}
}
// load dynamic backendsggml_backend_load_all();
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
const llama_vocab * vocab = llama_model_get_vocab(model);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return1;
}
// tokenize the prompt// find the number of tokens in the promptconstint n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, false, true);
// allocate space for the tokens and tokenize the prompt
std::vector<llama_token> prompt_tokens(n_prompt);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), false, true) < 0) {
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
return1;
}
// initialize the context
llama_context_params ctx_params = llama_context_default_params();
// n_ctx is the context size
ctx_params.n_ctx = n_prompt + n_predict - 1;
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
ctx_params.n_batch = n_prompt;
// enable performance counters
ctx_params.no_perf = false;
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return1;
}
// initialize the samplerauto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
// main loopconstauto t_main_start = ggml_time_us();
int n_decode = 0;
llama_token new_token_id;
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
// evaluate the current batch with the transformer modelif (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return1;
}
n_pos += batch.n_tokens;
// sample the next token
{
new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation?if (llama_vocab_is_eog(vocab, new_token_id)) {
break;
}
char buf[128];
int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return1;
}
std::string s(buf, n);
printf("%s", s.c_str());
fflush(stdout);
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1);
n_decode += 1;
}
}
printf("\n");
constauto t_main_end = ggml_time_us();
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
llama_sampler_free(smpl);
llama_free(ctx);
llama_model_free(model);
return0;
}
First Bad Commit
No response
Relevant log output
The output has control tokens like the following.
<|im_end|>
The text was updated successfully, but these errors were encountered:
Uh oh!
There was an error while loading. Please reload this page.
Name and Version
llama.cpp commit: 5dd5d1a
Operating systems
Linux
Which llama.cpp modules do you know to be affected?
libllama (core library)
Command line
Problem description & steps to reproduce
Steps to replicate:
First Bad Commit
No response
Relevant log output
The output has control tokens like the following.
The text was updated successfully, but these errors were encountered: