8000 server : refactor slot input data, move tokenizer to HTTP thread by ngxson · Pull Request #10023 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

server : refactor slot input data, move tokenizer to HTTP thread #10023

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 24, 2024
Prev Previous commit
use llama_tokens everywhere
  • Loading branch information
ngxson committed Oct 24, 2024
commit 7f7acdbec56a23930a80c2ccb414728f815651cc
16 changes: 8 additions & 8 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ struct server_task {
int id = -1; // to be filled by server_queue
int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL

std::vector<llama_token> prompt_tokens;
llama_tokens prompt_tokens;
server_task_type type;
json data;

Expand Down Expand Up @@ -153,12 +153,12 @@ struct server_slot {
int32_t n_prompt_tokens_processed = 0;

// input prompt tokens
std::vector<llama_token> prompt_tokens;
llama_tokens prompt_tokens;

size_t last_nl_pos = 0;

std::string generated_text;
std::vector<llama_token> cache_tokens;
llama_tokens cache_tokens;
std::vector<completion_token_output> generated_token_probs;

server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
Expand Down Expand Up @@ -1184,7 +1184,7 @@ struct server_context {
};

if (slot.sparams.n_probs > 0) {
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());

Expand Down Expand Up @@ -1235,7 +1235,7 @@ struct server_context {
if (slot.sparams.n_probs > 0) {
std::vector<completion_token_output> probs;
if (!slot.params.stream && slot.stopped_word) {
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);

size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
probs = std::vector<completion_token_output>(
Expand Down Expand Up @@ -1911,7 +1911,7 @@ struct server_context {
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;

std::vector<llama_token> new_tokens(
llama_tokens new_tokens(
prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);

Expand Down Expand Up @@ -2869,7 +2869,7 @@ int main(int argc, char ** argv) {
const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false);

std::vector<llama_token> tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);

if (with_pieces) {
for (const auto& token : tokens) {
Expand Down Expand Up @@ -2906,7 +2906,7 @@ int main(int argc, char ** argv) {

std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body.at("tokens");
const llama_tokens tokens = body.at("tokens");
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
}

Expand Down
Loading
0