10000 server : refactor slot input data, move tokenizer to HTTP thread by ngxson · Pull Request #10023 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

server : refactor slot input data, move tokenizer to HTTP thread #10023

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 24, 2024
Prev Previous commit
Next Next commit
remove redundant code
  • Loading branch information
ngxson committed Oct 24, 2024
commit 575b1332ab28c1892f726a6e92a815461bbc4240
10 changes: 1 addition & 9 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,10 +715,6 @@ struct server_context {
metrics.init();
}

std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
return tokenize_mixed(ctx, json_prompt, add_special, parse_special);
}

server_slot * get_slot_by_id(int id) {
for (server_slot & slot : slots) {
if (slot.id == id) {
Expand Down Expand Up @@ -1352,10 +1348,6 @@ struct server_context {
std::vector<server_task> create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) {
std::vector<server_task> tasks;
auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
if (prompt_tokens.empty()) {
// TODO @ngxson : should not throw an error
throw std::runtime_error("prompt must not be empty");
}
SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
server_task task;
task.id = queue_tasks.get_new_id();
Expand Down Expand Up @@ -2877,7 +2869,7 @@ int main(int argc, char ** argv) {
const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false);

std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
std::vector<llama_token> tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);

if (with_pieces) {
for (const auto& token : tokens) {
Expand Down
0