8000 server : (refactor) no more json in server_task input by ngxson · Pull Request #10691 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

server : (refactor) no more json in server_task input #10691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add test for slots endpoint
  • Loading branch information
ngxson committed Dec 7, 2024
commit 9bb1ae6beacf652cd6a10189571bfc21b8380310
1 change: 1 addition & 0 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,7 @@ struct server_context {

auto res = std::make_unique<server_task_result_metrics>();
res->id = task.id;
res->slots_data = slots_data;
res->n_idle_slots = n_idle_slots;
res->n_processing_slots = n_processing_slots;
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
Expand Down
11 changes: 11 additions & 0 deletions examples/server/tests/unit/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ def test_server_models():
assert len(res.body["data"]) == 1
assert res.body["data"][0]["id"] == server.model_alias


def test_server_slots():
global server
server.server_slots = True
server.start()
res = server.make_request("GET", "/slots")
assert res.status_code == 200
assert len(res.body) == server.n_slots
assert res.body[0]["n_ctx"] > 0


def test_load_split_model():
global server
server.model_hf_repo = "ggml-org/models"
Expand Down
4 changes: 2 additions & 2 deletions examples/server/tests/unit/test_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
],
})
assert res.status_code == 200
assert "cmpl" in res.body["id"]
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted
Expand Down Expand Up @@ -66,7 +66,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
if last_cmpl_id is None:
last_cmpl_id = data["id"]
assert last_cmpl_id == data["id"]
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
if choice["finish_reason"] in ["stop", "length"]:
assert data["usage"]["prompt_tokens"] == n_prompt
assert data["usage"]["completion_tokens"] == n_predicted
Expand Down
3 changes: 3 additions & 0 deletions examples/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class ServerProcess:
server_embeddings: bool | None = False
server_reranking: bool | None = False
server_metrics: bool | None = False
server_slots: bool | None = False
draft: int | None = None
api_key: str | None = None
response_format: str | None = None
Expand Down 5FB1 Expand Up @@ -129,6 +130,8 @@ def start(self, timeout_seconds: int = 10) -> None:
server_args.append("--reranking")
if self.server_metrics:
server_args.append("--metrics")
if self.server_slots:
server_args.append("--slots")
if self.model_alias:
server_args.extend(["--alias", self.model_alias])
if self.n_ctx:
Expand Down
Loading
0