8000 examples high_level_api: add openapi client example · abetlen/llama-cpp-python@4f89e2f · GitHub
[go: up one dir, main page]

Skip to content

Commit 4f89e2f

Browse files
committed
examples high_level_api: add openapi client example
Includes examples that: - Synchronously use the openapi client to make requests - Asynchronously use the openapi client to make requests - Make a streaming request (just uses the autogenerated models for now)
1 parent de73460 commit 4f89e2f

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import json
2+
from pprint import pprint
3+
import asyncio
4+
5+
import aiohttp
6+
import llama_cpp
7+
8+
import llama_cpp.client as llama_cpp_client
9+
import llama_cpp.client.aio as llama_cpp_client_aio
10+
import llama_cpp.client.models as llama_cpp_models
11+
12+
LLAMA_SERVER_BASE_URL = "http://localhost:8000"
13+
14+
15+
async def main():
16+
# Create client
17+
client = llama_cpp_client.LlamaCppPythonAPI(
18+
endpoint=LLAMA_SERVER_BASE_URL,
19+
)
20+
21+
# List models
22+
models = client.get.models_v1_models_get()
23+
print("Models:")
24+
pprint(models.as_dict())
25+
26+
# Completion (async)
27+
async with llama_cpp_client_ai 10000 o.LlamaCppPythonAPI(
28+
endpoint=LLAMA_SERVER_BASE_URL,
29+
) as aio_client:
30+
completion = await aio_client.create.completion_v1_completions_post(
31+
body={
32+
"model": "example",
33+
"prompt": "The quick brown fox jumps over the lazy dog.",
34+
"max_tokens": 50,
35+
}
36+
)
37+
print("Completion:")
38+
pprint(completion.as_dict())
39+
40+
# Embedding (async)
41+
# This time we'll use a model for the request instead of an untyped dictionary
42+
embedding = await aio_client.create.embedding_v1_embeddings_post(
43+
body=llama_cpp_models.CreateEmbeddingRequest(
44+
model="example",
45+
input="The quick brown fox jumps over the lazy dog.",
46+
)
47+
)
48+
print("Embedding:")
49+
pprint(embedding.as_dict())
50+
51+
# Chat completion (async)
52+
chat_completion = (
53+
await aio_client.create.chat_completion_v1_chat_completions_post(
54+
body=llama_cpp_models.CreateChatCompletionRequest(
55+
model="example",
56+
messages=[
57+
llama_cpp_models.ChatCompletionRequestMessage(
58+
role="system", content="You are a helpful assistant."
59+
),
60+
llama_cpp_models.ChatCompletionRequestMessage(
61+
role="user", content="What is the capital of France?"
62+
),
63+
],
64+
temperature=0.5,
65+
)
66+
)
67+
)
68+
69+
print("Chat completion:")
70+
pprint(chat_completion.as_dict())
71+
72+
# Chat completion (streaming, currently can't use client)
73+
async with aiohttp.ClientSession() as session:
74+
body = llama_cpp_models.CreateChatCompletionRequest(
75+
model="example",
76+
messages=[
77+
llama_cpp_models.ChatCompletionRequestMessage(
78+
role="system", content="You are a helpful assistant."
79+
),
80+
llama_cpp_models.ChatCompletionRequestMessage(
81+
role="user", content="Tell me the story of the three little pigs."
82+
),
83+
],
84+
max_tokens=200,
85+
temperature=2,
86+
stream=True,
87+
)
88+
async with session.post(
89+
f"{LLAMA_SERVER_BASE_URL}/v1/chat/completions", json=body.serialize()
90+
) as response:
91+
async for line in response.content:
92+
93+
# This sure seems like the wrong way to do this...
94+
chunk_str = line.decode("utf-8")[len("data: ") :].strip()
95+
if chunk_str == "":
96+
continue
97+
elif chunk_str == "[DONE]":
98+
print("")
99+
break
100+
else:
101+
chunk_json = json.loads(chunk_str)
102+
chunk = llama_cpp.ChatCompletionChunk(**chunk_json)
103+
delta = chunk["choices"][0]["delta"]
104+
if "content" in delta:
105+
print(delta["content"], end="")
106+
107+
print("done!")
108+
109+
110+
if __name__ == "__main__":
111+
loop = asyncio.get_event_loop()
112+
loop.run_until_complete(main())

0 commit comments

Comments
 (0)
0