8000 examples high_level_api: add openapi client example · abetlen/llama-cpp-python@68f1b57 · GitHub
[go: up one dir, main page]

Skip to content

Commit 68f1b57

Browse files
committed
examples high_level_api: add openapi client example
Includes examples that: - Synchronously use the openapi client to make requests - Asynchronously use the openapi client to make requests - Make a streaming asynchronous request (just uses the autogenerated models for now)
1 parent ed68e44 commit 68f1b57

File tree

1 file changed

+145
-0
lines changed

1 file changed

+145
-0
lines changed
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import json
2+
from pprint import pprint
3+
import asyncio
4+
5+
import aiohttp
6+
import llama_cpp
7+
8+
import llama_cpp.client as llama_cpp_client
9+
import llama_cpp.client.aio as llama_cpp_client_aio
10+
import llama_cpp.client.models as llama_cpp_models
11+
12+
LLAMA_SERVER_BASE_URL = "http://localhost:8000"
13+
14+
15+
async def main():
16+
# Create client
17+
client = llama_cpp_client.LlamaCppPythonAPI(
18+
endpoint=LLAMA_SERVER_BASE_URL,
19+
)
20+
21+
# List models
22+
models = client.get.models_v1_models_get()
23+
print("Models:")
24+
pprint(models.as_dict())
25+
26+
# Completion (async)
27+
async with llama_cpp_client_aio.LlamaCppPythonAPI(
28+
endpoint=LLAMA_SERVER_BASE_URL,
29+
) as aio_client:
30+
completion = await aio_client.create.completion_v1_completions_post(
31+
body={
32+
"model": "example",
33+
"prompt": "The quick brown fox jumps over the lazy dog.",
34+
"max_tokens": 50,
35+
}
36+
)
37+
print("Completion:")
38+
pprint(completion.as_dict())
39+
40+
# Embedding (async)
41+
# This time we'll use a model for the request instead of an untyped dictionary
42+
embedding = await aio_client.create.embedding_v1_embeddings_post(
43+
body=llama_cpp_models.CreateEmbeddingRequest(
44+
model="example",
45+
input="The quick brown fox jumps over the lazy dog.",
46+
)
47+
)
48+
print("Embedding:")
49+
pprint(embedding.as_dict())
50+
51+
# Chat completion (async)
52+
chat_completion = (
53+
await aio_client.create.chat_completion_v1_chat_completions_post(
54+
body=llama_cpp_models.CreateChatCompletionRequest(
55+
model="example",
56+
messages=[
57+
llama_cpp_models.ChatCompletionRequestMessage(
58+
role="system", content="You are a helpful assistant."
59+
),
60+
llama_cpp_models.ChatCompletionRequestMessage(
61+
role="user", content="What is the capital of France?"
62+
),
63+
],
64+
temperature=0.5,
65+
)
66+
)
67+
)
68+
69+
print("Chat completion:")
70+
pprint(chat_completion.as_dict())
71+
72+
# Chat completion (streaming, currently can't use client)
73+
async with aiohttp.ClientSession() as session:
74+
body = llama_cpp_models.CreateChatCompletionRequest(
75+
model="example",
76+
messages=[
77+
llama_cpp_models.ChatCompletionRequestMessage(
78+
role="system", content="You are a helpful assistant."
79+
),
80+
llama_cpp_models.ChatCompletionRequestMessage(
81+
role="user",
82+
content="Tell me the story of the three little pigs in the style of a pirate.",
83+
),
84+
],
85+
max_tokens=200,
86+
temperature=2,
87+
stream=True,
88+
)
89+
async with session.post(
90+
f"{LLAMA_SERVER_BASE_URL}/v1/chat/completions", json=body.serialize()
91+
) as response:
92+
async for line in response.content:
93+
# This sure seems like the wrong way to do this...
94+
line = line.decode("utf-8")
95+
if line.startswith("data: "):
96+
chunk_str = line[len("data: ") :].strip()
97+
if chunk_str == "":
98+
continue
99+
elif chunk_str == "[DONE]":
100+
print("")
101+
break
102+
else:
103+
chunk_json = json.loads(chunk_str)
104+
chunk = llama_cpp.ChatCompletionChunk(**chunk_json)
105+
delta = chunk["choices"][0]["delta"]
106+
if "content" in delta:
107+
print(delta["content"], end="")
108+
109+
# Completion (streaming, currently can't use client)
110+
async with aiohttp.ClientSession() as session:
111+
body = llama_cpp_models.CreateCompletionRequest(
112+
model="example",
113+
prompt="""### Human: recipe for stuffed mushrooms.
114+
### Assistant:""",
115+
max_tokens=1500,
116+
temperature=1,
117+
top_p=0.55,
118+
top_k=33,
119+
stream=True,
120+
)
121+
async with session.post(
122+
f"{LLAMA_SERVER_BASE_URL}/v1/completions", json=body.serialize()
123+
) as response:
124+
async for line in response.content:
125+
# This sure seems like the wrong way to do this...
126+
line = line.decode("utf-8")
127+
if line.startswith("data: {"):
128+
chunk_str = line[len("data: ") :].strip()
129+
130+
if chunk_str == "":
131+
continue
132+
elif chunk_str == "[DONE]":
133+
print("")
134+
break
135+
else:
136+
chunk_json = json.loads(chunk_str)
137+
chunk = llama_cpp.CompletionChunk(**chunk_json)
138+
text = chunk["choices"][0]["text"]
139+
print(text, end="")
140+
print("done!")
141+
142+
143+
if __name__ == "__main__":
144+
loop = asyncio.get_event_loop()
145+
loop.run_until_complete(main())

0 commit comments

Comments
 (0)
0