8000 (WIP) Openapi client gen by Stonelinks · Pull Request #144 · abetlen/llama-cpp-python · GitHub
[go: up one dir, main page]

Skip to content

(WIP) Openapi client gen #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ _skbuild/

.envrc

models/
./models/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -164,3 +164,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.tmp
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode

```bash
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git

# Install with pip
pip install -e .

# if you want to use the fastapi / openapi server
pip install -e .[server]

# If you're a poetry user, installing will also include a virtual environment
poetry install --all-extras
. .venv/bin/activate

# Will need to be re-run any time vendor/llama.cpp is updated
python3 setup.py develop
```
Expand Down
66 changes: 66 additions & 0 deletions bin/generate-api-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
set -e

cd $(dirname "$0")/../

CLIENT_PACKAGE_NAME=llama_cpp.client
CLIENT_TMP_DIR=.tmp/llama-cpp-openapi-client
CLIENT_FINAL_DIR=llama_cpp/client

# Ensure we're starting from a clean slate
rm -rf .tmp $CLIENT_TMP_DIR $CLIENT_FINAL_DIR

# Pull the latest version of the API's OpenAPI spec
./bin/save-openapi-json
OPENAPI_JSON_PATH=.tmp/llama_server_openapi.json

# Generate the client

# Version tolerant: this is apparently their newer, more pythonic generator that is more tolerant to changes in the API
VERSION_TOLERANT=true

# Python3 only: we don't need to support python2
PYTHON3_ONLY=true

# Low level client: I think this is yet another client generator (???) - turn it off for now
LOW_LEVEL_CLIENT=false

# Models: IMO the client it totally pointless without generating models (which is disabled by default) - so enable it
# The only models that are supported are for msrest
MODELS_MODE=msrest

# Client-side validation: also something disabled by default - i'm going to turn this off for now because i think there might be some problems with return types in the API
CLIENT_SIDE_VALIDATION=false

# Setup.py: the client lives in llama_cpp/client, don't generate a setup.py or anything else to support a standalone package
BASIC_SETUP_PY=false

# RXun autorest
docker run --rm \
-v $PWD:/local \
-w /local \
azsdkengsys.azurecr.io/azuresdk/autorest-python \
autorest \
--python \
--namespace=$CLIENT_PACKAGE_NAME \
--package-name=llama-cpp-api-client \
--package-version=0.0.1 \
--basic-setup-py=$BASIC_SETUP_PY \
--version-tolerant=$VERSION_TOLERANT \
--python3-only=$PYTHON3_ONLY \
--low-level-client=$LOW_LEVEL_CLIENT \
--models-mode=$MODELS_MODE \
--client-side-validation=$CLIENT_SIDE_VALIDATION \
--combine-operation-files=false \
--clear-output-folder=true \
--no-namespace-folders=true \
--input-file=/local/$OPENAPI_JSON_PATH \
--output-folder=/local/$CLIENT_TMP_DIR

# Move the client to the final location
cp -r $CLIENT_TMP_DIR $CLIENT_FINAL_DIR

# Do some cleanup with black, isort and autoflake
black $CLIENT_FINAL_DIR
isort $CLIENT_FINAL_DIR
autoflake -r --in-place --remove-all-unused-imports --remove-unused-variables $CLIENT_FINAL_DIR
69 changes: 69 additions & 0 deletions bin/save-openapi-json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3

import subprocess
import os
import time
import urllib.request

LLAMA_SERVER_PORT = 8000
LLAMA_SERVER_BASE_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin"


def start_llama_server() -> subprocess.Popen:
print(f"Starting llama_cpp server")

env = os.environ.copy()
env["PORT"] = str(LLAMA_SERVER_PORT)
env["MODEL"] = MODEL
env["VOCAB_ONLY"] = "true"
server_process = subprocess.Popen(
["python3", "-m", "llama_cpp.server"],
env=env,
)

# Wait for the server to start
while not is_llama_server_running():
if server_process.poll() is not None:
raise RuntimeError("llama_cpp server failed to start")
else:
time.sleep(0.1)
print("Waiting for llama_cpp server to start...")

return server_process


def stop_llama_server(server_process: subprocess.Popen):
print(f"Stopping llama_cpp server")
server_process.kill()


def is_llama_server_running():
try:
with urllib.request.urlopen(f"{LLAMA_SERVER_BASE_URL}/v1/models") as response:
return True
except:
return False


def scrape(url, path):
with urllib.request.urlopen(url) as response:
data = response.read()
data_str = data.decode("utf-8")

dir = os.path.dirname(path)
os.makedirs(dir, exist_ok=True)
with open(path, "w") as f:
f.write(data_str)


def scrape_llama_server_openapi_json():
llama_server_process = start_llama_server()

scrape(f"{LLAMA_SERVER_BASE_URL}/openapi.json", ".tmp/llama_server_openapi.json")

stop_llama_server(llama_server_process)


if __name__ == "__main__":
scrape_llama_server_openapi_json()
145 changes: 145 additions & 0 deletions examples/high_level_api/openapi_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import json
from pprint import pprint
import asyncio

import aiohttp
import llama_cpp

import llama_cpp.client as llama_cpp_client
import llama_cpp.client.aio as llama_cpp_client_aio
import llama_cpp.client.models as llama_cpp_models

LLAMA_SERVER_BASE_URL = "http://localhost:8000"


async def main():
# Create client
client = llama_cpp_client.LlamaCppPythonAPI(
endpoint=LLAMA_SERVER_BASE_URL,
)

# List models
models = client.get.models_v1_models_get()
print("Models:")
pprint(models.as_dict())

# Completion (async)
async with llama_cpp_client_aio.LlamaCppPythonAPI(
endpoint=LLAMA_SERVER_BASE_URL,
) as aio_client:
completion = await aio_client.create.completion_v1_completions_post(
body={
"model": "example",
"prompt": "The quick brown fox jumps over the lazy dog.",
"max_tokens": 50,
}
)
print("Completion:")
pprint(completion.as_dict())

# Embedding (async)
# This time we'll use a model for the request instead of an untyped dictionary
embedding = await aio_client.create.embedding_v1_embeddings_post(
body=llama_cpp_models.CreateEmbeddingRequest(
model="example",
input="The quick brown fox jumps over the lazy dog.",
)
)
print("Embedding:")
pprint(embedding.as_dict())

# Chat completion (async)
chat_completion = (
await aio_client.create.chat_completion_v1_chat_completions_post(
body=llama_cpp_models.CreateChatCompletionRequest(
model="example",
messages=[
llama_cpp_models.ChatCompletionRequestMessage(
role="system", content="You are a helpful assistant."
),
llama_cpp_models.ChatCompletionRequestMessage(
role="user", content="What is the capital of France?"
),
],
temperature=0.5,
)
)
)

print("Chat completion:")
pprint(chat_completion.as_dict())

# Chat completion (streaming, currently can't use client)
async with aiohttp.ClientSession() as session:
body = llama_cpp_models.CreateChatCompletionRequest(
model="example",
messages=[
llama_cpp_models.ChatCompletionRequestMessage(
role="system", content="You are a helpful assistant."
),
llama_cpp_models.ChatCompletionRequestMessage(
role="user",
content="Tell me the story of the three little pigs in the style of a pirate.",
),
],
max_tokens=200,
temperature=2,
stream=True,
)
async with session.post(
f"{LLAMA_SERVER_BASE_URL}/v1/chat/completions", json=body.serialize()
) as response:
async for line in response.content:
# This sure seems like the wrong way to do this...
line = line.decode("utf-8")
if line.startswith("data: "):
chunk_str = line[len("data: ") :].strip()
if chunk_str == "":
continue
elif chunk_str == "[DONE]":
print("")
break
else:
chunk_json = json.loads(chunk_str)
chunk = llama_cpp.ChatCompletionChunk(**chunk_json)
delta = chunk["choices"][0]["delta"]
if "content" in delta:
print(delta["content"], end="")

# Completion (streaming, currently can't use client)
async with aiohttp.ClientSession() as session:
body = llama_cpp_models.CreateCompletionRequest(
model="example",
prompt="""### Human: recipe for stuffed mushrooms.
### Assistant:""",
max_tokens=1500,
temperature=1,
top_p=0.55,
top_k=33,
stream=True,
)
async with session.post(
f"{LLAMA_SERVER_BASE_URL}/v1/completions", json=body.serialize()
) as response:
async for line in response.content:
# This sure seems like the wrong way to do this...
line = line.decode("utf-8")
if line.startswith("data: {"):
chunk_str = line[len("data: ") :].strip()

if chunk_str == "":
continue
elif chunk_str == "[DONE]":
print("")
break
else:
chunk_json = json.loads(chunk_str)
chunk = llama_cpp.CompletionChunk(**chunk_json)
text = chunk["choices"][0]["text"]
print(text, end="")
print("done!")


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
24 changes: 24 additions & 0 deletions llama_cpp/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# coding=utf-8
# --------------------------------------------------------------------------
# Code generated by Microsoft (R) AutoRest Code Generator (autorest: 3.9.5, generator: @autorest/python@6.4.15)
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
# --------------------------------------------------------------------------

from ._client import LlamaCppPythonAPI
from ._version import VERSION

__version__ = VERSION

try:
from ._patch import * # pylint: disable=unused-wildcard-import
from ._patch import __all__ as _patch_all
except ImportError:
_patch_all = []
from ._patch import patch_sdk as _patch_sdk

__all__ = [
"LlamaCppPythonAPI",
]
__all__.extend([p for p in _patch_all if p not in __all__])

_patch_sdk()
Loading
0