abetlen · Stonelinks · May 1, 2023 · May 1, 2023 · May 1, 2023 · May 9, 2023
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@ _skbuild/
 
 .envrc
 
-models/
+./models/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -164,3 +164,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+.tmp
diff --git a/README.md b/README.md
@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
 
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+
+# Install with pip
+pip install -e .
+
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
+
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

diff --git a/bin/generate-api-client b/bin/generate-api-client
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -e
+
+cd $(dirname "$0")/../
+
+CLIENT_PACKAGE_NAME=llama_cpp.client
+CLIENT_TMP_DIR=.tmp/llama-cpp-openapi-client
+CLIENT_FINAL_DIR=llama_cpp/client
+
+# Ensure we're starting from a clean slate
+rm -rf .tmp $CLIENT_TMP_DIR $CLIENT_FINAL_DIR
+
+# Pull the latest version of the API's OpenAPI spec
+./bin/save-openapi-json
+OPENAPI_JSON_PATH=.tmp/llama_server_openapi.json
+
+# Generate the client
+
+# Version tolerant: this is apparently their newer, more pythonic generator that is more tolerant to changes in the API
+VERSION_TOLERANT=true
+
+# Python3 only: we don't need to support python2
+PYTHON3_ONLY=true
+
+# Low level client: I think this is yet another client generator (???) - turn it off for now
+LOW_LEVEL_CLIENT=false
+
+# Models: IMO the client it totally pointless without generating models (which is disabled by default) - so enable it
+# The only models that are supported are for msrest
+MODELS_MODE=msrest
+
+# Client-side validation: also something disabled by default - i'm going to turn this off for now because i think there might be some problems with return types in the API
+CLIENT_SIDE_VALIDATION=false
+
+# Setup.py: the client lives in llama_cpp/client, don't generate a setup.py or anything else to support a standalone package
+BASIC_SETUP_PY=false
+
+# RXun autorest
+docker run --rm \
+    -v $PWD:/local \
+    -w /local \
+    azsdkengsys.azurecr.io/azuresdk/autorest-python \
+    autorest \
+    --python \
+    --namespace=$CLIENT_PACKAGE_NAME \
+    --package-name=llama-cpp-api-client \
+    --package-version=0.0.1 \
+    --basic-setup-py=$BASIC_SETUP_PY \
+    --version-tolerant=$VERSION_TOLERANT \
+    --python3-only=$PYTHON3_ONLY \
+    --low-level-client=$LOW_LEVEL_CLIENT \
+    --models-mode=$MODELS_MODE \
+    --client-side-validation=$CLIENT_SIDE_VALIDATION \
+    --combine-operation-files=false \
+    --clear-output-folder=true \
+    --no-namespace-folders=true \
+    --input-file=/local/$OPENAPI_JSON_PATH \
+    --output-folder=/local/$CLIENT_TMP_DIR
+
+# Move the client to the final location
+cp -r $CLIENT_TMP_DIR $CLIENT_FINAL_DIR
+
+# Do some cleanup with black, isort and autoflake
+black $CLIENT_FINAL_DIR
+isort $CLIENT_FINAL_DIR
+autoflake -r --in-place --remove-all-unused-imports --remove-unused-variables $CLIENT_FINAL_DIR
diff --git a/bin/save-openapi-json b/bin/save-openapi-json
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+import subprocess
+import os
+import time
+import urllib.request
+
+LLAMA_SERVER_PORT = 8000
+LLAMA_SERVER_BASE_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
+MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin"
+
+
+def start_llama_server() -> subprocess.Popen:
+    print(f"Starting llama_cpp server")
+
+    env = os.environ.copy()
+    env["PORT"] = str(LLAMA_SERVER_PORT)
+    env["MODEL"] = MODEL
+    env["VOCAB_ONLY"] = "true"
+    server_process = subprocess.Popen(
+        ["python3", "-m", "llama_cpp.server"],
+        env=env,
+    )
+
+    # Wait for the server to start
+    while not is_llama_server_running():
+        if server_process.poll() is not None:
+            raise RuntimeError("llama_cpp server failed to start")
+        else:
+            time.sleep(0.1)
+            print("Waiting for llama_cpp server to start...")
+
+    return server_process
+
+
+def stop_llama_server(server_process: subprocess.Popen):
+    print(f"Stopping llama_cpp server")
+    server_process.kill()
+
+
+def is_llama_server_running():
+    try:
+        with urllib.request.urlopen(f"{LLAMA_SERVER_BASE_URL}/v1/models") as response:
+            return True
+    except:
+        return False
+
+
+def scrape(url, path):
+    with urllib.request.urlopen(url) as response:
+        data = response.read()
+        data_str = data.decode("utf-8")
+
+        dir = os.path.dirname(path)
+        os.makedirs(dir, exist_ok=True)
+        with open(path, "w") as f:
+            f.write(data_str)
+
+
+def scrape_llama_server_openapi_json():
+    llama_server_process = start_llama_server()
+
+    scrape(f"{LLAMA_SERVER_BASE_URL}/openapi.json", ".tmp/llama_server_openapi.json")
+
+    stop_llama_server(llama_server_process)
+
+
+if __name__ == "__main__":
+    scrape_llama_server_openapi_json()
diff --git a/examples/high_level_api/openapi_client.py b/examples/high_level_api/openapi_client.py
@@ -0,0 +1,145 @@
+import json
+from pprint import pprint
+import asyncio
+
+import aiohttp
+import llama_cpp
+
+import llama_cpp.client as llama_cpp_client
+import llama_cpp.client.aio as llama_cpp_client_aio
+import llama_cpp.client.models as llama_cpp_models
+
+LLAMA_SERVER_BASE_URL = "http://localhost:8000"
+
+
+async def main():
+    # Create client
+    client = llama_cpp_client.LlamaCppPythonAPI(
+        endpoint=LLAMA_SERVER_BASE_URL,
+    )
+
+    # List models
+    models = client.get.models_v1_models_get()
+    print("Models:")
+    pprint(models.as_dict())
+
+    # Completion (async)
+    async with llama_cpp_client_aio.LlamaCppPythonAPI(
+        endpoint=LLAMA_SERVER_BASE_URL,
+    ) as aio_client:
+        completion = await aio_client.create.completion_v1_completions_post(
+            body={
+                "model": "example",
+                "prompt": "The quick brown fox jumps over the lazy dog.",
+                "max_tokens": 50,
+            }
+        )
+        print("Completion:")
+        pprint(completion.as_dict())
+
+        # Embedding (async)
+        # This time we'll use a model for the request instead of an untyped dictionary
+        embedding = await aio_client.create.embedding_v1_embeddings_post(
+            body=llama_cpp_models.CreateEmbeddingRequest(
+                model="example",
+                input="The quick brown fox jumps over the lazy dog.",
+            )
+        )
+        print("Embedding:")
+        pprint(embedding.as_dict())
+
+        # Chat completion (async)
+        chat_completion = (
+            await aio_client.create.chat_completion_v1_chat_completions_post(
+                body=llama_cpp_models.CreateChatCompletionRequest(
+                    model="example",
+                    messages=[
+                        llama_cpp_models.ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ),
+                        llama_cpp_models.ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ),
+                    ],
+                    temperature=0.5,
+                )
+            )
+        )
+
+        print("Chat completion:")
+        pprint(chat_completion.as_dict())
+
+    # Chat completion (streaming, currently can't use client)
+    async with aiohttp.ClientSession() as session:
+        body = llama_cpp_models.CreateChatCompletionRequest(
+            model="example",
+            messages=[
+                llama_cpp_models.ChatCompletionRequestMessage(
+                    role="system", content="You are a helpful assistant."
+                ),
+                llama_cpp_models.ChatCompletionRequestMessage(
+                    role="user",
+                    content="Tell me the story of the three little pigs in the style of a pirate.",
+                ),
+            ],
+            max_tokens=200,
+            temperature=2,
+            stream=True,
+        )
+        async with session.post(
+            f"{LLAMA_SERVER_BASE_URL}/v1/chat/completions", json=body.serialize()
+        ) as response:
+            async for line in response.content:
+                # This sure seems like the wrong way to do this...
+                line = line.decode("utf-8")
+                if line.startswith("data: "):
+                    chunk_str = line[len("data: ") :].strip()
+                    if chunk_str == "":
+                        continue
+                    elif chunk_str == "[DONE]":
+                        print("")
+                        break
+                    else:
+                        chunk_json = json.loads(chunk_str)
+                        chunk = llama_cpp.ChatCompletionChunk(**chunk_json)
+                        delta = chunk["choices"][0]["delta"]
+                        if "content" in delta:
+                            print(delta["content"], end="")
+
+    # Completion (streaming, currently can't use client)
+    async with aiohttp.ClientSession() as session:
+        body = llama_cpp_models.CreateCompletionRequest(
+            model="example",
+            prompt="""### Human: recipe for stuffed mushrooms.
+### Assistant:""",
+            max_tokens=1500,
+            temperature=1,
+            top_p=0.55,
+            top_k=33,
+            stream=True,
+        )
+        async with session.post(
+            f"{LLAMA_SERVER_BASE_URL}/v1/completions", json=body.serialize()
+        ) as response:
+            async for line in response.content:
+                # This sure seems like the wrong way to do this...
+                line = line.decode("utf-8")
+                if line.startswith("data: {"):
+                    chunk_str = line[len("data: ") :].strip()
+
+                    if chunk_str == "":
+                        continue
+                    elif chunk_str == "[DONE]":
+                        print("")
+                        break
+                    else:
+                        chunk_json = json.loads(chunk_str)
+                        chunk = llama_cpp.CompletionChunk(**chunk_json)
+                        text = chunk["choices"][0]["text"]
+                        print(text, end="")
+    print("done!")
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main())
diff --git a/llama_cpp/client/__init__.py b/llama_cpp/client/__init__.py
@@ -0,0 +1,24 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Code generated by Microsoft (R) AutoRest Code Generator (autorest: 3.9.5, generator: @autorest/python@6.4.15)
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._client import LlamaCppPythonAPI
+from ._version import VERSION
+
+__version__ = VERSION
+
+try:
+    from ._patch import *  # pylint: disable=unused-wildcard-import
+    from ._patch import __all__ as _patch_all
+except ImportError:
+    _patch_all = []
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "LlamaCppPythonAPI",
+]
+__all__.extend([p for p in _patch_all if p not in __all__])
+
+_patch_sdk()