coderonion
diff --git a/‎.github/workflows/test.yaml
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/test.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/index.md
Lines changed: 2 additions & 0 deletions b/‎docs/index.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/fastapi_server.py
Lines changed: 0 additions & 97 deletions b/‎examples/fastapi_server.py
Lines changed: 0 additions & 97 deletions
diff --git a/‎examples/high_level_api/fastapi_server.py
Lines changed: 181 additions & 0 deletions b/‎examples/high_level_api/fastapi_server.py
Lines changed: 181 additions & 0 deletions
diff --git a/‎examples/high_level_api_embedding.py renamed to ‎examples/high_level_api/high_level_api_embedding.py b/‎examples/high_level_api_embedding.py renamed to ‎examples/high_level_api/high_level_api_embedding.py
diff --git a/‎examples/high_level_api_inference.py renamed to ‎examples/high_level_api/high_level_api_inference.py
Lines changed: 1 addition & 1 deletion b/‎examples/high_level_api_inference.py renamed to ‎examples/high_level_api/high_level_api_inference.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/high_level_api_streaming.py renamed to ‎examples/high_level_api/high_level_api_streaming.py
Lines changed: 1 addition & 1 deletion b/‎examples/high_level_api_streaming.py renamed to ‎examples/high_level_api/high_level_api_streaming.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/langchain_custom_llm.py renamed to ‎examples/high_level_api/langchain_custom_llm.py b/‎examples/langchain_custom_llm.py renamed to ‎examples/high_level_api/langchain_custom_llm.py
diff --git a/‎examples/low_level_api_llama_cpp.py renamed to ‎examples/low_level_api/low_level_api_llama_cpp.py b/‎examples/low_level_api_llama_cpp.py renamed to ‎examples/low_level_api/low_level_api_llama_cpp.py
diff --git a/‎examples/low_level_api/quantize.py
Lines changed: 25 additions & 0 deletions b/‎examples/low_level_api/quantize.py
Lines changed: 25 additions & 0 deletions
@@ -0,0 +1,30 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "true"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip pytest cmake scikit-build
+          python3 setup.py develop
+      - name: Test with pytest
+        run: |
+          pytest
@@ -1,6 +1,7 @@
 # 🦙 Python Bindings for `llama.cpp`
 
 [![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
+[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@@ -70,7 +71,7 @@ python3 setup.py develop
 
 # How does this compare to other Python bindings of `llama.cpp`?
 
-I wrote this package for my own use, I had two goals in mind:
+I originally wrote this package for my own use with two goals in mind:
 
 - Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python
 - Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`
 
@@ -71,8 +71,10 @@ python3 setup.py develop
             - sample
             - generate
             - create_embedding
+            - embed
             - create_completion
             - __call__
+            - create_chat_completion
             - token_bos
             - token_eos
         show_root_heading: true
 
@@ -0,0 +1,181 @@
+"""Example FastAPI server for llama.cpp.
+
+To run this example:
+
+```bash
+pip install fastapi uvicorn sse-starlette
+export MODEL=../models/7B/...
+uvicorn fastapi_server_chat:app --reload
+```
+
+Then visit http://localhost:8000/docs to see the interactive API docs.
+
+"""
+import os
+import json
+from typing import List, Optional, Literal, Union, Iterator
+
+import llama_cpp
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
+from sse_starlette.sse import EventSourceResponse
+
+
+class Settings(BaseSettings):
+    model: str
+    n_ctx: int = 2048
+    n_batch: int = 2048
+    n_threads: int = os.cpu_count() or 1
+    f16_kv: bool = True
+    use_mlock: bool = True
+    embedding: bool = True
+    last_n_tokens_size: int = 64
+
+
+app = FastAPI(
+    title="🦙 llama.cpp Python API",
+    version="0.0.1",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+settings = Settings()
+llama = llama_cpp.Llama(
+    settings.model,
+    f16_kv=settings.f16_kv,
+    use_mlock=settings.use_mlock,
+    embedding=settings.embedding,
+    n_threads=settings.n_threads,
+    n_batch=settings.n_batch,
+    n_ctx=settings.n_ctx,
+    last_n_tokens_size=settings.last_n_tokens_size,
+)
+
+
+class CreateCompletionRequest(BaseModel):
+    prompt: str
+    suffix: Optional[str] = Field(None)
+    max_tokens: int = 16
+    temperature: float = 0.8
+    top_p: float = 0.95
+    logprobs: Optional[int] = Field(None)
+    echo: bool = False
+    stop: List[str] = []
+    repeat_penalty: float = 1.1
+    top_k: int = 40
+    stream: bool = False
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                "stop": ["\n", "###"],
+            }
+        }
+
+
+CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
+
+
+@app.post(
+    "/v1/completions",
+    response_model=CreateCompletionResponse,
+)
+def create_completion(request: CreateCompletionRequest):
+    if request.stream:
+        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
+        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
+    return llama(**request.dict())
+
+
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str]
+    input: str
+    user: Optional[str]
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "input": "The food was delicious and the waiter...",
+            }
+        }
+
+
+CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
+
+
+@app.post(
+    "/v1/embeddings",
+    response_model=CreateEmbeddingResponse,
+)
+def create_embedding(request: CreateEmbeddingRequest):
+    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+
+
+class ChatCompletionRequestMessage(BaseModel):
+    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
+    content: str
+    user: Optional[str] = None
+
+
+class CreateChatCompletionRequest(BaseModel):
+    model: Optional[str]
+    messages: List[ChatCompletionRequestMessage]
+    temperature: float = 0.8
+    top_p: float = 0.95
+    stream: bool = False
+    stop: List[str] = []
+    max_tokens: int = 128
+    repeat_penalty: float = 1.1
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "messages": [
+                    ChatCompletionRequestMessage(
+                        role="system", content="You are a helpful assistant."
+                    ),
+                    ChatCompletionRequestMessage(
+                        role="user", content="What is the capital of France?"
+                    ),
+                ]
+            }
+        }
+
+
+CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
+
+
+@app.post(
+    "/v1/chat/completions",
+    response_model=CreateChatCompletionResponse,
+)
+async def create_chat_completion(
+    request: CreateChatCompletionRequest,
+) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
+    completion_or_chunks = llama.create_chat_completion(
+        **request.dict(exclude={"model"}),
+    )
+
+    if request.stream:
+
+        async def server_sent_events(
+            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
+        ):
+            for chat_chunk in chat_chunks:
+                yield dict(data=json.dumps(chat_chunk))
+            yield dict(data="[DONE]")
+
+        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
+
+        return EventSourceResponse(
+            server_sent_events(chunks),
+        )
+    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
+    return completion
@@ -11,7 +11,7 @@
 
 output = llm(
     "Question: What are the names of the planets in the solar system? Answer: ",
-    max_tokens=1,
+    max_tokens=48,
     stop=["Q:", "\n"],
     echo=True,
 )
 
@@ -4,7 +4,7 @@
 from llama_cpp import Llama
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default=".//models/...")
+parser.add_argument("-m", "--model", type=str, default="./models/...")
 args = parser.parse_args()
 
 llm = Llama(model_path=args.model)
 
@@ -0,0 +1,25 @@
+import os
+import argparse
+import llama_cpp
+
+
+def main(args):
+    if not os.path.exists(fname_inp):
+        raise RuntimeError(f"Input file does not exist ({fname_inp})")
+    if os.path.exists(fname_out):
+        raise RuntimeError(f"Output file already exists ({fname_out})")
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
+    itype = args.itype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    if return_code != 0:
+        raise RuntimeError("Failed to quantize model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fname_inp", type=str, help="Path to input model")
+    parser.add_argument("fname_out", type=str, help="Path to output model")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    args = parser.parse_args()
+    main(args)
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`
`12`	`12`	`output = llm(`
`13`	`13`	`"Question: What are the names of the planets in the solar system? Answer: ",`
`14`		`- max_tokens=1,`
	`14`	`+ max_tokens=48,`
`15`	`15`	`stop=["Q:", "\n"],`
`16`	`16`	`echo=True,`
`17`	`17`	`)`