8000 Merge branch 'main' of https://github.com/abetlen/llama-cpp-python · coderonion/llama-cpp-python@e4c6f34 · GitHub
[go: up one dir, main page]

Skip to content

Commit e4c6f34

Browse files
author
Mug
committed
2 parents c862e8b + 6de2f24 commit e4c6f34

19 files changed

+6212
-123
lines changed

.github/workflows/test.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
build:
10+
11+
runs-on: ubuntu-latest
12+
strategy:
13+
matrix:
14+
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
15+
16+
steps:
17+
- uses: actions/checkout@v3
18+
with:
19+
submodules: "true"
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip pytest cmake scikit-build
27+
python3 setup.py develop
28+
- name: Test with pytest
29+
run: |
30+
pytest

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# 🦙 Python Bindings for `llama.cpp`
22

33
[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
4+
[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
45
[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
56
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
67
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@@ -70,7 +71,7 @@ python3 setup.py develop
7071

7172
# How does this compare to other Python bindings of `llama.cpp`?
7273

73-
I wrote this package for my own use, I had two goals in mind:
74+
I originally wrote this package for my own use with two goals in mind:
7475

7576
- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python
7677
- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`

docs/index.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@ python3 setup.py develop
7171
- sample
7272
- generate
7373
- create_embedding
74+
- embed
7475
- create_completion
7576
- __call__
77+
- create_chat_completion
7678
- token_bos
7779
- token_eos
7880
show_root_heading: true

examples/fastapi_server.py

Lines changed: 0 additions & 97 deletions
This file was deleted.
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
"""Example FastAPI server for llama.cpp.
2+
3+
To run this example:
4+
5+
```bash
6+
pip install fastapi uvicorn sse-starlette
7+
export MODEL=../models/7B/...
8+
uvicorn fastapi_server_chat:app --reload
9+
```
10+
11+
Then visit http://localhost:8000/docs to see the interactive API docs.
12+
13+
"""
14+
import os
15+
import json
16+
from typing import List, Optional, Literal, Union, Iterator
17+
18+
import llama_cpp
19+
20+
from fastapi import FastAPI
21+
from fastapi.middleware.cors import CORSMiddleware
22+
from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
23+
from sse_starlette.sse import EventSourceResponse
24+
25+
26+
class Settings(BaseSettings):
27+
model: str
28+
n_ctx: int = 2048
29+
n_batch: int = 2048
30+
n_threads: int = os.cpu_count() or 1
31+
f16_kv: bool = True
32+
use_mlock: bool = True
33+
embedding: bool = True
34+
last_n_tokens_size: int = 64
35+
36+
37+
app = FastAPI(
38+
title="🦙 llama.cpp Python API",
39+
version="0.0.1",
40+
)
41+
app.add_middleware(
42+
CORSMiddleware,
43+
allow_origins=["*"],
44+
allow_credentials=True,
45+
allow_methods=["*"],
46+
allow_headers=["*"],
47+
)
48+
settings = Settings()
49+
llama = llama_cpp.Llama(
50+
settings.model,
51+
f16_kv=settings.f16_kv,
52+
use_mlock=settings.use_mlock,
53+
embedding=settings.embedding,
54+
n_threads=settings.n_threads,
55+
n_batch=settings.n_batch,
56+
n_ctx=settings.n_ctx,
57+
last_n_tokens_size=settings.last_n_tokens_size,
58+
)
59+
60+
61+
class CreateCompletionRequest(BaseModel):
62+
prompt: str
63+
suffix: Optional[str] = Field(None)
64+
max_tokens: int = 16
65+
temperature: float = 0.8
66+
top_p: float = 0.95
67+
logprobs: Optional[int] = Field(None)
68+
echo: bool = False
69+
stop: List[str] = []
70+
repeat_penalty: float = 1.1
71+
top_k: int = 40
72+
stream: bool = False
73+
74+
class Config:
75+
schema_extra = {
76+
"example": {
77+
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
78+
"stop": ["\n", "###"],
79+
}
80+
}
81+
82+
83+
CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
84+
85+
86+
@app.post(
87+
"/v1/completions",
88+
response_model=CreateCompletionResponse,
89+
)
90+
def create_completion(request: CreateCompletionRequest):
91+
if request.stream:
92+
chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore
93+
return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
94+
return llama(**request.dict())
95+
96+
97+
class CreateEmbeddingRequest(BaseModel):
98+
model: Optional[str]
99+
input: str
100+
user: Optional[str]
101+
102+
class Config:
103+
schema_extra = {
104+
"example": {
105+
"input": "The food was delicious and the waiter...",
106+
}
107+
}
108+
109+
110+
CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
111+
112+
113+
@app.post(
114+
"/v1/embeddings",
115+
response_model=CreateEmbeddingResponse,
116+
)
117+
def create_embedding(request: CreateEmbeddingRequest):
118+
return llama.create_embedding(**request.dict(exclude={"model", "user"}))
119+
120+
121+
class ChatCompletionRequestMessage(BaseModel):
122+
role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
123+
content: str
124+
user: Optional[str] = None
125+
126+
127+
class CreateChatCompletionRequest(BaseModel):
128+
model: Optional[str]
129+
messages: List[ChatCompletionRequestMessage]
130+
temperature: float = 0.8
131+
top_p: float = 0.95
132+
stream: bool = False
133+
stop: List[str] = []
134+
max_tokens: int = 128
135+
repeat_penalty: float = 1.1
136+
137+
class Config:
138+
schema_extra = {
139+
"example": {
140+
"messages": [
141+
ChatCompletionRequestMessage(
142+
role="system", content="You are a helpful assistant."
143+
),
144+
ChatCompletionRequestMessage(
145+
role="user", content="What is the capital of France?"
146+
),
147+
]
148+
}
149+
}
150+
151+
152+
CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
153+
154+
155+
@app.post(
156+
"/v1/chat/completions",
157+
response_model=CreateChatCompletionResponse,
158+
)
159+
async def create_chat_completion(
160+
request: CreateChatCompletionRequest,
161+
) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
162+
completion_or_chunks = llama.create_chat_completion(
163+
**request.dict(exclude={"model"}),
164+
)
165+
166+
if request.stream:
167+
168+
async def server_sent_events(
169+
chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
170+
):
171+
for chat_chunk in chat_chunks:
172+
yield dict(data=json.dumps(chat_chunk))
173+
yield dict(data="[DONE]")
174+
175+
chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore
176+
177+
return EventSourceResponse(
178+
server_sent_events(chunks),
179+
)
180+
completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore
181+
return completion

examples/high_level_api_inference.py renamed to examples/high_level_api/high_level_api_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
output = llm(
1313
"Question: What are the names of the planets in the solar system? Answer: ",
14-
max_tokens=1,
14+
max_tokens=48,
1515
stop=["Q:", "\n"],
1616
echo=True,
1717
)

examples/high_level_api_streaming.py renamed to examples/high_level_api/high_level_api_streaming.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from llama_cpp import Llama
55

66
parser = argparse.ArgumentParser()
7-
parser.add_argument("-m", "--model", type=str, default=".//models/...")
7+
parser.add_argument("-m", "--model", type=str, default="./models/...")
88
args = parser.parse_args()
99

1010
llm = Llama(model_path=args.model)

examples/low_level_api/quantize.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import os
2+
import argparse
3+
import llama_cpp
4+
5+
6+
def main(args):
7+
if not os.path.exists(fname_inp):
8+
raise RuntimeError(f"Input file does not exist ({fname_inp})")
9+
if os.path.exists(fname_out):
10+
raise RuntimeError(f"Output file already exists ({fname_out})")
11+
fname_inp = args.fname_inp.encode("utf-8")
12+
fname_out = args.fname_out.encode("utf-8")
13+
itype = args.itype
14+
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
15+
if return_code != 0:
16+
raise RuntimeError("Failed to quantize model")
17+
18+
19+
if __name__ == "__main__":
20+
parser = argparse.ArgumentParser()
21+
parser.add_argument("fname_inp", type=str, help="Path to input model")
22+
parser.add_argument("fname_out", type=str, help="Path to output model")
23+
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
24+
args = parser.parse_args()
25+
main(args)

0 commit comments

Comments
 (0)
0