diff --git a/.gitignore b/.gitignore index 36ed7f7fd..2e701b11d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ .vscode/ - +*.swp _skbuild/ +nohup.out + .envrc models/ diff --git a/build.bat b/build.bat new file mode 100644 index 000000000..5a04d4d98 --- /dev/null +++ b/build.bat @@ -0,0 +1,5 @@ + +set CMAKE_ARGS=-DLLAMA_CUBLAS=on -DLLAMA_CUDA_DMMV_F16=on -DLLAMA_NATIVE=on -DLLAMA_AVX512=on -DLLAMA_AVX512_VBMI=on -DLLAMA_AVX512_VNNI=on +set FORCE_CMAKE=1 +python setup.py bdist_wheel + diff --git a/build.sh b/build.sh new file mode 100755 index 000000000..697243be1 --- /dev/null +++ b/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +rm -rf dist _skbuild + +CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_DMMV_F16=on -DLLAMA_NATIVE=on -DLLAMA_LTO=on -DLLAMA_AVX512=on -DLLAMA_AVX512_VBMI=on -DLLAMA_AVX512_VNNI=on " FORCE_CMAKE=1 python setup.py bdist_wheel + diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef319c7e0..ee5202273 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,5 +1,6 @@ import json import multiprocessing +import redis from threading import Lock from functools import partial from typing import Iterator, List, Optional, Union, Dict @@ -85,13 +86,22 @@ class Settings(BaseSettings): port: int = Field( default=8000, description="Listen port" ) + redishost: str = Field( + default="None", description="Redis server address" + ) + redisport: int = Field( + default=6379, description="Redis server port" + ) + redisdb: int = Field( + default=0, description="Redis server db" + ) router = APIRouter() settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None - +rediscon: Optional[redis.StrictRedis] = None def create_app(settings: Optional[Settings] = None): if settings is None: @@ -108,6 +118,15 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) + + if settings.redishost != 'None': + global rediscon + try: + rediscon = redis.StrictRedis(host=settings.redishost, port=settings.redisport, db=settings.redisdb) + print(rediscon) + except Exception as e: + print(e) + global llama llama = llama_cpp.Llama( model_path=settings.model, @@ -478,6 +497,8 @@ async def create_chat_completion( } kwargs = body.dict(exclude=exclude) + log = kwargs.copy() + if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), @@ -490,11 +511,26 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + streamRole = '' + streamContent = '' async for chat_chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) + + if 'role' in chat_chunk['choices'][0]['delta']: + streamRole = chat_chunk['choices'][0]['delta']['role'] + if 'content' in chat_chunk['choices'][0]['delta']: + streamContent += chat_chunk['choices'][0]['delta']['content'] + if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) + log['messages'].append({'role':streamRole, 'content':streamContent}) + + #print(json.dumps(log,indent=4)) + if rediscon is not None: + logstr = json.dumps(log) + rediscon.rpush('llama.cpp', logstr) + except anyio.get_cancelled_exc_class() as e: print("disconnected") with anyio.move_on_after(1, shield=True): @@ -512,6 +548,20 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): completion: llama_cpp.ChatCompletion = await run_in_threadpool( llama.create_chat_completion, **kwargs # type: ignore ) + #print(json.dumps(completion,indent=4)) + + messageRole = '' + messageContent = '' + if 'role' in completion['choices'][0]['message']: + messageRole = completion['choices'][0]['message']['role'] + if 'content' in completion['choices'][0]['message']: + messageContent = completion['choices'][0]['message']['content'] + log['messages'].append({'role':messageRole, 'content':messageContent}) + + #print(json.dumps(log,indent=4)) + if rediscon is not None: + logstr = json.dumps(log) + rediscon.rpush('llama.cpp', logstr) return completion diff --git a/log/llama.log.py b/log/llama.log.py new file mode 100755 index 000000000..71342dd75 --- /dev/null +++ b/log/llama.log.py @@ -0,0 +1,27 @@ +#!/bin/env python3 + +import redis +import json +import os +from datetime import datetime + +logDir = '/var/log/llama.cpp' +logJsonName = 'llama.log' +logJsonFile = os.path.join(logDir, logJsonName) + +if not os.path.exists(logDir): + os.makedirs(logDir) + +rediscon = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) +print(rediscon) + +with open(logJsonFile, 'a') as fJson: + while True: + logDataStr = rediscon.blpop('llama.cpp')[1].decode('utf8') + ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + logDict = {'time':ts, 'log':json.loads(logDataStr)} + logJsonStr = json.dumps(logDict, indent=4, ensure_ascii=False) + fJson.write(logJsonStr + '\n') + fJson.write('='*50 + '\n') + fJson.flush() + diff --git a/setup.py b/setup.py index 32101eb07..be10005ec 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3", "redis[hiredis]>= 4.1.0"], }, python_requires=">=3.7", classifiers=[ diff --git a/vendor/llama.cpp b/vendor/llama.cpp deleted file mode 160000 index 7f0e9a775..000000000 --- a/vendor/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa diff --git a/vendor/llama.cpp b/vendor/llama.cpp new file mode 120000 index 000000000..af152f6d7 --- /dev/null +++ b/vendor/llama.cpp @@ -0,0 +1 @@ +../../llama.cpp/ \ No newline at end of file