From 6dfca53adb3af4ea35d2f5f86945e6f3729d7974 Mon Sep 17 00:00:00 2001 From: carmonajca Date: Tue, 9 May 2023 18:13:44 +0200 Subject: [PATCH 01/35] Fixed errors and changed prompt --- .../low_level_api/low_level_api_chat_cpp.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1e3..1efd68b6e 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -16,8 +16,8 @@ from os import cpu_count, path import llama_cpp -from common import GptParams, gpt_params_parse, gpt_random_prompt -import util +from llama_cpp_python.examples.low_level_api.common import GptParams, gpt_params_parse, gpt_random_prompt +import llama_cpp_python.examples.low_level_api.util as util # A LLaMA interactive session class LLaMAInteract: @@ -368,10 +368,10 @@ def generate(self): id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, 1) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), 1) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), 1) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), 1) llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) @@ -444,7 +444,7 @@ def generate(self): if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): # If we arent in instruction mode, fix the current generation by appending the antiprompt. # Makes it so if chat ends prematurely you dont append the AI's text etc. - if not self.params.instruct: + if not self.params.instruct and self.first_antiprompt: self.embd_inp += self.first_antiprompt[0] self.n_remain = self.params.n_predict break @@ -480,6 +480,10 @@ def output(self): for id in self.generate(): cur_char = llama_cpp.llama_token_to_str(self.ctx, id) + # Stop the inference after a new line character + if cur_char == b"\n": + break + # Add remainder of missing bytes if None in self.multibyte_fix: self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char @@ -493,7 +497,7 @@ def output(self): # Contains multi-byte UTF8 for num, pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check - if pattern & int.from_bytes(cur_char) == pattern: + if pattern & int.from_bytes(cur_char, byteorder='big') == pattern: self.multibyte_fix = [cur_char] + ([None] * (num-1)) # Stop incomplete bytes from passing @@ -518,7 +522,7 @@ def interact(self): while self.params.interactive: self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): - print('\n> ', end="") + print('\n$ ', end="") self.input(self.read_input()) else: print(self.params.input_prefix, end="") From 5e5dad9e111a8115638a2a584e696d5f77dae5c9 Mon Sep 17 00:00:00 2001 From: carmonajca Date: Wed, 10 May 2023 10:22:26 +0200 Subject: [PATCH 02/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1b0fd4546..e6a46b0ed 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 +Subproject commit e6a46b0ed1884c77267dc70693183e3b7164e0e0 From 18cf97e0a4f413d37d635ae62684dadc23ffb9cf Mon Sep 17 00:00:00 2001 From: carmonajca Date: Wed, 10 May 2023 17:48:23 +0200 Subject: [PATCH 03/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e6a46b0ed..cf348a60e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e6a46b0ed1884c77267dc70693183e3b7164e0e0 +Subproject commit cf348a60e0af3905acd1d297cb064b918265b7ac From 3f8c5d55fe96c2d0c6f80d0151a34abb56ee6738 Mon Sep 17 00:00:00 2001 From: carmona Date: Thu, 11 May 2023 11:32:27 +0000 Subject: [PATCH 04/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1b0fd4546..cf348a60e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 +Subproject commit cf348a60e0af3905acd1d297cb064b918265b7ac From 2e68bbd245c911f1894553c7ff7e4712ae580c3a Mon Sep 17 00:00:00 2001 From: carmona Date: Thu, 11 May 2023 12:57:56 +0000 Subject: [PATCH 05/35] Updated low_level_api_chat for double \n --- examples/low_level_api/low_level_api_chat_cpp.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 1efd68b6e..28e6886ee 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -477,12 +477,17 @@ def input(self, prompt: str): # write output def output(self): self.remaining_tokens = self.params.n_predict + flag = False for id in self.generate(): cur_char = llama_cpp.llama_token_to_str(self.ctx, id) - # Stop the inference after a new line character + # Stop the inference after 2 consecutive new line characters if cur_char == b"\n": - break + if flag: + break + flag = True + else: + flag = False # Add remainder of missing bytes if None in self.multibyte_fix: @@ -522,7 +527,7 @@ def interact(self): while self.params.interactive: self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): - print('\n$ ', end="") + print('\n$ ', end="",flush=True) self.input(self.read_input()) else: print(self.params.input_prefix, end="") From a6b1012e411f23fccb45e2e4fbbadb20e50f81ea Mon Sep 17 00:00:00 2001 From: carmona Date: Fri, 12 May 2023 10:27:54 +0000 Subject: [PATCH 06/35] Removed on-line checking of "\n\n" in llama_cpp --- examples/low_level_api/low_level_api_chat_cpp.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 28e6886ee..be2cf989e 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -477,18 +477,9 @@ def input(self, prompt: str): # write output def output(self): self.remaining_tokens = self.params.n_predict - flag = False for id in self.generate(): cur_char = llama_cpp.llama_token_to_str(self.ctx, id) - # Stop the inference after 2 consecutive new line characters - if cur_char == b"\n": - if flag: - break - flag = True - else: - flag = False - # Add remainder of missing bytes if None in self.multibyte_fix: self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char From f2c4e24ea68d8bb8b1af46e9f88b8e877ab9efa9 Mon Sep 17 00:00:00 2001 From: carmona Date: Fri, 12 May 2023 10:28:23 +0000 Subject: [PATCH 07/35] Updated llama_cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index cf348a60e..089b1c93b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit cf348a60e0af3905acd1d297cb064b918265b7ac +Subproject commit 089b1c93ba2b93bc9a605af293730a028fad2c4e From b4654cc4c40483c7b2bde66270e92fe59bef2e24 Mon Sep 17 00:00:00 2001 From: carmona Date: Fri, 12 May 2023 15:59:41 +0000 Subject: [PATCH 08/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 089b1c93b..773ee249f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 089b1c93ba2b93bc9a605af293730a028fad2c4e +Subproject commit 773ee249fb6c14f791ac39f6ec05536f40dedc54 From 1dee6c314e88abc380ad653395a6b1d21e7470ef Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 15 May 2023 15:01:13 +0000 Subject: [PATCH 09/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 773ee249f..53ea6ccb8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 773ee249fb6c14f791ac39f6ec05536f40dedc54 +Subproject commit 53ea6ccb8dc3996a1c1d1dac547bf46f66999422 From c0c57c62bb346d65a182304b6bb3ae71fa9d5379 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 15 May 2023 15:06:01 +0000 Subject: [PATCH 10/35] Fixed llama.cpp commit --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 53ea6ccb8..63d20469b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 53ea6ccb8dc3996a1c1d1dac547bf46f66999422 +Subproject commit 63d20469b85467c5729cc9a97bd44cc3da63423f From 2001df0890f7f923366c734dec990db570de6a2d Mon Sep 17 00:00:00 2001 From: carmona Date: Tue, 16 May 2023 16:30:39 +0000 Subject: [PATCH 11/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 63d20469b..956065540 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 63d20469b85467c5729cc9a97bd44cc3da63423f +Subproject commit 9560655409dc80771a9b19e838ff47c5c1df6483 From 6a3e546c228f81b06d4e3bf2d4eb7f94fb8eb903 Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 17 May 2023 08:38:28 +0000 Subject: [PATCH 12/35] Added quantize.sh to automate quantization --- quantize.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 quantize.sh diff --git a/quantize.sh b/quantize.sh new file mode 100644 index 000000000..ee4d89e07 --- /dev/null +++ b/quantize.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +MAIN_DIR="/mnt/data/LLM_models/llama-cpp/" + +cd $(pwd)"/vendor/llama.cpp" +echo $(pwd) +make quantize +for dir in $(find $MAIN_DIR/* -type d); do + echo $dir + if [ -f "${dir}/ggml-model-f16.bin" ]; then + rm -f ${dir}/*q4_0.bin + ./quantize ${dir}/ggml-model-f16.bin q4_0 30 + else + echo "The file 'ggml-model-f16.bin' does not exist in ${dir}" + fi +done From 9a924940d71e664f84e9c272fbee19d8764194da Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 17 May 2023 08:39:18 +0000 Subject: [PATCH 13/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 956065540..2b2646931 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9560655409dc80771a9b19e838ff47c5c1df6483 +Subproject commit 2b2646931bd2a2eb3e21c6f3733cc0e090b2e24b From 15c32ff2af676e116992018da8ea5927665f3915 Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 17 May 2023 08:48:06 +0000 Subject: [PATCH 14/35] Assess the path to the llama.cpp folder --- quantize.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/quantize.sh b/quantize.sh index ee4d89e07..02aae9cf9 100644 --- a/quantize.sh +++ b/quantize.sh @@ -2,13 +2,15 @@ MAIN_DIR="/mnt/data/LLM_models/llama-cpp/" -cd $(pwd)"/vendor/llama.cpp" +SCRIPT_DIR=$(dirname "$0") + +cd "${SCRIPT_DIR}/vendor/llama.cpp" echo $(pwd) make quantize for dir in $(find $MAIN_DIR/* -type d); do echo $dir if [ -f "${dir}/ggml-model-f16.bin" ]; then - rm -f ${dir}/*q4_0.bin + rm -f ${dir}/*q4_0.bin.* ./quantize ${dir}/ggml-model-f16.bin q4_0 30 else echo "The file 'ggml-model-f16.bin' does not exist in ${dir}" From 54a012b5af8568d93943486d1d32803068ca68c5 Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 17 May 2023 10:51:18 +0000 Subject: [PATCH 15/35] Updated the script for quantization --- quantize.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/quantize.sh b/quantize.sh index 02aae9cf9..336d725ff 100644 --- a/quantize.sh +++ b/quantize.sh @@ -1,12 +1,18 @@ #!/bin/bash -MAIN_DIR="/mnt/data/LLM_models/llama-cpp/" +if [ $# -eq 0 ]; then + echo "Please, provide the main path to the folder where the models (ckpt_dirs) are." + exit 1 +fi +MAIN_DIR="$1" SCRIPT_DIR=$(dirname "$0") cd "${SCRIPT_DIR}/vendor/llama.cpp" echo $(pwd) -make quantize +if [ ! -f "quantize" ]; then + make quantize +fi for dir in $(find $MAIN_DIR/* -type d); do echo $dir if [ -f "${dir}/ggml-model-f16.bin" ]; then From 0df288e1b2524a9816f53a1fb87b6f8e7ea39732 Mon Sep 17 00:00:00 2001 From: Carmona <100353437+carmonajca@users.noreply.github.com> Date: Wed, 17 May 2023 22:01:44 +0200 Subject: [PATCH 16/35] Update to current status (#2) * Bugfix: Ensure logs are printed when streaming * Update llama.cpp * Update llama.cpp * Add missing tfs_z paramter * Bump version * Fix docker command * Revert "llama_cpp server: prompt is a string". Closes #187 This reverts commit b9098b0ef7309b63ebff99cdfadf641223c15025. * Only support generating one prompt at a time. * Allow model to tokenize strings longer than context length and set add_bos. Closes #92 * Update llama.cpp * Bump version * Update llama.cpp * Fix obscure Wndows DLL issue. Closes #208 * chore: add note for Mac m1 installation * Add winmode arg only on windows if python version supports it * Bump mkdocs-material from 9.1.11 to 9.1.12 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.11 to 9.1.12. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.11...9.1.12) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] * Update README.md Fix typo. * Fix CMakeLists.txt * Add sampling defaults for generate * Update llama.cpp * Add model_alias option to override model_path in completions. Closes #39 * Update variable name * Update llama.cpp * Fix top_k value. Closes #220 * Fix last_n_tokens_size * Implement penalize_nl * Format * Update token checks * Move docs link up * Fixd CUBLAS dll load issue in Windows * Check for CUDA_PATH before adding --------- Signed-off-by: dependabot[bot] Co-authored-by: Andrei Betlen Co-authored-by: Anchen Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Xiyou Zhou Co-authored-by: Aneesh Joy --- CMakeLists.txt | 2 +- README.md | 12 +++++- llama_cpp/llama.py | 87 +++++++++++++++++++++++++++++++++-------- llama_cpp/llama_cpp.py | 18 ++++++--- llama_cpp/server/app.py | 37 +++++++++++++++--- poetry.lock | 8 ++-- pyproject.toml | 4 +- setup.py | 2 +- 8 files changed, 133 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bda238801..16932b1dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,4 +28,4 @@ else() LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp ) -endif(UNIX) +endif() diff --git a/README.md b/README.md index 9f494f923..17cc28ca9 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ This package provides: - OpenAI-like API - LangChain compatibility +Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): @@ -26,6 +28,12 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. ### Installation with OpenBLAS / cuBLAS / CLBlast @@ -102,7 +110,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: ```bash -docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` ## Low-level API @@ -120,7 +128,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) >>> llama_cpp.llama_free(ctx) ``` diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7be51e15c..f47f4a4db 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -83,6 +83,7 @@ def __init__( # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -129,6 +130,7 @@ def __init__( self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -174,7 +176,9 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: + def tokenize( + self, text: bytes, add_bos: bool = True + ) -> List[llama_cpp.llama_token]: """Tokenize a string. Args: @@ -194,10 +198,22 @@ def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: text, tokens, n_ctx, - llama_cpp.c_bool(True), + llama_cpp.c_bool(add_bos), ) if int(n_tokens) < 0: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * int(n_tokens))() + n_tokens = llama_cpp.llama_tokenize( + self.ctx, + text, + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) return list(tokens[:n_tokens]) def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: @@ -275,11 +291,20 @@ def _sample( mirostat_mode: llama_cpp.c_int, mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, ): assert self.ctx is not None assert len(self.eval_logits) > 0 n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = ( + llama_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) logits = self.eval_logits[-1] + nl_logit = logits[int(Llama.token_nl())] data = (llama_cpp.llama_token_data * n_vocab)( *[ llama_cpp.llama_token_data( @@ -312,6 +337,8 @@ def _sample( alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) + if not penalize_nl: + candidates.data[int(Llama.token_nl())].logit = nl_logit if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -394,6 +421,7 @@ def sample( mirostat_mode: int = 0, mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, + penalize_nl: bool = True, ): """Sample a token from the model. @@ -425,18 +453,20 @@ def sample( mirostat_mode=llama_cpp.c_int(mirostat_mode), mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, ) def generate( self, tokens: Sequence[llama_cpp.llama_token], - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -495,6 +525,7 @@ def generate( repeat_penalty=repeat_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -504,7 +535,7 @@ def generate( if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str) -> Embedding: + def create_embedding(self, input: str, model: Optional[str] = None) -> Embedding: """Embed a string. Args: @@ -514,6 +545,7 @@ def create_embedding(self, input: str) -> Embedding: An embedding object. """ assert self.ctx is not None + model_name: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -543,7 +575,7 @@ def create_embedding(self, input: str) -> Embedding: "index": 0, } ], - "model": self.model_path, + "model": model_name, "usage": { "prompt_tokens": n_tokens, "total_tokens": n_tokens, @@ -576,9 +608,11 @@ def _create_completion( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -591,6 +625,7 @@ def _create_completion( text: bytes = b"" returned_characters: int = 0 stop = stop if stop is not None else [] + model_name: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -634,6 +669,7 @@ def _create_completion( top_k=top_k, top_p=top_p, temp=temperature, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -641,7 +677,7 @@ def _create_completion( presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): - if token == llama_cpp.llama_token_eos(): + if token == Llama.token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" break @@ -688,7 +724,7 @@ def _create_completion( "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text[start:].decode("utf-8", errors="ignore"), @@ -709,12 +745,15 @@ def _create_completion( print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + if stream: yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text[returned_characters:].decode( @@ -780,14 +819,11 @@ def _create_completion( "top_logprobs": top_logprobs, } - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) - yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text_str, @@ -818,9 +854,11 @@ def create_completion( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -858,9 +896,11 @@ def create_completion( repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -883,9 +923,11 @@ def __call__( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -923,9 +965,11 @@ def __call__( repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) def _convert_text_completion_to_chat( @@ -998,9 +1042,11 @@ def create_chat_completion( presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1035,9 +1081,11 @@ def create_chat_completion( repeat_penalty=repeat_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore @@ -1057,6 +1105,7 @@ def __getstate__(self): model_path=self.model_path, n_ctx=self.params.n_ctx, n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1076,6 +1125,7 @@ def __setstate__(self, state): model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], @@ -1130,6 +1180,11 @@ def token_bos() -> llama_cpp.llama_token: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + @staticmethod + def token_nl() -> llama_cpp.llama_token: + """Return the newline token.""" + return llama_cpp.llama_token_nl() + @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e60558cff..24ab40a12 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"lib")) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") @@ -68,7 +73,7 @@ def _load_shared_library(lib_base_name: str): _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = c_int(1) +LLAMA_FILE_VERSION = c_int(2) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" @@ -109,6 +114,7 @@ class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context ("n_parts", c_int), # -1 for default + ("n_gpu_layers", c_int), # number of layers to store in VRAM ("seed", c_int), # RNG seed, 0 for random ("f16_kv", c_bool), # use fp16 for KV cache ( @@ -135,7 +141,7 @@ class llama_context_params(Structure): LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors @@ -259,9 +265,9 @@ def llama_get_state_size(ctx: llama_context_p) -> c_size_t: # Destination needs to have allocated enough memory. # Returns the number of bytes copied def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] @@ -350,7 +356,7 @@ def llama_tokenize( tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index be6bac81d..e8f62e8bc 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -16,7 +16,16 @@ class Settings(BaseSettings): model: str = Field( description="The path to the model to use for generating completions." ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -59,6 +68,7 @@ class Settings(BaseSettings): router = APIRouter() +settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None @@ -80,6 +90,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, @@ -95,6 +106,12 @@ def create_app(settings: Optional[Settings] = None): if settings.cache: cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) return app @@ -106,6 +123,10 @@ def get_llama(): yield llama +def get_settings(): + yield settings + + model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( @@ -166,8 +187,9 @@ def get_llama(): description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", ) + class CreateCompletionRequest(BaseModel): - prompt: Optional[str] = Field( + prompt: Union[str, List[str]] = Field( default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( @@ -222,10 +244,13 @@ class Config: def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): + if isinstance(request.prompt, list): + assert len(request.prompt) <= 1 + request.prompt = request.prompt[0] if len(request.prompt) > 0 else "" + completion_or_chunks = llama( **request.dict( exclude={ - "model", "n", "best_of", "logit_bias", @@ -263,7 +288,7 @@ class Config: def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"user"})) class ChatCompletionRequestMessage(BaseModel): @@ -324,7 +349,6 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", "n", "logit_bias", "user", @@ -367,13 +391,16 @@ class ModelList(TypedDict): @router.get("/v1/models", response_model=GetModelResponse) def get_models( + settings: Settings = Depends(get_settings), llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: return { "object": "list", "data": [ { - "id": llama.model_path, + "id": settings.model_alias + if settings.model_alias is not None + else llama.model_path, "object": "model", "owned_by": "me", "permissions": [], diff --git a/poetry.lock b/poetry.lock index 5474bf4f5..5289b2962 100644 --- a/poetry.lock +++ b/poetry.lock @@ -773,14 +773,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.11" +version = "9.1.12" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"}, - {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"}, + {file = "mkdocs_material-9.1.12-py3-none-any.whl", hash = "sha256:68c57d95d10104179c8c3ce9a88ee9d2322a5145b3d0f1f38ff686253fb5ec98"}, + {file = "mkdocs_material-9.1.12.tar.gz", hash = "sha256:d4ebe9b5031ce63a265c19fb5eab4d27ea4edadb05de206372e831b2b7570fb5"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2" +content-hash = "d188fc14200f7ee348bef821265d676d584762983bcaf10f90c14221b4ed26a9" diff --git a/pyproject.toml b/pyproject.toml index 9e633722f..6613ee011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.48" +version = "0.1.50" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.11" +mkdocs-material = "^9.1.12" pytest = "^7.3.1" httpx = "^0.24.0" diff --git a/setup.py b/setup.py index f4cbb60b0..b056ce4cd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.48", + version="0.1.50", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From d175d15be349cedc7bff20b8975979fcf32a55fc Mon Sep 17 00:00:00 2001 From: carmona Date: Thu, 18 May 2023 14:19:16 +0000 Subject: [PATCH 17/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2b2646931..dc271c52e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2b2646931bd2a2eb3e21c6f3733cc0e090b2e24b +Subproject commit dc271c52ed65e7c8dfcbaaf84dabb1f788e4f3d0 From 0cfe61ead4e9103af525f6ac497c675d5da4eac9 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 22 May 2023 08:48:47 +0000 Subject: [PATCH 18/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index dc271c52e..7e4ea5bef 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit dc271c52ed65e7c8dfcbaaf84dabb1f788e4f3d0 +Subproject commit 7e4ea5beff567f53be92f75f9089e6f11fa5dabd From 53f7e126dd543d548432372b672586d00c898277 Mon Sep 17 00:00:00 2001 From: carmona Date: Tue, 23 May 2023 15:52:27 +0000 Subject: [PATCH 19/35] updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7e4ea5bef..2e6cd4b02 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7e4ea5beff567f53be92f75f9089e6f11fa5dabd +Subproject commit 2e6cd4b02549e343bef3768e6b946f999c82e823 From b1015442f5ce7952508314577370694b9f0ba6cf Mon Sep 17 00:00:00 2001 From: carmona Date: Tue, 23 May 2023 16:05:46 +0000 Subject: [PATCH 20/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2e6cd4b02..7d873811f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2e6cd4b02549e343bef3768e6b946f999c82e823 +Subproject commit 7d873811f31d4d8c909015c946a862c0089cda7d From 22b968f2788848967290d83d70fb0174400e7857 Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 24 May 2023 13:29:10 +0000 Subject: [PATCH 21/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7d873811f..ac7876ac2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7d873811f31d4d8c909015c946a862c0089cda7d +Subproject commit ac7876ac20124a15a44fd6317721ff1aa2538806 From 7219d5ce09b136f80425b888d14a74b1a9d5011e Mon Sep 17 00:00:00 2001 From: carmona Date: Thu, 1 Jun 2023 15:07:47 +0000 Subject: [PATCH 22/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ac7876ac2..ffb06a345 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ac7876ac20124a15a44fd6317721ff1aa2538806 +Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de From f181ea6edf3b2bb023b12da08830eba3e9731a76 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 12 Jun 2023 10:25:25 +0000 Subject: [PATCH 23/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ffb06a345..fa84c4b3e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de +Subproject commit fa84c4b3e80199a5683438f062009c031a06c4fa From e38ee1a282efa41575943fbb080c806c481f3246 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 12 Jun 2023 13:18:43 +0000 Subject: [PATCH 24/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index fa84c4b3e..58970a4c3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit fa84c4b3e80199a5683438f062009c031a06c4fa +Subproject commit 58970a4c39124a647ac2a640d9e178ea6c961e65 From 742c82a16c1e092c696b0800af5265bb50b3ef00 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 19 Jun 2023 14:39:18 +0000 Subject: [PATCH 25/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 58970a4c3..16b9cd193 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 58970a4c39124a647ac2a640d9e178ea6c961e65 +Subproject commit 16b9cd193965769089881bb8ec012fccca7b37b6 From ae8cbad13574037bd6b4583f4db8a8411c083757 Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 21 Jun 2023 13:55:33 +0000 Subject: [PATCH 26/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 16b9cd193..049aa16b8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 16b9cd193965769089881bb8ec012fccca7b37b6 +Subproject commit 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 From bf528434b5f4914c37b13005af6308132501497b Mon Sep 17 00:00:00 2001 From: carmonajca Date: Fri, 23 Jun 2023 09:58:55 +0200 Subject: [PATCH 27/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 049aa16b8..748713722 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 +Subproject commit 7487137227eb32ed9b12156338b865cb29b2dfd1 From 236c02d380174d251d1b001e36e4961f195ed734 Mon Sep 17 00:00:00 2001 From: carmona Date: Tue, 27 Jun 2023 14:06:48 +0000 Subject: [PATCH 28/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 748713722..0be54f75a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7487137227eb32ed9b12156338b865cb29b2dfd1 +Subproject commit 0be54f75a6c3e9a09ea71bdfcdabf9a996a0549b From d83749346c244e1c0cb0606bb04f6f06ff2df1f2 Mon Sep 17 00:00:00 2001 From: carmonajca Date: Wed, 28 Jun 2023 14:57:16 +0200 Subject: [PATCH 29/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0be54f75a..9d23589d6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0be54f75a6c3e9a09ea71bdfcdabf9a996a0549b +Subproject commit 9d23589d638dc74577d5ff880e6d4248b795f12e From e3cf794d07a7d65f67cdc6f1ed7b93ccf6034cea Mon Sep 17 00:00:00 2001 From: carmonajca Date: Tue, 4 Jul 2023 16:00:02 +0200 Subject: [PATCH 30/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9d23589d6..acc111caf 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9d23589d638dc74577d5ff880e6d4248b795f12e +Subproject commit acc111caf93fc6681450924df9f99679c384c59e From ad7ec369cc80dbeda0a4027123ab1eba2c7c0f77 Mon Sep 17 00:00:00 2001 From: carmona Date: Thu, 13 Jul 2023 10:41:07 +0000 Subject: [PATCH 31/35] Updated llama_cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index acc111caf..1cbf56146 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit acc111caf93fc6681450924df9f99679c384c59e +Subproject commit 1cbf561466e957b25f0e8163c2386683f8674369 From e1294e901afeddf03111a35a7c963c0c6be02740 Mon Sep 17 00:00:00 2001 From: carmona Date: Fri, 14 Jul 2023 18:43:52 +0800 Subject: [PATCH 32/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1cbf56146..27ad57a69 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1cbf561466e957b25f0e8163c2386683f8674369 +Subproject commit 27ad57a69b85bf12420a27e9945e580cc280be57 From 22d6849a21a849e169ce6f6c31129f6a7dfc075a Mon Sep 17 00:00:00 2001 From: carmona Date: Wed, 26 Jul 2023 10:14:44 +0000 Subject: [PATCH 33/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 27ad57a69..eb542d393 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 27ad57a69b85bf12420a27e9945e580cc280be57 +Subproject commit eb542d39324574a6778fad9ba9e34ba7a14a82a3 From 2579b16bb41bb6416333357c00fe8e16838e5b01 Mon Sep 17 00:00:00 2001 From: carmona Date: Fri, 28 Jul 2023 09:52:37 +0000 Subject: [PATCH 34/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index eb542d393..d91f3f0c5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit eb542d39324574a6778fad9ba9e34ba7a14a82a3 +Subproject commit d91f3f0c55663719ea03b76311e8c36ed55eb0e2 From b12aea96ebbe63ae35a1b1d0a77773c398275e11 Mon Sep 17 00:00:00 2001 From: carmona Date: Mon, 31 Jul 2023 09:29:07 +0000 Subject: [PATCH 35/35] Updated llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d91f3f0c5..9d2382b3e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d91f3f0c55663719ea03b76311e8c36ed55eb0e2 +Subproject commit 9d2382b3e45b5815fc6a054045a2f2c2b18c22a2