diff --git a/CMakeLists.txt b/CMakeLists.txt index bda238801..16932b1dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,4 +28,4 @@ else() LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp ) -endif(UNIX) +endif() diff --git a/README.md b/README.md index 9f494f923..17cc28ca9 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ This package provides: - OpenAI-like API - LangChain compatibility +Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): @@ -26,6 +28,12 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. ### Installation with OpenBLAS / cuBLAS / CLBlast @@ -102,7 +110,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: ```bash -docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` ## Low-level API @@ -120,7 +128,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) >>> llama_cpp.llama_free(ctx) ``` diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1e3..be2cf989e 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -16,8 +16,8 @@ from os import cpu_count, path import llama_cpp -from common import GptParams, gpt_params_parse, gpt_random_prompt -import util +from llama_cpp_python.examples.low_level_api.common import GptParams, gpt_params_parse, gpt_random_prompt +import llama_cpp_python.examples.low_level_api.util as util # A LLaMA interactive session class LLaMAInteract: @@ -368,10 +368,10 @@ def generate(self): id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, 1) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), 1) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), 1) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), 1) llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) @@ -444,7 +444,7 @@ def generate(self): if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): # If we arent in instruction mode, fix the current generation by appending the antiprompt. # Makes it so if chat ends prematurely you dont append the AI's text etc. - if not self.params.instruct: + if not self.params.instruct and self.first_antiprompt: self.embd_inp += self.first_antiprompt[0] self.n_remain = self.params.n_predict break @@ -493,7 +493,7 @@ def output(self): # Contains multi-byte UTF8 for num, pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check - if pattern & int.from_bytes(cur_char) == pattern: + if pattern & int.from_bytes(cur_char, byteorder='big') == pattern: self.multibyte_fix = [cur_char] + ([None] * (num-1)) # Stop incomplete bytes from passing @@ -518,7 +518,7 @@ def interact(self): while self.params.interactive: self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): - print('\n> ', end="") + print('\n$ ', end="",flush=True) self.input(self.read_input()) else: print(self.params.input_prefix, end="") diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7be51e15c..f47f4a4db 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -83,6 +83,7 @@ def __init__( # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -129,6 +130,7 @@ def __init__( self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -174,7 +176,9 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: + def tokenize( + self, text: bytes, add_bos: bool = True + ) -> List[llama_cpp.llama_token]: """Tokenize a string. Args: @@ -194,10 +198,22 @@ def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: text, tokens, n_ctx, - llama_cpp.c_bool(True), + llama_cpp.c_bool(add_bos), ) if int(n_tokens) < 0: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * int(n_tokens))() + n_tokens = llama_cpp.llama_tokenize( + self.ctx, + text, + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) return list(tokens[:n_tokens]) def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: @@ -275,11 +291,20 @@ def _sample( mirostat_mode: llama_cpp.c_int, mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, ): assert self.ctx is not None assert len(self.eval_logits) > 0 n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = ( + llama_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) logits = self.eval_logits[-1] + nl_logit = logits[int(Llama.token_nl())] data = (llama_cpp.llama_token_data * n_vocab)( *[ llama_cpp.llama_token_data( @@ -312,6 +337,8 @@ def _sample( alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) + if not penalize_nl: + candidates.data[int(Llama.token_nl())].logit = nl_logit if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -394,6 +421,7 @@ def sample( mirostat_mode: int = 0, mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, + penalize_nl: bool = True, ): """Sample a token from the model. @@ -425,18 +453,20 @@ def sample( mirostat_mode=llama_cpp.c_int(mirostat_mode), mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, ) def generate( self, tokens: Sequence[llama_cpp.llama_token], - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -495,6 +525,7 @@ def generate( repeat_penalty=repeat_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -504,7 +535,7 @@ def generate( if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str) -> Embedding: + def create_embedding(self, input: str, model: Optional[str] = None) -> Embedding: """Embed a string. Args: @@ -514,6 +545,7 @@ def create_embedding(self, input: str) -> Embedding: An embedding object. """ assert self.ctx is not None + model_name: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -543,7 +575,7 @@ def create_embedding(self, input: str) -> Embedding: "index": 0, } ], - "model": self.model_path, + "model": model_name, "usage": { "prompt_tokens": n_tokens, "total_tokens": n_tokens, @@ -576,9 +608,11 @@ def _create_completion( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -591,6 +625,7 @@ def _create_completion( text: bytes = b"" returned_characters: int = 0 stop = stop if stop is not None else [] + model_name: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -634,6 +669,7 @@ def _create_completion( top_k=top_k, top_p=top_p, temp=temperature, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -641,7 +677,7 @@ def _create_completion( presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): - if token == llama_cpp.llama_token_eos(): + if token == Llama.token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" break @@ -688,7 +724,7 @@ def _create_completion( "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text[start:].decode("utf-8", errors="ignore"), @@ -709,12 +745,15 @@ def _create_completion( print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + if stream: yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text[returned_characters:].decode( @@ -780,14 +819,11 @@ def _create_completion( "top_logprobs": top_logprobs, } - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) - yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text_str, @@ -818,9 +854,11 @@ def create_completion( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -858,9 +896,11 @@ def create_completion( repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -883,9 +923,11 @@ def __call__( repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -923,9 +965,11 @@ def __call__( repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) def _convert_text_completion_to_chat( @@ -998,9 +1042,11 @@ def create_chat_completion( presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1035,9 +1081,11 @@ def create_chat_completion( repeat_penalty=repeat_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore @@ -1057,6 +1105,7 @@ def __getstate__(self): model_path=self.model_path, n_ctx=self.params.n_ctx, n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1076,6 +1125,7 @@ def __setstate__(self, state): model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], @@ -1130,6 +1180,11 @@ def token_bos() -> llama_cpp.llama_token: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + @staticmethod + def token_nl() -> llama_cpp.llama_token: + """Return the newline token.""" + return llama_cpp.llama_token_nl() + @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e60558cff..24ab40a12 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"lib")) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") @@ -68,7 +73,7 @@ def _load_shared_library(lib_base_name: str): _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = c_int(1) +LLAMA_FILE_VERSION = c_int(2) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" @@ -109,6 +114,7 @@ class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context ("n_parts", c_int), # -1 for default + ("n_gpu_layers", c_int), # number of layers to store in VRAM ("seed", c_int), # RNG seed, 0 for random ("f16_kv", c_bool), # use fp16 for KV cache ( @@ -135,7 +141,7 @@ class llama_context_params(Structure): LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors @@ -259,9 +265,9 @@ def llama_get_state_size(ctx: llama_context_p) -> c_size_t: # Destination needs to have allocated enough memory. # Returns the number of bytes copied def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] @@ -350,7 +356,7 @@ def llama_tokenize( tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index be6bac81d..e8f62e8bc 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -16,7 +16,16 @@ class Settings(BaseSettings): model: str = Field( description="The path to the model to use for generating completions." ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -59,6 +68,7 @@ class Settings(BaseSettings): router = APIRouter() +settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None @@ -80,6 +90,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, @@ -95,6 +106,12 @@ def create_app(settings: Optional[Settings] = None): if settings.cache: cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) return app @@ -106,6 +123,10 @@ def get_llama(): yield llama +def get_settings(): + yield settings + + model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( @@ -166,8 +187,9 @@ def get_llama(): description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", ) + class CreateCompletionRequest(BaseModel): - prompt: Optional[str] = Field( + prompt: Union[str, List[str]] = Field( default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( @@ -222,10 +244,13 @@ class Config: def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): + if isinstance(request.prompt, list): + assert len(request.prompt) <= 1 + request.prompt = request.prompt[0] if len(request.prompt) > 0 else "" + completion_or_chunks = llama( **request.dict( exclude={ - "model", "n", "best_of", "logit_bias", @@ -263,7 +288,7 @@ class Config: def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"user"})) class ChatCompletionRequestMessage(BaseModel): @@ -324,7 +349,6 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", "n", "logit_bias", "user", @@ -367,13 +391,16 @@ class ModelList(TypedDict): @router.get("/v1/models", response_model=GetModelResponse) def get_models( + settings: Settings = Depends(get_settings), llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: return { "object": "list", "data": [ { - "id": llama.model_path, + "id": settings.model_alias + if settings.model_alias is not None + else llama.model_path, "object": "model", "owned_by": "me", "permissions": [], diff --git a/poetry.lock b/poetry.lock index 5474bf4f5..5289b2962 100644 --- a/poetry.lock +++ b/poetry.lock @@ -773,14 +773,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.11" +version = "9.1.12" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"}, - {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"}, + {file = "mkdocs_material-9.1.12-py3-none-any.whl", hash = "sha256:68c57d95d10104179c8c3ce9a88ee9d2322a5145b3d0f1f38ff686253fb5ec98"}, + {file = "mkdocs_material-9.1.12.tar.gz", hash = "sha256:d4ebe9b5031ce63a265c19fb5eab4d27ea4edadb05de206372e831b2b7570fb5"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2" +content-hash = "d188fc14200f7ee348bef821265d676d584762983bcaf10f90c14221b4ed26a9" diff --git a/pyproject.toml b/pyproject.toml index 9e633722f..6613ee011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.48" +version = "0.1.50" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.11" +mkdocs-material = "^9.1.12" pytest = "^7.3.1" httpx = "^0.24.0" diff --git a/quantize.sh b/quantize.sh new file mode 100644 index 000000000..336d725ff --- /dev/null +++ b/quantize.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +if [ $# -eq 0 ]; then + echo "Please, provide the main path to the folder where the models (ckpt_dirs) are." + exit 1 +fi +MAIN_DIR="$1" + +SCRIPT_DIR=$(dirname "$0") + +cd "${SCRIPT_DIR}/vendor/llama.cpp" +echo $(pwd) +if [ ! -f "quantize" ]; then + make quantize +fi +for dir in $(find $MAIN_DIR/* -type d); do + echo $dir + if [ -f "${dir}/ggml-model-f16.bin" ]; then + rm -f ${dir}/*q4_0.bin.* + ./quantize ${dir}/ggml-model-f16.bin q4_0 30 + else + echo "The file 'ggml-model-f16.bin' does not exist in ${dir}" + fi +done diff --git a/setup.py b/setup.py index f4cbb60b0..b056ce4cd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.48", + version="0.1.50", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1b0fd4546..9d2382b3e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 +Subproject commit 9d2382b3e45b5815fc6a054045a2f2c2b18c22a2