diff --git a/CMakeLists.txt b/CMakeLists.txt
index bda238801..16932b1dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,4 +28,4 @@ else()
         LIBRARY DESTINATION llama_cpp
         RUNTIME DESTINATION llama_cpp
     )
-endif(UNIX)
+endif()
diff --git a/README.md b/README.md
index 9f494f923..17cc28ca9 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ This package provides:
   - OpenAI-like API
   - LangChain compatibility
 
+Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
+
 ## Installation from PyPI (recommended)
 
 Install from PyPI (requires a c compiler):
@@ -26,6 +28,12 @@ pip install llama-cpp-python
 The above command will attempt to install the package and build build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 
+Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
 
 ### Installation with OpenBLAS / cuBLAS / CLBlast
 
@@ -102,7 +110,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 
 ```bash
-docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 
 ## Low-level API
@@ -120,7 +128,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
->>> tokens = (llama_cppp.llama_token * int(max_tokens))()
+>>> tokens = (llama_cpp.llama_token * int(max_tokens))()
 >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
 >>> llama_cpp.llama_free(ctx)
 ```
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 8773cb1e3..be2cf989e 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -16,8 +16,8 @@
 from os import cpu_count, path
 
 import llama_cpp
-from common import GptParams, gpt_params_parse, gpt_random_prompt
-import util
+from llama_cpp_python.examples.low_level_api.common import GptParams, gpt_params_parse, gpt_random_prompt
+import llama_cpp_python.examples.low_level_api.util as util
 
 # A LLaMA interactive session
 class LLaMAInteract:
@@ -368,10 +368,10 @@ def generate(self):
 						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
 					else:
 						# Temperature sampling
-						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
-						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
-						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, 1)
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), 1)
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), 1)
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), 1)
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
 				# print("`{}`".format(candidates_p.size))
@@ -444,7 +444,7 @@ def generate(self):
 			if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
 				# If we arent in instruction mode, fix the current generation by appending the antiprompt.
 				# Makes it so if chat ends prematurely you dont append the AI's text etc.
-				if not self.params.instruct:
+				if not self.params.instruct and self.first_antiprompt:
 					self.embd_inp += self.first_antiprompt[0]
 				self.n_remain = self.params.n_predict
 				break
@@ -493,7 +493,7 @@ def output(self):
 			# Contains multi-byte UTF8
 			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
 				# Bitwise AND check
-				if pattern & int.from_bytes(cur_char) == pattern:
+				if pattern & int.from_bytes(cur_char, byteorder='big') == pattern:
 					self.multibyte_fix = [cur_char] + ([None] * (num-1))
 
 			# Stop incomplete bytes from passing
@@ -518,7 +518,7 @@ def interact(self):
 		while self.params.interactive:
 			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
-				print('\n> ', end="")
+				print('\n$ ', end="",flush=True)
 				self.input(self.read_input())
 			else:
 				print(self.params.input_prefix, end="")
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7be51e15c..f47f4a4db 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -83,6 +83,7 @@ def __init__(
         # NOTE: These parameters are likely to change in the future.
         n_ctx: int = 512,
         n_parts: int = -1,
+        n_gpu_layers: int = 0,
         seed: int = 1337,
         f16_kv: bool = True,
         logits_all: bool = False,
@@ -129,6 +130,7 @@ def __init__(
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
         self.params.n_parts = n_parts
+        self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -174,7 +176,9 @@ def __init__(
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
-    def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
+    def tokenize(
+        self, text: bytes, add_bos: bool = True
+    ) -> List[llama_cpp.llama_token]:
         """Tokenize a string.
 
         Args:
@@ -194,10 +198,22 @@ def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
             text,
             tokens,
             n_ctx,
-            llama_cpp.c_bool(True),
+            llama_cpp.c_bool(add_bos),
         )
         if int(n_tokens) < 0:
-            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * int(n_tokens))()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.ctx,
+                text,
+                tokens,
+                llama_cpp.c_int(n_tokens),
+                llama_cpp.c_bool(add_bos),
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
         return list(tokens[:n_tokens])
 
     def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
@@ -275,11 +291,20 @@ def _sample(
         mirostat_mode: llama_cpp.c_int,
         mirostat_tau: llama_cpp.c_float,
         mirostat_eta: llama_cpp.c_float,
+        penalize_nl: bool = True,
     ):
         assert self.ctx is not None
         assert len(self.eval_logits) > 0
         n_vocab = int(llama_cpp.llama_n_vocab(self.ctx))
+        n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+        top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
+        last_n_tokens_size = (
+            llama_cpp.c_int(n_ctx)
+            if last_n_tokens_size.value < 0
+            else last_n_tokens_size
+        )
         logits = self.eval_logits[-1]
+        nl_logit = logits[int(Llama.token_nl())]
         data = (llama_cpp.llama_token_data * n_vocab)(
             *[
                 llama_cpp.llama_token_data(
@@ -312,6 +337,8 @@ def _sample(
             alpha_frequency=frequency_penalty,
             alpha_presence=presence_penalty,
         )
+        if not penalize_nl:
+            candidates.data[int(Llama.token_nl())].logit = nl_logit
         if temp.value == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
@@ -394,6 +421,7 @@ def sample(
         mirostat_mode: int = 0,
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
+        penalize_nl: bool = True,
     ):
         """Sample a token from the model.
 
@@ -425,18 +453,20 @@ def sample(
             mirostat_mode=llama_cpp.c_int(mirostat_mode),
             mirostat_tau=llama_cpp.c_float(mirostat_tau),
             mirostat_eta=llama_cpp.c_float(mirostat_eta),
+            penalize_nl=penalize_nl,
         )
 
     def generate(
         self,
         tokens: Sequence[llama_cpp.llama_token],
-        top_k: int,
-        top_p: float,
-        temp: float,
-        repeat_penalty: float,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.1,
         reset: bool = True,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
@@ -495,6 +525,7 @@ def generate(
                 repeat_penalty=repeat_penalty,
                 frequency_penalty=frequency_penalty,
                 presence_penalty=presence_penalty,
+                tfs_z=tfs_z,
                 mirostat_mode=mirostat_mode,
                 mirostat_tau=mirostat_tau,
                 mirostat_eta=mirostat_eta,
@@ -504,7 +535,7 @@ def generate(
             if tokens_or_none is not None:
                 tokens.extend(tokens_or_none)
 
-    def create_embedding(self, input: str) -> Embedding:
+    def create_embedding(self, input: str, model: Optional[str] = None) -> Embedding:
         """Embed a string.
 
         Args:
@@ -514,6 +545,7 @@ def create_embedding(self, input: str) -> Embedding:
             An embedding object.
         """
         assert self.ctx is not None
+        model_name: str = model if model is not None else self.model_path
 
         if self.params.embedding == False:
             raise RuntimeError(
@@ -543,7 +575,7 @@ def create_embedding(self, input: str) -> Embedding:
                     "index": 0,
                 }
             ],
-            "model": self.model_path,
+            "model": model_name,
             "usage": {
                 "prompt_tokens": n_tokens,
                 "total_tokens": n_tokens,
@@ -576,9 +608,11 @@ def _create_completion(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
@@ -591,6 +625,7 @@ def _create_completion(
         text: bytes = b""
         returned_characters: int = 0
         stop = stop if stop is not None else []
+        model_name: str = model if model is not None else self.model_path
 
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
@@ -634,6 +669,7 @@ def _create_completion(
             top_k=top_k,
             top_p=top_p,
             temp=temperature,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
@@ -641,7 +677,7 @@ def _create_completion(
             presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
         ):
-            if token == llama_cpp.llama_token_eos():
+            if token == Llama.token_eos():
                 text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
                 break
@@ -688,7 +724,7 @@ def _create_completion(
                     "id": completion_id,
                     "object": "text_completion",
                     "created": created,
-                    "model": self.model_path,
+                    "model": model_name,
                     "choices": [
                         {
                             "text": text[start:].decode("utf-8", errors="ignore"),
@@ -709,12 +745,15 @@ def _create_completion(
                 print("Llama._create_completion: cache save", file=sys.stderr)
             self.cache[prompt_tokens + completion_tokens] = self.save_state()
 
+        if self.verbose:
+            llama_cpp.llama_print_timings(self.ctx)
+
         if stream:
             yield {
                 "id": completion_id,
                 "object": "text_completion",
                 "created": created,
-                "model": self.model_path,
+                "model": model_name,
                 "choices": [
                     {
                         "text": text[returned_characters:].decode(
@@ -780,14 +819,11 @@ def _create_completion(
                 "top_logprobs": top_logprobs,
             }
 
-        if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
-
         yield {
             "id": completion_id,
             "object": "text_completion",
             "created": created,
-            "model": self.model_path,
+            "model": model_name,
             "choices": [
                 {
                     "text": text_str,
@@ -818,9 +854,11 @@ def create_completion(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -858,9 +896,11 @@ def create_completion(
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -883,9 +923,11 @@ def __call__(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -923,9 +965,11 @@ def __call__(
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
         )
 
     def _convert_text_completion_to_chat(
@@ -998,9 +1042,11 @@ def create_chat_completion(
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1035,9 +1081,11 @@ def create_chat_completion(
             repeat_penalty=repeat_penalty,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -1057,6 +1105,7 @@ def __getstate__(self):
             model_path=self.model_path,
             n_ctx=self.params.n_ctx,
             n_parts=self.params.n_parts,
+            n_gpu_layers=self.params.n_gpu_layers,
             seed=self.params.seed,
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
@@ -1076,6 +1125,7 @@ def __setstate__(self, state):
             model_path=state["model_path"],
             n_ctx=state["n_ctx"],
             n_parts=state["n_parts"],
+            n_gpu_layers=state["n_gpu_layers"],
             seed=state["seed"],
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
@@ -1130,6 +1180,11 @@ def token_bos() -> llama_cpp.llama_token:
         """Return the beginning-of-sequence token."""
         return llama_cpp.llama_token_bos()
 
+    @staticmethod
+    def token_nl() -> llama_cpp.llama_token:
+        """Return the newline token."""
+        return llama_cpp.llama_token_nl()
+
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]:
         exps = [math.exp(float(x)) for x in logits]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index e60558cff..24ab40a12 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str):
         _base_path = _lib.parent.resolve()
         _lib_paths = [_lib.resolve()]
 
+    cdll_args = dict() # type: ignore
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"lib"))
+        cdll_args["winmode"] = 0
 
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
         if _lib_path.exists():
             try:
-                return ctypes.CDLL(str(_lib_path))
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
@@ -68,7 +73,7 @@ def _load_shared_library(lib_base_name: str):
 _lib = _load_shared_library(_lib_base_name)
 
 # C types
-LLAMA_FILE_VERSION = c_int(1)
+LLAMA_FILE_VERSION = c_int(2)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
@@ -109,6 +114,7 @@ class llama_context_params(Structure):
     _fields_ = [
         ("n_ctx", c_int),  # text context
         ("n_parts", c_int),  # -1 for default
+        ("n_gpu_layers", c_int),  # number of layers to store in VRAM
         ("seed", c_int),  # RNG seed, 0 for random
         ("f16_kv", c_bool),  # use fp16 for KV cache
         (
@@ -135,7 +141,7 @@ class llama_context_params(Structure):
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
 # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
@@ -259,9 +265,9 @@ def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
 def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
+    ctx: llama_context_p, dst  # type: Array[c_uint8]
 ) -> int:
-    return _lib.llama_copy_state_data(ctx, dest)
+    return _lib.llama_copy_state_data(ctx, dst)
 
 
 _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
@@ -350,7 +356,7 @@ def llama_tokenize(
     tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
-) -> c_int:
+) -> int:
     return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
 
 
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index be6bac81d..e8f62e8bc 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -16,7 +16,16 @@ class Settings(BaseSettings):
     model: str = Field(
         description="The path to the model to use for generating completions."
     )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=0,
+        description="The number of layers to put on the GPU. The rest will be on the CPU.",
+    )
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
@@ -59,6 +68,7 @@ class Settings(BaseSettings):
 
 router = APIRouter()
 
+settings: Optional[Settings] = None
 llama: Optional[llama_cpp.Llama] = None
 
 
@@ -80,6 +90,7 @@ def create_app(settings: Optional[Settings] = None):
     global llama
     llama = llama_cpp.Llama(
         model_path=settings.model,
+        n_gpu_layers=settings.n_gpu_layers,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,
         use_mmap=settings.use_mmap,
@@ -95,6 +106,12 @@ def create_app(settings: Optional[Settings] = None):
     if settings.cache:
         cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
         llama.set_cache(cache)
+
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+
+    set_settings(settings)
     return app
 
 
@@ -106,6 +123,10 @@ def get_llama():
         yield llama
 
 
+def get_settings():
+    yield settings
+
+
 model_field = Field(description="The model to use for generating completions.")
 
 max_tokens_field = Field(
@@ -166,8 +187,9 @@ def get_llama():
     description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
 )
 
+
 class CreateCompletionRequest(BaseModel):
-    prompt: Optional[str] = Field(
+    prompt: Union[str, List[str]] = Field(
         default="", description="The prompt to generate completions for."
     )
     suffix: Optional[str] = Field(
@@ -222,10 +244,13 @@ class Config:
 def create_completion(
     request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
+    if isinstance(request.prompt, list):
+        assert len(request.prompt) <= 1
+        request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
+
     completion_or_chunks = llama(
         **request.dict(
             exclude={
-                "model",
                 "n",
                 "best_of",
                 "logit_bias",
@@ -263,7 +288,7 @@ class Config:
 def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+    return llama.create_embedding(**request.dict(exclude={"user"}))
 
 
 class ChatCompletionRequestMessage(BaseModel):
@@ -324,7 +349,6 @@ def create_chat_completion(
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(
             exclude={
-                "model",
                 "n",
                 "logit_bias",
                 "user",
@@ -367,13 +391,16 @@ class ModelList(TypedDict):
 
 @router.get("/v1/models", response_model=GetModelResponse)
 def get_models(
+    settings: Settings = Depends(get_settings),
     llama: llama_cpp.Llama = Depends(get_llama),
 ) -> ModelList:
     return {
         "object": "list",
         "data": [
             {
-                "id": llama.model_path,
+                "id": settings.model_alias
+                if settings.model_alias is not None
+                else llama.model_path,
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
diff --git a/poetry.lock b/poetry.lock
index 5474bf4f5..5289b2962 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -773,14 +773,14 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.11"
+version = "9.1.12"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
-    {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
+    {file = "mkdocs_material-9.1.12-py3-none-any.whl", hash = "sha256:68c57d95d10104179c8c3ce9a88ee9d2322a5145b3d0f1f38ff686253fb5ec98"},
+    {file = "mkdocs_material-9.1.12.tar.gz", hash = "sha256:d4ebe9b5031ce63a265c19fb5eab4d27ea4edadb05de206372e831b2b7570fb5"},
 ]
 
 [package.dependencies]
@@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2"
+content-hash = "d188fc14200f7ee348bef821265d676d584762983bcaf10f90c14221b4ed26a9"
diff --git a/pyproject.toml b/pyproject.toml
index 9e633722f..6613ee011 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.48"
+version = "0.1.50"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@@ -22,7 +22,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.21.2"}
-mkdocs-material = "^9.1.11"
+mkdocs-material = "^9.1.12"
 pytest = "^7.3.1"
 httpx = "^0.24.0"
 
diff --git a/quantize.sh b/quantize.sh
new file mode 100644
index 000000000..336d725ff
--- /dev/null
+++ b/quantize.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+if [ $# -eq 0 ]; then
+    echo "Please, provide the main path to the folder where the models (ckpt_dirs) are."
+    exit 1
+fi
+MAIN_DIR="$1"
+
+SCRIPT_DIR=$(dirname "$0")
+
+cd "${SCRIPT_DIR}/vendor/llama.cpp"
+echo $(pwd)
+if [ ! -f "quantize" ]; then
+    make quantize
+fi
+for dir in $(find $MAIN_DIR/* -type d); do
+    echo $dir
+    if [ -f "${dir}/ggml-model-f16.bin" ]; then
+        rm -f ${dir}/*q4_0.bin.*
+        ./quantize ${dir}/ggml-model-f16.bin q4_0 30
+    else
+        echo "The file 'ggml-model-f16.bin' does not exist in ${dir}"
+    fi
+done
diff --git a/setup.py b/setup.py
index f4cbb60b0..b056ce4cd 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.48",
+    version="0.1.50",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 1b0fd4546..9d2382b3e 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25
+Subproject commit 9d2382b3e45b5815fc6a054045a2f2c2b18c22a2