abetlen
diff --git a/‎llama_cpp/llama.py
Lines changed: 10 additions & 11 deletions b/‎llama_cpp/llama.py
Lines changed: 10 additions & 11 deletions
diff --git a/‎llama_cpp/managers/cache.py
Lines changed: 4 additions & 4 deletions b/‎llama_cpp/managers/cache.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎llama_cpp/server/model.py
Lines changed: 5 additions & 0 deletions b/‎llama_cpp/server/model.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/start.sh
Lines changed: 1 addition & 1 deletion b/‎scripts/start.sh
Lines changed: 1 addition & 1 deletion
@@ -355,16 +355,15 @@ def __init__(
             verbose=self.verbose,
         )
 
-        if self.lora_path:
-            if self._model.apply_lora_from_file(
-                self.lora_path,
-                self.lora_scale,
-                self.lora_base,
-                self.n_threads,
-            ):
-                raise RuntimeError(
-                    f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}"
-                )
+        if self.lora_path and self._model.apply_lora_from_file(
+            self.lora_path,
+            self.lora_scale,
+            self.lora_base,
+            self.n_threads,
+        ):
+            raise RuntimeError(
+                f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}"
+            )
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
@@ -450,7 +449,7 @@ def __init__(
             if self.verbose:
                 print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
 
-    def _load_control_vector(self, filepath: str, strength: float = 1.9):
+    def _load_control_vector(self, filepath: str, strength: float = 1.7):
         if not os.path.exists(filepath):
             raise ValueError(f"Control vector file does not exist: {filepath}")
         if not filepath.endswith(".json"):
 
@@ -77,13 +77,13 @@ def create_completion_with_cache(
         **kwargs,
     ):
         """Predict the given prompt with the given max tokens and cache the result."""
-        
+
         if not stop_tokens:
             stop_tokens = ["</s>"]
 
         print(prompt)
         prompt = prompt.strip()
-        
+
         if self.current_state == prompt:
             print("Prompt is the same as previous. Assuming new turn")
             # self.reset()
@@ -96,7 +96,7 @@ def create_completion_with_cache(
             print("Not using cached state")
             partial_prompt = prompt
             self.reset()
-            
+
         if to_eval:
             print("Evaluating partial prompt")
             prompt_tokens = self.get_prompt_tokens(partial_prompt)
@@ -119,7 +119,7 @@ def create_completion_with_cache(
             for output in outputs:
                 yield output
                 results += output["choices"][0]["text"]
-            
+
             self.current_state = prompt + results
 
         if self.flush_cache:
 
@@ -11,6 +11,7 @@
 from llama_cpp.server.settings import ModelSettings
 from llama_cpp.managers.cache import LlamaCacheManager
 
+
 class LlamaProxy:
     def __init__(self, models: List[ModelSettings]) -> None:
         assert models, "No models provided!"
@@ -134,6 +135,10 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
         else:
             create_fn = LlamaCacheManager
             kwargs["model_path"] = settings.model
+            # kwargs["control_vectors"] = [
+            #     "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/hearts_system/hearts_system.json",
+            #     "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/hearts_system/hearts_system_critical.json"
+            # ]
 
         _model = create_fn(
             **kwargs,
 
@@ -1,5 +1,5 @@
 python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-john-doe-ckpt-158-200.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000
 
-python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-instruct-v0.2-v0.0.1.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000
+python llama_cpp/server --model "/Users/sengwee.ngui/Library/CloudStorage/OneDrive-TemusPte.Ltd/Documents/projects/SuperAdapters/data/llms/mistral-fwd-instruct-v0.2-v0.0.1.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000 --host 0.0.0.0
 
 python3 llama_cpp/server --model "../data/mistral-fwd-john-doe-ckpt-158-200.gguf" --n_gpu_layers 64 --n_ctx 8192 --n_batch 2048 --last_n_tokens_size 4000