From 47ca05ac79fec8ed0ac3856dc4261d2be249eb50 Mon Sep 17 00:00:00 2001
From: Antoine Lizee <antoine.lizee@gmail.com>
Date: Thu, 2 Nov 2023 01:29:06 +0000
Subject: [PATCH 1/5] fix: tokenization of special characters: (#850)

It should behave like llama.cpp, where most out of the box usages
treat special characters accordingly
---
 llama_cpp/llama.py      | 6 +++---
 llama_cpp/server/app.py | 2 +-
 test.py                 | 0
 tests/test_llama.py     | 9 +++++++++
 4 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 test.py
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bc747cffb..c9ea90fb4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -856,7 +856,7 @@ def create_embedding(
         data: List[EmbeddingData] = []
         total_tokens = 0
         for index, input in enumerate(inputs):
-            tokens = self.tokenize(input.encode("utf-8"))
+            tokens = self.tokenize(input.encode("utf-8"), special=True)
             self.reset()
             self.eval(tokens)
             n_tokens = len(tokens)
@@ -928,7 +928,7 @@ def _create_completion(
         completion_tokens: List[int] = []
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
-            self.tokenize(prompt.encode("utf-8"))
+            self.tokenize(prompt.encode("utf-8"), special=True)
             if prompt != ""
             else [self.token_bos()]
         )
@@ -1826,7 +1826,7 @@ def __init__(self, llama: Llama):
 
     def encode(self, text: str, add_bos: bool = True) -> List[int]:
         return self.llama.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
         )
 
     def decode(self, tokens: List[int]) -> str:
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 930ad5df8..f8d8c7658 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -594,7 +594,7 @@ def make_logit_bias_processor(
     elif logit_bias_type == "tokens":
         for token, score in logit_bias.items():
             token = token.encode("utf-8")
-            for input_id in llama.tokenize(token, add_bos=False):
+            for input_id in llama.tokenize(token, add_bos=False, special=True):
                 to_bias[input_id] = score
 
     def logit_bias_processor(
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 76291fbca..330b69b9c 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -25,6 +25,15 @@ def test_llama_cpp_tokenization():
     detokenized = llama.detokenize(tokens)
     assert detokenized != text
 
+    text = b"Hello World</s>"
+    tokens = llama.tokenize(text)
+    assert tokens[-1] != llama.token_eos()
+    assert tokens == [1, 15043, 2787, 829, 29879, 29958]
+
+    tokens = llama.tokenize(text, special=True)
+    assert tokens[-1] == llama.token_eos()
+    assert tokens == [1, 10994, 2787, 2]
+
 
 def test_llama_patch(monkeypatch):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)

From addc2f6077c12da39db5a4d01cbde5982d51c553 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 1 Nov 2023 21:31:54 -0400
Subject: [PATCH 2/5] Update CHANGELOG

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f587225e6..b1591b1d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- Fix tokenization of special characters by @antoine-lizee in #850
+
 ## [0.2.12]
 
 - Update llama.cpp to ggerganov/llama.cpp@50337961a678fce4081554b24e56e86b67660163

From 3e180d77f1320b07bd279c9e2beffb1ec7722587 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 1 Nov 2023 21:37:53 -0400
Subject: [PATCH 3/5] Cleanup

---
 test.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index e69de29bb..000000000

From f0d1a1b255e926f8ed488531a487dfab85b3760e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 1 Nov 2023 22:17:53 -0400
Subject: [PATCH 4/5] Fix runner label

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 269016b2e..ed548eb54 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -80,7 +80,7 @@ jobs:
 
   build-linux-opencl:
 
-    runs-on: linux-latest
+    runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v3

From d7ae8b5817aa8594d792d62f2c18dc87208f4306 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 2 Nov 2023 01:17:46 -0400
Subject: [PATCH 5/5] Fix clblast test

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ed548eb54..24448ec9f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -99,7 +99,7 @@ jobs:
       - name: Install dependencies
         run: |
           python3 -m pip install --upgrade pip
-          CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
+          CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose
       - name: Test with pytest
         run: |
           python3 -m pytest