8000 fix text_offset of multi-token characters (#1037) · qeleb/llama-cpp-python@2f03fb0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2f03fb0

Browse files
authored
fix text_offset of multi-token characters (abetlen#1037)
* fix text_offsets for bytes tokens * fix
1 parent 33cc623 commit 2f03fb0

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

llama_cpp/llama.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,7 +1551,9 @@ def logit_bias_processor(
15511551
"utf-8", errors="ignore"
15521552
)
15531553
text_offset = len(prompt) + len(
1554-
self.detokenize(completion_tokens[:returned_tokens])
1554+
self.detokenize(completion_tokens[:returned_tokens]).decode(
1555+
"utf-8", errors="ignore"
1556+
)
15551557
)
15561558
token_offset = len(prompt_tokens) + returned_tokens
15571559
logits = self._scores[token_offset - 1, :]
@@ -1789,13 +1791,19 @@ def logit_bias_processor(
17891791
]
17901792
all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
17911793
# TODO: may be able to change this loop to use np.take_along_dim
1792-
for token, token_str, logprobs_token in zip(
1793-
all_tokens, all_token_strs, all_logprobs
1794+
for idx, (token, token_str, logprobs_token) in enumerate(
1795+
zip(all_tokens, all_token_strs, all_logprobs)
17941796
):
17951797
if token == self.token_bos():
17961798
continue
1797-
text_offsets.append(text_offset)
1798-
text_offset += len(token_str)
1799+
text_offsets.append(
1800+
text_offset
1801+
+ len(
1802+
self.detokenize(all_tokens[:idx]).decode(
1803+
"utf-8", errors="ignore"
1804+
)
1805+
)
1806+
)
17991807
tokens.append(token_str)
18001808
sorted_logprobs = list(
18011809
sorted(

0 commit comments

Comments
 (0)
0