8000 Add vocab utils · qeleb/llama-cpp-python@7ae9a3e · GitHub
[go: up one dir, main page]

Skip to content

Commit 7ae9a3e

Browse files
committed
Add vocab utils
1 parent 85caba5 commit 7ae9a3e

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

llama_cpp/_internals.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,81 @@ def copy_logits(self, logits: npt.NDArray[np.single]):
536536
self.candidates.size = llama_cpp.c_size_t(self.n_vocab)
537537

538538

539+
# Python wrappers over common/common
540+
def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> list[int]:
541+
n_tokens = len(text) + 1 if add_bos else len(text)
542+
result = (llama_cpp.llama_token * n_tokens)()
543+
n_tokens = llama_cpp.llama_tokenize(
544+
model.model,
545+
text.encode("utf-8"),
546+
len(text),
547+
result,
548+
n_tokens,
549+
add_bos,
550+
special,
551+
)
552+
if n_tokens < 0:
553+
result = (llama_cpp.llama_token * -n_tokens)()
554+
check = llama_cpp.llama_tokenize(
555+
model.model,
556+
text.encode("utf-8"),
557+
len(text),
558+
result,
559+
len(result),
560+
add_bos,
561+
special,
562+
)
563+
if check != -n_tokens:
564+
raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
565+
else:
566+
result = result[:n_tokens]
567+
return list(result)
568+
569+
570+
def _token_to_piece(model: _LlamaModel, token: int) -> str:
571+
assert model.model is not None
572+
result = (ctypes.c_char * 8)(0)
573+
n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
574+
if n_tokens < 0:
575+
result = (ctypes.c_char * -n_tokens)(0)
576+
check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
577+
if check != -n_tokens:
578+
raise RuntimeError(f"Failed to get piece: token={token}")
579+
else:
580+
result = result[:n_tokens]
581+
return bytes(result).decode("utf-8")
582+
583+
584+
def _detokenize_spm(model: _LlamaModel, tokens: List[int]) -> str:
585+
bos_id = model.token_bos()
586+
result = ""
587+
for i, token in enumerate(tokens):
588+
piece = _token_to_piece(model, token)
589+
if (
590+
(tokens[0] == bos_id and i == 1) or (tokens[0] != bos_id and i == 0)
591+
) and piece[0] == " ":
592+
piece = piece[1:]
593+
result += piece
594+
return result
595+
596+
597+
def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str:
598+
result = ""
599+
for token in tokens:
600+
piece = _token_to_piece(model, token)
601+
result += piece
602+
return result
603+
604+
605+
def _should_add_bos(model: _LlamaModel) -> bool:
606+
assert model.model is not None
607+
add_bos = llama_cpp.llama_add_bos_token(model.model)
608+
if add_bos != -1:
609+
return add_bos != 0
610+
else:
611+
return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
612+
613+
539614
# Python wrappers over common/sampling structs
540615

541616

0 commit comments

Comments
 (0)
0