8000 llama : fix bpe tokenize from byte (#2889) · ggml-org/llama.cpp@3730134 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3730134

Browse files
authored
llama : fix bpe tokenize from byte (#2889)
1 parent d9151e6 commit 3730134

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

llama.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe {
33663366
std::string byte_str(1, *j);
33673367
auto token_multibyte = vocab.token_to_id.find(byte_str);
33683368
if (token_multibyte == vocab.token_to_id.end()) {
3369-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3369+
try {
3370+
llama_token token_byte = llama_byte_to_token(vocab, *j);
3371+
output.push_back(token_byte);
3372+
} catch (const std::out_of_range & err) {
3373+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3374+
}
3375+
} else {
3376+
output.push_back((*token_multibyte).second);
33703377
}
3371-
output.push_back((*token_multibyte).second);
33723378
}
33733379
} else {
33743380
output.push_back((*token).second);

0 commit comments

Comments
 (0)
0