8000 vocab : use string_view::find() to avoid unnecessary looking up beyon… · ggml-org/llama.cpp@5dd5d1a · GitHub
[go: up one dir, main page]

Skip to content

Commit 5dd5d1a

Browse files
authored
vocab : use string_view::find() to avoid unnecessary looking up beyond the fragment range (#12706)
1 parent 1c05999 commit 5dd5d1a

File tree

1 file changed

+2
-4
lines changed

1 file changed

+2
-4
lines changed

src/llama-vocab.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,14 +2221,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
22212221
// find the first occurrence of a given special token in this fragment
22222222
// passing offset argument only limit the "search area" but match coordinates
22232223
// are still relative to the source full raw_text
2224-
auto match = raw_text.find(text, raw_text_base_offset);
2224+
// string_view begins at pos 0 for the same reason
2225+
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
22252226

22262227
// no occurrences found, stop processing this fragment for a given special token
22272228
if (match == std::string::npos) break;
22282229

2229-
// check if match is within bounds of offset <-> length
2230-
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
2231-
22322230
#ifdef PRETOKENIZERDEBUG
22332231
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
22342232
#endif

0 commit comments

Comments
 (0)
0