Misc. bug: llama_tokenize parse_special is ignored in newer versions #12743

RobbyCBennett · 2025-04-03T21:14:56Z

Name and Version

llama.cpp commit: 5dd5d1a

Operating systems

Linux

Which llama.cpp modules do you know to be affected?

libllama (core library)

Command line

build/bin/llama-simple -m Llama-3.2-1B-Instruct-Q8_0.gguf

Problem description & steps to reproduce

Steps to replicate:

Build and run the code, which outputs control tokens, even though it shouldn't.

#include "llama.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>

static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
    printf("\n");
}

static void discard_log(ggml_log_level level, const char *text, void *user_data)
{}

int main(int argc, char ** argv) {
    llama_log_set(discard_log, nullptr);

    // path to the model gguf file
    std::string model_path;
    // prompt to generate text from
    std::string prompt = "<|im_start|>system\nAct as a professional programmer, and answer questions based on the given prompt and data. Here are some rules you must always follow in your responses:\n- Generate only the requested output. Don't include any other language before or after the requested output. Answer in markdown language.\n- Only answer in plain English in markdown. Never respond in code. Never give code snippets.\n<|im_end|>\n<|im_start|>user\nProvide a VERY SHORT overview of the following local object called \"cp2\". Do not state the obvious. Do not respond with a title/header.\n```c\n    register char *cp1, *cp2;\n\n```\n<|im_end|>\n<|im_start|>assistant\n";
    // number of layers to offload to the GPU
    int ngl = 99;
    // number of tokens to predict
    int n_predict = 1024;

    // parse command line arguments

    {
        int i = 1;
        for (; i < argc; i++) {
            if (strcmp(argv[i], "-m") == 0) {
                if (i + 1 < argc) {
                    model_path = argv[++i];
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-n") == 0) {
                if (i + 1 < argc) {
                    try {
                        n_predict = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-ngl") == 0) {
                if (i + 1 < argc) {
                    try {
                        ngl = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else {
                // prompt starts here
                break;
            }
        }
        if (model_path.empty()) {
            print_usage(argc, argv);
            return 1;
        }
        if (i < argc) {
            prompt = argv[i++];
            for (; i < argc; i++) {
                prompt += " ";
                prompt += argv[i];
            }
        }
    }

    // load dynamic backends

    ggml_backend_load_all();

    // initialize the model

    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

    // tokenize the prompt

    // find the number of tokens in the prompt
    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, false, true);

    // allocate space for the tokens and tokenize the prompt
    std::vector<llama_token> prompt_tokens(n_prompt);
    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), false, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }

    // initialize the context

    llama_context_params ctx_params = llama_context_default_params();
    // n_ctx is the context size
    ctx_params.n_ctx = n_prompt + n_predict - 1;
    // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
    ctx_params.n_batch = n_prompt;
    // enable performance counters
    ctx_params.no_perf = false;

    llama_context * ctx = llama_init_from_model(model, ctx_params);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

    // initialize the sampler

    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;
    llama_sampler * smpl = llama_sampler_chain_init(sparams);

    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

    // prepare a batch for the prompt

    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

    // main loop

    const auto t_main_start = ggml_time_us();
    int n_decode = 0;
    llama_token new_token_id;

    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }

        n_pos += batch.n_tokens;

        // sample the next token
        {
            new_token_id = llama_sampler_sample(smpl, ctx, -1);

            // is it an end of generation?
            if (llama_vocab_is_eog(vocab, new_token_id)) {
                break;
            }

            char buf[128];
            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                return 1;
            }
            std::string s(buf, n);
            printf("%s", s.c_str());
            fflush(stdout);

            // prepare the next batch with the sampled token
            batch = llama_batch_get_one(&new_token_id, 1);

            n_decode += 1;
        }
    }

    printf("\n");

    const auto t_main_end = ggml_time_us();

    llama_perf_sampler_print(smpl);
    llama_perf_context_print(ctx);

    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_model_free(model);

    return 0;
}

First Bad Commit

No response

Relevant log output

The output has control tokens like the following.

<|im_end|>

The text was updated successfully, but these errors were encountered:

github-actions · 2025-05-18T01:07:58Z

This issue was closed because it has been inactive for 14 days since being marked as stale.

RobbyCBennett added the bug-unconfirmed label Apr 3, 2025

github-actions bot added the stale label May 4, 2025

github-actions bot closed this as completed May 18, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Misc. bug: llama_tokenize parse_special is ignored in newer versions #12743

Misc. bug: llama_tokenize parse_special is ignored in newer versions #12743

Uh oh!

Misc. bug: llama_tokenize parse_special is ignored in newer versions #12743

Misc. bug: llama_tokenize parse_special is ignored in newer versions #12743

Comments

Uh oh!

Name and Version

Operating systems

Which llama.cpp modules do you know to be affected?

Command line

Problem description & steps to reproduce

First Bad Commit

Relevant log output

Uh oh!