8000 `server`: add `--reasoning-budget 0` to disable thinking (incl. qwen3 w/ enable_thinking:false) by ochafik · Pull Request #13771 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

server: add --reasoning-budget 0 to disable thinking (incl. qwen3 w/ enable_thinking:false) #13771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 25, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
server: add --reasoning-format=disabled to disable thinking (incl. qw…
…en3 w/ enable_thinking:false)
  • Loading branch information
ochafik committed May 25, 2025
commit b457f89e72035a7967e8feda2e2f914cde014527
1 change: 1 addition & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2854,6 +2854,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else if (value == "disabled") { params.reasoning_format = COMMON_REASONING_FORMAT_DISABLED; }
else { std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
Expand Down
36 changes: 32 additions & 4 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ struct templates_params {
bool stream;
std::string grammar;
bool add_generation_prompt = true;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};

Expand Down Expand Up @@ -591,6 +592,16 @@ std::string common_chat_format_name(common_chat_format format) {
}
}

std::string common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DISABLED: return "disabled";
default:
throw std::runtime_error("Unknown reasoning format");
}
}

static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
std::string arguments;
if (builder.is_partial()) {
Expand Down Expand Up @@ -918,7 +929,11 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
data.thinking_forced_open = true;
if (!inputs.enable_thinking) {
data.prompt += "<|END_THINKING|>";
} else {
data.thinking_forced_open = true;
}
}

data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
Expand Down Expand Up @@ -1186,7 +1201,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
if (string_ends_with(data.prompt, "<think>\n")) {
data.thinking_forced_open = true;
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}

if (inputs.tools.is_array() && !inputs.tools.empty()) {
Expand Down Expand Up @@ -1460,10 +1479,18 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;

data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
json additional_context = {
{"enable_thinking", inputs.enable_thinking},
};

data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "<think>\n")) {
data.thinking_forced_open = true;
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}

if (!inputs.tools.is_null()) {
Expand Down Expand Up @@ -1671,6 +1698,7 @@ static common_chat_params common_chat_templates_apply_jinja(
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
params.add_generation_prompt = inputs.add_generation_prompt;
params.tool_choice = inputs.tool_choice;
params.enable_thinking = inputs.reasoning_format != COMMON_REASONING_FORMAT_DISABLED;
params.grammar = inputs.grammar;
params.now = inputs.now;
if (!inputs.json_schema.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};

Expand Down Expand Up @@ -182,6 +183,7 @@ std::string common_chat_format_example(
bool use_jinja);

std::string common_chat_format_name(common_chat_format format);
std::string common_reasoning_format_name(common_reasoning_format format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);

common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ struct common_params_vocoder {
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
COMMON_REASONING_FORMAT_DISABLED, // Disable thinking (causes any thinking tag to be closed, or empty thinking tags to be inserted, depending on the model)
};

struct common_params {
Expand Down
2 changes: 1 addition & 1 deletion tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ struct slot_params {
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
{"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")},
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
{"samplers", samplers},
Expand Down
32 changes: 32 additions & 0 deletions tools/server/tests/unit/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,38 @@ def create_server():
server.n_slots = 1


@pytest.mark.parametrize("template_name,enable_thinking,expected_end", [
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", True, "<think>\n"),
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", False, "<think>\n</think>"),

("Qwen-Qwen3-0.6B", True, "<|im_start|>assistant\n"),
("Qwen-Qwen3-0.6B", False, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),

("Qwen-QwQ-32B", True, "<|im_start|>assistant\n<think>\n"),
("Qwen-QwQ-32B", False, "<|im_start|>assistant\n<think>\n</think>"),

("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", True, "<|START_THINKING|>"),
("CohereForAI-c4ai-command-r7b-12-2024-tool_use-think", False, "<|START_THINKING|><|END_THINKING|>"),
])
def test_enable_thinking(template_name: str, enable_thinking: bool, expected_end: str):
global server
server.jinja = True
server.reasoning_format = 'deepseek' if enable_thinking else 'disabled'
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
server.start(timeout_seconds=TIMEOUT_SERVER_START)

res = server.make_request("POST", "/apply-template", data={
"messages": [
{"role": "user", "content": "What is today?"},
],
"tools": [TEST_TOOL],
})
assert res.status_code == 200
prompt = res.body["prompt"]

assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"


@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
@pytest.mark.parametrize("template_name,format", [
("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"),
Expand Down
2 changes: 1 addition & 1 deletion tools/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class ServerProcess:
draft_max: int | None = None
no_webui: bool | None = None
jinja: bool | None = None
reasoning_format: Literal['deepseek', 'none'] | None = None
reasoning_format: Literal['deepseek', 'none', 'disabled'] | None = None
chat_template: str | None = None
chat_template_file: str | None = None
server_path: str | None = None
Expand Down
Loading
0