8000 `server`: add `--reasoning-budget 0` to disable thinking (incl. qwen3… · ggml-org/llama.cpp@e121edc · GitHub
[go: up one dir, main page]

Skip to content

Commit e121edc

Browse files
ochafikochafikngxson
authored
server: add --reasoning-budget 0 to disable thinking (incl. qwen3 w/ enable_thinking:false) (#13771)
--------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
1 parent 2f099b5 commit e121edc

File tree

12 files changed

+277
-107
lines changed

12 files changed

+277
-107
lines changed

common/arg.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28482848
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
28492849
add_opt(common_arg(
28502850
{"--reasoning-format"}, "FORMAT",
2851-
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2852-
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2853-
"only supported for non-streamed responses",
2851+
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2852+
"- none: leaves thoughts unparsed in `message.content`\n"
2853+
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2854+
"(default: deepseek)",
28542855
[](common_params & params, const std::string & value) {
28552856
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
28562857
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2857-
else { std::invalid_argument("invalid value"); }
2858+
else { throw std::invalid_argument("invalid value"); }
28582859
}
28592860
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2861+
add_opt(common_arg(
2862+
{"--reasoning-budget"}, "N",
2863+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2864+
[](common_params & params, int value) {
2865+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2866+
params.reasoning_budget = value;
2867+
}
2868+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
28602869
add_opt(common_arg(
28612870
{"--chat-template"}, "JINJA_TEMPLATE",
28622871
string_format(
@@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29552964
[](common_params & params, const std::string & value) {
29562965
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
29572966
else if (value == "md") { params.batched_bench_output_jsonl = false; }
2958-
else { std::invalid_argument("invalid value"); }
2967+
else { throw std::invalid_argument("invalid value"); }
29592968
}
29602969
).set_examples({LLAMA_EXAMPLE_BENCH}));
29612970
add_opt(common_arg(

common/chat.cpp

Lines changed: 126 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct templates_params {
133133
bool stream;
134134
std::string grammar;
135135
bool add_generation_prompt = true;
136+
bool enable_thinking = true;
136137
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
137138
};
138139

@@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
573574
return tmpls;
574575
}
575576

576-
std::string common_chat_format_name(common_chat_format format) {
577+
const char * common_chat_format_name(common_chat_format format) {
577578
switch (format) {
578579
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
579580
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) {
591592
}
592593
}
593594

595+
const char * common_reasoning_format_name(common_reasoning_format format) {
596+
switch (format) {
597+
case COMMON_REASONING_FORMAT_NONE: return "none";
598+
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
599+
default:
600+
throw std::runtime_error("Unknown reasoning format");
601+
}
602+
}
603+
594604
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
595605
std::string arguments;
596606
if (builder.is_partial()) {
@@ -918,7 +928,13 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
918928
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
919929
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
920930
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
921-
data.thinking_forced_open = true;
931+
if (!inputs.enable_thinking) {
932+
data.prompt += "<|END_THINKING|>";
933+
} else {
934+
data.thinking_forced_open = true;
935+
}
936+
} else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
937+
data.prompt += "<|START_THINKING|><|END_THINKING|>";
922938
}
923939

924940
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1186,7 +1202,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
11861202
data.prompt = prompt;
11871203
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
11881204
if (string_ends_with(data.prompt, "<think>\n")) {
1189-
data.thinking_forced_open = true;
1205+
if (!inputs.enable_thinking) {
1206+
data.prompt += "</think>";
1207+
} else {
1208+
data.thinking_forced_open = true;
1209+
}
11901210
}
11911211

11921212
if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1460,104 +1480,114 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
14601480
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14611481
common_chat_params data;
14621482

1463-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1483+
json additional_context = {
1484+
{"enable_thinking", inputs.enable_thinking},
1485+
};
1486+
1487+
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
14641488
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
14651489
if (string_ends_with(data.prompt, "<think>\n")) {
1466-
data.thinking_forced_open = true;
1490+
if (!inputs.enable_thinking) {
1491+
data.prompt += "</think>";
1492+
} else {
1493+
data.thinking_forced_open = true;
1494+
}
14671495
}
14681496

1469-
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1470-
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1471-
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1472-
std::vector<std::string> tool_rules;
1473-
std::vector<std::string> tool_call_alts;
1474-
std::vector<std::string> escaped_names;
1475-
foreach_function(inputs.tools, [&](const json & tool) {
1476-
const auto & function = tool.at("function");
1477-
std::string name = function.at("name");
1478-
auto parameters = function.at("parameters");
1479-
builder.resolve_refs(parameters);
1480-
tool_rules.push_back(builder.add_schema(name + "-call", {
1481-
{"type", "object"},
1482-
{"properties", json {
1483-
{"name", json {{"const", name}}},
1484-
{"arguments", parameters},
1485-
}},
1486-
{"required", json::array({"name", "arguments"})},
1487-
}));
1488-
tool_call_alts.push_back(builder.add_rule(
1489-
name + "-function-tag",
1490-
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1491-
builder.add_schema(name + "-args", parameters) + " "
1492-
"\"</function>\" space"));
1497+
if (!inputs.tools.is_null()) {
1498+
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1499+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1500+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1501+
std::vector<std::string> tool_rules;
1502+
std::vector<std::string> tool_call_alts;
1503+
std::vector<std::string> escaped_names;
1504+
foreach_function(inputs.tools, [&](const json & tool) {
1505+
const auto & function = tool.at("function");
1506+
std::string name = function.at("name");
1507+
auto parameters = function.at("parameters");
1508+
builder.resolve_refs(parameters);
1509+
tool_rules.push_back(builder.add_schema(name + "-call", {
1510+
{"type", "object"},
1511+
{"properties", json {
1512+
{"name", json {{"const", name}}},
1513+
{"arguments", parameters},
1514+
}},
1515+
{"required", json::array({"name", "arguments"})},
1516+
}));
1517+
tool_call_alts.push_back(builder.add_rule(
1518+
name + "-function-tag",
1519+
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1520+
builder.add_schema(name + "-args", parameters) + " "
1521+
"\"</function>\" space"));
14931522

1494-
data.grammar_triggers.push_back({
1495-
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1496-
"<function=" + name + ">",
1523+
data.grammar_triggers.push_back({
1524+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1525+
"<function=" + name + ">",
1526+
});
1527+
auto escaped_name = regex_escape(name);
1528+
data.grammar_triggers.push_back({
1529+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1530+
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1531+
});
1532+
escaped_names.push_back(escaped_name);
14971533
});
1498-
auto escaped_name = regex_escape(name);
1534+
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1535+
std::vector<std::string> alt_tags {
1536+
any_tool_call,
1537+
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1538+
// The rest is just to accommodate common "good bad" outputs.
1539+
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1540+
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1541+
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1542+
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1543+
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1544+
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1545+
};
1546+
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1547+
tool_call_alts.push_back(wrappable_tool_call);
1548+
tool_call_alts.push_back(
1549+
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1550+
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1551+
builder.add_rule("root",
1552+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1553+
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1554+
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
14991555
data.grammar_triggers.push_back({
1500-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1501-
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1556+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1557+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1558+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1559+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1560+
"(\\s*"
1561+
"(?:<tool_call>"
1562+
"|<function"
1563+
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1564+
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1565+
")"
1566+
")[\\s\\S]*"
1567+
),
15021568
});
1503-
escaped_names.push_back(escaped_name);
1504-
});
1505-
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1506-
std::vector<std::string> alt_tags {
1507-
any_tool_call,
1508-
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1509-
// The rest is just to accommodate common "good bad" outputs.
1510-
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1511-
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1512-
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1513-
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1514-
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1515-
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1516-
};
1517-
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1518-
tool_call_alts.push_back(wrappable_tool_call);
1519-
tool_call_alts.push_back(
1520-
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1521-
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1522-
builder.add_rule("root",
1523-
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1524-
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1525-
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
1526-
data.grammar_triggers.push_back({
1527-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1528-
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1529-
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1530-
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1531-
"(\\s*"
1532-
"(?:<tool_call>"
1533-
"|<function"
1534-
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1535-
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1536-
")"
1537-
")[\\s\\S]*"
1538-
),
1569+
data.preserved_tokens = {
1570+
"<think>",
1571+
"</think>",
1572+
"<tool_call>",
1573+
"</tool_call>",
1574+
"<function",
1575+
"<tools>",
1576+
"</tools>",
1577+
"<response>",
1578+
"</response>",
1579+
"<function_call>",
1580+
"</function_call>",
1581+
"<json>",
1582+
"</json>",
1583+
"<JSON>",
1584+
"</JSON>",
1585+
"```",
1586+
"```json",
1587+
"```xml",
1588+
};
15391589
});
1540-
data.preserved_tokens = {
1541-
"<think>",
1542-
"</think>",
1543-
"<tool_call>",
1544-
"</tool_call>",
1545-
"<function",
1546-
"<tools>",
1547-
"</tools>",
1548-
"<response>",
1549-
"</response>",
1550-
"<function_call>",
1551-
"</function_call>",
1552-
"<json>",
1553-
"</json>",
1554-
"<JSON>",
1555-
"</JSON>",
1556-
"```",
1557-
"```json",
1558-
"```xml",
1559-
};
1560-
});
1590+
}
15611591

15621592
return data;
15631593
}
@@ -1669,6 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
16691699
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
16701700
params.add_generation_prompt = inputs.add_generation_prompt;
16711701
params.tool_choice = inputs.tool_choice;
1702+
params.enable_thinking = inputs.enable_thinking;
16721703
params.grammar = inputs.grammar;
16731704
params.now = inputs.now;
16741705
if (!inputs.json_schema.empty()) {
@@ -1702,7 +1733,7 @@ static common_chat_params common_chat_templates_apply_jinja(
17021733
}
17031734

17041735
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1705-
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
1736+
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
17061737
return common_chat_params_init_hermes_2_pro(tmpl, params);
17071738
}
17081739

@@ -1821,7 +1852,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
18211852
}
18221853

18231854
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
1824-
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
1855+
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
18251856

18261857
switch (format) {
18271858
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
@@ -1858,7 +1889,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
18581889
common_chat_parse_command_r7b(builder);
18591890
break;
18601891
default:
1861-
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
1892+
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
18621893
}
18631894
builder.finish();
18641895
}

common/chat.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
123123
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
124124
bool parallel_tool_calls = false;
125125
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126+
bool enable_thinking = true;
126127
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
127128
};
128129

@@ -181,7 +182,8 @@ std::string common_chat_format_example(
181182
const struct common_chat_templates * tmpls,
182183
bool use_jinja);
183184

184-
std::string common_chat_format_name(common_chat_format format);
185+
const char* common_chat_format_name(common_chat_format format);
186+
const char* common_reasoning_format_name(common_reasoning_format format);
185187
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
186188

187189
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ struct common_params {
368368
bool use_jinja = false; // NOLINT
369369
bool enable_chat_template = true;
370370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
int reasoning_budget = -1;
371372
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
372373

373374
std::vector<std::string> api_keys;

0 commit comments

Comments
 (0)
0