8000 Merge remote-tracking branch 'origin/master' into fix-chat-parsers · ggml-org/llama.cpp@e064360 · GitHub
[go: up one dir, main page]

Skip to co 10000 ntent

Commit e064360

Browse files
committed
Merge remote-tracking branch 'origin/master' into fix-chat-parsers
2 parents 6c3f404 + 79c137f commit e064360

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1973
-714
lines changed

common/arg.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28482848
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
28492849
add_opt(common_arg(
28502850
{"--reasoning-format"}, "FORMAT",
2851-
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2852-
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2853-
"only supported for non-streamed responses",
2851+
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2852+
"- none: leaves thoughts unparsed in `message.content`\n"
2853+
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2854+
"(default: deepseek)",
28542855
[](common_params & params, const std::string & value) {
28552856
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
28562857
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2857-
else { std::invalid_argument("invalid value"); }
2858+
else { throw std::invalid_argument("invalid value"); }
28582859
}
28592860
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2861+
add_opt(common_arg(
2862+
{"--reasoning-budget"}, "N",
2863+
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2864+
[](common_params & params, int value) {
2865+
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2866+
params.reasoning_budget = value;
2867+
}
2868+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
28602869
add_opt(common_arg(
28612870
{"--chat-template"}, "JINJA_TEMPLATE",
28622871
string_format(
@@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29552964
[](common_params & params, const std::string & value) {
29562965
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
29572966
else if (value == "md") { params.batched_bench_output_jsonl = false; }
2958-
else { std::invalid_argument("invalid value"); }
2967+
else { throw std::invalid_argument("invalid value"); }
29592968
}
29602969
).set_examples({LLAMA_EXAMPLE_BENCH}));
29612970
add_opt(common_arg(

common/chat.cpp

Lines changed: 125 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct templates_params {
133133
bool stream;
134134
std::string grammar;
135135
bool add_generation_prompt = true;
136+
bool enable_thinking = true;
136137
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
137138
};
138139

@@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
573574
return tmpls;
574575
}
575576

576-
std::string common_chat_format_name(common_chat_format format) {
577+
const char * common_chat_format_name(common_chat_format format) {
577578
switch (format) {
578579
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
579580
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) {
591592
}
592593
}
593594

595+
const char * common_reasoning_format_name(common_reasoning_format format) {
596+
switch (format) {
597+
case COMMON_REASONING_FORMAT_NONE: return "none";
598+
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
599+
default:
600+
throw std::runtime_error("Unknown reasoning format");
601+
}
602+
}
603+
594604
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
595605
std::string arguments;
596606
if (builder.is_partial()) {
@@ -924,7 +934,13 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
924934
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
925935
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
926936
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
927-
data.thinking_forced_open = true;
937+
if (!inputs.enable_thinking) {
938+
data.prompt += "<|END_THINKING|>";
939+
} else {
940+
data.thinking_forced_open = true;
941+
}
942+
} else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
943+
data.prompt += "<|START_THINKING|><|END_THINKING|>";
928944
}
929945

930946
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1192,7 +1208,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
11921208
data.prompt = prompt;
11931209
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
11941210
if (string_ends_with(data.prompt, "<think>\n")) {
1195-
data.thinking_forced_open = true;
1211+
if (!inputs.enable_thinking) {
1212+
data.prompt += "</think>";
1213+
} else {
1214+
data.thinking_forced_open = true;
1215+
}
11961216
}
11971217

11981218
if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1477,104 +1497,114 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
14771497
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14781498
common_chat_params data;
14791499

1480-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1500+
json additional_context = {
1501+
{"enable_thinking", inputs.enable_thinking},
1502+
};
1503+
1504+
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
14811505
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
14821506
if (string_ends_with(data.prompt, "<think>\n")) {
1483-
data.thinking_forced_open = true;
1507+
if (!inputs.enable_thinking) {
1508+
data.prompt += "</think>";
1509+
} else {
1510+
data.thinking_forced_open = true;
1511+
}
14841512
}
14851513

1486-
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1487-
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1488-
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1489-
std::vector<std::string> tool_rules;
1490-
std::vector<std::string> tool_call_alts;
1491-
std::vector<std::string> escaped_names;
1492-
foreach_function(inputs.tools, [&](const json & tool) {
1493-
const auto & function = tool.at("function");
1494-
std::string name = function.at("name");
1495-
auto parameters = function.at("parameters");
1496-
builder.resolve_refs(parameters);
1497-
tool_rules.push_back(builder.add_schema(name + "-call", {
1498-
{"type", "object"},
1499-
{"properties", json {
1500-
{"name", json {{"const", name}}},
1501-
{"arguments", parameters},
1502-
}},
1503-
{"required", json::array({"name", "arguments"})},
1504-
}));
1505-
tool_call_alts.push_back(builder.add_rule(
1506-
name + "-function-tag",
1507-
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1508-
builder.add_schema(name + "-args", parameters) + " "
1509-
"\"</function>\" space"));
1514+
if (!inputs.tools.is_null()) {
1515+
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1516+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1517+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1518+
std::vector<std::string> tool_rules;
1519+
std::vector<std::string> tool_call_alts;
1520+
std::vector<std::string> escaped_names;
1521+
foreach_function(inputs.tools, [&](const json & tool) {
1522+
const auto & function = tool.at("function");
1523+
std::string name = function.at("name");
1524+
auto parameters = function.at("parameters");
1525+
builder.resolve_refs(parameters);
1526+
tool_rules.push_back(builder.add_schema(name + "-call", {
1527+
{"type", "object"},
1528+
{"properties", json {
1529+
{"name", json {{"const", name}}},
1530+
{"arguments", parameters},
1531+
}},
1532+
{"required", json::array({"name", "arguments"})},
1533+
}));
1534+
tool_call_alts.push_back(builder.add_rule(
1535+
name + "-function-tag",
1536+
"\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1537+
builder.add_schema(name + "-args", parameters) + " "
1538+
"\"</function>\" space"));
15101539

1511-
data.grammar_triggers.push_back({
1512-
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1513-
"<function=" + name + ">",
1540+
data.grammar_triggers.push_back({
1541+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1542+
"<function=" + name + ">",
1543+
});
1544+
auto escaped_name = regex_escape(name);
1545+
data.grammar_triggers.push_back({
1546+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1547+
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1548+
});
1549+
escaped_names.push_back(escaped_name);
15141550
});
1515-
auto escaped_name = regex_escape(name);
1551+
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1552+
std::vector<std::string> alt_tags {
1553+
any_tool_call,
1554+
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1555+
// The rest is just to accommodate common "good bad" outputs.
1556+
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1557+
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1558+
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1559+
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1560+
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1561+
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1562+
};
1563+
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1564+
tool_call_alts.push_back(wrappable_tool_call);
1565+
tool_call_alts.push_back(
1566+
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1567+
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1568+
builder.add_rule("root",
1569+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1570+
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1571+
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
15161572
data.grammar_triggers.push_back({
1517-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1518-
"<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1573+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1574+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1575+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1576+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1577+
"(\\s*"
1578+
"(?:<tool_call>"
1579+
"|<function"
1580+
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1581+
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1582+
")"
1583+
")[\\s\\S]*"
1584+
),
15191585
});
1520-
escaped_names.push_back(escaped_name);
1521-
});
1522-
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1523-
std::vector<std::string> alt_tags {
1524-
any_tool_call,
1525-
"\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1526-
// The rest is just to accommodate common "good bad" outputs.
1527-
"\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1528-
"\"<response>\" space " + any_tool_call + " \"</response>\"",
1529-
"\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1530-
"\"<json>\" space " + any_tool_call + " \"</json>\"",
1531-
"\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1532-
"\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1533-
};
1534-
auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1535-
tool_call_alts.push_back(wrappable_tool_call);
1536-
tool_call_alts.push_back(
1537-
"( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1538-
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1539-
builder.add_rule("root",
1540-
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1541-
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1542-
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
1543-
data.grammar_triggers.push_back({
1544-
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1545-
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1546-
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1547-
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1548-
"(\\s*"
1549-
"(?:<tool_call>"
1550-
"|<function"
1551-
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1552-
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1553-
")"
1554-
")[\\s\\S]*"
1555-
),
1586+
data.preserved_tokens = {
1587+
"<think>",
1588+
"</think>",
1589+
"<tool_call>",
1590+
"</tool_call>",
1591+
"<function",
1592+
"<tools>",
1593+
"</tools>",
1594+
"<response>",
1595+
"</response>",
1596+
"<function_call>",
1597+
"</function_call>",
1598+
"<json>",
1599+
"</json>",
1600+
"<JSON>",
1601+
"</JSON>",
1602+
"```",
1603+
"```json",
1604+
"```xml",
1605+
};
15561606
});
1557-
data.preserved_tokens = {
1558-
"<think>",
1559-
"</think>",
1560-
"<tool_call>",
1561-
"</tool_call>",
1562-
"<function",
1563-
"<tools>",
1564-
"</tools>",
1565-
"<response>",
1566-
"</response>",
1567-
"<function_call>",
1568-
"</function_call>",
1569-
"<json>",
1570-
"</json>",
1571-
"<JSON>",
1572-
"</JSON>",
1573-
"```",
1574-
"```json",
1575-
"```xml",
1576-
};
1577-
});
1607+
}
15781608

15791609
return data;
15801610
}
@@ -1688,6 +1718,7 @@ static common_chat_params common_chat_templates_apply_jinja(
16881718
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
16891719
params.add_generation_prompt = inputs.add_generation_prompt;
16901720
params.tool_choice = inputs.tool_choice;
1721+
params.enable_thinking = inputs.enable_thinking;
16911722
params.grammar = inputs.grammar;
16921723
params.now = inputs.now;
16931724
if (!inputs.json_schema.empty()) {
@@ -1721,7 +1752,7 @@ static common_chat_params common_chat_templates_apply_jinja(
17211752
}
17221753

17231754
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1724-
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
1755+
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
17251756
return common_chat_params_init_hermes_2_pro(tmpl, params);
17261757
}
17271758

@@ -1840,7 +1871,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
18401871
}
18411872

18421873
static void common_chat_parse(common_chat_msg_parser & builder) {
1843-
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format).c_str(), builder.input().c_str());
1874+
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
18441875

18451876
switch (builder.syntax().format) {
18461877
case COMMON_CHAT_FORMAT_CONTENT_ONLY:

common/chat.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
123123
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
124124
bool parallel_tool_calls = false;
125125
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126+
bool enable_thinking = true;
126127
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
127128
};
128129

@@ -182,7 +183,8 @@ std::string common_chat_format_example(
182183
const struct common_chat_templates * tmpls,
183184
bool use_jinja);
184185

185-
std::string common_chat_format_name(common_chat_format format);
186+
const char* common_chat_format_name(common_chat_format format);
187+
const char* common_reasoning_format_name(common_reasoning_format format);
186188
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
187189

188190
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ std::string fs_get_cache_directory() {
849849
if (getenv("LLAMA_CACHE")) {
850850
cache_directory = std::getenv("LLAMA_CACHE");
851851
} else {
852-
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
852+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
853853
if (std::getenv("XDG_CACHE_HOME")) {
854854
cache_directory = std::getenv("XDG_CACHE_HOME");
855855
} else {

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ struct common_params {
368368
bool use_jinja = false; // NOLINT
369369
bool enable_chat_template = true;
370370
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371+
int reasoning_budget = -1;
371372
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
372373

373374
std::vector<std::string> api_keys;

0 commit comments

Comments
 (0)
0