8000 server: fix streaming crashes by ochafik · Pull Request #13786 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

server: fix streaming crashes #13786

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 26, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
allow all parsers to parse non-tool-call content.
  • Loading branch information
ochafik committed May 26, 2025
commit 6f8c7aa0368367bd504b9538f24934f995a0fef7
1 change: 1 addition & 0 deletions common/chat-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class common_chat_msg_parser {
const std::string & healing_marker() const { return healing_marker_; }
const bool & is_partial() const { return is_partial_; }
const common_chat_msg & result() const { return result_; }
const common_chat_syntax & syntax() const { return syntax_; }

void move_to(size_t pos) {
if (pos > input_.size()) {
Expand Down
40 changes: 35 additions & 5 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,10 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
return data;
}
static void common_chat_parse_generic(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const std::vector<std::vector<std::string>> content_paths = {
{"response"},
};
Expand Down Expand Up @@ -892,6 +896,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
return data;
}
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}

static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
parse_prefixed_json_tool_call_array(builder, prefix);
}
Expand Down Expand Up @@ -1104,6 +1113,11 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}

static const common_regex function_regex(
"\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
static const common_regex close_regex("\\}\\s*");
Expand Down Expand Up @@ -1225,6 +1239,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
}
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}

static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
Expand Down Expand Up @@ -1286,6 +1304,10 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
return data;
}
static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
static const common_regex prefix(regex_escape(" functools["));
parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
}
Expand Down Expand Up @@ -1427,6 +1449,10 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
return data;
}
static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
// This version of Functionary still supports the llama 3.1 tool call format for the python tool.
static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));

Expand Down Expand Up @@ -1554,6 +1580,10 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
}
static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}

static const common_regex open_regex(
"(?:"
Expand Down Expand Up @@ -1809,10 +1839,10 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.add_content(builder.consume_rest());
}

static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
static void common_chat_parse(common_chat_msg_parser & builder) {
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format).c_str(), builder.input().c_str());

switch (format) {
switch (builder.syntax().format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
common_chat_parse_content_only(builder);
break;
Expand Down Expand Up @@ -1847,15 +1877,15 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
common_chat_parse_command_r7b(builder);
break;
default:
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
builder.finish();
}

common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
common_chat_msg_parser builder(input, is_partial, syntax);
try {
common_chat_parse(builder, syntax.format);
common_chat_parse(builder);
} catch (const common_chat_msg_partial_exception & ex) {
LOG_DBG("Partial parse: %s\n", ex.what());
if (!is_partial) {
Expand Down
1 change: 1 addition & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ struct common_chat_syntax {
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
};

// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
Expand Down
72 changes: 46 additions & 26 deletions tests/test-chat.cpp
< A3D4 td id="diff-1c7deadc6d883e0ace74903fdc5d9f93cd403aee331139e18a49ea30a20b84e8L625" data-line-number="625" class="blob-num blob-num-context js-linkable-line-number">
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,12 @@ static common_chat_msg simple_assist_msg(const std::string & content, const std:
}
return msg;
}
const common_chat_msg message_assist = simple_assist_msg("Hello, world!\nWhat's up?");
const common_chat_msg message_assist_empty = simple_assist_msg("");
const common_chat_msg message_assist_thoughts_unparsed_deepseek = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
const common_chat_msg message_assist = simple_assist_msg("Hello, world!\nWhat's up?");
const common_chat_msg message_assist_empty = simple_assist_msg("");
const common_chat_msg message_assist_thoughts_unparsed_deepseek = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
const common_chat_msg message_assist_thoughts_unparsed_md = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```");
const common_chat_msg message_assist_thoughts_unparsed_md_partial = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}");

const common_chat_msg message_assist_thoughts_unparsed_r7b = simple_assist_msg("<|START_THINKING|>I'm\nthinking<|END_THINKING|>Hello, world!\nWhat's up?");
const common_chat_msg message_assist_thoughts = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking");
const common_chat_msg message_assist_thoughts_unopened_unparsed = simple_assist_msg("I'm\nthinking</think>Hello, world!\nWhat's up?");
Expand Down Expand Up @@ -591,8 +594,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
common_chat_parse(
Expand All @@ -619,8 +620,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_call_idx,
common_chat_parse(
Expand All @@ -632,8 +631,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_no_content,
common_chat_parse(
Expand All @@ -644,8 +641,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));

test_templates(tmpls.get(), end_tokens, message_assist_call_idx, tools,
Expand Down Expand Up @@ -675,6 +670,18 @@ static void test_template_output_parsers() {

// Generic tool calls doesn't generate / parse content-only messages symmetrically.

assert_equals(
simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"),
common_chat_parse(
"{ \"tool_call\" : { \"name\" : \"t",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_GENERIC,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ true,
/* .parse_tool_calls = */ false,
}));
assert_equals(
message_assist_empty,
common_chat_parse(
Expand Down Expand Up @@ -776,8 +783,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(
simple_assist_msg("Let's call something\n"),
Expand All @@ -788,8 +793,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_call_thoughts,
common_chat_parse(
Expand Down Expand Up @@ -979,7 +982,34 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
}));
assert_msg_equals(message_assist_thoughts_unparsed_md,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
/* is_partial= */ false,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ true,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
}));
assert_msg_equals(message_assist_thoughts_unparsed_md_partial,
common_chat_parse(
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
/* is_partial= */ true,
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ true,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unopened_unparsed,
Expand All @@ -989,8 +1019,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
Expand Down Expand Up @@ -1187,8 +1215,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts_unopened_unparsed,
common_chat_parse(
Expand All @@ -1197,8 +1223,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
Expand Down Expand Up @@ -1252,8 +1276,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
assert_msg_equals(message_assist_thoughts,
common_chat_parse(
Expand Down Expand Up @@ -1295,8 +1317,6 @@ static void test_template_output_parsers() {
{
/* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
}));
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
"<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
Expand Down
1 change: 1 addition & 0 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ struct server_task {
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
}

{
Expand Down
7 changes: 5 additions & 2 deletions tools/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,8 +732,11 @@ static json oaicompat_chat_params_parse(
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
inputs.reasoning_format = opt.reasoning_format;
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
if (body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}
llama_params["parse_tool_calls"] = true;
}

// if the assistant message appears at the end of list, we do not add end-of-turn token
Expand Down
163B
0