diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 737bae27c7206..560df0cefb73d 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,6 +11,9 @@
 
 using json = nlohmann::ordered_json;
 
+const char * DOTALL = "[\\U00000000-\\U0010FFFF]";
+const char * DOT = "[^\\x0A\\x0D]";
+
 template <typename Iterator>
 static std::string join(Iterator begin, Iterator end, const std::string & separator);
 
@@ -160,6 +163,29 @@ static std::string format_literal(const std::string & literal) {
     return "\"" + escaped + "\"";
 }
 
+/*
+    not_literal('a') -> '[^a]'
+    not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+*/
+// static std::string not_literal(const std::string & literal, bool dotall = true) {
+//     assert(literal.size() > 0);
+//     std::stringstream out;
+//     std::function<void(int)> recurse = [&](size_t i) {
+//         const char & c = literal[i];
+//         out << "[^" << c << "]";
+//         out << " " << (dotall ? DOTALL : DOT) << "*";
+//         if (i < literal.size() - 1) {
+//             out << " | " << format_literal(literal.substr(i, 1)) << " (";
+//             recurse(i + 1);
+//             out << ")?";
+//         }
+//     };
+//     out << "(";
+//     recurse(0);
+//     out << ")";
+//     return out.str();
+// }
+
 
 class SchemaConverter {
 private:
@@ -171,22 +197,6 @@ class SchemaConverter {
     std::vector<std::string> _errors;
     std::vector<std::string> _warnings;
 
-    std::string _add_rule(const std::string & name, const std::string & rule) {
-        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
-        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
-            _rules[esc_name] = rule;
-            return esc_name;
-        } else {
-            int i = 0;
-            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
-                i++;
-            }
-            std::string key = esc_name + std::to_string(i);
-            _rules[key] = rule;
-            return key;
-        }
-    }
-
     std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
         std::vector<std::string> rules;
         for (size_t i = 0; i < alt_schemas.size(); i++) {
@@ -219,11 +229,11 @@ class SchemaConverter {
             auto get_dot = [&]() {
                 std::string rule;
                 if (_dotall) {
-                    rule = "[\\U00000000-\\U0010FFFF]";
+                    rule = DOTALL;
                 } else {
-                    rule = "[^\\x0A\\x0D]";
+                    rule = DOT;
                 }
-                return _add_rule("dot", rule);
+                return add_rule("dot", rule);
             };
 
             // Joins the sequence, merging consecutive literals together.
@@ -340,7 +350,7 @@ class SchemaConverter {
                     if (!sub_is_literal) {
                         std::string & sub_id = sub_rule_ids[sub];
                         if (sub_id.empty()) {
-                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                            sub_id = add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
                         }
                         sub = sub_id;
                     }
@@ -385,7 +395,7 @@ class SchemaConverter {
             }
             return join_seq();
         };
-        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+        return add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
     }
 
     std::string _resolve_ref(const std::string & ref) {
@@ -413,7 +423,7 @@ class SchemaConverter {
             const auto &prop_schema = kv.second;
 
             std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
-            prop_kv_rule_names[prop_name] = _add_rule(
+            prop_kv_rule_names[prop_name] = add_rule(
                 name + (name.empty() ? "" : "-") + prop_name + "-kv",
                 format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
             );
@@ -426,7 +436,7 @@ class SchemaConverter {
         if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
             std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
             std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
             prop_kv_rule_names["*"] = kv_rule;
             optional_props.push_back("*");
         }
@@ -453,7 +463,7 @@ class SchemaConverter {
                 std::string k = ks[0];
                 std::string kv_rule_name = prop_kv_rule_names[k];
                 if (k == "*") {
-                    res = _add_rule(
+                    res = add_rule(
                         name + (name.empty() ? "" : "-") + "additional-kvs",
                         kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
                     );
@@ -463,7 +473,7 @@ class SchemaConverter {
                     res = kv_rule_name;
                 }
                 if (ks.size() > 1) {
-                    res += " " + _add_rule(
+                    res += " " + add_rule(
                         name + (name.empty() ? "" : "-") + k + "-rest",
                         get_recursive_refs(std::vector<std::string>(ks.begin() + 1, ks.end()), true)
                     );
@@ -489,7 +499,7 @@ class SchemaConverter {
     }
 
     std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
-        auto n = _add_rule(name, rule.content);
+        auto n = add_rule(name, rule.content);
         for (const auto & dep : rule.deps) {
             BuiltinRule dep_rule;
             auto it = PRIMITIVE_RULES.find(dep);
@@ -577,6 +587,62 @@ class SchemaConverter {
         visit_refs(schema);
     }
 
+/*
+    reply ::= prefix tool-call*
+
+    prefix ::= [^<] prefix
+                | "<" [^t] prefix
+                | "<t" [^o] prefix
+                | "<to" [^o] prefix
+                | "<too" [^l] prefix
+                | "<tool" [^_] prefix
+                | "<tool_" [^c] prefix
+                | "<tool_c" [^a] prefix
+                | "<tool_ca" [^l] prefix
+                | "<tool_cal" [^l] prefix
+                | "<tool_call" [^l] prefix
+                | "<tool_call" [^>] prefix
+                |
+
+*/
+
+    std::string not_literal(const std::string & literal) {
+        auto rule_name = _find_rule_name("not" + literal, "!!!");
+        std::stringstream out;
+        for (size_t i = 0, n = literal.size(); i < n; i++) {
+            out << " | ";
+            if (i > 0) {
+                out << format_literal(literal.substr(0, i)) << " ";
+            }
+            out << "[^" << literal[i] << "] " << rule_name.c_str();
+        }
+        _rules[rule_name] = out.str();
+        return rule_name;
+    }
+
+    std::string _escape_name(const std::string & name) {
+        return regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+    }
+    std::string _find_rule_name(const std::string & name, const std::string & rule) {
+        auto esc_name = _escape_name(name);
+        int i = 0;
+        while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+            i++;
+        }
+        return esc_name + std::to_string(i);
+    }
+    std::string add_rule(const std::string & name, const std::string & rule) {
+        auto esc_name = _escape_name(name);
+        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
+            _rules[esc_name] = rule;
+            return esc_name;
+        } else {
+            auto key = _find_rule_name(esc_name, rule);
+            _rules[key] = rule;
+            return key;
+        }
+    }
+
     std::string _generate_constant_rule(const json & value) {
         return format_literal(value.dump());
     }
@@ -587,24 +653,24 @@ class SchemaConverter {
         std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
 
         if (schema.contains("$ref")) {
-            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
+            return add_rule(rule_name, _resolve_ref(schema["$ref"]));
         } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
             std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
-            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+            return add_rule(rule_name, _generate_union_rule(name, alt_schemas));
         } else if (schema_type.is_array()) {
             std::vector<json> schema_types;
             for (const auto & t : schema_type) {
                 schema_types.push_back({{"type", t}});
             }
-            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
+            return add_rule(rule_name, _generate_union_rule(name, schema_types));
         } else if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return add_rule(rule_name, _generate_constant_rule(schema["const"]));
         } else if (schema.contains("enum")) {
             std::vector<std::string> enum_values;
             for (const auto & v : schema["enum"]) {
                 enum_values.push_back(_generate_constant_rule(v));
             }
-            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+            return add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
         } else if ((schema_type.is_null() || schema_type == "object")
                 && (schema.contains("properties") ||
                     (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -622,7 +688,7 @@ class SchemaConverter {
                     properties.emplace_back(prop.key(), prop.value());
                 }
             }
-            return _add_rule(rule_name,
+            return add_rule(rule_name,
                 _build_object_rule(
                     properties, required, name,
                     schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
@@ -653,7 +719,7 @@ class SchemaConverter {
                     add_component(t, true);
                 }
             }
-            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
+            return add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
         } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
             json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
             if (items.is_array()) {
@@ -665,14 +731,14 @@ class SchemaConverter {
                     rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                 }
                 rule += " \"]\" space";
-                return _add_rule(rule_name, rule);
+                return add_rule(rule_name, rule);
             } else {
                 std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
                 int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                 json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
                 int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
 
-                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+                return add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
             }
         } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
             return _visit_pattern(schema["pattern"], rule_name);
@@ -680,14 +746,14 @@ class SchemaConverter {
             return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
         } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
             auto prim_name = schema_format + "-string";
-            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+            return add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
         } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
             std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
             int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
             int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+            return add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
         } else if (schema.empty() || schema_type == "object") {
-            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
+            return add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
         } else {
             if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                 _errors.push_back("Unrecognized schema: " + schema.dump());
@@ -724,3 +790,45 @@ std::string json_schema_to_grammar(const json & schema) {
     converter.check_errors();
     return converter.format_grammar();
 }
+
+std::string tool_call_grammar(const json & tools, bool allow_parallel_calls, bool allow_content) {
+    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+    
+    std::vector<std::string> tool_rules;
+    
+    for (const auto & tool : tools) {
+        const auto & function = tool["function"];
+        std::string name = function["name"];
+        std::string description = function.contains("description") ? function["description"] : "";
+        auto parameters_copy = function["parameters"];
+        converter.resolve_refs(parameters_copy, name);
+
+        tool_rules.push_back(converter.visit(json {
+            {"type", "object"},
+            {"description", description},
+            {"properties", json {
+                {"name", json {{"const", name}}},
+                {"arguments", parameters_copy},
+            }},
+            {"required", json::array({"name", "arguments"})},
+        }, name + "-tool-call"));
+    }
+
+    converter.add_rule(
+        "root",
+        (allow_content ? converter.not_literal("<tool_call>") + " | " : "") +
+        build_repetition(
+            converter.add_rule(
+                "tool_call",
+                "\"<tool_call>\" (" 
+                + join(tool_rules.begin(), tool_rules.end(), " | ")
+                + ") \"</tool_call>\""
+            ),
+            allow_content ? 0 : 1,
+            allow_parallel_calls ? std::numeric_limits<int>::max() : 1,
+            " \"\\n\" "
+        ));
+
+    converter.check_errors();
+    return converter.format_grammar();
+}
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index 41623b3464528..e0219cecef809 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -5,4 +5,5 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 
+std::string tool_call_grammar(const nlohmann::ordered_json & tools, bool allow_parallel_calls = false, bool allow_content = true);
 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/examples/agent/README.md b/examples/agent/README.md
new file mode 100644
index 0000000000000..c2df9c8f55526
--- /dev/null
+++ b/examples/agent/README.md
@@ -0,0 +1,218 @@
+# examples.agent: Interactive agent that can use Python tools!
+
+Have any LLM use local (sandboxed) tools, with a simple CLI.
+
+```bash
+python -m examples.agent \
+    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+    --tools examples/agent/tools/example_math_tools.py \
+    --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What's a third of the result?" \
+    --greedy
+```
+
+<details>
+<summary>Show output</summary>
+
+```bash
+💭 First, I will calculate the square of 2535, then add it to 32222000403. After that, I will multiply the result by 1.5 and finally, I will divide the result by 3.
+⚙️  pow(value=2535, power=2) -> 6426225.0
+💭 Now that I have calculated the square of 2535, I will calculate the sum of 6426225 and 32222000403.
+⚙️  add(a=6426225, b=32222000403) -> 32228426628
+💭 Now that I have calculated the sum, I will multiply it by 1.5.
+⚙️  multiply(a=32228426628, b=1.5) -> 48342639942.0
+💭 Now that I have calculated the product, I will divide it by 3.
+⚙️  divide(a=48342639942.0, b=3) -> 16114213314.0
+➡️ "\nThe result of the calculation is 16114213314.0."
+```
+
+</details>
+
+```bash
+python -m examples.agent \
+    --tools examples/agent/tools/fake_weather_tools.py \
+    --goal "What is the weather going to be like in San Francisco and Glasgow over the next 4 days." \
+    --greedy
+```
+
+<details>
+<summary>Show output</summary>
+
+```bash
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_current_weather(location=San Francisco, format=fahrenheit) -> ...
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_n_day_weather_forecast(location=San Francisco, format=fahrenheit, num_days=4) -> ...
+💭 I will first get the current weather in San Francisco, then get the 4-day weather forecast for both San Francisco and Glasgow.
+⚙️  get_n_day_weather_forecast(location=Glasgow, format=celsius, num_days=4) -> ...
+The current weather in San Francisco is sunny and 87.8F. Here is the 4-day weather forecast:
+
+For San Francisco:
+- In 1 day: Cloudy, 60.8F
+- In 2 days: Sunny, 73.4F
+- In 3 days: Cloudy, 62.6F
+
+For Glasgow:
+- In 1 day: Cloudy, 16C
+- In 2 days: Sunny, 23C
+- In 3 days: Cloudy, 17C
+```
+
+</details>
+
+
+```bash
+python -m examples.agent \
+    --model ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf \
+    --std_tools \
+    --goal "Wait 10sec then say Hi out loud" \
+    --greedy
+```
+
+<details>
+<summary>Show output</summary>
+
+```bash
+```
+
+</details>
+
+## Prerequisites
+
+Note: To get conda, just install Miniforge (it's OSS): https://github.com/conda-forge/miniforge
+
+```bash
+conda create -n agent python=3.11
+conda activate agent
+pip install -r examples/agent/requirements.txt
+pip install -r examples/openai/requirements.txt
+```
+
+## Components
+
+This example relies on the new [OpenAI compatibility server](../openai).
+
+```
+  agent.py  →  examples.openai  →  server.cpp
+            →  safe_tools.py
+            → ( run_sandboxed_tools.sh :  Docker  →  fastify.py )  →  unsafe_tools.py  →  code interpreter, etc...
+```
+
+The agent can use tools written in Python, or (soon) exposed under OpenAPI endpoints. Only has standard Python deps (e.g. no langchain)
+
+- Can call into any OpenAI endpoint that supports tool calling, spawns a local one if `--endpoint` isn't specified
+(can pass all llama.cpp params)
+
+- [Standard tools](./tools/std.py) include "safe" TTS, wait for/until helpers, and *requesting user input*.
+
+- Tools are often "unsafe" (e.g. [Python execution functions](./tools/unsafe_python_tools.py)),
+so we provide a script to run them in a Docker-sandboxed environment, exposed as an OpenAPI server:
+
+    ```bash
+    # With limactl, the default sandbox location ~/.llama.cpp/sandbox won't be writable
+    # (see https://github.com/lima-vm/lima/discussions/393)
+    # export DATA_DIR=/tmp/lima/llama.cpp/sandbox
+    PORT=9999 examples/agent/run_sandboxed_tools.sh \
+        examples/agent/tools/unsafe_python_tools.py &
+
+    python -m examples.agent \
+        --tools http://localhost:9999 \
+        --goal "Whats cos(123) / 23 * 12.6 ?"
+    ```
+
+    <details>
+    <summary>Show output</summary>
+
+    ```
+    💭 Calculate the expression using Python
+    ⚙️  execute_python(source="import math\nresult = math.cos(123) / 23 * 12.6") -> {'result': -0.4864525314920599}
+    ➡️ "-0.4864525314920599"
+    ```
+
+    </details>
+
+    - [fastify.py](./fastify.py) turns a python module into an [OpenAPI](https://www.openapis.org/) endpoint using [FastAPI](https://fastapi.tiangolo.com/)
+
+    - [run_sandboxed_tools.sh](./run_sandboxed_tools.sh) builds and runs a Docker environment with fastify inside it, and exposes its port locally
+
+- Beyond just "tools", output format can be constrained using [JSON schemas](https://json-schema.org/) or [Pydantic](https://docs.pydantic.dev/latest/) types
+
+    ```bash
+    python -m examples.agent \
+        --tools examples/agent/tools/example_summaries.py \
+        --format PyramidalSummary \
+        --goal "Create a pyramidal summary of Mankind's recent advancements"
+    ```
+
+## Launch parts separately
+
+If you'd like to debug each binary separately (rather than have an agent spawing an OAI compat proxy spawning a C++ server), you can run these commands:
+
+```bash
+# C++ server
+make -j server
+./server \
+    --model mixtral.gguf \
+    --metrics \
+    -ctk q4_0 \
+    -ctv f16 \
+    -c 32768 \
+    --port 8081
+
+# OpenAI compatibility layer
+python -m examples.openai \
+    --port 8080 \
+    --endpoint http://localhost:8081 \
+    --template-hf-model-id-fallback mistralai/Mixtral-8x7B-Instruct-v0.1
+
+# Or have the OpenAI compatibility layer spawn the C++ server under the hood:
+#   python -m examples.openai --model mixtral.gguf
+
+# Agent itself:
+python -m examples.agent \
+    --endpoint http://localhost:8080 \
+    --tools examples/agent/tools/example_summaries.py \
+    --format PyramidalSummary \
+    --goal "Create a pyramidal summary of Mankind's recent advancements"
+```
+
+## Use existing tools (WIP)
+
+```bash
+git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+```
+
+Then edit `examples/agents/hermes_function_calling/utils.py`:
+
+```py
+log_folder = os.environ.get('LOG_FOLDER', os.path.join(script_dir, "inference_logs"))
+```
+
+Then run tools in a sandbox:
+
+```bash
+REQUIREMENTS_FILE=<( cat examples/agents/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) \
+  examples/agents/run_sandboxed_tools.sh \
+    examples/agents/hermes_function_calling/functions.py \
+    -e LOG_FOLDER=/data/inference_logs
+```
+
+## TODO
+
+- Wait for spawned servers to be heathly
+
+- Add model URL / HF loading support
+
+- Add Embedding endpoint + storage / retrieval tools (Faiss? ScaNN?), or spontaneous RAG
+
+- Auto discover tools exposed by an OpenAPI endpoint
+
+- Add a Python notebook tool example
+
+- Update `run_sandboxed_tools.sh` to support dev mode (`uvicorn fastify:app --reload`)
+
+- Follow-ups (depending on the vibe)
+
+    - Remove OAI support from server
+
+    - Remove non-Python json schema to grammar converters
+
diff --git a/examples/agent/__main__.py b/examples/agent/__main__.py
new file mode 100644
index 0000000000000..299acbd43259e
--- /dev/null
+++ b/examples/agent/__main__.py
@@ -0,0 +1,6 @@
+import typer
+
+from examples.agent.agent import main
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/examples/agent/agent.py b/examples/agent/agent.py
new file mode 100644
index 0000000000000..b3a5c6d4b9c23
--- /dev/null
+++ b/examples/agent/agent.py
@@ -0,0 +1,329 @@
+from pathlib import Path
+import sys
+from time import sleep
+import typer
+from pydantic import BaseModel, Json, TypeAdapter
+from pydantic_core import SchemaValidator, core_schema
+from typing import Annotated, Any, Callable, Dict, List, Union, Optional, Type
+import json, requests
+
+from examples.agent.openapi_client import OpenAPIMethod, openapi_methods_from_endpoint
+from examples.agent.tools.std_tools import StandardTools
+from examples.openai.api import ChatCompletionRequest, ChatCompletionResponse, Message, ResponseFormat, Tool, ToolFunction
+from examples.agent.utils import collect_functions, load_module
+from examples.openai.prompting import ToolsPromptStyle
+from examples.openai.subprocesses import spawn_subprocess
+
+def make_call_adapter(ta: TypeAdapter, fn: Callable[..., Any]):
+    args_validator = SchemaValidator(core_schema.call_schema(
+        arguments=ta.core_schema['arguments_schema'],
+        function=fn,
+    ))
+    return lambda **kwargs: args_validator.validate_python(kwargs)
+
+def completion_with_tool_usage(
+        *,
+        response_model: Optional[Union[Json[Any], type]]=None,
+        max_iterations: Optional[int]=None,
+        tools: List[Callable[..., Any]],
+        endpoint: str,
+        messages: List[Message],
+        auth: Optional[str],
+        verbose: bool,
+        assume_llama_cpp_server: bool = False,
+        **kwargs):
+    '''
+    Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
+    (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+
+    The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
+    '''
+    response_format = None
+    type_adapter = None
+    if response_model:
+        if isinstance(response_model, dict):
+            schema = response_model
+        else:
+            type_adapter = TypeAdapter(response_model)
+            schema = type_adapter.json_schema()
+        response_format=ResponseFormat(type="json_object", schema=schema)
+
+    tool_map = {}
+    tools_schemas = []
+    for fn in tools:
+        if isinstance(fn, OpenAPIMethod):
+            tool_map[fn.__name__] = fn
+            parameters_schema = fn.parameters_schema
+        else:
+            ta = TypeAdapter(fn)
+            tool_map[fn.__name__] = make_call_adapter(ta, fn)
+            parameters_schema = ta.json_schema()
+        if verbose:
+            sys.stderr.write(f'# PARAMS SCHEMA ({fn.__name__}): {json.dumps(parameters_schema, indent=2)}\n')
+        tools_schemas.append(
+            Tool(
+                type="function",
+                function=ToolFunction(
+                    name=fn.__name__,
+                    description=fn.__doc__ or '',
+                    parameters=parameters_schema,
+                )
+            )
+        )
+
+    i = 0
+    while (max_iterations is None or i < max_iterations):
+        request = ChatCompletionRequest(
+            messages=messages,
+            response_format=response_format,
+            tools=tools_schemas if tools_schemas else None,
+            cache_prompt=True,
+            **kwargs,
+        )
+        if verbose:
+            sys.stderr.write(f'# REQUEST: {request.model_dump_json(indent=2)}\n')
+        headers = {
+            "Content-Type": "application/json",
+        }
+        if auth:
+            headers["Authorization"] = auth
+
+        def drop_nones(o):
+            if isinstance(o, BaseModel):
+                return drop_nones(o.model_dump())
+            if isinstance(o, list):
+                return [drop_nones(i) for i in o if i is not None]
+            if isinstance(o, dict):
+                return {
+                    k: drop_nones(v)
+                    for k, v in o.items()
+                    if v is not None
+                }
+            return o
+        
+        if assume_llama_cpp_server:
+            body = request.model_dump()
+        else:
+            # request_dict = request.model_dump()
+            # body = drop_nones(request)
+            tools_arg = None
+            tool_choice = request.tool_choice
+            response_format = None
+            if request.tools:
+                tools_arg = drop_nones(request.tools)
+            if request.response_format:
+                response_format = {
+                    'type': request.response_format.type,
+                }
+                if request.response_format.schema:
+                    assert tools_arg is None
+                    assert tool_choice is None
+                    tools_arg = [{
+                        "type": "function",
+                        "function": {
+                            "name": "output",
+                            "description": "A JSON object",
+                            "parameters": request.response_format.schema,
+                        }
+                    }]
+                    tool_choice = "output"
+
+            body = drop_nones(dict(
+                messages=drop_nones(request.messages),
+                model=request.model,
+                tools=tools_arg,
+                tool_choice=tool_choice,
+                temperature=request.temperature,
+                response_format=response_format,
+            ))
+
+        if verbose:
+            sys.stderr.write(f'# POSTing to {endpoint}/v1/chat/completions\n')
+            sys.stderr.write(f'# HEADERS: {headers}\n')
+            sys.stderr.write(f'# BODY: {json.dumps(body, indent=2)}\n')
+
+        response = requests.post(
+            f'{endpoint}/v1/chat/completions',
+            headers=headers,
+            json=body,
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        response = ChatCompletionResponse(**response_json)
+        if verbose:
+            sys.stderr.write(f'# RESPONSE: {response.model_dump_json(indent=2)}\n')
+        if response.error:
+            raise Exception(f'Inference failed: {response.error.message}')
+
+        assert len(response.choices) == 1
+        choice = response.choices[0]
+
+        content = choice.message.content
+        if choice.finish_reason == "tool_calls":
+            messages.append(choice.message)
+            for tool_call in choice.message.tool_calls:
+                if content:
+                    print(f'💭 {content}')
+
+                args = json.loads(tool_call.function.arguments)
+                pretty_call = f'{tool_call.function.name}({", ".join(f"{k}={v.model_dump_json() if isinstance(v, BaseModel) else json.dumps(v)}" for k, v in args.items())})'
+                sys.stdout.write(f'⚙️  {pretty_call}')
+                sys.stdout.flush()
+                tool_result = tool_map[tool_call.function.name](**args)
+                sys.stdout.write(f" → {tool_result}\n")
+                messages.append(Message(
+                    tool_call_id=tool_call.id,
+                    role="tool",
+                    name=tool_call.function.name,
+                    content=f'{tool_result}',
+                    # content=f'{pretty_call} = {tool_result}',
+                ))
+        else:
+            assert content
+            result = type_adapter.validate_json(content) if type_adapter else content
+            return result
+
+        i += 1
+
+    if max_iterations is not None:
+        raise Exception(f"Failed to get a valid response after {max_iterations} tool calls")
+
+
+def main(
+    goal: Annotated[str, typer.Option()],
+    tools: Optional[List[str]] = None,
+    format: Annotated[Optional[str], typer.Option(help="The output format: either a Python type (e.g. 'float' or a Pydantic model defined in one of the tool files), or a JSON schema, e.g. '{\"format\": \"date\"}'")] = None,
+    max_iterations: Optional[int] = 10,
+    std_tools: Optional[bool] = False,
+    auth: Optional[str] = None,
+    parallel_calls: Optional[bool] = False,
+    verbose: bool = False,
+    style: Optional[ToolsPromptStyle] = None,
+    assume_llama_cpp_server: Optional[bool] = None,
+
+    model: Optional[Annotated[str, typer.Option("--model", "-m")]] = None,# = "models/7B/ggml-model-f16.gguf",
+    model_url: Optional[Annotated[str, typer.Option("--model-url", "-mu")]] = None,
+    hf_repo: Optional[Annotated[str, typer.Option("--hf-repo", "-hfr")]] = None,
+    hf_file: Optional[Annotated[str, typer.Option("--hf-file", "-hff")]] = None,
+    
+    endpoint: Optional[str] = None,
+    context_length: Optional[int] = None,
+    # endpoint: str = 'http://localhost:8080/v1/chat/completions',
+
+    greedy: Optional[bool] = True,
+
+    n_predict: Optional[int] = 1000,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    min_p: Optional[float] = None,
+    tfs_z: Optional[float] = None,
+    typical_p: Optional[float] = None,
+    temperature: Optional[float] = 0,
+    dynatemp_range: Optional[float] = None,
+    dynatemp_exponent: Optional[float] = None,
+    repeat_last_n: Optional[int] = None,
+    repeat_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presense_penalty: Optional[float] = None,
+    mirostat: Optional[bool] = None,
+    mirostat_tau: Optional[float] = None,
+    mirostat_eta: Optional[float] = None,
+    penalize_nl: Optional[bool] = None,
+    n_keep: Optional[int] = None,
+    seed: Optional[int] = None,
+    n_probs: Optional[int] = None,
+    min_keep: Optional[int] = None,
+):
+    if greedy:
+        top_k = 1
+        top_p = 0.0
+
+    if not endpoint:
+        server_port = 8080
+        server_host = 'localhost'
+        assume_llama_cpp_server = True
+        endpoint = f'http://{server_host}:{server_port}'
+        if verbose:
+            sys.stderr.write(f"# Starting C++ server with model {model} on {endpoint}\n")
+        cmd = [
+            "python", "-m", "examples.openai.server",
+            "--model", model,
+            *(['--verbose'] if verbose else []),
+            *(['--parallel-calls'] if parallel_calls else []),
+            *([f'--context-length={context_length}'] if context_length else []),
+            *([f'--style={style.value}'] if style else []),
+        ]
+        spawn_subprocess(cmd)
+        sleep(5)
+
+    tool_functions = []
+    types: Dict[str, type] = {}
+    for f in (tools or []):
+        if f.startswith('http://') or f.startswith('https://'):
+            tool_functions.extend(openapi_methods_from_endpoint(f))
+        else:
+            module = load_module(f)
+            tool_functions.extend(collect_functions(module))
+            types.update({
+                k: v
+                for k, v in module.__dict__.items()
+                if isinstance(v, type)
+            })
+
+    if std_tools:
+        tool_functions.extend(collect_functions(StandardTools))
+
+    sys.stdout.write(f'🛠️  {", ".join(fn.__name__ for fn in tool_functions)}\n')
+
+    response_model: Union[type, Json[Any]] = None #str
+    if format:
+        if format in types:
+            response_model = types[format]
+        elif format == 'json':
+            response_model = {}
+        else:
+            try:
+                response_model = json.loads(format)
+            except:
+                response_model = eval(format)
+
+
+    result = completion_with_tool_usage(
+        model="gpt-4o",
+        endpoint=endpoint,
+        response_model=response_model,
+        max_iterations=max_iterations,
+        tools=tool_functions,
+        auth=auth,
+        verbose=verbose,
+        assume_llama_cpp_server=assume_llama_cpp_server or False,
+
+        n_predict=n_predict,
+        top_k=top_k,
+        top_p=top_p,
+        min_p=min_p,
+        tfs_z=tfs_z,
+        typical_p=typical_p,
+        temperature=temperature,
+        dynatemp_range=dynatemp_range,
+        dynatemp_exponent=dynatemp_exponent,
+        repeat_last_n=repeat_last_n,
+        repeat_penalty=repeat_penalty,
+        frequency_penalty=frequency_penalty,
+        presense_penalty=presense_penalty,
+        mirostat=mirostat,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        penalize_nl=penalize_nl,
+        n_keep=n_keep,
+        seed=seed,
+        n_probs=n_probs,
+        min_keep=min_keep,
+        messages=[Message(role="user", content=goal)],
+    )
+    print(result if response_model else f'➡️ {result}')
+    # exit(0)
+
+if __name__ == '__main__':
+    typer.run(main)
+
diff --git a/examples/agent/fastify-requirements.txt b/examples/agent/fastify-requirements.txt
new file mode 100644
index 0000000000000..28604117432f8
--- /dev/null
+++ b/examples/agent/fastify-requirements.txt
@@ -0,0 +1,5 @@
+fastapi[all]
+pydantic
+sse-starlette
+uvicorn[all]
+typer[all]
diff --git a/examples/agent/fastify.py b/examples/agent/fastify.py
new file mode 100644
index 0000000000000..cf02ccc3102ff
--- /dev/null
+++ b/examples/agent/fastify.py
@@ -0,0 +1,46 @@
+'''
+    Binds the functions of a python script as a FastAPI server.
+
+    This is useful in combination w/ the examples/agent/run_sandboxed_tools.sh
+'''
+import os
+import fastapi, uvicorn
+import typer
+from typing import Type, List
+
+from examples.agent.utils import load_module
+
+def bind_functions(app, module):
+    for k in dir(module):
+        if k.startswith('_'):
+            continue
+        if k == k.capitalize():
+            continue
+        v = getattr(module, k)
+        if not callable(v) or isinstance(v, type):
+            continue
+        if not hasattr(v, '__annotations__'):
+            continue
+
+        vt = type(v)
+        if vt.__module__ == 'langchain_core.tools' and vt.__name__.endswith('Tool') and hasattr(v, 'func') and callable(v.func):
+            v = v.func
+
+        print(f'INFO:     Binding /{k}')
+        try:
+            app.post('/' + k)(v)
+        except Exception as e:
+            print(f'WARNING:    Failed to bind /{k}\n\t{e}')
+
+def main(files: List[str], host: str = '0.0.0.0', port: int = 8000):
+    app = fastapi.FastAPI()
+
+    for f in files:
+        bind_functions(app, load_module(f))
+
+    print(f'INFO:     CWD = {os.getcwd()}')
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == '__main__':
+    typer.run(main)
+
diff --git a/examples/agent/openapi_client.py b/examples/agent/openapi_client.py
new file mode 100644
index 0000000000000..d336c7436dec7
--- /dev/null
+++ b/examples/agent/openapi_client.py
@@ -0,0 +1,89 @@
+
+import json
+import requests
+import urllib
+
+
+class OpenAPIMethod:
+    def __init__(self, url, name, descriptor, catalog):
+        self.url = url
+        self.__name__ = name
+
+        assert 'post' in descriptor, 'Only POST methods are supported'
+        post_descriptor = descriptor['post']
+
+        self.__doc__ = post_descriptor['description']
+        parameters = post_descriptor.get('parameters', [])
+        request_body = post_descriptor.get('requestBody')
+
+        self.parameters = {p['name']: p for p in parameters}
+        assert all(param['in'] == 'query' for param in self.parameters.values()), f'Only query path parameters are supported (path: {url}, descriptor: {json.dumps(descriptor)})'
+
+        self.body = None
+        if request_body:
+            assert 'application/json' in request_body['content'], f'Only application/json is supported for request body (path: {url}, descriptor: {json.dumps(descriptor)})'
+
+            body_name = 'body'
+            i = 2
+            while body_name in self.parameters:
+                body_name = f'body{i}'
+                i += 1
+
+            self.body = dict(
+                name=body_name,
+                required=request_body['required'],
+                schema=request_body['content']['application/json']['schema'],
+            )
+
+        self.parameters_schema = dict(
+            type='object',
+            properties={
+                **({
+                    self.body['name']: self.body['schema']
+                } if self.body else {}),
+                **{
+                    name: param['schema']
+                    for name, param in self.parameters.items()
+                }
+            },
+            components=catalog.get('components'),
+            required=[name for name, param in self.parameters.items() if param['required']] + ([self.body['name']] if self.body and self.body['required'] else [])
+        )
+
+    def __call__(self, **kwargs):
+        if self.body:
+            body = kwargs.pop(self.body['name'], None)
+            if self.body['required']:
+                assert body is not None, f'Missing required body parameter: {self.body["name"]}'
+        else:
+            body = None
+
+        query_params = {}
+        for name, param in self.parameters.items():
+            value = kwargs.pop(name, None)
+            if param['required']:
+                assert value is not None, f'Missing required parameter: {name}'
+
+            assert param['in'] == 'query', 'Only query parameters are supported'
+            query_params[name] = value
+
+        params = "&".join(f"{name}={urllib.parse.quote(value)}" for name, value in query_params.items())
+        url = f'{self.url}?{params}'
+        response = requests.post(url, json=body)
+        response.raise_for_status()
+        response_json = response.json()
+
+        return response_json
+
+
+def openapi_methods_from_endpoint(url):
+    catalog_url = f'{url}/openapi.json'
+    catalog_response = requests.get(catalog_url)
+    catalog_response.raise_for_status()
+    catalog = catalog_response.json()
+
+    methods = [
+        OpenAPIMethod(url=f'{url}{path}', name=path.replace('/', ' ').strip().replace(' ', '_'), descriptor=descriptor, catalog=catalog)
+        for path, descriptor in catalog['paths'].items()
+    ]
+    return methods
diff --git a/examples/agent/requirements.txt b/examples/agent/requirements.txt
new file mode 100644
index 0000000000000..6ff121e93ec28
--- /dev/null
+++ b/examples/agent/requirements.txt
@@ -0,0 +1,8 @@
+fastapi[all]
+gguf
+jinja2
+pydantic
+requests
+sse-starlette
+uvicorn[all]
+typer[all]
diff --git a/examples/agent/run_sandboxed_tools.sh b/examples/agent/run_sandboxed_tools.sh
new file mode 100755
index 0000000000000..4f502e12dbe89
--- /dev/null
+++ b/examples/agent/run_sandboxed_tools.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#
+# Runs a Python script in a sandboxed environment and makes its functions available as a web service.
+#
+# git submodule add https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+# python examples/openai/fastify.py examples/openai/hermes_function_calling/functions.py
+# REQUIREMENTS_FILE=<( cat examples/openai/hermes_function_calling/requirements.txt | grep -vE "bitsandbytes|flash-attn" ) examples/agents/run_sandboxed_tools.sh examples/agents/hermes_function_calling/functions.py -e LOG_FOLDER=/data/inference_logs
+set -euo pipefail
+
+script="$( realpath "$1" )"
+script_folder="$(dirname "$script")"
+shift 1
+
+function cleanup {
+  rm -rf "$BUILD_DIR"
+  echo "Deleted $BUILD_DIR"
+}
+trap cleanup EXIT
+BUILD_DIR=$(mktemp -d)
+DATA_DIR="${DATA_DIR:-$HOME/.llama.cpp/sandbox}"
+SCRIPT_DIR=$( cd "$(dirname "$0")" ; pwd )
+
+mkdir -p "$DATA_DIR"
+
+REQUIREMENTS_FILE="${REQUIREMENTS_FILE:-}"
+if [[ -z "$REQUIREMENTS_FILE" && -f "$script_folder/requirements.txt" ]]; then
+    REQUIREMENTS_FILE="$script_folder/requirements.txt"
+fi
+if [[ -n "$REQUIREMENTS_FILE" ]]; then
+    cp "$REQUIREMENTS_FILE" "$BUILD_DIR/script-requirements.txt"
+else
+    touch $BUILD_DIR/script-requirements.txt
+fi
+
+echo "INFO: using DATA_DIR: $DATA_DIR"
+
+cp \
+    "$SCRIPT_DIR/fastify-requirements.txt" \
+    "$SCRIPT_DIR/fastify.py" \
+    "$SCRIPT_DIR/utils.py" \
+    "$BUILD_DIR"
+
+mkdir -p "$DATA_DIR"
+
+readonly PORT=${PORT:-8088}
+readonly LLAMA_IMAGE_NAME=llama.cpp/tools-base
+
+echo "
+    FROM     ${BASE_IMAGE:-python:3.11-slim}
+    RUN      apt-get update
+    RUN      apt-get install -y gcc python3-dev git cmake
+    RUN      pip install --upgrade pip
+    RUN      pip install packaging numpy
+    RUN      mkdir /src /data
+
+    # Copy resources in increasing likelihood of change, to keep as much as possible cached
+    COPY     fastify-requirements.txt /root/
+    RUN      pip install -r /root/fastify-requirements.txt
+    COPY     script-requirements.txt  /root/
+    RUN      pip install -r /root/script-requirements.txt
+    COPY     fastify.py utils.py      /root/examples/agent/
+
+    WORKDIR  /data
+    ENTRYPOINT PYTHONPATH=/src:/root python -m examples.agent.fastify --port=$PORT '/src/$( basename "$script" )'
+" | docker build "$BUILD_DIR" -f - -t "$LLAMA_IMAGE_NAME"
+
+echo "#"
+echo "# Binding $script to http://localhost:$PORT/"
+echo "#"
+set -x
+docker run \
+    "$@" \
+    --mount "type=bind,source=$( realpath "$script_folder" ),target=/src,readonly" \
+    --mount "type=bind,source=$DATA_DIR,target=/data" \
+    -p "$PORT:$PORT" \
+    -it "$LLAMA_IMAGE_NAME"
diff --git a/examples/agent/tools/example_math_tools.py b/examples/agent/tools/example_math_tools.py
new file mode 100644
index 0000000000000..4361328bc1c0f
--- /dev/null
+++ b/examples/agent/tools/example_math_tools.py
@@ -0,0 +1,23 @@
+import math
+
+def add(a: float, b: float) -> float:
+    """
+        Add a and b reliably.
+        Don't use this tool to compute the square of a number (use multiply or pow instead)
+    """
+    return a + b
+
+def multiply(a: float, b: float) -> float:
+    """Multiply a with b reliably"""
+    return a * b
+
+def divide(a: float, b: float) -> float:
+    """Divide a by b reliably"""
+    return a / b
+
+def pow(value: float, power: float) -> float:
+    """
+        Raise a value to a power (exponent) reliably.
+        The square of x is pow(x, 2), its cube is pow(x, 3), etc.
+    """
+    return math.pow(value, power)
diff --git a/examples/agent/tools/example_summaries.py b/examples/agent/tools/example_summaries.py
new file mode 100644
index 0000000000000..a1df1121b7713
--- /dev/null
+++ b/examples/agent/tools/example_summaries.py
@@ -0,0 +1,16 @@
+
+from typing import Annotated, List, Optional
+from annotated_types import MinLen
+from pydantic import BaseModel
+
+
+class QAPair(BaseModel):
+    question: str
+    concise_answer: str
+    justification: str
+
+class PyramidalSummary(BaseModel):
+    title: str
+    summary: str
+    question_answers: Annotated[List[QAPair], MinLen(2)]
+    sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
diff --git a/examples/agent/tools/fake_weather_tools.py b/examples/agent/tools/fake_weather_tools.py
new file mode 100644
index 0000000000000..0154d966ded63
--- /dev/null
+++ b/examples/agent/tools/fake_weather_tools.py
@@ -0,0 +1,36 @@
+
+import random
+from typing import Literal
+
+
+def _weather(w: str, temp, format):
+    return f'{w}, {temp}C' if format == 'celsius' \
+        else f'{w}, {(temp * 9/5) + 32}F'
+
+def get_current_weather(location: str, format: Literal["celsius", "fahrenheit"]) -> str:
+      '''
+        Get the current weather
+
+        Args:
+            location: The city and state, e.g. San Francisco, CA
+            format: The temperature unit to use. Infer this from the users location.
+      '''
+      return _weather('Sunny', 31, format)
+
+def get_n_day_weather_forecast(location: str, format: Literal["celsius", "fahrenheit"], num_days: int) -> str:
+    '''
+        Get an N-day weather forecast
+
+        Args:
+            location: The city and state, e.g. San Francisco, CA
+            format: The temperature unit to use. Infer this from the users location.
+            num_days: The number of days to forecast
+    '''
+    random.seed(123)
+    return '\n'.join([
+        f'{num_days} forecast for {location}:',
+        *(
+            f'- in {i} day{"s" if i > 1 else ""}: {_weather("Sunny" if i % 2 == 0 else "Cloudy", random.randrange(15, 35), format)}'
+            for i in range(1, num_days)
+        )
+    ])
diff --git a/examples/agent/tools/std_tools.py b/examples/agent/tools/std_tools.py
new file mode 100644
index 0000000000000..f4ee850365c6d
--- /dev/null
+++ b/examples/agent/tools/std_tools.py
@@ -0,0 +1,93 @@
+from datetime import date
+import datetime
+from pydantic import BaseModel
+import subprocess
+import sys
+import time
+import typer
+from typing import Union, Optional
+
+class Duration(BaseModel):
+    seconds: Optional[int] = None
+    minutes: Optional[int] = None
+    hours: Optional[int] = None
+    days: Optional[int] = None
+    months: Optional[int] = None
+    years: Optional[int] = None
+
+    def __str__(self) -> str:
+        return ', '.join([
+            x
+            for x in [
+                f"{self.years} years" if self.years else None,
+                f"{self.months} months" if self.months else None,
+                f"{self.days} days" if self.days else None,
+                f"{self.hours} hours" if self.hours else None,
+                f"{self.minutes} minutes" if self.minutes else None,
+                f"{self.seconds} seconds" if self.seconds else None,
+            ]
+            if x is not None
+        ])
+
+    @property
+    def get_total_seconds(self) -> int:
+        return sum([
+            self.seconds or 0,
+            (self.minutes or 0)*60,
+            (self.hours or 0)*3600,
+            (self.days or 0)*86400,
+            (self.months or 0)*2592000,
+            (self.years or 0)*31536000,
+        ])
+
+class WaitForDuration(BaseModel):
+    duration: Duration
+
+    def __call__(self):
+        sys.stderr.write(f"Waiting for {self.duration}...\n")
+        time.sleep(self.duration.get_total_seconds)
+
+class StandardTools:
+
+    @staticmethod
+    def ask_user(question: str) -> str:
+        '''
+            Ask the user a question and return the answer.
+            This allows getting additional information, requesting disambiguation, etc.
+        '''
+        return typer.prompt(question)
+
+    @staticmethod
+    def wait_for_duration(duration: Duration) -> None:
+        'Wait for a certain amount of time before continuing.'
+
+        # sys.stderr.write(f"Waiting for {duration}...\n")
+        time.sleep(duration.get_total_seconds)
+
+    @staticmethod
+    def wait_for_date(target_date: date) -> None:
+        f'''
+            Wait until a specific date is reached before continuing.
+            Today's date is {datetime.date.today()}
+        '''
+
+        # Get the current date
+        current_date = datetime.date.today()
+
+        if target_date < current_date:
+            raise ValueError("Target date cannot be in the past.")
+
+        time_diff = datetime.datetime.combine(target_date, datetime.time.min) - datetime.datetime.combine(current_date, datetime.time.min)
+
+        days, seconds = time_diff.days, time_diff.seconds
+
+        # sys.stderr.write(f"Waiting for {days} days and {seconds} seconds until {target_date}...\n")
+        time.sleep(days * 86400 + seconds)
+        # sys.stderr.write(f"Reached the target date: {target_date}\n")
+
+    @staticmethod
+    def say_out_loud(something: str) -> None:
+        """
+            Just says something. Used to say each thought out loud
+        """
+        subprocess.check_call(["say", something])
diff --git a/examples/agent/tools/unsafe_python_tools.py b/examples/agent/tools/unsafe_python_tools.py
new file mode 100644
index 0000000000000..b187f219e989d
--- /dev/null
+++ b/examples/agent/tools/unsafe_python_tools.py
@@ -0,0 +1,44 @@
+import json
+import sys
+import types
+from typing import Dict, Union
+
+def _is_serializable(obj) -> bool:
+    try:
+        json.dumps(obj)
+        return True
+    except Exception as e:
+        return False
+
+def execute_python(source: str) -> Union[Dict, str]:
+    """
+        Evaluate a Python program and return the globals it declared.
+        Can be used to compute mathematical expressions (e.g. after importing math module).
+
+        Args:
+            source: contain valid, executable and pure Python code. Should also import any required Python packages.
+                For example: "import math\nresult = math.cos(2) * 10"
+
+        Returns:
+            dict | str: A dictionary containing variables declared, or an error message if an exception occurred.
+    """
+    try:
+        namespace = {}
+        sys.stderr.write(f"Executing Python program:\n{source}\n")
+        exec(source, namespace)
+        results = {
+            k: v
+            for k, v in namespace.items()
+            if not k.startswith('_') \
+                and not isinstance(v, type) \
+                and not isinstance(v, types.ModuleType) \
+                and not callable(v) \
+                and _is_serializable(v)
+        }
+        sys.stderr.write(f"Results: {json.dumps(results, indent=2)}\n")
+        return results
+    except Exception as e:
+        msg = f"Error: {sys.exc_info()[1]}"
+        sys.stderr.write(f"{msg}\n")
+        return msg
+
diff --git a/examples/agent/utils.py b/examples/agent/utils.py
new file mode 100644
index 0000000000000..b381e8ef6a171
--- /dev/null
+++ b/examples/agent/utils.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+import sys
+import importlib.util
+from typing import Type
+
+def load_source_as_module(source):
+    i = 0
+    while (module_name := f'mod_{i}') in sys.modules:
+        i += 1
+
+    spec = importlib.util.spec_from_file_location(module_name, source)
+    assert spec, f'Failed to load {source} as module'
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    assert spec.loader, f'{source} spec has no loader'
+    spec.loader.exec_module(module)
+    return module
+
+def load_module(f: str):
+    if f.endswith('.py'):
+        sys.path.insert(0, str(Path(f).parent))
+
+        return load_source_as_module(f)
+    else:
+        return importlib.import_module(f)
+
+def collect_functions(module):
+    for k in dir(module):
+        if k.startswith('_'):
+            continue
+        if k == k.capitalize():
+            continue
+        v = getattr(module, k)
+        if not callable(v) or isinstance(v, type):
+            continue
+        if not hasattr(v, '__annotations__'):
+            continue
+
+        vt = type(v)
+        if vt.__module__ == 'langchain_core.tools' and vt.__name__.endswith('Tool') and hasattr(v, 'func') and callable(v.func):
+            v = v.func
+
+        yield v
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 7d889c3fe1287..d9b624f6053b1 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -25,9 +25,9 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
 
 
 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: List[str]):
         self.content = content
-        self.deps = deps or []
+        self.deps = deps
 
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
@@ -86,7 +86,7 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
 
     def _format_literal(self, literal):
         escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES[m.group(0)], literal
         )
         return f'"{escaped}"'
 
@@ -125,13 +125,13 @@ def _add_rule(self, name, rule):
         self._rules[key] = rule
         return key
 
-    def resolve_refs(self, schema: dict, url: str):
+    def resolve_refs(self, schema: Any, url: str):
         '''
             Resolves all $ref fields in the given schema, fetching any remote schemas,
             replacing $ref with absolute reference URL and populating self._refs with the
             respective referenced (sub)schema dictionaries.
         '''
-        def visit(n: dict):
+        def visit(n: Any):
             if isinstance(n, list):
                 return [visit(x) for x in n]
             elif isinstance(n, dict):
@@ -191,7 +191,7 @@ def _visit_pattern(self, pattern, name):
 
         assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
         pattern = pattern[1:-1]
-        sub_rule_ids = {}
+        sub_rule_ids: Dict[str, str] = {}
 
         i = 0
         length = len(pattern)
diff --git a/examples/openai/README.md b/examples/openai/README.md
new file mode 100644
index 0000000000000..c8bc0130d4edc
--- /dev/null
+++ b/examples/openai/README.md
@@ -0,0 +1,189 @@
+# examples.openai: OpenAI-compatibility layer on top of server.cpp
+
+New Python OpenAI API compatibility server, which calls into / spawns the C++ server under the hood:
+
+```bash
+python -m examples.openai.server --model model.gguf
+```
+
+## Prerequisites
+
+Note: To get conda, just install Miniforge (it's OSS): https://github.com/conda-forge/miniforge
+
+```bash
+conda create -n agent python=3.11
+conda activate agent
+pip install -r examples/openai/requirements.txt
+```
+
+## Features
+
+The new [examples/openai/server.py](./server.py):
+
+- Supports grammar-constrained tool calling for **all** models (incl. Mixtral 7x8B)
+
+    - Optimised support for Functionary & Nous Hermes, easy to extend to other tool-calling schemes
+
+    - Generic support w/ JSON schema that guides the model towards tool usage (at the cost of extra tokens):
+
+        ```ts
+          {
+            // original_thought: string,
+            thought_about_next_step_only: string,
+            next_step: {tool_calls: {name: string, arguments: any}} | {result: T}
+          }
+          // Where T is the output JSON schema, or 'any'
+        ```
+
+        - Option to publicise schemas to models as TypeScript signatures (as for Functionary) or JSON schema.
+
+        - Supports models that require user/assistant alternance (like Mixtral Instruct) by merging system messages into user messages.
+
+- Spawns the C++ [llama.cpp server](../server) under the hood (unless passed `--endpoint`), but only uses its non-chat endpoint
+
+  (depending on the prompting strategy, we weave the tool & output schema along with the chat template into the raw model grammar constraints)
+
+- Uses the actual Jinja2 templates stored in the GGUF models
+
+- Will eventually also spawn `whisper.cpp` and another server subprocess for the embeddings endpoint
+
+Rationale: the C++ server lacks some OpenAI compatibility features (and can't realistically keep up with prompt templates w/o bringing in too many dependencies), this new layer could allow focusing the C++ server on serving efficiency and delegate OAI compliance to a layer easier to maintain.
+
+## Test
+
+If you want to see tools in action, look at the [agent example](../agent). Otherwise:
+
+Start the server in Terminal 1:
+
+```bash
+python -m examples.openai --model  ~/AI/Models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf
+```
+
+Query it in Terminal 2 (or use it from any framework that makes use of tools: note tool calls are guaranteed to comply to the schema, so retries are likely not necessary!):
+
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-3.5-turbo",
+    "tools": [{
+          "type": "function",
+          "function": {
+              "name": "get_current_weather",
+              "description": "Get the current weather",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      }
+                  },
+                  "required": ["location", "format"]
+              }
+          }
+      }, {
+          "type": "function",
+          "function": {
+              "name": "get_n_day_weather_forecast",
+              "description": "Get an N-day weather forecast",
+              "parameters": {
+                  "type": "object",
+                  "properties": {
+                      "location": {
+                          "type": "string",
+                          "description": "The city and state, e.g. San Francisco, CA"
+                      },
+                      "format": {
+                          "type": "string",
+                          "enum": ["celsius", "fahrenheit"],
+                          "description": "The temperature unit to use. Infer this from the users location."
+                      },
+                      "num_days": {
+                          "type": "integer",
+                          "description": "The number of days to forecast"
+                      }
+                  },
+                  "required": ["location", "format", "num_days"]
+              }
+          }
+      }],
+    "messages": [
+      {"role": "system", "content": "Do not make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."},
+      {"role": "user", "content": "what is the weather going to be like in San Francisco and Glasgow over the next 4 days"}
+    ]
+  }'
+```
+
+<details>
+<summary>Show output</summary>
+
+```json
+{
+  "id": "chatcmpl-3095057176",
+  "object": "chat.completion",
+  "created": 1711726921,
+  "model": "gpt-3.5-turbo",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "name": null,
+        "tool_call_id": null,
+        "content": "In order to provide the required information, I need to call the get_n_day_weather_forecast function twice, once for San Francisco and once for Glasgow.",
+        "tool_calls": [
+          {
+            "id": "call_970977",
+            "type": "function",
+            "function": {
+              "name": "get_n_day_weather_forecast",
+              "arguments": {
+                "location": "San Francisco, CA",
+                "format": "celsius",
+                "num_days": 4
+              }
+            }
+          }
+        ]
+      },
+      "logprobs": null,
+      "finish_reason": "tool_calls"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 546,
+    "completion_tokens": 118,
+    "total_tokens": 664
+  },
+  "system_fingerprint": "...",
+  "error": null
+}
+```
+
+</details>
+
+## TODO
+
+- Embedding endpoint w/ distinct server subprocess
+
+- Evaluate options for session caching
+
+    - Pass session id & store / read from file?
+
+    - Support parent session ids for trees of thought?
+
+    - Support precaching long prompts from CLI / read session files?
+
+- Follow-ups
+
+    - Remove OAI support from server
+
+    - Remove non-Python json-schema-to-grammar versions
+
+    - Reach out to frameworks to advertise new option.
diff --git a/examples/openai/__main__.py b/examples/openai/__main__.py
new file mode 100644
index 0000000000000..601eee3c4c6a6
--- /dev/null
+++ b/examples/openai/__main__.py
@@ -0,0 +1,7 @@
+import typer
+
+from examples.openai.server import main
+
+if __name__ == "__main__":
+    typer.run(main)
+
diff --git a/examples/openai/api.py b/examples/openai/api.py
new file mode 100644
index 0000000000000..9508592c89467
--- /dev/null
+++ b/examples/openai/api.py
@@ -0,0 +1,92 @@
+from abc import ABC
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import BaseModel, Json, TypeAdapter
+
+class FunctionCall(BaseModel):
+    name: str
+    arguments: str
+    # arguments: Union[Dict[str, Any], str]
+
+class ToolCall(BaseModel):
+    id: Optional[str] = None
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
+class Message(BaseModel):
+    role: str
+    name: Optional[str] = None
+    tool_call_id: Optional[str] = None
+    content: Optional[str]
+    tool_calls: Optional[List[ToolCall]] = None
+
+class ToolFunction(BaseModel):
+    name: str
+    parameters: dict[str, Any]
+    description: Optional[str] = None
+
+class Tool(BaseModel):
+    type: str
+    function: ToolFunction
+
+class ResponseFormat(BaseModel):
+    type: Literal["json_object"]
+    schema: Optional[dict[str, Any]] = None  # type: ignore
+
+class LlamaCppParams(BaseModel):
+    n_predict: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    min_p: Optional[float] = None
+    tfs_z: Optional[float] = None
+    typical_p: Optional[float] = None
+    temperature: float = 1.0
+    dynatemp_range: Optional[float] = None
+    dynatemp_exponent: Optional[float] = None
+    repeat_last_n: Optional[int] = None
+    repeat_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    presense_penalty: Optional[float] = None
+    mirostat: Optional[bool] = None
+    mirostat_tau: Optional[float] = None
+    mirostat_eta: Optional[float] = None
+    penalize_nl: Optional[bool] = None
+    n_keep: Optional[int] = None
+    seed: Optional[int] = None
+    n_probs: Optional[int] = None
+    min_keep: Optional[int] = None
+
+class ChatCompletionRequest(LlamaCppParams):
+    model: str
+    tools: Optional[List[Tool]] = None
+    tool_choice: Optional[str] = None
+    messages: Optional[List[Message]] = None
+    prompt: Optional[str] = None
+    response_format: Optional[ResponseFormat] = None
+
+    stream: bool = False
+    cache_prompt: Optional[bool] = None
+
+class Choice(BaseModel):
+    index: int
+    message: Message
+    logprobs: Optional[dict[str, Any]] = None
+    finish_reason: Union[Literal["stop"], Literal["tool_calls"]]
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+class CompletionError(BaseModel):
+    message: str
+    # code: int
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: List[Choice]
+    usage: Usage
+    system_fingerprint: Optional[str] = None
+    error: Optional[CompletionError] = None
diff --git a/examples/openai/gguf_kvs.py b/examples/openai/gguf_kvs.py
new file mode 100644
index 0000000000000..4bb24b5e75d5f
--- /dev/null
+++ b/examples/openai/gguf_kvs.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+import sys
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "gguf-py"))
+
+from gguf.gguf_reader import GGUFReader
+from gguf.constants import Keys
+
+class GGUFKeyValues:
+    def __init__(self, model: Path):
+        self.reader = GGUFReader(model.as_posix())
+    def __getitem__(self, key: str):
+        if '{arch}' in key:
+            key = key.replace('{arch}', self[Keys.General.ARCHITECTURE])
+        return self.reader.read_field(self.reader.fields[key])
+    def __contains__(self, key: str):
+        return key in self.reader.fields
+    def keys(self):
+        return self.reader.fields.keys()
diff --git a/examples/openai/llama_cpp_server_api.py b/examples/openai/llama_cpp_server_api.py
new file mode 100644
index 0000000000000..db1c860411ae8
--- /dev/null
+++ b/examples/openai/llama_cpp_server_api.py
@@ -0,0 +1,12 @@
+from typing import Any, Optional
+from pydantic import Json
+
+from examples.openai.api import LlamaCppParams
+
+class LlamaCppServerCompletionRequest(LlamaCppParams):
+    prompt: str
+    stream: Optional[bool] = None
+    cache_prompt: Optional[bool] = None
+
+    grammar: Optional[str] = None
+    json_schema: Optional[Json[Any]] = None
diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
new file mode 100644
index 0000000000000..6aef7e437060d
--- /dev/null
+++ b/examples/openai/prompting.py
@@ -0,0 +1,717 @@
+from abc import ABC, abstractmethod
+from enum import Enum
+import jinja2
+import json
+from pathlib import Path
+import random
+import re
+import sys
+from typing import Annotated, Any, Optional
+from pydantic import BaseModel, Field, Json
+
+from examples.json_schema_to_grammar import SchemaConverter
+from examples.openai.api import Tool, Message, FunctionCall, ToolCall
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys  # type: ignore
+from examples.openai.ts_converter import SchemaToTypeScriptConverter
+
+# _THOUGHT_KEY = "thought"
+_THOUGHT_KEY = "thought_about_next_step_only"
+
+# While the API will be usable with a generic tools usage like OpenAI,
+# (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
+# each model may need specific prompting (and/or constrained output,
+# especially for models not fine-tuned for tool usage / function calling).
+class ToolsPromptStyle(str, Enum):
+    # Short prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
+    TOOLS_SHORT = "short"
+
+    # Longer prompt w/ <tools>schemas</tools>, <tool_call>...</tool_call> output
+    TOOLS_LONG = "long"
+
+    # Bespoke constrained output format that favours thought and reasoning
+    # while allowing unambiguous parsing of parallel tool calling.
+    TOOLS_THOUGHTFUL_STEPS = "thoughtful_steps"
+
+    # Large prompt for https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+    # <tool_call>...</tool_call> output
+    # Requires:
+    # - git clone https://github.com/NousResearch/Hermes-Function-Calling examples/openai/hermes_function_calling
+    # - Set large context length as their prompts are super long
+    TOOLS_HERMES_2_PRO = "tools_hermes_2_pro"
+
+    # Seems to want to escape underscores in tool names and in the <tool\_call>...</tool\_call> tags
+    TOOLS_MIXTRAL = "mixtral"
+
+    # Short prompt w/ TypeScript definitions for https://github.com/MeetKai/functionary
+    # https://github.com/MeetKai/functionary/blob/main/functionary/prompt_template/prompt_template_v2.py
+    # Note: see this prior attempt to support Functionary: https://github.com/ggerganov/llama.cpp/pull/5695
+    TYPESCRIPT_FUNCTIONARY_V2 = "functionary_v2"
+
+def raise_exception(msg: str):
+    raise Exception(msg)
+
+class ChatTemplate(BaseModel):
+    template: str
+    eos_token: str
+    bos_token: str
+
+    inferred_tool_style: Annotated[Optional['ToolsPromptStyle'], Field(exclude=True)] = None
+    expects_strict_user_assistant_alternance: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_call: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_call_content: Annotated[Optional[bool], Field(exclude=True)] = None
+    formats_tool_result: Optional[bool] = None
+    formats_tool_name: Optional[bool] = None
+
+    @property
+    def potentially_supports_parallel_calls(self) -> bool:
+        return bool(self.formats_tool_result and self.formats_tool_name)
+
+    def __init__(self, template: str, eos_token: str, bos_token: str):
+        super().__init__(template=template, eos_token=eos_token, bos_token=bos_token)
+        env = jinja2.Environment(loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True)
+        self._template = env.from_string(template)
+        # print(template)
+
+        # self.expects_strict_user_assistant_alternance = "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception" in template
+
+        self.probe_template_capabilities()
+        self.extract_prefix_suffix_from_template()
+
+        if "<|recipient|>' + tool_call['function']['name']" in template:
+            self.inferred_tool_style = ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2
+        else:
+            self.inferred_tool_style = ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_LONG
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_HERMES_2_PRO
+            # self.inferred_tool_style = ToolsPromptStyle.TOOLS_MIXTRAL
+
+    def probe_template_capabilities(self):
+
+        def succeeds(messages: list[Message], strings_to_find = ()):
+            try:
+                result = self.raw_render(messages, add_generation_prompt=True)
+                # print(result)
+                for s in strings_to_find:
+                    if s not in result:
+                        return False
+                return True
+            except Exception as e:
+                # print(e)
+                return False
+
+        # if self.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        user_msg = Message(role="user", content="Hey")
+        assistant_msg = Message(role="assistant", content="I, Robot")
+
+        self.expects_strict_user_assistant_alternance = not succeeds([assistant_msg, user_msg]) and succeeds([user_msg, assistant_msg])
+
+        thought = "Precious thought"
+        fn_name = "callMeMaybe"
+        toolcall = ToolCall(id="call_531873", type="function", function=FunctionCall(name=fn_name, arguments=json.dumps({"lol": 123})))
+        toolcall_msg = Message(role="assistant", content=None, tool_calls=[toolcall])
+        tool_result = "Tool result"
+        tool_name = "additioner"
+        tool_msg = Message(role="tool", name=tool_name, content=tool_result)
+        stringified_toolcall_msg = Message(role="assistant", content=None, tool_calls=[ToolCall(function=FunctionCall(name=fn_name, arguments=json.dumps({"lol": 123})))])
+        toolcall_content_msg = Message(role="assistant", content=thought, tool_calls=toolcall_msg.tool_calls)
+
+        self.formats_tool_call = succeeds([user_msg, toolcall_msg], (fn_name,))
+        if self.formats_tool_call:
+            self.formats_tool_call_content = succeeds([user_msg, toolcall_content_msg], (thought,))
+
+        self.formats_tool_result = succeeds([user_msg, assistant_msg, tool_msg], (tool_result,))
+        self.formats_tool_name = succeeds([user_msg, assistant_msg, tool_msg], (tool_name,))
+        # assert self.formats_tools or self.expects_strict_user_assistant_alternance
+
+    def extract_prefix_suffix_from_template(self):
+
+        delimiter = '<%$[SAMPLE]$%>'
+        user_msg = Message(role="user", content="Hey")
+        empty_prompt = self.raw_render([user_msg], add_generation_prompt=True).strip()
+        planted_prompt = self.raw_render([user_msg, Message(role="assistant", content=delimiter)], add_generation_prompt=False).strip()
+        assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
+        [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
+
+        # sys.stderr.write(f"\n# prefix={prefix}\n# suffix={suffix}\n\n")
+
+        self._prefix = prefix
+        self._suffix = suffix
+
+
+    def strip_suffix(self, s: str) -> str:
+        if s.endswith(self._suffix):
+            return s[:-len(self._suffix)]
+        else:
+            sys.stderr.write(f"Expected suffix ({self._suffix}) not found: {s}\n")
+            return s
+
+    @staticmethod
+    def from_gguf(metadata: GGUFKeyValues):
+        if Keys.Tokenizer.CHAT_TEMPLATE not in metadata:
+            raise NotImplementedError(f'Only supporting models with {Keys.Tokenizer.CHAT_TEMPLATE} entry in their GGUF key-values (TODO: add default template, maybe pick llama2\'s?)')
+
+        tokens = metadata[Keys.Tokenizer.LIST]
+        return ChatTemplate(
+            template = metadata[Keys.Tokenizer.CHAT_TEMPLATE],
+            bos_token = tokens[metadata[Keys.Tokenizer.BOS_ID]],
+            eos_token = tokens[metadata[Keys.Tokenizer.EOS_ID]])
+
+    @staticmethod
+    def from_huggingface(model_id: str):
+        from transformers import LlamaTokenizer  # type: ignore
+        tokenizer = LlamaTokenizer.from_pretrained(model_id)
+        return ChatTemplate(
+            template = tokenizer.chat_template or tokenizer.default_chat_template,
+            bos_token = tokenizer.bos_token,
+            eos_token = tokenizer.eos_token)
+
+    def raw_render(self, messages: list[Message], add_generation_prompt: bool, omit_bos: bool = False):
+        result = self._template.render(
+            messages=[messages.model_dump() for messages in messages],
+            eos_token=self.eos_token,
+            bos_token='' if omit_bos else self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=add_generation_prompt,
+        )
+        return result
+
+class ChatHandlerArgs(BaseModel):
+    chat_template: ChatTemplate
+    response_schema: Optional[dict[str,Any]] = None
+    tools: Optional[list[Tool]] = None
+
+class ChatHandler(ABC):
+    def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle]):
+        self.args = args
+        self.style = style
+        self.output_format_prompt: Optional[Message] = None
+        self.grammar: Optional[str] = None
+
+    @abstractmethod
+    def parse(self, s: str) -> Optional[Message]:
+        raise NotImplementedError()
+
+
+    def add_system_prompt(self, messages: list[Message], system_prompt: Message) -> list[Message]:
+        assert system_prompt.role == "system"
+        # TODO: add to last system message, or create a new one just before the last user message
+        system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
+        if system_message:
+            (i, m) = system_message
+            return messages[:i] + [Message(role="system", content=(system_prompt.content + '\n' if system_prompt.content else '') + (m.content or ''))] + messages[i+1:]
+        else:
+            return [system_prompt] + messages
+
+    def render_prompt(self, messages: list[Message]) -> str:
+
+        if self.output_format_prompt:
+            messages = self.add_system_prompt(messages, self.output_format_prompt)
+
+        def normalize(m: Message):
+            if self.style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS and m.role == "assistant":
+                if m.tool_calls:
+                    m = Message(
+                        role=m.role,
+                        content=json.dumps({
+                            _THOUGHT_KEY: m.content or '',
+                            "next_step": {
+                                "tool_calls": [tc.model_dump() for tc in m.tool_calls]
+                            }
+                        }, indent=2)
+                    )
+                else:
+                    m = Message(
+                        role=m.role,
+                        content=json.dumps({
+                            _THOUGHT_KEY: '',
+                            "next_step": {
+                                "result": m.content
+                            }
+                        }, indent=2)
+                    )
+                # Fall through to benefit from role normalization
+
+            if m.tool_calls:
+                if not self.args.chat_template.formats_tool_call or not self.args.chat_template.formats_tool_call_content:
+                    return Message(
+                        role=m.role,
+                        content='\n'.join([
+                            *([m.content] if m.content else ()),
+                            *([
+                                f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>'
+                                for tc in m.tool_calls
+                            ])
+                        ])
+                    )
+                else:
+                    return m
+            elif self.args.chat_template.expects_strict_user_assistant_alternance and m.role not in ('user', 'assistant'):
+                if m.role == "system":
+                    return Message(role="user", content=f'[SYS]{m.content}[/SYS]')
+                elif m.role == "tool":
+                    return Message(role="user", content=f'[TOOL(name={m.name}, id={m.tool_call_id})]{m.content}[/TOOL]')
+                else:
+                    sys.stderr.write(f'Unexpected message role: {message.role}\n')
+                    return Message(role="user", content=f'[{m.role.upper()}]{m.content}[/{m.role.upper()}]')
+            else:
+                return m
+
+        messages=[normalize(m) for m in messages]
+
+        if self.args.chat_template.expects_strict_user_assistant_alternance:
+            new_messages=[]
+            current_role = 'user'
+            current_content: list[str] = []
+
+            def flush():
+                nonlocal current_content
+                nonlocal current_role
+
+                if self.args.chat_template.expects_strict_user_assistant_alternance or current_content:
+                    new_messages.append(Message(
+                        role=current_role,
+                        content='\n'.join(current_content)
+                    ))
+                    current_content = []
+
+            for i, message in enumerate(messages):
+                assert message.role in ('user', 'assistant')
+
+                if message.role == current_role:
+                    if message.content:
+                        current_content.append(message.content)
+                else:
+                    flush()
+                    current_role = 'assistant' if current_role == 'user' else 'user'
+                    if message.content:
+                        current_content.append(message.content)
+            if current_content:
+                flush()
+            messages = new_messages
+
+        # JSON!
+        # messages = [m.model_dump() for m in messages]
+
+        return self.args.chat_template.raw_render(
+            messages=messages,
+            add_generation_prompt=True,
+        )
+
+class NoToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs):
+        super().__init__(args, None)
+        assert not args.tools
+
+        if args.response_schema:
+            self.output_format_prompt = Message(
+                role="system",
+                content=_please_respond_with_schema(args.response_schema)
+            )
+            converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+            schema = converter.resolve_refs(args.response_schema, 'response')
+            converter.visit(schema, '')
+            self.grammar = converter.format_grammar()
+        else:
+            self.output_format_prompt = None
+            self.grammar = None
+
+    def parse(self, s: str) -> Optional[Message]:
+        return Message(role="assistant", content=s)
+
+class ToolCallTagsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs, style: Optional[ToolsPromptStyle], escapes_underscores: bool, parallel_calls: bool):
+        super().__init__(args, style)
+
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+        tool_rules = []
+        for tool in self.args.tools or []:
+
+            parameters_schema = tool.function.parameters
+            parameters_schema = converter.resolve_refs(parameters_schema, tool.function.name)
+
+            tool_rules.append(converter.visit(
+                dict(
+                    type="object",
+                    properties=dict(
+                        name=dict(type="string", pattern='^' + tool.function.name.replace('_', f'\\?_') + '$') if escapes_underscores \
+                            else dict(const=tool.function.name),
+                        arguments=parameters_schema,
+                    ),
+                    required=['name', 'arguments']
+                ),
+                f'{tool.function.name}-tool-call'
+            ))
+
+        def format_literal(s: str) -> str:
+            if escapes_underscores:
+                return ' "\\\\"? "_" '.join((converter._format_literal(part) for part in s.split('_')))
+            else:
+                return converter._format_literal(s)
+
+        tool_call_rule = converter._add_rule(
+            'tool_call',
+            format_literal("<tool_call>") + " space (" +
+            ' | '.join(tool_rules) +
+            ")  space " + format_literal("</tool_call>"))# + ' space')
+
+        # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
+        # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
+        content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
+        # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
+        converter._add_rule(
+            'root',
+            # tool_call_rule)
+            f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if parallel_calls \
+                else f'{content_rule}* {tool_call_rule}?')
+        self.grammar = converter.format_grammar()
+
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
+
+        if r'<tool\_call>' in s:
+            # Some weird escaping of underscores is happening w/ Mixtral 8x7B Instruct
+            s = s.replace(r'\_', '_')
+
+        parts = _tool_call_re.split(s)
+        if len(parts) == 1:
+            return Message(role="assistant", content=s)
+        else:
+            content: list[str] = []
+            tool_calls = []
+            for i, part in enumerate(parts):
+                if i % 2 == 0:
+                    content.append(part)
+                else:
+                    try:
+                        fc = json.loads(part)
+                    except json.JSONDecodeError:
+                        raise ValueError(f'Failed to parse tool call as JSON: {part}\nFull string: {s}')
+                    tool_calls.append(
+                        ToolCall(
+                            id=gen_callid(),
+                            function=FunctionCall(
+                                name=fc["name"],
+                                arguments=json.dumps(fc["arguments"]))))
+
+            content_str = '\n'.join(content).strip()
+            return Message(role="assistant", content=content_str if content_str else None, tool_calls=tool_calls)
+
+
+class TemplatedToolsChatHandler(ToolCallTagsChatHandler):
+    def __init__(self, args: ChatHandlerArgs, template: str, parallel_calls: bool, escapes_underscores: bool = False, style: Optional[ToolsPromptStyle] = None):
+        super().__init__(args, style=style, escapes_underscores=escapes_underscores, parallel_calls=parallel_calls)
+        assert '{tools}' in template, 'Template must contain "{tools}"'
+
+        self.output_format_prompt = Message(
+            role="system",
+            content=template.replace(
+                '{tools}',
+                '\n'.join(json.dumps(tool.model_dump(), indent=2) for tool in (self.args.tools or [])),
+            )
+        )
+
+class Hermes2ProToolsChatHandler(ToolCallTagsChatHandler):
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
+        super().__init__(args, style=ToolsPromptStyle.TOOLS_HERMES_2_PRO, escapes_underscores=False, parallel_calls=parallel_calls)
+
+        # Hackily import https://github.com/NousResearch/Hermes-Function-Calling
+        path = str(Path(__file__).parent / "hermes_function_calling")
+        if path not in sys.path: sys.path.insert(0, path)
+        try:
+            from examples.openai.hermes_function_calling.prompter import PromptManager  # type: ignore
+        except ImportError:
+            raise ImportError(f"Please `git clone https://github.com/NousResearch/Hermes-Function-Calling {path}`")
+
+        prompt = PromptManager().generate_prompt(user_prompt=[], tools=[tool.model_dump_json() for tool in args.tools or []])
+        assert len(prompt) == 1 and prompt[0]["role"] == "system"
+        self.output_format_prompt = Message(**prompt[0])
+
+class FunctionaryToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
+        super().__init__(args, ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2)
+
+        self.output_format_prompt = Message(
+            role="system",
+            content= '// Supported function definitions that should be called when necessary.\n' +
+                _tools_typescript_signatures(args.tools or [])
+        )
+
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+        tool_rules = [
+            converter._add_rule(
+                tool.function.name + '-call',
+                converter._format_literal(tool.function.name) + ' ' + converter._format_literal('\n<|content|>\n') + ' ' +
+                converter.visit(tool.function.parameters, tool.function.name + '-args') + ' ' +
+                converter._format_literal('\n'))
+            for i, tool in enumerate(self.args.tools or [])
+        ]
+
+        not_from_rule = converter._add_rule('not_from', converter.not_literal("<|from|>"))
+        content_without_start_rule = converter._add_rule(
+            'content_without_start',
+            converter._format_literal("all\n<|content|>") + ' ' + not_from_rule + '*')
+        start_rule = converter._add_rule('start', converter._format_literal('<|from|>assistant\n<|recipient|>'))
+        content_rule = converter._add_rule('content', start_rule + ' ' + content_without_start_rule)
+        tool_call_without_start_rule = converter._add_rule(
+            'tool_call_without_start',
+            ' | '.join(tool_rules))
+        tool_call_rule = converter._add_rule('tool_call', f'{start_rule} {tool_call_without_start_rule}')
+        converter._add_rule(
+            'root',
+            f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
+            f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if parallel_calls \
+                else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
+
+        self.grammar = converter.format_grammar()
+
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
+
+        parts = _recipient_content_re.split(s)
+        if len(parts) == 1:
+            return Message(role="assistant", content=s)
+        else:
+            text_content = []
+            tool_calls: list[ToolCall] = []
+            for i in range((len(parts) - 1) // 3):
+                assert parts[i * 3].strip() == '', f'Unexpected content before tool call: {parts[i * 3]}'
+                recipient = parts[i * 3 + 1].strip()
+                content = parts[i * 3 + 2]
+                if recipient == 'all':
+                    text_content.append(content)
+                else:
+                    try:
+                        arguments = json.loads(content)
+                    except json.JSONDecodeError:
+                        raise ValueError(f'Failed to parse tool call content as JSON: {content}')
+                    tool_calls.append(
+                        ToolCall(
+                            id=gen_callid(),
+                            function=FunctionCall(name=recipient, arguments=arguments)))
+
+
+            assert parts[-1].strip() in ('', '<|stop|>'), f'Unexpected content after tool calls: {parts[-1]}\nFull string: {s}'
+
+            content = '\n'.join(text_content).strip()
+            return Message(role="assistant", content=content if content else None, tool_calls=tool_calls if tool_calls else None)
+
+def _make_bespoke_schema(response_schema, tool_call_schema, parallel_calls):
+    return {
+        "type": "object",
+        "properties": {
+            # "original_goal": {"title": "Original Goal", "type": "string"},
+            _THOUGHT_KEY: {
+                "title": "Thought about next step",
+                # "title": "Thought about how the next step brings us closer to achieving the original goal",
+                "type": "string"
+            },
+            "next_step": {
+                "title": "Next Step: either a result or one or more tool calls to achieve the original goal",
+                "oneOf": [
+                    {
+                        # "title": "Tool Calls",
+                        "properties": {
+                            # "type": {
+                            #     "const": "tool_calls"
+                            # },
+                            "tool_calls": {
+                                "prefixItems": tool_call_schema if parallel_calls \
+                                    else [tool_call_schema],
+                            }
+                        },
+                        "required": ["tool_calls"]
+                    },
+                    {
+                        "title": "Result (achieving original goal)",
+                        "properties": {
+                            "result": response_schema,
+                        },
+                        "required": ["result"]
+                    },
+                ]
+            },
+        },
+        "required": ["original_goal", _THOUGHT_KEY, "next_step"]
+        # "required": ["next_step"]
+    }
+
+class ThoughtfulStepsToolsChatHandler(ChatHandler):
+    def __init__(self, args: ChatHandlerArgs, parallel_calls: bool):
+        super().__init__(args, ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS)
+
+        # args.response_schema = args.response_schema or {}
+        converter = SchemaConverter(prop_order={}, allow_fetch=False, dotall=False, raw_pattern=False)
+
+        response_schema = converter.resolve_refs(args.response_schema or {"type": "string"}, 'response')
+        tool_parameter_schemas = {
+            tool.function.name: converter.resolve_refs(tool.function.parameters, tool.function.name)
+            for tool in self.args.tools or []
+        }
+        # sys.stderr.write(f"# RESOLVED RESPONSE SCHEMA: {json.dumps(response_schema, indent=2)}\n")
+        # sys.stderr.write(f"# RESOLVED TOOL PARAMETER SCHEMA: {json.dumps(tool_parameter_schemas, indent=2)}\n")
+        converter.visit(
+            _make_bespoke_schema(
+                response_schema,
+                {
+                    "oneOf": [
+                        {
+                            "type": "object",
+                            "properties": {
+                                "name": {"const": tool_name},
+                                "arguments": tool_parameters,
+                            },
+                            "required": ["name", "arguments"]
+                        }
+                        for tool_name, tool_parameters in tool_parameter_schemas.items()
+                    ]
+                },
+                parallel_calls=parallel_calls,
+            ),
+            '',
+        )
+        self.grammar = converter.format_grammar()
+
+        self.output_format_prompt = Message(
+            role="system",
+            content='\n'.join([
+                'You are a function calling AI model.',
+                'Here are the tools available:',
+                _tools_schema_signatures(self.args.tools or [], indent=2),
+                # _tools_typescript_signatures(self.args.tools),
+                _please_respond_with_schema(
+                    _make_bespoke_schema(
+                        response_schema,
+                        {
+                            "properties": {
+                                "name": {
+                                    "title": "Name of the tool to call",
+                                    "type": "string"
+                                },
+                                "arguments": {
+                                    "title": "Arguments to pass to the tool",
+                                    "type": "object"
+                                }
+                            },
+                            "required": ["name", "arguments"]
+                        },
+                        parallel_calls=parallel_calls,
+                    )
+                ),
+            ])
+        )
+
+    def parse(self, s: str) -> Optional[Message]:
+        s = self.args.chat_template.strip_suffix(s)
+        try:
+            data = json.loads(s)
+        except json.JSONDecodeError:
+            raise ValueError(f'Failed to parse data as JSON: {s}')
+
+        next_step = data['next_step']
+        if 'result' in next_step:
+            return Message(role="assistant", content=json.dumps(next_step['result']))
+        elif 'tool_calls' in next_step:
+            return Message(
+                role="assistant",
+                content=data.get(_THOUGHT_KEY),
+                tool_calls=[
+                    ToolCall(
+                        id=gen_callid(),
+                        function=FunctionCall(
+                            name=tc["name"],
+                            arguments=json.dumps(tc["arguments"])))
+                    for tc in next_step['tool_calls']
+                ]
+            )
+        else:
+            raise ValueError(f'Unexpected data: {data}')
+
+_SHORT_TEMPLATE='\n'.join([
+    'Here are the tools available:',
+    '<tools>',
+    '{tools}',
+    '</tools>',
+])
+
+_LONG_TEMPLATE='\n'.join([
+    # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
+    # 'You may call one or more functions to assist with the user query. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
+    'Call one or more functions to assist with the user query, every time this is possible. Don\'t make assumptions about what values to plug into functions. Here are the available tools:',
+    '<tools>',
+    '{tools}',
+    '</tools>',
+    '',
+    # 'Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}',
+    # '',
+    # 'For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:',
+    'To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:',
+    '<tool_call>',
+    '{"name": <function-name>, "arguments": <args-dict>}',
+    '</tool_call>',
+    # 'This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
+])
+
+def get_chat_handler(args: ChatHandlerArgs, parallel_calls: bool, tool_style: Optional[ToolsPromptStyle] = None, verbose=False) -> ChatHandler:
+    tool_style = tool_style if tool_style is not None else args.chat_template.inferred_tool_style
+
+    if parallel_calls and not args.chat_template.potentially_supports_parallel_calls:
+        sys.stderr.write(f"# WARNING: Disabled parallel_calls as model does not seem to support it (will fall back to sequential calls)\n")
+        parallel_calls = False
+
+    if verbose:
+        sys.stderr.write(f"# Using tool style: {tool_style}\n")
+
+    if not args.tools:
+        return NoToolsChatHandler(args)
+
+    elif tool_style == ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS:
+        return ThoughtfulStepsToolsChatHandler(args, parallel_calls=parallel_calls)
+
+    elif tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
+        return FunctionaryToolsChatHandler(args, parallel_calls=parallel_calls)
+
+    elif tool_style == ToolsPromptStyle.TOOLS_SHORT:
+        return TemplatedToolsChatHandler(args, _SHORT_TEMPLATE, parallel_calls=parallel_calls)
+
+    elif tool_style == ToolsPromptStyle.TOOLS_LONG:
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls)
+
+    elif tool_style == ToolsPromptStyle.TOOLS_MIXTRAL:
+        return TemplatedToolsChatHandler(args, _LONG_TEMPLATE, parallel_calls=parallel_calls, escapes_underscores=True)
+
+    elif tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
+        return Hermes2ProToolsChatHandler(args, parallel_calls=parallel_calls)
+    else:
+        raise ValueError(f"Unsupported tool call style: {tool_style}")
+
+# os.environ.get('NO_TS')
+def _please_respond_with_schema(schema: dict[str, Any]) -> str:
+    sig = json.dumps(schema, indent=2)
+    # _ts_converter = SchemaToTypeScriptConverter()
+    # # _ts_converter.resolve_refs(schema, 'schema')
+    # sig = _ts_converter.visit(schema)
+    return f'Please respond in JSON format with the following schema: {sig}'
+
+def _tools_typescript_signatures(tools: list[Tool]) -> str:
+    _ts_converter = SchemaToTypeScriptConverter()
+    # for tool in tools:
+    #     _ts_converter.resolve_refs(tool.function.parameters, tool.function.name)
+
+    return 'namespace functions {\n' + '\n'.join(
+        '// ' + (tool.function.description or '').replace('\n', '\n// ') + '\n' + ''
+        'type ' + tool.function.name + ' = (_: ' + _ts_converter.visit(tool.function.parameters) + ") => any;\n"
+        for tool in tools
+    ) + '} // namespace functions'
+
+def _tools_schema_signatures(tools: list[Tool], indent=None) -> str:
+    return '\n'.join(
+        json.dumps(tool.model_dump(), indent=indent)
+        for tool in tools
+    )
+
+_tool_call_re = re.compile(
+    '<tool_call>(.*?)</tool_call>', re.DOTALL)
+_recipient_content_re = re.compile(r'(?:(?:<\|(?:stop|from)\|>)+ *assistant\n<\|recipient\|>|^) *([^ <|>\n]+) *\n<\|content\|>(.*?)(?:$|<\|stop\|>\s*$|(?=(?:<\|(?:stop|from)\|>)+ *assistant\n))', re.DOTALL)
+
+def gen_callid():
+    return f'call_{random.randint(0, 1000000)}'
diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt
new file mode 100644
index 0000000000000..fad994e0d9f09
--- /dev/null
+++ b/examples/openai/requirements.txt
@@ -0,0 +1,7 @@
+fastapi[all]
+# gguf
+jinja2
+pydantic
+sse-starlette
+uvicorn[all]
+typer[all]
diff --git a/examples/openai/server.py b/examples/openai/server.py
new file mode 100644
index 0000000000000..21903f81272e3
--- /dev/null
+++ b/examples/openai/server.py
@@ -0,0 +1,175 @@
+import json, sys
+from pathlib import Path
+import time
+
+from examples.openai.llama_cpp_server_api import LlamaCppServerCompletionRequest
+from examples.openai.gguf_kvs import GGUFKeyValues, Keys  # type: ignore
+from examples.openai.api import ChatCompletionResponse, Choice, ChatCompletionRequest, Usage
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
+
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+import httpx
+import random
+from starlette.responses import StreamingResponse
+from typing import Annotated, Optional
+import typer
+
+from examples.openai.subprocesses import spawn_subprocess
+
+def generate_id(prefix):
+    return f"{prefix}{random.randint(0, 1 << 32)}"
+
+def main(
+    model: Annotated[str, typer.Option("--model", "-m")] = "models/7B/ggml-model-f16.gguf",
+    template_hf_model_id_fallback: Annotated[Optional[str], typer.Option(help="If the GGUF model does not contain a chat template, get it from this HuggingFace tokenizer")] = 'meta-llama/Llama-2-7b-chat-hf',
+    # model_url: Annotated[Optional[str], typer.Option("--model-url", "-mu")] = None,
+    host: str = "localhost",
+    port: int = 8080,
+    parallel_calls: bool = False,
+    style: Optional[ToolsPromptStyle] = None,
+    auth: Optional[str] = None,
+    verbose: bool = False,
+    context_length: Optional[int] = None,
+    endpoint: Optional[str] = None,
+    server_host: str = "localhost",
+    server_port: Optional[int] = 8081,
+):
+    import uvicorn
+
+    chat_template = None
+    if model:
+        metadata = GGUFKeyValues(Path(model))
+
+        if not context_length:
+            context_length = metadata[Keys.LLM.CONTEXT_LENGTH]
+
+        if Keys.Tokenizer.CHAT_TEMPLATE in metadata:
+            chat_template = ChatTemplate.from_gguf(metadata)
+        else:
+            sys.stderr.write(f"# WARNING: Model does not contain a chat template, fetching it from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+            assert template_hf_model_id_fallback, "template_hf_model_id_fallback is required when the model does not contain a chat template"
+            chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
+
+        if verbose:
+            sys.stderr.write(f"# CHAT TEMPLATE:\n\n{chat_template}\n\n")
+
+    if not chat_template:
+        sys.stderr.write(f"# WARNING: Unsure which model we're talking to, fetching its chat template from HuggingFace tokenizer of {template_hf_model_id_fallback}\n")
+        assert template_hf_model_id_fallback or chat_template, "template_hf_model_id_fallback is required when using an endpoint without a model"
+        chat_template = ChatTemplate.from_huggingface(template_hf_model_id_fallback)
+
+    if not endpoint:
+        if verbose:
+            sys.stderr.write(f"# Starting C++ server with model {model} on {server_host}:{server_port}\n")
+        cmd = [
+            "./server", "-m", model,
+            "--host", server_host, "--port", f'{server_port}',
+            # TODO: pass these from JSON / BaseSettings?
+            '-ctk', 'q4_0', '-ctv', 'f16',
+            "-c", f"{context_length}",
+            *([] if verbose else ["--log-disable"]),
+        ]
+
+        spawn_subprocess(cmd)
+        endpoint = f"http://{server_host}:{server_port}"
+
+    app = FastAPI()
+
+    @app.post("/v1/chat/completions")
+    async def chat_completions(request: Request, chat_request: ChatCompletionRequest):
+        headers = {
+            "Content-Type": "application/json",
+        }
+        if (auth_value := request.headers.get("Authorization", auth)):
+            headers["Authorization"] = auth_value
+
+        if chat_request.response_format is not None:
+            assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
+            response_schema = chat_request.response_format.schema or {}
+        else:
+            response_schema = None
+
+        chat_handler = get_chat_handler(
+            ChatHandlerArgs(chat_template=chat_template, response_schema=response_schema, tools=chat_request.tools),
+            parallel_calls=parallel_calls,
+            tool_style=style,
+            verbose=verbose,
+        )
+
+        prompt = chat_handler.render_prompt(chat_request.messages) if chat_request.messages else chat_request.prompt
+        assert prompt is not None, "One of prompt or messages field is required"
+
+        if verbose:
+            sys.stderr.write(f'\n# REQUEST:\n\n{chat_request.model_dump_json(indent=2)}\n\n')
+            # sys.stderr.write(f'\n# MESSAGES:\n\n{TypeAdapter(list[Message]).dump_json(messages)}\n\n')
+            sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
+            sys.stderr.write(f'\n# GRAMMAR:\n\n{chat_handler.grammar}\n\n')
+
+        data = LlamaCppServerCompletionRequest(
+            **{
+                k: v
+                for k, v in chat_request.model_dump().items()
+                if k not in (
+                    "prompt",
+                    "tools",
+                    "messages",
+                    "response_format",
+                )
+            },
+            prompt=prompt,
+            grammar=chat_handler.grammar,
+        ).model_dump()
+        # sys.stderr.write(json.dumps(data, indent=2) + "\n")
+
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f'{endpoint}/completions',
+                json=data,
+                headers=headers,
+                timeout=None)
+
+        if chat_request.stream:
+            # TODO: Remove suffix from streamed response using partial parser.
+            assert not chat_request.tools and not chat_request.response_format, "Streaming not supported yet with tools or response_format"
+            return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
+        else:
+            result = response.json()
+            if verbose:
+                sys.stderr.write("# RESULT:\n\n" + json.dumps(result, indent=2) + "\n\n")
+            if 'content' not in result:
+                # print(json.dumps(result, indent=2))
+                return JSONResponse(result)
+
+            # print(json.dumps(result.get('content'), indent=2))
+            message = chat_handler.parse(result["content"])
+            assert message is not None, f"Failed to parse response:\n{response.text}\n\n"
+
+            prompt_tokens=result['timings']['prompt_n']
+            completion_tokens=result['timings']['predicted_n']
+            return JSONResponse(ChatCompletionResponse(
+                id=generate_id('chatcmpl-'),
+                object="chat.completion",
+                created=int(time.time()),
+                model=chat_request.model,
+                choices=[Choice(
+                    index=0,
+                    message=message,
+                    finish_reason="stop" if message.tool_calls is None else "tool_calls",
+                )],
+                usage=Usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=prompt_tokens + completion_tokens,
+                ),
+                system_fingerprint='...'
+            ).model_dump())
+
+    async def generate_chunks(response):
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/examples/openai/subprocesses.py b/examples/openai/subprocesses.py
new file mode 100644
index 0000000000000..33ee8a50715eb
--- /dev/null
+++ b/examples/openai/subprocesses.py
@@ -0,0 +1,30 @@
+
+import atexit
+import os
+import signal
+import subprocess
+import sys
+
+
+def _cleanup_process(p):
+    pid = p.pid
+
+    if sys.platform == 'win32':
+        os.system(f'taskkill /PID {pid} /T /F')
+    else:
+        pgid = os.getpgid(pid)
+        os.killpg(pgid, signal.SIGTERM)
+
+        p.wait()
+        if p.poll() is None:
+            os.killpg(pgid, signal.SIGKILL)
+
+def spawn_subprocess(cmd, **kwargs):
+    server_process = subprocess.Popen(
+        cmd,
+        stdout=sys.stderr,
+        start_new_session=True,
+        **kwargs
+    )
+    atexit.register(_cleanup_process, server_process)
+    return server_process
diff --git a/examples/openai/test_chat_handlers.md b/examples/openai/test_chat_handlers.md
new file mode 100644
index 0000000000000..ffbf700103dcb
--- /dev/null
+++ b/examples/openai/test_chat_handlers.md
@@ -0,0 +1,1141 @@
+
+Messages:
+
+```js
+[
+  {
+    "role": "user",
+    "name": null,
+    "tool_call_id": null,
+    "content": "Add two numbers for the purpose of this test.",
+    "tool_calls": null
+  },
+  {
+    "role": "assistant",
+    "name": null,
+    "tool_call_id": null,
+    "content": "I've thought a lot about this.",
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  },
+  {
+    "role": "tool",
+    "name": "superSecretTool",
+    "tool_call_id": "call_531873",
+    "content": "32222002938",
+    "tool_calls": null
+  },
+  {
+    "role": "assistant",
+    "name": null,
+    "tool_call_id": null,
+    "content": "The sum of 2535 and 32222000403 is 42.",
+    "tool_calls": null
+  }
+]
+```
+
+
+# mistral_instruct_v0_1
+
+
+Template:
+
+```js
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
+```
+
+
+## mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS
+
+
+### mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS / with tools
+
+
+Prompt:
+
+```js
+<s>[INST] [SYS]You are a function calling AI model.
+Here are the tools available:
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+Please respond in JSON format with the following schema: {
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
+}[/SYS]
+Add two numbers for the purpose of this test. [/INST]{
+  "thought_about_next_step_only": "I've thought a lot about this.",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}</s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}</s>
+```
+
+
+Output format prompt:
+
+```json
+You are a function calling AI model.
+Here are the tools available:
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+Please respond in JSON format with the following schema: {
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+next-step ::= next-step-0 | next-step-1
+next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
+next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
+next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
+next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
+next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
+next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
+next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
+next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
+next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
+next-step-1 ::= "{" space next-step-1-result-kv "}" space
+next-step-1-result-kv ::= "\"result\"" space ":" space integer
+next-step-kv ::= "\"next_step\"" space ":" space next-step
+root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
+
+
+### mistral_instruct_v0_1 / TOOLS_THOUGHTFUL_STEPS / without tools
+
+
+Prompt:
+
+```js
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## mistral_instruct_v0_1 / TOOLS_MIXTRAL
+
+
+### mistral_instruct_v0_1 / TOOLS_MIXTRAL / with tools
+
+
+Prompt:
+
+```js
+<s>[INST] [SYS]Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+```
+
+
+Output format prompt:
+
+```json
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>
+```
+
+
+Grammar:
+
+```js
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* tool-call?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"" "say" "\"" space
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"" "superSecretTool" "\"" space
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool" "\\"? "_" "call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool" "\\"? "_" "call>"
+```
+
+
+### mistral_instruct_v0_1 / TOOLS_MIXTRAL / without tools
+
+
+Prompt:
+
+```js
+<s>[INST] [SYS]Please respond in JSON format with the following schema: number[/SYS]
+Add two numbers for the purpose of this test. [/INST]I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call></s>[INST] [TOOL(name=superSecretTool, id=call_531873)]32222002938[/TOOL] [/INST]The sum of 2535 and 32222000403 is 42.</s>
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+# functionary_v2_2
+
+
+**Might Support Parallel Tool Calls**
+
+
+Template:
+
+```js
+{#v2.2#}
+{% for message in messages %}
+{% if message['role'] == 'user' or message['role'] == 'system' %}
+{{ '<|from|>' + message['role'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% elif message['role'] == 'tool' %}
+{{ '<|from|>' + message['name'] + '
+<|recipient|>all
+<|content|>' + message['content'] + '
+' }}{% else %}
+{% set contain_content='no'%}
+{% if message['content'] is not none %}
+{{ '<|from|>assistant
+<|recipient|>all
+<|content|>' + message['content'] }}{% set contain_content='yes'%}
+{% endif %}
+{% if 'tool_calls' in message and message['tool_calls'] is not none %}
+{% for tool_call in message['tool_calls'] %}
+{% set prompt='<|from|>assistant
+<|recipient|>' + tool_call['function']['name'] + '
+<|content|>' + tool_call['function']['arguments'] %}
+{% if loop.index == 1 and contain_content == "no" %}
+{{ prompt }}{% else %}
+{{ '
+' + prompt}}{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|stop|>
+' }}{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|from|>assistant
+<|recipient|>' }}{% endif %}
+```
+
+
+## functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2
+
+
+### functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2 / with tools
+
+
+Prompt:
+
+```js
+<|from|>system
+<|recipient|>all
+<|content|>// Supported function definitions that should be called when necessary.
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+<|from|>user
+<|recipient|>all
+<|content|>Add two numbers for the purpose of this test.
+<|from|>assistant
+<|recipient|>all
+<|content|>I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
+<|recipient|>
+```
+
+
+Output format prompt:
+
+```json
+// Supported function definitions that should be called when necessary.
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+```
+
+
+Grammar:
+
+```js
+content ::= start content-without-start
+content-without-start ::= "all\n<|content|>" not-from*
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+not-from ::= ([^<] | "<" ([^|] | "|" ([^f] | "f" ([^r] | "r" ([^o] | "o" ([^m] | "m" ([^|] | "|" ([^>])?)?)?)?)?)?)?)
+root ::= content-without-start   content*   (tool-call+ content*)? | tool-call-without-start tool-call* content*
+say-args ::= "{" space say-args-text-kv "}" space
+say-args-text-kv ::= "\"text\"" space ":" space string
+say-call ::= "say" "\n<|content|>\n" say-args "\n"
+space ::= " "?
+start ::= "<|from|>assistant\n<|recipient|>"
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+superSecretTool-args ::= "{" space superSecretTool-args-a-kv "," space superSecretTool-args-b-kv "}" space
+superSecretTool-args-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-args-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-call ::= "superSecretTool" "\n<|content|>\n" superSecretTool-args "\n"
+tool-call ::= start tool-call-without-start
+tool-call-without-start ::= superSecretTool-call | say-call
+```
+
+
+### functionary_v2_2 / TYPESCRIPT_FUNCTIONARY_V2 / without tools
+
+
+Prompt:
+
+```js
+<|from|>system
+<|recipient|>all
+<|content|>Please respond in JSON format with the following schema: number
+<|from|>user
+<|recipient|>all
+<|content|>Add two numbers for the purpose of this test.
+<|from|>assistant
+<|recipient|>all
+<|content|>I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|stop|>
+<|from|>superSecretTool
+<|recipient|>all
+<|content|>32222002938
+<|from|>assistant
+<|recipient|>all
+<|content|>The sum of 2535 and 32222000403 is 42.<|stop|>
+<|from|>assistant
+<|recipient|>
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+# hermes_2_pro_mistral
+
+
+Template:
+
+```js
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
+```
+
+
+## hermes_2_pro_mistral / TOOLS_SHORT
+
+
+### hermes_2_pro_mistral / TOOLS_SHORT / with tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools><|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+Here are the tools available:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+```
+
+
+Grammar:
+
+```js
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* tool-call?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### hermes_2_pro_mistral / TOOLS_SHORT / without tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+Please respond in JSON format with the following schema: number<|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## hermes_2_pro_mistral / TOOLS_LONG
+
+
+### hermes_2_pro_mistral / TOOLS_LONG / with tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call><|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+Call one or more functions to assist with the user query, every time this is possible. Don't make assumptions about what values to plug into functions. Here are the available tools:
+<tools>
+{
+  "type": "function",
+  "function": {
+    "name": "superSecretTool",
+    "description": "Adds two numbers",
+    "parameters": {
+      "properties": {
+        "a": {
+          "type": "integer"
+        },
+        "b": {
+          "type": "integer"
+        }
+      },
+      "required": [
+        "a",
+        "b"
+      ]
+    }
+  }
+}
+{
+  "type": "function",
+  "function": {
+    "name": "say",
+    "description": "Says something out loud (TTS)",
+    "parameters": {
+      "properties": {
+        "text": {
+          "description": "The text to say out loud",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text"
+      ]
+    }
+  }
+}
+</tools>
+
+To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>, "arguments": <args-dict>}
+</tool_call>
+```
+
+
+Grammar:
+
+```js
+content ::= [^<] | "<" [^t<] | "<t" [^o<]
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= content* tool-call?
+say-tool-call ::= "{" space say-tool-call-name-kv "," space say-tool-call-arguments-kv "}" space
+say-tool-call-arguments ::= "{" space say-tool-call-arguments-text-kv "}" space
+say-tool-call-arguments-kv ::= "\"arguments\"" space ":" space say-tool-call-arguments
+say-tool-call-arguments-text-kv ::= "\"text\"" space ":" space string
+say-tool-call-name ::= "\"say\""
+say-tool-call-name-kv ::= "\"name\"" space ":" space say-tool-call-name
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+superSecretTool-tool-call ::= "{" space superSecretTool-tool-call-name-kv "," space superSecretTool-tool-call-arguments-kv "}" space
+superSecretTool-tool-call-arguments ::= "{" space superSecretTool-tool-call-arguments-a-kv "," space superSecretTool-tool-call-arguments-b-kv "}" space
+superSecretTool-tool-call-arguments-a-kv ::= "\"a\"" space ":" space integer
+superSecretTool-tool-call-arguments-b-kv ::= "\"b\"" space ":" space integer
+superSecretTool-tool-call-arguments-kv ::= "\"arguments\"" space ":" space superSecretTool-tool-call-arguments
+superSecretTool-tool-call-name ::= "\"superSecretTool\""
+superSecretTool-tool-call-name-kv ::= "\"name\"" space ":" space superSecretTool-tool-call-name
+tool-call ::= "<tool_call>" space (superSecretTool-tool-call | say-tool-call)  space "</tool_call>"
+```
+
+
+### hermes_2_pro_mistral / TOOLS_LONG / without tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+Please respond in JSON format with the following schema: number<|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS
+
+
+### hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS / with tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+You are a function calling AI model.
+Here are the tools available:
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+Please respond in JSON format with the following schema: {
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
+}<|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+{
+  "thought_about_next_step_only": "I've thought a lot about this.",
+  "next_step": {
+    "tool_calls": [
+      {
+        "id": "call_531873",
+        "type": "function",
+        "function": {
+          "name": "superSecretTool",
+          "arguments": {
+            "a": 2535,
+            "b": 32222000403
+          }
+        }
+      }
+    ]
+  }
+}<|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+{
+  "thought_about_next_step_only": "",
+  "next_step": {
+    "result": "The sum of 2535 and 32222000403 is 42."
+  }
+}<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+You are a function calling AI model.
+Here are the tools available:
+namespace functions {
+// Adds two numbers
+type superSecretTool = (_: {
+a: number,
+b: number
+}) => any;
+
+// Says something out loud (TTS)
+type say = (_: {
+// The text to say out loud
+text: string
+}) => any;
+} // namespace functions
+Please respond in JSON format with the following schema: {
+thought_about_next_step_only: string,
+next_step: {
+tool_calls: [{
+name: string,
+arguments: any
+}][]
+}|{
+result: number
+}
+}
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integer ::= ("-"? integral-part) space
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+next-step ::= next-step-0 | next-step-1
+next-step-0 ::= "{" space next-step-0-tool-calls-kv "}" space
+next-step-0-tool-calls ::= "[" space next-step-0-tool-calls-tuple-0 "]" space
+next-step-0-tool-calls-kv ::= "\"tool_calls\"" space ":" space next-step-0-tool-calls
+next-step-0-tool-calls-tuple-0 ::= next-step-0-tool-calls-tuple-0-0 | next-step-0-tool-calls-tuple-0-1
+next-step-0-tool-calls-tuple-0-0 ::= "{" space next-step-0-tool-calls-tuple-0-0-name-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments ::= "{" space next-step-0-tool-calls-tuple-0-0-arguments-a-kv "," space next-step-0-tool-calls-tuple-0-0-arguments-b-kv "}" space
+next-step-0-tool-calls-tuple-0-0-arguments-a-kv ::= "\"a\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-b-kv ::= "\"b\"" space ":" space integer
+next-step-0-tool-calls-tuple-0-0-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-0-arguments
+next-step-0-tool-calls-tuple-0-0-name ::= "\"superSecretTool\""
+next-step-0-tool-calls-tuple-0-0-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-0-name
+next-step-0-tool-calls-tuple-0-1 ::= "{" space next-step-0-tool-calls-tuple-0-1-name-kv "," space next-step-0-tool-calls-tuple-0-1-arguments-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments ::= "{" space next-step-0-tool-calls-tuple-0-1-arguments-text-kv "}" space
+next-step-0-tool-calls-tuple-0-1-arguments-kv ::= "\"arguments\"" space ":" space next-step-0-tool-calls-tuple-0-1-arguments
+next-step-0-tool-calls-tuple-0-1-arguments-text-kv ::= "\"text\"" space ":" space string
+next-step-0-tool-calls-tuple-0-1-name ::= "\"say\""
+next-step-0-tool-calls-tuple-0-1-name-kv ::= "\"name\"" space ":" space next-step-0-tool-calls-tuple-0-1-name
+next-step-1 ::= "{" space next-step-1-result-kv "}" space
+next-step-1-result-kv ::= "\"result\"" space ":" space integer
+next-step-kv ::= "\"next_step\"" space ":" space next-step
+root ::= "{" space thought-about-next-step-only-kv "," space next-step-kv "}" space
+space ::= " "?
+string ::=  "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space
+thought-about-next-step-only-kv ::= "\"thought_about_next_step_only\"" space ":" space string
+```
+
+
+### hermes_2_pro_mistral / TOOLS_THOUGHTFUL_STEPS / without tools
+
+
+Prompt:
+
+```js
+<|im_start|>system
+Please respond in JSON format with the following schema: number<|im_end|>
+<|im_start|>user
+Add two numbers for the purpose of this test.<|im_end|>
+<|im_start|>assistant
+I've thought a lot about this.
+<tool_call>{"id": "call_531873", "type": "function", "function": {"name": "superSecretTool", "arguments": {"a": 2535, "b": 32222000403}}}</tool_call><|im_end|>
+<|im_start|>tool
+32222002938<|im_end|>
+<|im_start|>assistant
+The sum of 2535 and 32222000403 is 42.<|im_end|>
+<|im_start|>assistant
+
+```
+
+
+Output format prompt:
+
+```json
+Please respond in JSON format with the following schema: number
+```
+
+
+Grammar:
+
+```js
+decimal-part ::= [0-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+integral-part ::= [0-9] | [1-9] [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]? [0-9]?
+root ::= ("-"? integral-part) space
+space ::= " "?
+```
+
+
+## hermes_2_pro_mistral / TOOLS_HERMES_2_PRO
+
diff --git a/examples/openai/test_chat_handlers.py b/examples/openai/test_chat_handlers.py
new file mode 100644
index 0000000000000..50b39f47cbbd5
--- /dev/null
+++ b/examples/openai/test_chat_handlers.py
@@ -0,0 +1,235 @@
+#
+#
+# python -m examples.openai.test_chat_handlers | tee examples/openai/test_chat_handlers.md
+
+import json
+import sys
+
+from examples.openai.api import FunctionCall, Message, Tool, ToolCall, ToolFunction
+from examples.openai.prompting import ChatHandlerArgs, ChatTemplate, ToolsPromptStyle, get_chat_handler
+
+TEST_ARG_A = 2535
+TEST_ARG_B = 32222000403
+TEST_SUM = 32222002938
+
+QUESTION = "Add two numbers for the purpose of this test."
+ANSWER = "The sum of 2535 and 32222000403 is 42."
+
+PROMPT_MESSAGE = Message(
+    role="user",
+    content=QUESTION,
+)
+ASSIST_MESSAGE = Message(
+    role="assistant",
+    content=ANSWER,
+)
+TOOL_NAME = "superSecretTool"
+TOOL_CALL = ToolCall(
+    id="call_531873",
+    type="function",
+    function=FunctionCall(
+        name=TOOL_NAME,
+        arguments={
+            "a": TEST_ARG_A,
+            "b": TEST_ARG_B
+        }
+    )
+)
+TOOL_CALL_MESSAGE = Message(
+    role="assistant",
+    content=None,
+    tool_calls=[TOOL_CALL],
+)
+
+TEST_THOUGHT = "I've thought a lot about this."
+THOUGHTFUL_TOOL_CALL_MESSAGE = Message(
+    role="assistant",
+    content=TEST_THOUGHT,
+    tool_calls=[TOOL_CALL],
+)
+
+# UNDERSCORE_ESCAPED_TOOL_CALL_MESSAGE = Message(**{
+#     **TOOL_CALL_MESSAGE.model_dump(),
+#     "tool_calls": [
+#         json.loads(tc.model_dump_json().replace("_", "\\_"))
+#         for tc in TOOL_CALL_MESSAGE.tool_calls
+#     ],
+# })
+TOOL_MESSAGE = Message(
+    role="tool",
+    name=TOOL_NAME,
+    tool_call_id="call_531873",
+    content=f'{TEST_SUM}',
+    tool_calls=None
+)
+TEST_MESSAGES = [
+    PROMPT_MESSAGE,
+    TOOL_CALL_MESSAGE,
+    TOOL_MESSAGE,
+    ASSIST_MESSAGE,
+]
+TEST_MESSAGES_THOUGHT = [
+    PROMPT_MESSAGE,
+    THOUGHTFUL_TOOL_CALL_MESSAGE,
+    TOOL_MESSAGE,
+    ASSIST_MESSAGE,
+]
+
+
+TEST_TOOLS = [
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name=TOOL_NAME,
+            description="Adds two numbers",
+            parameters={
+                "properties": {
+                  "a": {"type": "integer"},
+                  "b": {"type": "integer"},
+                },
+                "required": ["a", "b"]
+            }
+        )
+    ),
+    Tool(
+        type="function",
+        function=ToolFunction(
+            name="say",
+            description="Says something out loud (TTS)",
+            parameters={
+                "properties": {
+                  "text": {
+                      "description": "The text to say out loud",
+                      "type": "string"
+                  },
+                },
+                "required": ["text"]
+            }
+        )
+    )
+]
+
+TEST_OUTPUT_SCHEMA = {"type": "integer"}
+
+# Generate the JSON for TEST_TEMPLATES below by uncommenting this block:
+#
+# TEST_TEMPLATES = {
+#   'mistral_instruct_v0_1': ChatTemplate.from_huggingface("mistralai/Mixtral-8x7B-Instruct-v0.1"),
+#   'functionary_v2_2': ChatTemplate.from_huggingface("meetkai/functionary-small-v2.2"),
+#   'hermes_2_pro_mistral': ChatTemplate.from_huggingface("NousResearch/Hermes-2-Pro-Mistral-7B"),
+#   'llama2': ChatTemplate.from_huggingface("meta-llama/Llama-2-7b-chat-hf"),
+# }
+# print(json.dumps({k: v.model_dump() for k, v in TEST_TEMPLATES.items()}, indent=2))
+# exit(0)
+
+TEST_TEMPLATES = {
+    "mistral_instruct_v0_1": {
+        "template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+    "functionary_v2_2": {
+        "template": "{#v2.2#}\n{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|from|>' + message['role'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% elif message['role'] == 'tool' %}\n{{ '<|from|>' + message['name'] + '\n<|recipient|>all\n<|content|>' + message['content'] + '\n' }}{% else %}\n{% set contain_content='no'%}\n{% if message['content'] is not none %}\n{{ '<|from|>assistant\n<|recipient|>all\n<|content|>' + message['content'] }}{% set contain_content='yes'%}\n{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{% set prompt='<|from|>assistant\n<|recipient|>' + tool_call['function']['name'] + '\n<|content|>' + tool_call['function']['arguments'] %}\n{% if loop.index == 1 and contain_content == \"no\" %}\n{{ prompt }}{% else %}\n{{ '\n' + prompt}}{% endif %}\n{% endfor %}\n{% endif %}\n{{ '<|stop|>\n' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|from|>assistant\n<|recipient|>' }}{% endif %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+    "hermes_2_pro_mistral": {
+        "template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+        "eos_token": "<|im_end|>",
+        "bos_token": "<s>"
+    },
+    "llama2": {
+        "template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+        "eos_token": "</s>",
+        "bos_token": "<s>"
+    },
+}
+MODELS_WITH_PARALLEL_CALLS = set(["functionary_v2_2"])
+TEST_TEMPLATES = {k: ChatTemplate(**v) for k, v in TEST_TEMPLATES.items()}
+
+if __name__ == "__main__":
+
+    failures = []
+
+    print(f'\nMessages:\n\n```js\n{json.dumps([m.model_dump() for m in TEST_MESSAGES_THOUGHT], indent=2)}\n```\n')
+
+    def check(b: bool, msg: str):
+        if not b:
+            sys.stderr.write(f'FAILURE: {msg}\n\n')
+            failures.append(msg)
+
+    functionary_v2_2 = TEST_TEMPLATES["functionary_v2_2"]
+    check(functionary_v2_2.inferred_tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2, "functionary_v2_2 should be inferred as TYPESCRIPT_FUNCTIONARY_V2")
+
+    for model_name, chat_template in TEST_TEMPLATES.items():
+        check(chat_template.potentially_supports_parallel_calls == (model_name in MODELS_WITH_PARALLEL_CALLS),
+              f"{model_name} should {'not ' if model_name not in MODELS_WITH_PARALLEL_CALLS else ''} be detected as potentially supporting parallel calls")
+
+        argss = {
+            "with tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=TEST_TOOLS,
+            ),
+            "without tools": ChatHandlerArgs(
+                chat_template=chat_template, #ChatTemplate.from_gguf(GGUFKeyValues(model)),
+                response_schema=TEST_OUTPUT_SCHEMA,
+                tools=[],
+            ),
+        }
+
+        print(f"\n# {model_name}\n")
+
+        if chat_template.potentially_supports_parallel_calls:
+            print("\n**Might Support Parallel Tool Calls**\n")
+
+        print(f'\nTemplate:\n\n```js\n{chat_template.template}\n```\n')
+
+        for style in ToolsPromptStyle:
+            if (style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2) != (model_name.startswith("functionary")):
+                continue
+
+            if style == ToolsPromptStyle.TOOLS_MIXTRAL and model_name != "mistral_instruct_v0_1":
+                continue
+
+            if model_name == "mistral_instruct_v0_1" and style not in (ToolsPromptStyle.TOOLS_THOUGHTFUL_STEPS, ToolsPromptStyle.TOOLS_MIXTRAL):
+                continue
+
+            print(f'\n## {model_name} / {style.name}\n')
+
+
+            for tool_situation, args in argss.items():
+                ch = get_chat_handler(args, parallel_calls=True, tool_style=style)
+
+                print(f'\n### {model_name} / {style.name} / {tool_situation}\n')
+
+                print(f'\nPrompt:\n\n```js\n{ch.render_prompt(TEST_MESSAGES_THOUGHT)}\n```\n')
+
+                print(f'\nOutput format prompt:\n\n```json\n{ch.output_format_prompt.content}\n```\n')
+
+                print(f'\nGrammar:\n\n```js\n{ch.grammar}\n```\n')
+
+
+                # if model_name == 'hermes_2_pro_mistral':
+                #     print("Skipping hermes_2_pro_mistral")
+                #     continue
+                def check_finds(msgs, strings_to_find):
+                    prompt = ch.render_prompt(msgs)
+                    for s in strings_to_find:
+                        check(str(s) in prompt, f"Missing {s} in prompt for {model_name}:\n{prompt}")
+
+                check_finds([PROMPT_MESSAGE], (QUESTION,))
+                check_finds([ASSIST_MESSAGE], (ANSWER,))
+                check_finds([TOOL_CALL_MESSAGE], (TEST_ARG_A, TEST_ARG_B, TOOL_NAME))
+                check_finds([THOUGHTFUL_TOOL_CALL_MESSAGE], (TEST_THOUGHT, TEST_ARG_A, TEST_ARG_B, TOOL_NAME,))
+                check_finds([TOOL_MESSAGE], (TEST_SUM,))
+                if chat_template.potentially_supports_parallel_calls:
+                    check_finds([TOOL_MESSAGE], (TOOL_NAME,))
+
+
+
+    if failures:
+        for f in failures:
+            print(f'{f}\n\n')
+
+        assert not failures
diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
new file mode 100644
index 0000000000000..245e389c103c8
--- /dev/null
+++ b/examples/openai/ts_converter.py
@@ -0,0 +1,150 @@
+from typing import Any, Dict, List, Set, Tuple, Union
+import json
+
+from pydantic import Json
+
+class SchemaToTypeScriptConverter:
+    # TODO: comments for arguments!
+    # // Get the price of a particular car model
+    # type get_car_price = (_: {
+    # // The name of the car model.
+    # car_name: string,
+    # }) => any;
+
+    # // get the weather of a location
+    # type get_weather = (_: {
+    # // where to get weather.
+    # location: string,
+    # }) => any;
+
+    def __init__(self, allow_fetch: bool = True):
+        self._refs: Dict[str, Json[Any]] = {}
+        self._refs_being_resolved: Set[str] = set()
+        self._allow_fetch = allow_fetch
+
+    def resolve_refs(self, schema: Json[Any], url: str):
+        '''
+            Resolves all $ref fields in the given schema, fetching any remote schemas,
+            replacing $ref with absolute reference URL and populating self._refs with the
+            respective referenced (sub)schema dictionaries.
+        '''
+        def visit(n: Json[Any]):
+            if isinstance(n, list):
+                return [visit(x) for x in n]
+            elif isinstance(n, dict):
+                ref = n.get('$ref')
+                if ref is not None and ref not in self._refs:
+                    if ref.startswith('https://'):
+                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
+                        import requests
+
+                        frag_split = ref.split('#')
+                        base_url = frag_split[0]
+
+                        target = self._refs.get(base_url)
+                        if target is None:
+                            target = self.resolve_refs(requests.get(ref).json(), base_url)
+                            self._refs[base_url] = target
+
+                        if len(frag_split) == 1 or frag_split[-1] == '':
+                            return target
+                    elif ref.startswith('#/'):
+                        target = schema
+                        ref = f'{url}{ref}'
+                        n['$ref'] = ref
+                    else:
+                        raise ValueError(f'Unsupported ref {ref}')
+
+                    for sel in ref.split('#')[-1].split('/')[1:]:
+                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                        target = target[sel]
+
+                    self._refs[ref] = target
+                else:
+                    for v in n.values():
+                        visit(v)
+
+            return n
+        return visit(schema)
+
+    def _desc_comment(self, schema: Json[Any]):
+        desc = schema.get("description", "").replace("\n", "\n// ") if 'description' in schema else None
+        return f'// {desc}\n' if desc else ''
+
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], additional_properties: Union[bool, Any]):
+        if additional_properties == True:
+            additional_properties = {}
+        elif additional_properties == False:
+            additional_properties = None
+
+        return "{\n" + ',\n'.join([
+            f'{self._desc_comment(prop_schema)}{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
+            for prop_name, prop_schema in properties
+        ] + (
+            [f"{self._desc_comment(additional_properties) if isinstance(additional_properties, dict) else ''}[key: string]: {self.visit(additional_properties)}"]
+            if additional_properties is not None else []
+        )) + "\n}"
+
+    def visit(self, schema: Json[Any]):
+        def print_constant(v):
+            return json.dumps(v)
+
+        schema_type = schema.get('type')
+        schema_format = schema.get('format')
+
+        if 'oneOf' in schema or 'anyOf' in schema:
+            return '|'.join(self.visit(s) for s in schema.get('oneOf') or schema.get('anyOf') or [])
+
+        elif isinstance(schema_type, list):
+            return '|'.join(self.visit({'type': t}) for t in schema_type)
+
+        elif 'const' in schema:
+            return print_constant(schema['const'])
+
+        elif 'enum' in schema:
+            return '|'.join((print_constant(v) for v in schema['enum']))
+
+        elif schema_type in (None, 'object') and \
+              ('properties' in schema or \
+              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
+            required = set(schema.get('required', []))
+            properties = list(schema.get('properties', {}).items())
+            return self._build_object_rule(properties, required, schema.get('additionalProperties'))
+
+        elif schema_type in (None, 'object') and 'allOf' in schema:
+            required = set()
+            properties = []
+            def add_component(comp_schema, is_required):
+                if (ref := comp_schema.get('$ref')) is not None:
+                    comp_schema = self._refs[ref]
+
+                if 'properties' in comp_schema:
+                    for prop_name, prop_schema in comp_schema['properties'].items():
+                        properties.append((prop_name, prop_schema))
+                        if is_required:
+                            required.add(prop_name)
+
+            for t in schema['allOf']:
+                if 'anyOf' in t:
+                    for tt in t['anyOf']:
+                        add_component(tt, is_required=False)
+                else:
+                    add_component(t, is_required=True)
+
+            return self._build_object_rule(properties, required, additional_properties={})
+
+        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
+            items = schema.get('items') or schema['prefixItems']
+            if isinstance(items, list):
+                return '[' + ', '.join(self.visit(item) for item in items) + '][]'
+            else:
+                return self.visit(items) + '[]'
+
+        elif schema_type in (None, 'string') and schema_format == 'date-time':
+            return 'Date'
+
+        elif (schema_type == 'object') or (len(schema) == 0):
+            return 'any'
+
+        else:
+            return 'number' if schema_type == 'integer' else schema_type or 'any'
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6ffaa8d9fe637..ed90dbc3566eb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2600,7 +2600,7 @@ int main(int argc, char ** argv) {
         chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
         chat.push_back({{"role", "user"},      {"content", "How are you?"}});
 
-        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
+        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat, "");
 
         LOG_INFO("chat template", {
             {"chat_example", chat_example},
@@ -3078,7 +3078,7 @@ int main(int argc, char ** argv) {
             if (!result.error && result.stop) {
                 json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
 
-                res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+                res.set_content(result_oai.dump(2, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
             } else {
                 res_error(res, result.data);
             }
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 26d9359d7f3f8..5c59d079bbfaa 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -66,6 +66,8 @@ def step_server_config(context, server_fqdn, server_port):
     context.server_seed = None
     context.user_api_key = None
     context.response_format = None
+    context.tools = None
+    context.tool_choice = None
     context.temperature = None
 
     context.tasks_result = []
@@ -337,6 +339,13 @@ def step_max_tokens(context, max_tokens):
 def step_response_format(context, response_format):
     context.response_format = json.loads(response_format)
 
+@step('tools {tools}')
+def step_tools(context, tools):
+    context.tools = json.loads(tools)
+
+@step('tool choice {tool_choice}')
+def step_tool_choice(context, tool_choice):
+    context.tool_choice = tool_choice
 
 @step('{temperature:f} temperature')
 def step_temperature(context, temperature):
@@ -471,6 +480,11 @@ async def step_oai_chat_completions(context, api_error):
                                             response_format=context.response_format
                                             if hasattr(context, 'response_format') else None,
 
+                                            tools=context.tools
+                                            if hasattr(context, 'tools') else None,
+
+                                            tool_choice=context.tool_choice,
+
                                             user_api_key=context.user_api_key
                                             if hasattr(context, 'user_api_key') else None,
 
@@ -541,6 +555,9 @@ async def step_oai_chat_completions(context):
                               if hasattr(context, 'enable_streaming') else None,
                               response_format=context.response_format
                               if hasattr(context, 'response_format') else None,
+                              tools=context.tools
+                              if hasattr(context, 'tools') else None,
+                              tool_choice=context.tool_choice,
                               user_api_key=context.user_api_key
                               if hasattr(context, 'user_api_key') else None)
 
@@ -554,16 +571,18 @@ async def step_oai_chat_completions(context):
                               context.base_url,
                               '/chat/completions',
                               True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
+                              model=context.model,
+                            #   if hasattr(context, 'model') else None,
+                              n_predict=context.n_predict,
+                            #   if hasattr(context, 'n_predict') else None,
                               enable_streaming=context.enable_streaming
                               if hasattr(context, 'enable_streaming') else None,
-                              response_format=context.response_format
-                              if hasattr(context, 'response_format') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
+                              response_format=context.response_format,
+                            #   if hasattr(context, 'response_format') else None,
+                              tools=context.tools,# if hasattr(context, 'tools') else None,
+                              tool_choice=context.tool_choice, # if hasattr(context, 'tool_choice') else None,
+                              user_api_key=context.user_api_key)
+                            #   if hasattr(context, 'user_api_key') else None)
 
 
 @step('all prompts are predicted')
@@ -908,6 +927,8 @@ async def oai_chat_completions(user_prompt,
                                n_predict=None,
                                enable_streaming=None,
                                response_format=None,
+                               tools=None,
+                               tool_choice=None,
                                user_api_key=None,
                                expect_api_error=None):
     if debug:
@@ -935,6 +956,10 @@ async def oai_chat_completions(user_prompt,
     }
     if response_format is not None:
         payload['response_format'] = response_format
+    if tools is not None:
+        payload['tools'] = tools
+    if tool_choice is not None:
+        payload['tool_choice'] = tool_choice
     completion_response = {
         'content': '',
         'timings': {
@@ -996,6 +1021,8 @@ async def oai_chat_completions(user_prompt,
                 max_tokens=n_predict,
                 stream=enable_streaming,
                 response_format=payload.get('response_format'),
+                tools=payload.get('tools'),
+                tool_choice=payload.get('tool_choice'),
                 seed=seed,
                 temperature=payload['temperature']
             )
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 63fde9c9faabe..b2780921513c2 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -6,11 +6,13 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
+#include "json-schema-to-grammar.h"
 
 #include <string>
 #include <vector>
 #include <sstream>
 #include <random>
+#include <regex>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
@@ -117,7 +119,7 @@ static inline void server_log(const char * level, const char * function, int lin
 //
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const std::string & extra_system_message) {
     size_t alloc_size = 0;
     // vector holding all allocated string to be passed to llama_chat_apply_template
     std::vector<std::string> str(messages.size() * 2);
@@ -132,6 +134,14 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat[i].content = str[i*2 + 1].c_str();
     }
 
+    if (!extra_system_message.empty()) {
+        alloc_size += extra_system_message.size();
+
+        llama_chat_message msg { "system", extra_system_message.c_str() };
+        chat.insert(chat.begin(), msg);
+        // chat.push_back(msg);
+    }
+
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
     std::vector<char> buf(alloc_size * 2);
 
@@ -374,8 +384,9 @@ static json oaicompat_completion_params_parse(
     llama_params["temperature"]       = json_value(body,   "temperature",       1.0);
     llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
 
+    std::string extra_system_message;
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), extra_system_message);
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {
@@ -390,9 +401,61 @@ static json oaicompat_completion_params_parse(
         std::string response_type = json_value(response_format, "type", std::string());
         if (response_type == "json_object") {
             llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+            extra_system_message = (std::stringstream()
+                << "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n"
+                << llama_params["json_schema"].dump().c_str() 
+                << "\n</schema>"
+            ).str();
         } else if (!response_type.empty() && response_type != "text") {
             throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
         }
+    } else if (body.contains("tools") && body["tools"].is_array()) {
+        const auto & tools = body["tools"];
+        bool built_grammar = false;
+        bool allow_parallel_calls = false;
+        bool allow_content = true;
+        if (body.contains("tool_choice") && body["tool_choice"].is_string() && body["tool_choice"] != "auto") {
+            std::string tool_choice = body["tool_choice"];
+            if (tool_choice == "required") {
+                allow_content = false;
+            } else {
+                for (const auto & tool : tools) {
+                    if (tool["name"] == tool_choice) {
+                        llama_params["grammar"] = tool_call_grammar(json::array({ tool }), allow_parallel_calls, /* allow_content= */ false);
+                        built_grammar = true;
+                        break;
+                    }
+                }
+            }
+        }
+        if (!built_grammar) {
+            llama_params["grammar"] = tool_call_grammar(tools, allow_parallel_calls, allow_content);
+        }
+        // TODO: pass a template file.
+        extra_system_message = (std::stringstream()
+            << "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. "
+            << "You may call one or more functions to assist with the user query. "
+            // << "Don't make assumptions about what values to plug into functions. "
+            << "Here are the available tools: <tools>"
+            << tools.dump(2).c_str()
+            << "</tools>\n"
+            // << "To call a tool give a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:"
+            << "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:"
+            << "<tool_call>"
+            << "{\"name\": <function-name>, \"arguments\": <args-dict>}"
+            << "</tool_call>"
+            << "Don't explain which tools you're going to call, just call them."
+        ).str();
+    }
+
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"], extra_system_message);
+
+    // Handle "stop" field
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
     // Handle "n" field
@@ -410,7 +473,7 @@ static json oaicompat_completion_params_parse(
     }
 
     // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    static const std::vector<std::string> unsupported_params;// { "tool_choice" };
     for (auto & param : unsupported_params) {
         if (body.contains(param)) {
             throw std::runtime_error("Unsupported param: " + param);
@@ -437,10 +500,36 @@ static json format_final_response_oaicompat(const json & request, json result, c
     int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
     std::string content      = json_value(result, "content", std::string(""));
 
+
     std::string finish_reason = "length";
     if (stopped_word || stopped_eos) {
         finish_reason = "stop";
     }
+    json tool_calls;
+    json message_content;
+    if (request.contains("tools")) {
+        std::regex pattern("<tool_call>(.*?)</tool_call>");
+        std::sregex_iterator iter(content.begin(), content.end(), pattern);
+        std::sregex_iterator end;
+        while (iter != end) {
+            std::smatch match = *iter;
+            auto call = json::parse(match[1].str());
+            if (tool_calls.is_null()) {
+                tool_calls = json::array();
+            }
+            tool_calls.push_back({
+                {"function", {
+                    {"name", call["name"]},
+                    {"arguments", call["arguments"].dump()},
+                }},
+            });
+            finish_reason = "tool_calls";
+            ++iter;
+        }
+    }
+    if (tool_calls.is_null()) {
+        message_content = content;
+    }
 
     json choices =
         streaming ? json::array({json{{"finish_reason", finish_reason},
@@ -448,7 +537,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
                                         {"delta", json::object()}}})
                   : json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
-                                        {"message", json{{"content", content},
+                                        {"message", json{{"content", message_content},
+                                                         {"tool_calls", tool_calls},
                                                          {"role", "assistant"}}}}});
 
     std::time_t t = std::time(0);
diff --git a/gguf-py/examples/read_template.py b/gguf-py/examples/read_template.py
new file mode 100644
index 0000000000000..34a998ae5fee4
--- /dev/null
+++ b/gguf-py/examples/read_template.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from gguf.gguf_reader import GGUFReader
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: read_template.py <path_to_gguf_file>")
+        sys.exit(1)
+    gguf_file_path = sys.argv[1]
+
+    reader = GGUFReader(gguf_file_path)
+    print(reader.read_field(reader.fields['tokenizer.chat_template']))
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e48bc00c388c8..acb43427e8a1f 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -54,6 +54,7 @@ class ReaderField(NamedTuple):
     types: list[GGUFValueType] = []
 
 
+
 class ReaderTensor(NamedTuple):
     name: str
     tensor_type: GGMLQuantizationType
@@ -124,6 +125,21 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r
     # Fetch a key/value metadata field by key.
     def get_field(self, key: str) -> Union[ReaderField, None]:
         return self.fields.get(key, None)
+    
+    def read_field(self, field):
+        if not field.types:
+            return None
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            itype = field.types[-1]
+            if itype == GGUFValueType.STRING:
+                return [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
+            else:
+                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        elif field.types[0] == GGUFValueType.STRING:
+            return str(bytes(field.parts[-1]), encoding="utf-8")
+        else:
+            assert(field.types[0] in self.gguf_scalar_to_np)
+            return field.parts[-1].tolist()[0]
 
     # Fetch a tensor from the list by index.
     def get_tensor(self, idx: int) -> ReaderTensor:
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
index 1a37a7b91409d..ff76e7c6a0040 100755
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -79,17 +79,7 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
         metadata[field.name] = curr
         if field.types[:1] == [GGUFValueType.ARRAY]:
             curr["array_types"] = [t.name for t in field.types][1:]
-            if not args.json_array:
-                continue
-            itype = field.types[-1]
-            if itype == GGUFValueType.STRING:
-                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
-            else:
-                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
-        elif field.types[0] == GGUFValueType.STRING:
-            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
-        else:
-            curr["value"] = field.parts[-1].tolist()[0]
+        curr["value"] = reader.read_field(field)
     if not args.no_tensors:
         for idx, tensor in enumerate(reader.tensors):
             tensors[tensor.name] = {
diff --git a/llama.cpp b/llama.cpp
index 8b675ea993a38..21410b624cf2a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13850,6 +13850,10 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
         }
     }
 
+    if (next_candidates.empty()) {
+        return rejects;
+    }
+
     const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
 
     // update top of stack to next element, if any
@@ -13995,7 +13999,7 @@ struct llama_grammar * llama_grammar_init(
     // Important: vec_rules has to be moved here, not copied, because stacks contains
     // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
     // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
+    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} };
 }
 
 void llama_grammar_free(struct llama_grammar * grammar) {
@@ -14003,18 +14007,24 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints };
+
+    std::unordered_map<const llama_grammar_element *, const llama_grammar_element *> element_map;
+    element_map.reserve(std::accumulate(
+        grammar->rules.begin(), grammar->rules.end(), 0,
+        [](size_t acc, const std::vector<llama_grammar_element> & rule) {
+            return acc + rule.size();
+        }));
+    for (size_t ir = 0; ir < grammar->rules.size(); ir++) {
+        for (size_t ie = 0; ie < grammar->rules[ir].size(); ie++) {
+            element_map[&grammar->rules[ir][ie]] = &result->rules[ir][ie];
+        }
+    }
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
         for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
-                    }
-                }
-            }
+            result->stacks[is][ie] = element_map.at(grammar->stacks[is][ie]);
         }
     }
 
@@ -14484,7 +14494,7 @@ void llama_sample_repetition_penalties(
     }
 }
 
-void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
+void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_grammar * grammar) {
     GGML_ASSERT(ctx);
     int64_t t_start_sample_us = ggml_time_us();
 
@@ -14496,8 +14506,20 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         }
     }
 
+    if (grammar->token_codepoints.empty()) {
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+        grammar->token_codepoints.resize(n_vocab);
+        grammar->token_pieces.resize(n_vocab);
+        for (llama_token id = 0; id < n_vocab; ++id) {
+            const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
+            grammar->token_codepoints[id] = decode_utf8(piece, {0, 0});
+        }
+    }
+
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
-    candidates_decoded.reserve(candidates->size);
+    if (grammar->partial_utf8.n_remain > 0) {
+        candidates_decoded.reserve(candidates->size);
+    }
 
     std::vector<llama_grammar_candidate> candidates_grammar;
     candidates_grammar.reserve(candidates->size);
@@ -14512,6 +14534,9 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
             }
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
+        } else if (grammar->partial_utf8.n_remain == 0){
+            const auto & decoded = grammar->token_codepoints.at(id);
+            candidates_grammar.push_back({ i, decoded.first.data(), decoded.second });
         } else {
             candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
@@ -14707,7 +14732,9 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
+    const auto   decoded     = grammar->partial_utf8.n_remain == 0
+        ? grammar->token_codepoints[token]
+        : decode_utf8(piece, grammar->partial_utf8);
     const auto & code_points = decoded.first;
     std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
diff --git a/llama.h b/llama.h
index 62908261f2791..11fe84fee688c 100644
--- a/llama.h
+++ b/llama.h
@@ -1013,7 +1013,7 @@ extern "C" {
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
+            struct llama_grammar * grammar);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -1111,6 +1111,11 @@ struct llama_grammar {
 
     // buffer for partially generated UTF-8 sequence from accepted tokens
     llama_partial_utf8                                      partial_utf8;
+
+    // caching the token pieces & their decoded codepoints.
+    std::vector<std::string>                                token_pieces;
+    std::vector<std::pair<std::vector<uint32_t>,
+                                  llama_partial_utf8>>      token_codepoints;
 };
 
 struct llama_grammar_candidate {