diff --git a/common/chat.cpp b/common/chat.cpp
index f138c7bcafcfa..8af41a8d454a6 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -26,13 +26,13 @@ struct common_chat_templates {
 struct templates_params {
     json messages;
     json tools;
-    common_chat_tool_choice tool_choice;
     json json_schema;
+    common_chat_tool_choice tool_choice;
     bool parallel_tool_calls;
     bool stream;
-    std::string grammar;
     bool add_generation_prompt = true;
     bool extract_reasoning     = true;
+    std::string grammar;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -815,7 +815,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
         }
         builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
     });
-    data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+    data.grammar_triggers.push_back({"[TOOL_CALLS]", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
     data.preserved_tokens = {
         "[TOOL_CALLS]",
     };
@@ -862,8 +862,8 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
     });
     data.grammar_triggers.push_back({
-        COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
         "<|START_ACTION|>",
+        COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
     });
     data.preserved_tokens = {
         "<|START_ACTION|>",
@@ -1004,11 +1004,11 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
             });
             // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
             data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
                 "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
             });
             if (!builtin_tools.empty()) {
-                data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+                data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
                 data.preserved_tokens.push_back("<|python_tag|>");
             }
             // Allow a few empty lines on top of the usual constrained json schema space rule.
@@ -1085,10 +1085,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
                 "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
                 "\"<｜tool▁calls▁end｜>\""
                 " space");
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool▁calls▁begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool_calls_begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool calls begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool\\_calls\\_begin｜>"});
+            data.grammar_triggers.push_back({"<｜tool▁calls▁begin｜>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+            data.grammar_triggers.push_back({"<｜tool_calls_begin｜>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+            data.grammar_triggers.push_back({"<｜tool calls begin｜>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+            data.grammar_triggers.push_back({"<｜tool\\_calls\\_begin｜>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
             data.preserved_tokens = {
                 "<think>",
                 "</think>",
@@ -1196,7 +1196,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
             }
             builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
         });
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
+        data.grammar_triggers.push_back({" functools[", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
         data.preserved_tokens = {
             " functools[",
         };
@@ -1230,20 +1230,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                 first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
                 subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
                 data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
                     regex_escape(name + "\n"),
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
                 });
                 data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
                     regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
                 });
                 data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                     regex_escape(">>>" + name + "\n"),
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                 });
                 data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                     ">>>assistant<|end_header_id|>\n" + name,
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                 });
             });
             data.preserved_tokens = {
@@ -1339,12 +1339,12 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
             });
             if (has_raw_python) {
                 tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
-                data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+                data.grammar_triggers.push_back({"<|python_tag|>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
                 data.preserved_tokens.push_back("<|python_tag|>");
             }
             auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
             builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
+            data.grammar_triggers.push_back({"<function=", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
         });
         data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
     } else {
@@ -1404,13 +1404,13 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
                 "\"</function>\" space"));
 
             data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                 "<function=" + name + ">",
+                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
             });
             auto escaped_name = regex_escape(name);
             data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
                 "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
             });
         });
         auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
@@ -1431,12 +1431,12 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
             "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
         auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
         builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
+        data.grammar_triggers.push_back({"<tool_call>", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
+        data.grammar_triggers.push_back({"<function", COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
         // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
         data.grammar_triggers.push_back({
-            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
             "(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
+            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
         });
         data.preserved_tokens = {
             "<think>",
diff --git a/common/chat.h b/common/chat.h
index d26a09c2f7c4f..01ce0a4741e94 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -68,18 +68,18 @@ struct common_chat_templates_inputs {
     bool add_generation_prompt = true;
     bool use_jinja = true;
     // Parameters below only supported when use_jinja is true
-    std::vector<common_chat_tool> tools;
-    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
     bool extract_reasoning     = true;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    std::vector<common_chat_tool> tools;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
 struct common_chat_params {
     common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    bool                                grammar_lazy = false;
     std::string                         prompt;
     std::string                         grammar;
-    bool                                grammar_lazy = false;
     std::vector<common_grammar_trigger> grammar_triggers;
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
diff --git a/common/common.h b/common/common.h
index 556ff5be40798..26c12997bd6b0 100644
--- a/common/common.h
+++ b/common/common.h
@@ -51,8 +51,8 @@ struct cpu_params {
     int      n_threads                   = -1;
     bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
     bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 
@@ -119,8 +119,8 @@ enum common_grammar_trigger_type {
 };
 
 struct common_grammar_trigger {
-    common_grammar_trigger_type type;
     std::string value;
+    common_grammar_trigger_type type;
     llama_token token = LLAMA_TOKEN_NULL;
 };
 
@@ -156,6 +156,11 @@ struct common_params_sampling {
     bool    no_perf            = false; // disable performance metrics
     bool    timing_per_token   = false;
 
+    bool                                grammar_lazy = false;
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
+    std::set<llama_token>               preserved_tokens;
+
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
 
@@ -171,11 +176,6 @@ struct common_params_sampling {
         COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
 
-    std::string                         grammar; // optional BNF-like grammar to constrain sampling
-    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
-    std::set<llama_token>               preserved_tokens;
-
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 
     // print the parameters into a string
@@ -240,14 +240,14 @@ struct common_params {
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     // offload params
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
     int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
     int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
 
-    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
@@ -283,11 +283,10 @@ struct common_params {
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
-
     std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
 
+    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
+
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
@@ -296,13 +295,15 @@ struct common_params {
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                      //                                       (which is more convenient to use for plotting)
                                      //
-    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
 
-    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
 
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
     bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
+
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
     size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
     bool   kl_divergence    = false; // compute KL divergence
@@ -337,23 +338,28 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
-    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
-    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+    // options multimodal models (see tools/mtmd)
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
 
-    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+    // options embedding
+    bool embedding         = false; // get only sentence embedding
+    bool reranking         = false; // enable reranking support on server
 
     // multimodal models (see tools/mtmd)
     struct common_params_model mmproj;
-    bool mmproj_use_gpu = true;     // use GPU for multimodal model
-    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
-    bool embedding         = false; // get only sentence embedding
-    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+
+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+
 
     // server params
     int32_t port           = 8080;         // server listens on this network port
@@ -362,19 +368,14 @@ struct common_params {
     int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
 
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
-    std::vector<std::string> api_keys;
-
-    std::string ssl_file_key  = "";                                                                         // NOLINT
-    std::string ssl_file_cert = "";                                                                         // NOLINT
-
     // "advanced" endpoints are disabled by default for better security
     bool webui            = true;
     bool endpoint_slots   = false;
@@ -383,22 +384,20 @@ struct common_params {
 
     bool log_json = false;
 
-    std::string slot_save_path;
+    std::vector<std::string> api_keys;
 
-    float slot_prompt_similarity = 0.5f;
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT
 
-    // batched-bench params
-    bool is_pp_shared = false;
+    std::string slot_save_path;
 
-    std::vector<int32_t> n_pp;
-    std::vector<int32_t> n_tg;
-    std::vector<int32_t> n_pl;
+    float slot_prompt_similarity = 0.5f;
 
     // retrieval params
-    std::vector<std::string> context_files; // context files to embed
-
     int32_t chunk_size = 64; // chunk size for context embedding
 
+    std::vector<std::string> context_files; // context files to embed
+
     std::string chunk_separator = "\n"; // chunk separator for context embedding
 
     // passkey params
@@ -414,12 +413,19 @@ struct common_params {
     bool compute_ppl    = true;  // whether to compute perplexity
     bool parse_special  = false; // whether to parse special tokens during imatrix tokenization
 
+    // batched-bench params
+    bool is_pp_shared = false;
+
+    std::vector<int32_t> n_pp;
+    std::vector<int32_t> n_tg;
+    std::vector<int32_t> n_pl;
+
     // cvector-generator params
     int n_pca_batch = 100;
     int n_pca_iterations = 1000;
-    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
     std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
     std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
+    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
diff --git a/common/log.cpp b/common/log.cpp
index 52b31470c46bd..48111a047ece4 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -46,13 +46,13 @@ static std::vector<const char *> g_col = {
 };
 
 struct common_log_entry {
-    enum ggml_log_level level;
-
-    bool prefix;
+    std::vector<char> msg;
 
     int64_t timestamp;
 
-    std::vector<char> msg;
+    enum ggml_log_level level;
+
+    bool prefix;
 
     // signals the worker thread to stop
     bool is_end;
diff --git a/include/llama.h b/include/llama.h
index 52cd7a5a037ef..c324486ea9f39 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -302,6 +302,12 @@ extern "C" {
         // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         int32_t main_gpu;
 
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
+
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
 
@@ -315,12 +321,6 @@ extern "C" {
 
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;      // use mmap if possible
-        bool use_mlock;     // force system to keep model in RAM
-        bool check_tensors; // validate model tensor data
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 85b4324b699e6..fdc66cf9ee269 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1246,7 +1246,6 @@ llm_graph_result_ptr llama_context::graph_build(
     return model.build_graph(
             {
                 /*.ctx         =*/ ctx,
-                /*.arch        =*/ model.arch,
                 /*.hparams     =*/ model.hparams,
                 /*.cparams     =*/ cparams,
                 /*.ubatch      =*/ ubatch,
@@ -1256,6 +1255,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.loras       =*/ &loras,
                 /*.memory      =*/ memory.get(),
                 /*.cross       =*/ &cross,
+                /*.arch        =*/ model.arch,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
             }, gf, gtype);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 13e36d161c614..29b1c6f6c49c2 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -414,7 +414,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
 //
 
 llm_graph_context::llm_graph_context(const llm_graph_params & params) :
-    arch             (params.arch),
     hparams          (params.hparams),
     cparams          (params.cparams),
     ubatch           (params.ubatch),
@@ -430,6 +429,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     n_embd_v_gqa     (hparams.n_embd_v_gqa()),
     n_expert         (hparams.n_expert),
     n_expert_used    (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
+    arch             (params.arch),
     freq_base        (cparams.rope_freq_base),
     freq_scale       (cparams.rope_freq_scale),
     ext_factor       (cparams.yarn_ext_factor),
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 2b85bb25befba..55af082d449e4 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -374,8 +374,6 @@ using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor
 struct llm_graph_params {
     ggml_context * ctx;
 
-    const llm_arch arch;
-
     const llama_hparams & hparams;
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
@@ -388,14 +386,14 @@ struct llm_graph_params {
     const llama_memory_i      * memory;
     const llama_cross         * cross;
 
+    const llm_arch arch;
+
     int32_t n_outputs;
 
     const llm_graph_cb & cb;
 };
 
 struct llm_graph_context {
-    const llm_arch arch;
-
     const llama_hparams & hparams;
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
@@ -413,6 +411,8 @@ struct llm_graph_context {
     const int64_t n_expert;
     const int64_t n_expert_used;
 
+    const llm_arch arch;
+
     const float freq_base;
     const float freq_scale;
     const float ext_factor;
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index 0f52b011b6986..e2888c2ca46a5 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -65,12 +65,12 @@ struct llama_model_loader {
     int n_tensors = 0;
     int n_created = 0;
 
-    uint64_t n_elements = 0;
-    size_t   n_bytes    = 0;
-
     bool use_mmap = false;
     bool check_tensors;
 
+    uint64_t n_elements = 0;
+    size_t   n_bytes    = 0;
+
     llama_files files;
     llama_ftype ftype;
     llama_fver  fver;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 82557ea054bb2..5f3d7f9ee76c5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -13539,14 +13539,14 @@ llama_model_params llama_model_default_params() {
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ nullptr,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.check_tensors               =*/ false,
+        /*.tensor_split                =*/ nullptr,
+        /*.progress_callback           =*/ nullptr,
+        /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
     };
 
 #ifdef GGML_USE_METAL
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 804b11e0a943e..34c3c370a97de 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -483,10 +483,10 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_chain_i,
         /* .ctx   = */ new llama_sampler_chain {
-            /* .params      = */ params,
             /* .samplers    = */ {},
-            /* .t_sample_us = */ 0,
+            /* .params      = */ params,
             /* .n_sample    = */ 0,
+            /* .t_sample_us = */ 0,
         }
     );
 }
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
index 759dd7dcb7042..0f653b2f4fff6 100644
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -12,15 +12,15 @@ struct llama_grammar;
 // sampler chain
 
 struct llama_sampler_chain {
-    llama_sampler_chain_params params;
-
     std::vector<struct llama_sampler *> samplers;
 
-    // timing
+    llama_sampler_chain_params params;
 
-    mutable int64_t t_sample_us;
+    // timing
 
     mutable int32_t n_sample;
+
+    mutable int64_t t_sample_us;
 };
 
 struct llama_sampler * llama_sampler_init_dry_testing(
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 087665e41411b..98a9a5430a6b2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -393,7 +393,7 @@ struct server_task {
                             params.sampling.grammar_triggers.push_back(std::move(trigger));
                         } else {
                             SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
-                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+                            params.sampling.grammar_triggers.push_back({word, COMMON_GRAMMAR_TRIGGER_TYPE_WORD});
                         }
                     } else {
                         params.sampling.grammar_triggers.push_back(std::move(ct.value));