From ba839d1dd0e2dcce935df3c3d55e4c9b26032500 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Wed, 15 Nov 2023 11:51:27 -0700 Subject: [PATCH 01/10] feat: Allow overriding GGUF metadata when loading model --- common/common.cpp | 52 +++++++++ common/common.h | 2 + llama.cpp | 263 ++++++++++++++++++++++++++++++++++------------ llama.h | 17 +++ 4 files changed, 265 insertions(+), 69 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6a711420004b4..4262d5c8da0c6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -673,6 +673,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); + } else if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + break; + } + char * sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.int_value = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.float_value = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + if (std::strcmp(sep, "true")) { + kvo.bool_value = true; + } else if (std::strcmp(sep, "false")) { + kvo.bool_value = false; + } else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + } else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + break; + } + params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters } else if ( log_param_single_parse( argv[i] ) ) { @@ -716,6 +757,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.back().key[0] = 0; + } + return true; } @@ -892,6 +938,12 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + if (params.kv_overrides.empty()) { + mparams.kv_overrides = NULL; + } else { + GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); + mparams.kv_overrides = params.kv_overrides.data(); + } return mparams; } diff --git a/common/common.h b/common/common.h index dd6b002eb94ba..38417f3537a13 100644 --- a/common/common.h +++ b/common/common.h @@ -86,6 +86,8 @@ struct gpt_params { std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files + std::vector kv_overrides; + // TODO: avoid tuple, use struct std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter diff --git a/llama.cpp b/llama.cpp index 01522fdb4e74f..66e5a6c70f82d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -569,21 +569,6 @@ struct LLM_TN { // gguf helpers // -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -do { \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ - } \ -} while (0) - static std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_NONE, "none" }, { LLAMA_ROPE_SCALING_LINEAR, "linear" }, @@ -1712,21 +1697,34 @@ struct llama_model_loader { llama_fver fver; std::unique_ptr mapping; + std::unordered_map kv_overrides; struct gguf_context * ctx_gguf = NULL; struct ggml_context * ctx_meta = NULL; - llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { + std::string arch_name; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx_meta, }; + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + ctx_gguf = gguf_init_from_file(fname.c_str(), params); if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + n_kv = gguf_get_n_kv(ctx_gguf); n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -1828,19 +1826,149 @@ struct llama_model_loader { } } - std::string get_arch_name() const { - const auto kv = LLM_KV(LLM_ARCH_UNKNOWN); + private: + template struct gk_get_arrlen { T & output; }; + template struct gk_set_literal { TI & input; TO & output; }; + template + void gk_set(int kid, T & result) { + (void)result; + throw std::runtime_error(format("request for key id %d with unhandled result type: %s", kid, typeid(T).name())); + } + + template<> void gk_set(int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } + template<> void gk_set(int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } + template<> void gk_set(int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } + template<> void gk_set(int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } + template<> void gk_set(int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } + template<> void gk_set(int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } + template<> void gk_set(int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } + template<> void gk_set(int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } + template<> void gk_set(int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } + template<> void gk_set(int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } + template<> void gk_set(int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } + template<> void gk_set(int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } + + template + typename std::enable_if::value, void>::type + gk_set(int k, struct gk_get_arrlen & r) { r.output = gguf_get_arr_n(ctx_gguf, k); } + + template + void gk_set_lit(TI i, TO o) { + (void)i; (void)o; + throw std::runtime_error(format("gk_set_lit can't handle types: in=%s, out=%s", + typeid(TI).name(), typeid(TO).name())); + } + + template + typename std::enable_if::value, void>::type + gk_set_lit(const int64_t & i, T & o) { o = T(i); } + + template + typename std::enable_if::value, void>::type + gk_set_lit(const double & i, T & o) { o = T(i); } + + template + void gk_set_lit(const T & i, T & o) { o = i; } + + public: + template + bool get_key(const std::string & key, T & result, const bool required = false) { + const auto & tt = typeid(T); + enum gguf_type gt = GGUF_TYPE_COUNT; + enum llama_model_kv_override_type ot = LLAMA_KV_OVERRIDE_INT; + bool is_signed = false, can_override = true; + if (tt == typeid(uint8_t)) { + gt = GGUF_TYPE_UINT8; + } else if (tt == typeid(uint16_t)) { + gt = GGUF_TYPE_UINT16; + } else if (tt == typeid(uint32_t)) { + gt = GGUF_TYPE_UINT32; + } else if (tt == typeid(uint64_t)) { + gt = GGUF_TYPE_UINT64; + } else if (tt == typeid(int8_t)) { + is_signed = true; + gt = GGUF_TYPE_INT8; + } else if (tt == typeid(int16_t)) { + is_signed = true; + gt = GGUF_TYPE_INT16; + } else if (tt == typeid(int32_t)) { + is_signed = true; + gt = GGUF_TYPE_INT32; + } else if (tt == typeid(int64_t)) { + is_signed = true; + gt = GGUF_TYPE_INT64; + } else if (tt == typeid(float)) { + is_signed = true; + gt = GGUF_TYPE_FLOAT32; + ot = LLAMA_KV_OVERRIDE_FLOAT; + } else if (tt == typeid(double)) { + is_signed = true; + gt = GGUF_TYPE_FLOAT64; + ot = LLAMA_KV_OVERRIDE_FLOAT; + } else if (tt == typeid(bool)) { + gt = GGUF_TYPE_BOOL; + ot = LLAMA_KV_OVERRIDE_BOOL; + } else if (tt == typeid(std::string)) { + can_override = false; + gt = GGUF_TYPE_STRING; + } else { + throw std::runtime_error(format("request for key '%s' with unknown result type: %s", key.c_str(), tt.name())); + } + + if (can_override) { + auto it = kv_overrides.find(key); + if (it != kv_overrides.end()) { + struct llama_model_kv_override & po = it->second; + if (po.tag != ot) { + // Bad type + } else if (ot == LLAMA_KV_OVERRIDE_INT && po.int_value < 0 && !is_signed) { + // Out of range + } else { + switch (po.tag) { + case LLAMA_KV_OVERRIDE_INT: gk_set_lit(po.int_value, result); break; + case LLAMA_KV_OVERRIDE_FLOAT: gk_set_lit(po.float_value, result); break; + case LLAMA_KV_OVERRIDE_BOOL: gk_set_lit(po.bool_value, result); break; + default: GGML_ASSERT(false && "Impossible: Unhandled override tag type"); + } + return true; + } + } + } + + const int kid = gguf_find_key(ctx_gguf, key.c_str()); + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + const enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid); + if (ktype == GGUF_TYPE_ARRAY && ot == LLAMA_KV_OVERRIDE_INT) { + gk_get_arrlen arrlen = {result}; + gk_set(kid, arrlen); + return true; + } + if (ktype != gt) { + throw std::runtime_error(format("key %s has wrong type %s but expected type %s", + key.c_str(), gguf_type_name(ktype), gguf_type_name(gt))); + } + gk_set(kid, result); - std::string arch_name; - GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE)); + return true; + } + + template + bool get_key(const enum llm_kv kid, T & result, const bool required = false) { + return get_key(llm_kv(kid), result, required); + } + std::string get_arch_name() const { return arch_name; } enum llm_arch get_arch() const { - const std::string arch_name = get_arch_name(); - - return llm_arch_from_string(arch_name); + return llm_kv.arch; } const char * get_tensor_name(int i) const { @@ -2087,49 +2215,44 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { - struct gguf_context * ctx = ml.ctx_gguf; - - const auto kv = LLM_KV(model.arch); - auto & hparams = model.hparams; // get general kv - GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); + ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); - GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + ml.get_key(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, true); + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train, true); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd, true); + ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff, true); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head, true); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer, true); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; - GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false); - hparams.rope_finetuned = false; - GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, - kv(LLM_KV_ROPE_SCALING_FINETUNED)); + bool rope_finetuned = false; + ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); + hparams.rope_finetuned = rope_finetuned; hparams.n_yarn_orig_ctx = hparams.n_ctx_train; - GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, - kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false); // rope_freq_base (optional) hparams.rope_freq_base_train = 10000.0f; - GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false); std::string rope_scaling("linear"); - GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); + ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false); if (ropescale == 0.0f) { // try the old key name - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false); } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; @@ -2137,7 +2260,7 @@ static void llm_load_hparams( { hparams.n_rot = hparams.n_embd / hparams.n_head; - GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) { if (hparams.n_rot != hparams.n_embd / hparams.n_head) { @@ -2152,7 +2275,7 @@ static void llm_load_hparams( switch (model.arch) { case LLM_ARCH_LLAMA: { - GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); switch (hparams.n_layer) { case 26: model.type = e_model::MODEL_3B; break; @@ -2166,7 +2289,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_FALCON: { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2176,7 +2299,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BAICHUAN: { - GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; @@ -2185,7 +2308,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_STARCODER: { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 36: model.type = e_model::MODEL_3B; break; @@ -2196,7 +2319,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_PERSIMMON: { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); switch (hparams.n_layer) { case 36: model.type = e_model::MODEL_8B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2204,7 +2327,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_REFACT: { - GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_1B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2212,7 +2335,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BLOOM: { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; @@ -2227,9 +2350,9 @@ static void llm_load_hparams( { hparams.f_clamp_kqv = 0.0f; - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); - GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV)); - GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, true); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2239,7 +2362,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_STABLELM: { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_3B; break; @@ -2287,7 +2410,7 @@ static void llm_load_vocab( { std::string tokenizer_name; - GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name, true); if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; @@ -2377,16 +2500,17 @@ static void llm_load_vocab( }; for (const auto & it : special_token_types) { const std::string & key = kv(std::get<0>(it)); - int32_t & id = std::get<1>(it), old_id = id; + int32_t & id = std::get<1>(it); - GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key); - // Must be >= -1 and < vocab size. Since the key is unsigned, -1 - // can only come from the default value, so there's no point in - // validating that. - if (size_t(id + 1) > vocab.id_to_token.size()) { - LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n", - __func__, key.c_str(), id, old_id); - id = old_id; + uint32_t new_id; + if (!ml.get_key(std::get<0>(it), new_id, false)) { + continue; + } + if (new_id >= vocab.id_to_token.size()) { + LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n", + __func__, key.c_str(), new_id, id); + } else { + id = new_id; } } } @@ -3260,7 +3384,7 @@ static void llm_load_tensors( static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap); + llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; @@ -7759,7 +7883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s constexpr bool use_mmap = false; #endif - llama_model_loader ml(fname_inp, use_mmap); + llama_model_loader ml(fname_inp, use_mmap, NULL); if (ml.use_mmap) { ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); } @@ -8055,7 +8179,7 @@ static int llama_apply_lora_from_file_internal( std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/NULL)); size_t ctx_size; size_t mmapped_size; @@ -8286,6 +8410,7 @@ struct llama_model_params llama_model_default_params() { /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, + /*.kv_overrides =*/ nullptr, }; #ifdef GGML_USE_METAL diff --git a/llama.h b/llama.h index e8dc04bb54b81..80c00e73ea0fd 100644 --- a/llama.h +++ b/llama.h @@ -158,6 +158,22 @@ extern "C" { llama_seq_id all_seq_id; // used if seq_id == NULL } llama_batch; + enum llama_model_kv_override_type { + LLAMA_KV_OVERRIDE_INT, + LLAMA_KV_OVERRIDE_FLOAT, + LLAMA_KV_OVERRIDE_BOOL, + }; + + struct llama_model_kv_override { + char key[128]; + enum llama_model_kv_override_type tag; + union { + int64_t int_value; + double float_value; + bool bool_value; + }; + }; + struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors @@ -172,6 +188,7 @@ extern "C" { bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM + const struct llama_model_kv_override * kv_overrides; }; struct llama_context_params { From 9d39deab8fea3b3b0a1430768c7903a3170a9bcc Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Wed, 15 Nov 2023 12:14:19 -0700 Subject: [PATCH 02/10] Fix the one time GCC is stricter than clang about something --- llama.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index 66e5a6c70f82d..c7e97f8bea8de 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1835,18 +1835,18 @@ struct llama_model_loader { throw std::runtime_error(format("request for key id %d with unhandled result type: %s", kid, typeid(T).name())); } - template<> void gk_set(int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } - template<> void gk_set(int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } - template<> void gk_set(int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } - template<> void gk_set(int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } - template<> void gk_set(int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } - template<> void gk_set(int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } - template<> void gk_set(int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } - template<> void gk_set(int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } - template<> void gk_set(int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } - template<> void gk_set(int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } - template<> void gk_set(int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } - template<> void gk_set(int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } + void gk_set(int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } + void gk_set(int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } + void gk_set(int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } + void gk_set(int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } + void gk_set(int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } + void gk_set(int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } + void gk_set(int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } + void gk_set(int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } + void gk_set(int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } + void gk_set(int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } + void gk_set(int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } + void gk_set(int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } template typename std::enable_if::value, void>::type From 69be5c3d6d324e1d6f10c3e5ec336b5ad4942658 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 16 Nov 2023 01:09:41 -0700 Subject: [PATCH 03/10] Step1 --- llama.cpp | 67 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/llama.cpp b/llama.cpp index c7e97f8bea8de..a29f5b4199a98 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1834,26 +1834,25 @@ struct llama_model_loader { (void)result; throw std::runtime_error(format("request for key id %d with unhandled result type: %s", kid, typeid(T).name())); } - - void gk_set(int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } - void gk_set(int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } - void gk_set(int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } - void gk_set(int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } - void gk_set(int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } - void gk_set(int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } - void gk_set(int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } - void gk_set(int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } - void gk_set(int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } - void gk_set(int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } - void gk_set(int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } - void gk_set(int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } + void gk_set(const int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } + void gk_set(const int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } + void gk_set(const int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } + void gk_set(const int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } + void gk_set(const int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } + void gk_set(const int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } + void gk_set(const int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } + void gk_set(const int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } + void gk_set(const int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } + void gk_set(const int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } + void gk_set(const int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } + void gk_set(const int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } template typename std::enable_if::value, void>::type - gk_set(int k, struct gk_get_arrlen & r) { r.output = gguf_get_arr_n(ctx_gguf, k); } + gk_set(const int k, struct gk_get_arrlen r) { r.output = gguf_get_arr_n(ctx_gguf, k); } template - void gk_set_lit(TI i, TO o) { + void gk_set_lit(const TI i, TO o) { (void)i; (void)o; throw std::runtime_error(format("gk_set_lit can't handle types: in=%s, out=%s", typeid(TI).name(), typeid(TO).name())); @@ -1861,11 +1860,11 @@ struct llama_model_loader { template typename std::enable_if::value, void>::type - gk_set_lit(const int64_t & i, T & o) { o = T(i); } + gk_set_lit(const int64_t i, T & o) { o = T(i); } template typename std::enable_if::value, void>::type - gk_set_lit(const double & i, T & o) { o = T(i); } + gk_set_lit(const double i, T & o) { o = T(i); } template void gk_set_lit(const T & i, T & o) { o = i; } @@ -1873,46 +1872,45 @@ struct llama_model_loader { public: template bool get_key(const std::string & key, T & result, const bool required = false) { - const auto & tt = typeid(T); enum gguf_type gt = GGUF_TYPE_COUNT; enum llama_model_kv_override_type ot = LLAMA_KV_OVERRIDE_INT; bool is_signed = false, can_override = true; - if (tt == typeid(uint8_t)) { + if (std::is_same::value) { gt = GGUF_TYPE_UINT8; - } else if (tt == typeid(uint16_t)) { + } else if (std::is_same::value) { gt = GGUF_TYPE_UINT16; - } else if (tt == typeid(uint32_t)) { + } else if (std::is_same::value) { gt = GGUF_TYPE_UINT32; - } else if (tt == typeid(uint64_t)) { + } else if (std::is_same::value) { gt = GGUF_TYPE_UINT64; - } else if (tt == typeid(int8_t)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_INT8; - } else if (tt == typeid(int16_t)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_INT16; - } else if (tt == typeid(int32_t)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_INT32; - } else if (tt == typeid(int64_t)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_INT64; - } else if (tt == typeid(float)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_FLOAT32; ot = LLAMA_KV_OVERRIDE_FLOAT; - } else if (tt == typeid(double)) { + } else if (std::is_same::value) { is_signed = true; gt = GGUF_TYPE_FLOAT64; ot = LLAMA_KV_OVERRIDE_FLOAT; - } else if (tt == typeid(bool)) { + } else if (std::is_same::value) { gt = GGUF_TYPE_BOOL; ot = LLAMA_KV_OVERRIDE_BOOL; - } else if (tt == typeid(std::string)) { + } else if (std::is_same::value) { can_override = false; gt = GGUF_TYPE_STRING; } else { - throw std::runtime_error(format("request for key '%s' with unknown result type: %s", key.c_str(), tt.name())); + throw std::runtime_error(format("request for key '%s' with unknown result type: %s", key.c_str(), typeid(T).name())); } if (can_override) { @@ -1921,13 +1919,16 @@ struct llama_model_loader { struct llama_model_kv_override & po = it->second; if (po.tag != ot) { // Bad type + // FIXME: Error reporting } else if (ot == LLAMA_KV_OVERRIDE_INT && po.int_value < 0 && !is_signed) { // Out of range + // FIXME: Error reporting } else { + // FIXME: Possible informational output switch (po.tag) { - case LLAMA_KV_OVERRIDE_INT: gk_set_lit(po.int_value, result); break; + case LLAMA_KV_OVERRIDE_INT: gk_set_lit(po.int_value, result); break; case LLAMA_KV_OVERRIDE_FLOAT: gk_set_lit(po.float_value, result); break; - case LLAMA_KV_OVERRIDE_BOOL: gk_set_lit(po.bool_value, result); break; + case LLAMA_KV_OVERRIDE_BOOL: gk_set_lit(po.bool_value, result); break; default: GGML_ASSERT(false && "Impossible: Unhandled override tag type"); } return true; From 8c9f776952442ee121a4a0e4cff204dab43a2213 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 16 Nov 2023 19:08:48 -0700 Subject: [PATCH 04/10] Refactor... basically everything! --- llama.cpp | 345 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 204 insertions(+), 141 deletions(-) diff --git a/llama.cpp b/llama.cpp index a29f5b4199a98..742b8b86bd0af 100644 --- a/llama.cpp +++ b/llama.cpp @@ -75,6 +75,7 @@ #include #include #include +#include #include #if defined(_MSC_VER) @@ -1682,6 +1683,161 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) { return buf; } +namespace GGUFMeta { + template + struct GKV_Base_Type { + static constexpr gguf_type gt = gt_; + + static T getter(const gguf_context * ctx, const int kid) { + return gfun(ctx, kid); + } + }; + + template struct GKV_Base; + + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + + struct GetArrayLen{int value;}; + template<> struct GKV_Base { + public: + static constexpr gguf_type gt = GGUF_TYPE_ARRAY; + static GetArrayLen getter(const gguf_context *ctx, const int k) { + return GetArrayLen{gguf_get_arr_n(ctx, k)}; + } + }; + + struct ArrayInfo{ + const gguf_type gt; + const size_t length; + const void * data; + }; + + template<> struct GKV_Base { + public: + static constexpr gguf_type gt = GGUF_TYPE_ARRAY; + static ArrayInfo getter(const gguf_context *ctx, const int k) { + return ArrayInfo { + gguf_get_arr_type(ctx, k), + size_t(gguf_get_arr_n(ctx, k)), + gguf_get_arr_data(ctx, k), + }; + } + }; + + template + class GKV: public GKV_Base { + GKV() = delete; + + public: + static T get_kv(const gguf_context * ctx, const int k) { + const enum gguf_type kt = gguf_get_kv_type(ctx, k); + + if (kt != GKV::gt) { + throw std::runtime_error(format("key %s has wrong type %s but expected type %s", + gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); + } + return GKV::getter(ctx, k); + } + + // This can't be uncommented. + // template static bool try_override(OT & target, const struct llama_model_kv_override *override) = delete; + + template + static typename std::enable_if::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override *override) { + if (!override) { + return false; + } + if (override->tag != LLAMA_KV_OVERRIDE_BOOL) { + return false; + } + target = override->bool_value; + return true; + } + + template + static typename std::enable_if::value && std::is_integral::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override *override) { + if (!override) { + return false; + } + if (override->tag != LLAMA_KV_OVERRIDE_INT) { + return false; + } + if (override->int_value < 0 && !std::is_signed::value) { + return false; + } + target = override->int_value; + return true; + } + + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override *override) { + if (!override) { + return false; + } + if (override->tag != LLAMA_KV_OVERRIDE_FLOAT) { + return false; + } + target = override->float_value; + return true; + } + + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { + if (try_override(target, override)) { + return true; + } + target = get_kv(ctx, k); + return true; + } + + template + static bool set(const gguf_context * ctx, const char * key, TT & target, const struct llama_model_kv_override *override = nullptr) { + const int kid = gguf_find_key(ctx, key); + if (kid < 0) { + return false; + } + return GKV::set(ctx, kid, target, override); + } + + template + static bool set(const gguf_context * ctx, const std::string & key, TT & target, const struct llama_model_kv_override *override = nullptr) { + return GKV::set(ctx, key.c_str(), target, override); + } + }; + + template<> + class GKV: public GKV_Base { + using BT = const char *; + public: + static bool set(const gguf_context * ctx, const int k, std::string & target, const struct llama_model_kv_override *override = nullptr) { + (void)override; + target = std::string(GKV::get_kv(ctx, k)); + return true; + } + + static bool set(const gguf_context * ctx, const char * key, std::string & target, const struct llama_model_kv_override *override = nullptr) { + return GKV::set(ctx, key, target, override); + } + + static bool set(const gguf_context * ctx, const std::string & key, std::string & target, const struct llama_model_kv_override *override = nullptr) { + return GKV::set(ctx, key, target, override); + } + }; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -1826,141 +1982,48 @@ struct llama_model_loader { } } - private: - template struct gk_get_arrlen { T & output; }; - template struct gk_set_literal { TI & input; TO & output; }; - template - void gk_set(int kid, T & result) { - (void)result; - throw std::runtime_error(format("request for key id %d with unhandled result type: %s", kid, typeid(T).name())); - } - void gk_set(const int k, uint8_t & r) { r = gguf_get_val_u8 (ctx_gguf, k); } - void gk_set(const int k, uint16_t & r) { r = gguf_get_val_u16 (ctx_gguf, k); } - void gk_set(const int k, uint32_t & r) { r = gguf_get_val_u32 (ctx_gguf, k); } - void gk_set(const int k, uint64_t & r) { r = gguf_get_val_u64 (ctx_gguf, k); } - void gk_set(const int k, int8_t & r) { r = gguf_get_val_i8 (ctx_gguf, k); } - void gk_set(const int k, int16_t & r) { r = gguf_get_val_i16 (ctx_gguf, k); } - void gk_set(const int k, int32_t & r) { r = gguf_get_val_i32 (ctx_gguf, k); } - void gk_set(const int k, int64_t & r) { r = gguf_get_val_i64 (ctx_gguf, k); } - void gk_set(const int k, float & r) { r = gguf_get_val_f32 (ctx_gguf, k); } - void gk_set(const int k, double & r) { r = gguf_get_val_f64 (ctx_gguf, k); } - void gk_set(const int k, bool & r) { r = gguf_get_val_bool(ctx_gguf, k); } - void gk_set(const int k, std::string & r) { r = std::string(gguf_get_val_str(ctx_gguf, k)); } + template typename std::enable_if::value, bool>::type + get_arr_n(const std::string & key, T & result, const bool required = false) { + const int kid = gguf_find_key(ctx_gguf, key.c_str()); - template - typename std::enable_if::value, void>::type - gk_set(const int k, struct gk_get_arrlen r) { r.output = gguf_get_arr_n(ctx_gguf, k); } + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } - template - void gk_set_lit(const TI i, TO o) { - (void)i; (void)o; - throw std::runtime_error(format("gk_set_lit can't handle types: in=%s, out=%s", - typeid(TI).name(), typeid(TO).name())); - } + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(ctx_gguf, kid); - template - typename std::enable_if::value, void>::type - gk_set_lit(const int64_t i, T & o) { o = T(i); } - template - typename std::enable_if::value, void>::type - gk_set_lit(const double i, T & o) { o = T(i); } + result = arr_info.length; + return true; + } - template - void gk_set_lit(const T & i, T & o) { o = i; } + template typename std::enable_if::value, bool>::type + get_arr_n(const enum llm_kv kid, T & result, const bool required = true) { + return get_arr_n(llm_kv(kid), result, required); + } - public: template - bool get_key(const std::string & key, T & result, const bool required = false) { - enum gguf_type gt = GGUF_TYPE_COUNT; - enum llama_model_kv_override_type ot = LLAMA_KV_OVERRIDE_INT; - bool is_signed = false, can_override = true; - if (std::is_same::value) { - gt = GGUF_TYPE_UINT8; - } else if (std::is_same::value) { - gt = GGUF_TYPE_UINT16; - } else if (std::is_same::value) { - gt = GGUF_TYPE_UINT32; - } else if (std::is_same::value) { - gt = GGUF_TYPE_UINT64; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_INT8; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_INT16; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_INT32; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_INT64; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_FLOAT32; - ot = LLAMA_KV_OVERRIDE_FLOAT; - } else if (std::is_same::value) { - is_signed = true; - gt = GGUF_TYPE_FLOAT64; - ot = LLAMA_KV_OVERRIDE_FLOAT; - } else if (std::is_same::value) { - gt = GGUF_TYPE_BOOL; - ot = LLAMA_KV_OVERRIDE_BOOL; - } else if (std::is_same::value) { - can_override = false; - gt = GGUF_TYPE_STRING; - } else { - throw std::runtime_error(format("request for key '%s' with unknown result type: %s", key.c_str(), typeid(T).name())); - } - - if (can_override) { - auto it = kv_overrides.find(key); - if (it != kv_overrides.end()) { - struct llama_model_kv_override & po = it->second; - if (po.tag != ot) { - // Bad type - // FIXME: Error reporting - } else if (ot == LLAMA_KV_OVERRIDE_INT && po.int_value < 0 && !is_signed) { - // Out of range - // FIXME: Error reporting - } else { - // FIXME: Possible informational output - switch (po.tag) { - case LLAMA_KV_OVERRIDE_INT: gk_set_lit(po.int_value, result); break; - case LLAMA_KV_OVERRIDE_FLOAT: gk_set_lit(po.float_value, result); break; - case LLAMA_KV_OVERRIDE_BOOL: gk_set_lit(po.bool_value, result); break; - default: GGML_ASSERT(false && "Impossible: Unhandled override tag type"); - } - return true; - } - } - } + bool get_key(const std::string & key, T & result, const bool required = true) { + auto it = kv_overrides.find(key); - const int kid = gguf_find_key(ctx_gguf, key.c_str()); - if (kid < 0) { - if (required) { - throw std::runtime_error(format("key not found in model: %s", key.c_str())); - } - return false; - } + const struct llama_model_kv_override * override = + it != kv_overrides.end() ? &it->second : nullptr; - const enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid); - if (ktype == GGUF_TYPE_ARRAY && ot == LLAMA_KV_OVERRIDE_INT) { - gk_get_arrlen arrlen = {result}; - gk_set(kid, arrlen); - return true; - } - if (ktype != gt) { - throw std::runtime_error(format("key %s has wrong type %s but expected type %s", - key.c_str(), gguf_type_name(ktype), gguf_type_name(gt))); + const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + + if (required && !found) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); } - gk_set(kid, result); - return true; + return found; } template - bool get_key(const enum llm_kv kid, T & result, const bool required = false) { + bool get_key(const enum llm_kv kid, T & result, const bool required = true) { return get_key(llm_kv(kid), result, required); } @@ -2222,12 +2285,12 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_key(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, true); - ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train, true); - ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd, true); - ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff, true); - ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head, true); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer, true); + ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; @@ -2251,8 +2314,8 @@ static void llm_load_hparams( // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; - ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false); - if (ropescale == 0.0f) { // try the old key name + if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) { + // try the old key name ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false); } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; @@ -2276,7 +2339,7 @@ static void llm_load_hparams( switch (model.arch) { case LLM_ARCH_LLAMA: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 26: model.type = e_model::MODEL_3B; break; @@ -2290,7 +2353,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_FALCON: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2300,7 +2363,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BAICHUAN: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; @@ -2309,7 +2372,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_STARCODER: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 36: model.type = e_model::MODEL_3B; break; @@ -2320,7 +2383,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_PERSIMMON: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 36: model.type = e_model::MODEL_8B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2328,7 +2391,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_REFACT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_1B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -2336,7 +2399,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_BLOOM: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; @@ -2351,9 +2414,9 @@ static void llm_load_hparams( { hparams.f_clamp_kqv = 0.0f; - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); - ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); - ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; @@ -2363,7 +2426,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_STABLELM: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, true); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_3B; break; @@ -2411,7 +2474,7 @@ static void llm_load_vocab( { std::string tokenizer_name; - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name, true); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; From 184420786292ea5bb1a8b1c12c9ac70ddc2e121f Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 16 Nov 2023 19:32:18 -0700 Subject: [PATCH 05/10] Nuke obsolete GetArrayLen struct --- llama.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index 742b8b86bd0af..626905426ee50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1708,15 +1708,6 @@ namespace GGUFMeta { template<> struct GKV_Base: GKV_Base_Type {}; template<> struct GKV_Base: GKV_Base_Type {}; - struct GetArrayLen{int value;}; - template<> struct GKV_Base { - public: - static constexpr gguf_type gt = GGUF_TYPE_ARRAY; - static GetArrayLen getter(const gguf_context *ctx, const int k) { - return GetArrayLen{gguf_get_arr_n(ctx, k)}; - } - }; - struct ArrayInfo{ const gguf_type gt; const size_t length; From dd89015c1300a8341e00b5f5cdfd10dc0861265d Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Fri, 17 Nov 2023 07:57:11 -0500 Subject: [PATCH 06/10] simplify std::string specialization --- llama.cpp | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/llama.cpp b/llama.cpp index 626905426ee50..e42e5f5ef902a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1708,6 +1708,14 @@ namespace GGUFMeta { template<> struct GKV_Base: GKV_Base_Type {}; template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base { + static constexpr gguf_type gt = GGUF_TYPE_STRING; + + static std::string getter(const gguf_context * ctx, const int kid) { + return gguf_get_val_str(ctx, kid); + } + }; + struct ArrayInfo{ const gguf_type gt; const size_t length; @@ -1741,9 +1749,6 @@ namespace GGUFMeta { return GKV::getter(ctx, k); } - // This can't be uncommented. - // template static bool try_override(OT & target, const struct llama_model_kv_override *override) = delete; - template static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { @@ -1786,6 +1791,14 @@ namespace GGUFMeta { return true; } + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override *override) { + (void)target; + (void)override; + return false; // cannot override str + } + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { if (try_override(target, override)) { return true; @@ -1808,25 +1821,6 @@ namespace GGUFMeta { return GKV::set(ctx, key.c_str(), target, override); } }; - - template<> - class GKV: public GKV_Base { - using BT = const char *; - public: - static bool set(const gguf_context * ctx, const int k, std::string & target, const struct llama_model_kv_override *override = nullptr) { - (void)override; - target = std::string(GKV::get_kv(ctx, k)); - return true; - } - - static bool set(const gguf_context * ctx, const char * key, std::string & target, const struct llama_model_kv_override *override = nullptr) { - return GKV::set(ctx, key, target, override); - } - - static bool set(const gguf_context * ctx, const std::string & key, std::string & target, const struct llama_model_kv_override *override = nullptr) { - return GKV::set(ctx, key, target, override); - } - }; } struct llama_model_loader { From cb5bfe0c18c526db14474e4ba015b0a80647b082 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Sat, 18 Nov 2023 02:24:52 -0700 Subject: [PATCH 07/10] Various cleanups Add informational output when overrides are applied Warn user when an override with the wrong type is specified --- llama.cpp | 96 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 34 deletions(-) diff --git a/llama.cpp b/llama.cpp index e42e5f5ef902a..45d23b3f19cbd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1749,46 +1749,71 @@ namespace GGUFMeta { return GKV::getter(ctx, k); } + static const char * override_type_to_str(const llama_model_kv_override_type ty) { + switch (ty) { + case LLAMA_KV_OVERRIDE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_INT: return "int"; + case LLAMA_KV_OVERRIDE_FLOAT: return "float"; + } + return "unknown"; + } + + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { + if (!override) { return false; } + if (override->tag == expected_type) { + LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", + __func__, override_type_to_str(override->tag), override->key); + switch (override->tag) { + case LLAMA_KV_OVERRIDE_BOOL: { + printf("%s\n", override->bool_value ? "true" : "false"); + } break; + case LLAMA_KV_OVERRIDE_INT: { + printf("%" PRId64 "\n", override->int_value); + } break; + case LLAMA_KV_OVERRIDE_FLOAT: { + printf("%.6f\n", override->float_value); + } break; + default: + // Shouldn't be possible to end up here, but just in case... + throw std::runtime_error( + format("Unsupported attempt to override %s type for metadata key %s\n", + override_type_to_str(override->tag), override->key)); + } + return true; + } + LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", + __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + return false; + } + template static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (!override) { - return false; - } - if (override->tag != LLAMA_KV_OVERRIDE_BOOL) { - return false; + if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { + target = override->bool_value; + return true; } - target = override->bool_value; return true; } template static typename std::enable_if::value && std::is_integral::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (!override) { - return false; - } - if (override->tag != LLAMA_KV_OVERRIDE_INT) { - return false; - } - if (override->int_value < 0 && !std::is_signed::value) { - return false; + if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { + target = override->int_value; + return true; } - target = override->int_value; - return true; + return false; } template static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override *override) { - if (!override) { - return false; - } - if (override->tag != LLAMA_KV_OVERRIDE_FLOAT) { - return false; + if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { + target = override->float_value; + return true; } - target = override->float_value; - return true; + return false; } template @@ -1796,7 +1821,10 @@ namespace GGUFMeta { try_override(T & target, const struct llama_model_kv_override *override) { (void)target; (void)override; - return false; // cannot override str + if (!override) { return false; } + // Currently, we should never end up here so it would be a bug if we do. + throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", + override ? override->key : "NULL")); } static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { @@ -1807,18 +1835,16 @@ namespace GGUFMeta { return true; } - template - static bool set(const gguf_context * ctx, const char * key, TT & target, const struct llama_model_kv_override *override = nullptr) { + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { const int kid = gguf_find_key(ctx, key); if (kid < 0) { return false; } - return GKV::set(ctx, kid, target, override); + return set(ctx, kid, target, override); } - template - static bool set(const gguf_context * ctx, const std::string & key, TT & target, const struct llama_model_kv_override *override = nullptr) { - return GKV::set(ctx, key.c_str(), target, override); + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { + return set(ctx, key.c_str(), target, override); } }; } @@ -1967,8 +1993,9 @@ struct llama_model_loader { } } - template typename std::enable_if::value, bool>::type - get_arr_n(const std::string & key, T & result, const bool required = false) { + template + typename std::enable_if::value, bool>::type + get_arr_n(const std::string & key, T & result, const bool required = true) { const int kid = gguf_find_key(ctx_gguf, key.c_str()); if (kid < 0) { @@ -1986,7 +2013,8 @@ struct llama_model_loader { return true; } - template typename std::enable_if::value, bool>::type + template + typename std::enable_if::value, bool>::type get_arr_n(const enum llm_kv kid, T & result, const bool required = true) { return get_arr_n(llm_kv(kid), result, required); } @@ -8228,7 +8256,7 @@ static int llama_apply_lora_from_file_internal( std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/NULL)); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); size_t ctx_size; size_t mmapped_size; From aa7cf3143be910d0c35d60023440c05a850e05c4 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Sat, 18 Nov 2023 03:07:03 -0700 Subject: [PATCH 08/10] Fix broken logic for parsing bool KV overrides Fix issue where overrides didn't apply when key missing in GGUF metadata Resolve merge changes --- common/common.cpp | 7 +++++-- llama.cpp | 32 +++++++++++++------------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9d9d64a16e31a..e103fc9084e26 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -699,9 +699,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; kvo.tag = LLAMA_KV_OVERRIDE_BOOL; - if (std::strcmp(sep, "true")) { + if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; - } else if (std::strcmp(sep, "false")) { + } else if (std::strcmp(sep, "false") == 0) { kvo.bool_value = false; } else { fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); @@ -888,6 +888,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); + printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); diff --git a/llama.cpp b/llama.cpp index 2eb807aa677d0..6a7dd7cb82363 100644 --- a/llama.cpp +++ b/llama.cpp @@ -607,7 +607,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } } -static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) { +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); switch (type) { @@ -1895,16 +1895,13 @@ namespace GGUFMeta { if (try_override(target, override)) { return true; } + if (k < 0) { return false; } target = get_kv(ctx, k); return true; } static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { - const int kid = gguf_find_key(ctx, key); - if (kid < 0) { - return false; - } - return set(ctx, kid, target, override); + return set(ctx, gguf_find_key(ctx, key), target, override); } static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { @@ -2367,6 +2364,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; + const gguf_context * ctx = ml.ctx_gguf; // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -2678,19 +2676,15 @@ static void llm_load_vocab( } // Handle add_bos_token and add_eos_token - std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS); - int kid = gguf_find_key(ctx, key.c_str()); - enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid); - vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1; - if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) { - LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str()); - } - key = kv(LLM_KV_TOKENIZER_ADD_EOS); - kid = gguf_find_key(ctx, key.c_str()); - ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid); - vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1; - if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) { - LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str()); + { + bool temp = true; + + if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { + vocab.special_add_bos = int(temp); + } + if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { + vocab.special_add_eos = int(temp); + } } } From 14e0ba1daa9c913463efc1714a306146af7f8d6a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 5 Dec 2023 09:40:57 +0200 Subject: [PATCH 09/10] llama : rearrange model params --- llama.cpp | 2 +- llama.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index ae6d81b88fb73..578ee9bf40e85 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8542,10 +8542,10 @@ struct llama_model_params llama_model_default_params() { /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, + /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, - /*.kv_overrides =*/ nullptr, }; #ifdef GGML_USE_METAL diff --git a/llama.h b/llama.h index cc3fb53ba9aa1..8175c81399152 100644 --- a/llama.h +++ b/llama.h @@ -181,14 +181,17 @@ extern "C" { // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; + // context pointer passed to the progress callback void * progress_callback_user_data; + // override key-value pairs of the model meta data + const struct llama_model_kv_override * kv_overrides; + // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM - const struct llama_model_kv_override * kv_overrides; }; struct llama_context_params { From 7bbe60576a68dc8f7715b298a1c94a02f87e1811 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Tue, 5 Dec 2023 07:14:49 -0700 Subject: [PATCH 10/10] Update new GET_KEY call Add note that metadata KV overrides aren't reflected in initial metadata KV info dump --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 94f7d6bb3ab4d..b77020e10d8a5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2056,6 +2056,7 @@ struct llama_model_loader { } } + LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); @@ -2571,7 +2572,8 @@ static void llm_load_hparams( } break; case LLM_ARCH_QWEN: { - GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break;