@@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
25862586 case LLAMA_FTYPE_MOSTLY_Q8_0: return " Q8_0" ;
25872587
25882588 // K-quants
2589- case LLAMA_FTYPE_MOSTLY_Q2_K: return " Q2_K" ;
2589+ case LLAMA_FTYPE_MOSTLY_Q2_K: return " Q2_K - Medium" ;
2590+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return " Q2_K - Small" ;
25902591 case LLAMA_FTYPE_MOSTLY_Q3_K_S: return " Q3_K - Small" ;
25912592 case LLAMA_FTYPE_MOSTLY_Q3_K_M: return " Q3_K - Medium" ;
25922593 case LLAMA_FTYPE_MOSTLY_Q3_K_L: return " Q3_K - Large" ;
@@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89558956 // TODO: explore better strategies
89568957 new_type = GGML_TYPE_Q8_0;
89578958 }
8958- } else if (name.find (" ffn_down.weight " ) != std::string::npos) {
8959+ } else if (name.find (" ffn_down" ) != std::string::npos) {
89598960 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8 ) new_type = GGML_TYPE_Q4_K;
8963+ }
89608964 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8961- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8965+ new_type = qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 16 ? GGML_TYPE_Q5_K
89628966 : arch != LLM_ARCH_FALCON || use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
89638967 : GGML_TYPE_Q3_K;
89648968 }
@@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89678971 }
89688972 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
89698973 if (arch == LLM_ARCH_FALCON) {
8970- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8974+ new_type = qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 16 ? GGML_TYPE_Q6_K :
89718975 use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
89728976 } else {
89738977 if (use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
89748978 }
89758979 }
89768980 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8977- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4 ) {
8981+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 8 ) {
89788982 new_type = GGML_TYPE_Q5_K;
89798983 }
89808984 ++qs.i_feed_forward_w2 ;
@@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89928996 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
89938997 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89948998 }
8995- else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
8996- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8997- }
8999+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000+ // else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002+ // }
89989003 // This can be used to reduce the size of the Q5_K_S model.
89999004 // The associated PPL increase is fully in line with the size reduction
90009005 // else {
@@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90439048
90449049 // K-quants
90459050 case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break ;
9051+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break ;
90469052 case LLAMA_FTYPE_MOSTLY_Q3_K_S:
90479053 case LLAMA_FTYPE_MOSTLY_Q3_K_M:
90489054 case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break ;
@@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
91019107 if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
91029108 ++qs.n_attention_wv ;
91039109 }
9104- else if (name.find (" ffn_down.weight " ) != std::string::npos) {
9110+ else if (name.find (" ffn_down" ) != std::string::npos) {
91059111 ++qs.n_feed_forward_w2 ;
91069112 }
91079113 }
0 commit comments