From 91db53b64555753f6445eef675cfdbf93370322b Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:41:23 +0200 Subject: [PATCH] IQ1_XL and some corrections notably on attn_q and parenthesis --- examples/quantize/quantize.cpp | 7 ++- include/llama.h | 1 + src/llama.cpp | 101 +++++++++++++++++++-------------- 3 files changed, 64 insertions(+), 45 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index db04bf64edaf5..9e2e400716196 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -28,13 +28,14 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "IQ1_XL", LLAMA_FTYPE_MOSTLY_IQ1_XL, " 1.90 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 3.20G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, - { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.95 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, diff --git a/include/llama.h b/include/llama.h index c9d7cb8a1d32a..4f5f2fc197b7b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -170,6 +170,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7bfde358cbb06..e0f251a0f0616 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4483,11 +4483,12 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ1_XS: return "IQ1_S mix - 1.6-1.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XL: return "IQ1_XL - 1.90 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.96 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -15475,7 +15476,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -15491,7 +15493,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K; @@ -15508,7 +15511,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = GGML_TYPE_Q6_K; + } else new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_gqa() >= 7) { @@ -15519,10 +15524,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { @@ -15534,7 +15539,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; @@ -15559,21 +15564,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = GGML_TYPE_Q6_K; + } else new_type = GGML_TYPE_Q8_0; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ1_M; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ2_S; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && @@ -15603,24 +15611,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } @@ -15636,7 +15648,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -15674,11 +15686,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) - new_type = GGML_TYPE_IQ2_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } @@ -15701,7 +15713,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_IQ2_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; @@ -15715,29 +15729,31 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } @@ -15881,6 +15897,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; + case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;