Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quantize: specify each major tensor quant in CLI for common LLMs #8917

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
73 changes: 69 additions & 4 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
{ "CQS", LLAMA_FTYPE_CQS, "Custom Quantization Scheme", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
Expand Down Expand Up @@ -107,19 +108,35 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n");
printf(" --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n");
printf(" --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n");
printf(" --attn-v-type ggml_type: use this ggml_type for the attn_v.weight tensor.\n");
printf(" --attn-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor.\n");
printf(" --attn-output-type ggml_type: use this ggml_type for the attn_output.weight tensor.\n");
printf(" --ffn-gate-type ggml_type: use this ggml_type for the ffn_gate tensor.\n");
printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
printf("Note: The token embeddings tensor is loaded in system RAM, even in case of full GPU/VRAM offload.\n");
printf("Note: The recommanded type for the output tensor is q6_K for the ffn types > iq3_xxs and < q8_0.\n\n");
printf("Note for the Custom Quant Scheme FTYPE:\n");
printf(" Write the specific tensor legacy quants as qN_N, the K-Quants as qN_K, the IQ-Quants as iqN_xx.\n");
printf(" Usually, attn-q-type can be one type below the chosen ffn type, and attn-v-type should be one type above.\n");
printf(" attn-qkv-type replaces the types attn-q, attn-k and attn-v on some models.\n");
//TODO: - eventually - harmonize the CAPS writing of the FTYPEs, and non CAPS writing of the GGML_TYPEs.
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") {
Expand Down Expand Up @@ -279,6 +296,54 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--attn-q-type") == 0) {
if (arg_idx < argc-1) {
params.attn_q_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--attn-k-type") == 0) {
if (arg_idx < argc-1) {
params.attn_k_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--attn-v-type") == 0) {
if (arg_idx < argc-1) {
params.attn_v_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--attn-qkv-type") == 0) {
if (arg_idx < argc-1) {
params.attn_qkv_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--attn-output-type") == 0) {
if (arg_idx < argc-1) {
params.attn_output_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--ffn-gate-type") == 0) {
if (arg_idx < argc-1) {
params.ffn_gate_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--ffn-down-type") == 0) {
if (arg_idx < argc-1) {
params.ffn_down_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--ffn-up-type") == 0) {
if (arg_idx < argc-1) {
params.ffn_up_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
Expand Down
9 changes: 9 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_CQS = 99, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down Expand Up @@ -360,6 +361,14 @@ extern "C" {
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
enum ggml_type attn_q_type; // attention query tensor type
enum ggml_type attn_k_type; // attention key tensor type
enum ggml_type attn_v_type; // attention value tensor type
enum ggml_type attn_qkv_type; // attention query-key-value tensor type
enum ggml_type attn_output_type; // attention output tensor type
enum ggml_type ffn_gate_type; // feedforward network gate type
enum ggml_type ffn_down_type; // feedforward network down type
enum ggml_type ffn_up_type; // feedforward network up type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
Expand Down
76 changes: 68 additions & 8 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5317,6 +5317,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
case LLAMA_FTYPE_CQS: return "Custom Quantization Scheme";

default: return "unknown, may not work";
}
Expand Down Expand Up @@ -18028,7 +18029,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_v_type < GGML_TYPE_COUNT) {
new_type = qs.params->attn_v_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
Expand Down Expand Up @@ -18066,7 +18070,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
++qs.i_attention_wv;
} else if (name.find("attn_k.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_k_type < GGML_TYPE_COUNT) {
new_type = qs.params->attn_k_type;
}
else if (qs.model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
Expand All @@ -18078,7 +18085,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type = GGML_TYPE_IQ2_S;
}
} else if (name.find("attn_q.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_q_type < GGML_TYPE_COUNT) {
new_type = qs.params->attn_q_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
Expand All @@ -18087,7 +18097,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
} else if (name.find("ffn_down") != std::string::npos) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_down_type < GGML_TYPE_COUNT) {
new_type = qs.params->ffn_down_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
Expand Down Expand Up @@ -18130,7 +18143,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_output_type < GGML_TYPE_COUNT) {
new_type = qs.params->attn_output_type;
}
else if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
Expand All @@ -18150,7 +18166,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
}
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->attn_qkv_type < GGML_TYPE_COUNT) {
new_type = qs.params->attn_qkv_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
Expand All @@ -18159,15 +18178,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (name.find("ffn_gate") != std::string::npos) {
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_gate_type < GGML_TYPE_COUNT) {
new_type = qs.params->ffn_gate_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
++qs.i_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
if (ftype == LLAMA_FTYPE_CQS && qs.params->ffn_up_type < GGML_TYPE_COUNT) {
new_type = qs.params->ffn_up_type;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
++qs.i_ffn_up;
Expand Down Expand Up @@ -18325,6 +18350,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

// Custom Quantization Scheme
case LLAMA_FTYPE_CQS: default_type = GGML_TYPE_Q2_K; break;

default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}

Expand Down Expand Up @@ -18583,6 +18611,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
new_type = params->output_tensor_type;
}
if (params->attn_q_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_q.weight") == 0) {
new_type = params->attn_q_type;
}
if (params->attn_k_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_k.weight") == 0) {
new_type = params->attn_k_type;
}
if (params->attn_v_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_v.weight") == 0) {
new_type = params->attn_v_type;
}
if (params->attn_qkv_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_qkv.weight") == 0) {
new_type = params->attn_qkv_type;
}
if (params->attn_output_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_output.weight") == 0) {
new_type = params->attn_output_type;
}
if (params->ffn_gate_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_gate") == 0) {
new_type = params->ffn_gate_type;
}
if (params->ffn_down_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_down") == 0) {
new_type = params->ffn_down_type;
}
if (params->ffn_up_type < GGML_TYPE_COUNT && strcmp(tensor->name, "ffn_up") == 0) {
new_type = params->ffn_up_type;
}

// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
Expand Down Expand Up @@ -18993,6 +19045,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
/*.attn_q_type =*/ GGML_TYPE_COUNT,
/*.attn_k_type =*/ GGML_TYPE_COUNT,
/*.attn_v_type =*/ GGML_TYPE_COUNT,
/*.attn_qkv_type =*/ GGML_TYPE_COUNT,
/*.attn_output_type =*/ GGML_TYPE_COUNT,
/*.ffn_gate_type =*/ GGML_TYPE_COUNT,
/*.ffn_down_type =*/ GGML_TYPE_COUNT,
/*.ffn_up_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
Expand Down
Loading