From cd416660c5f75c8d1aa7f5d63f2d4bc327307a45 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 16:35:14 -0600 Subject: [PATCH] feat(granitemoe): Implement granitemoe GraniteMoE follows the mixtral architecture (once the input_linear layers are split into gate_exps/up_exps). The main delta is the addition of the same four multipliers used in Granite. Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- src/llama.cpp | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9ad4e261e14d01..bae09fe7c52518 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -215,6 +215,7 @@ enum llm_arch { LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, LLM_ARCH_GRANITE, + LLM_ARCH_GRANITE_MOE, LLM_ARCH_UNKNOWN, }; @@ -266,6 +267,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_GRANITE, "granite" }, + { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1478,6 +1480,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GRANITE_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2396,7 +2415,7 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - // For Granite architecture + // For Granite architectures float f_residual_multiplier = 0.0f; float f_embedding_multiplier = 0.0f; float f_attention_multiplier = 0.0f; @@ -5439,6 +5458,7 @@ static void llm_load_hparams( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5462,8 +5482,8 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } - // Extra multipliers for Granite architecture - if (model.arch == LLM_ARCH_GRANITE) { + // Extra multipliers for Granite architectures + if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier); ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier); @@ -6758,7 +6778,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (model.arch == LLM_ARCH_GRANITE) { + if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier); LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier); LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier); @@ -6932,6 +6952,7 @@ static bool llm_load_tensors( case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -15859,6 +15880,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { result = llm.build_llama(); } break; @@ -19156,6 +19178,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2