From 8a4ca2313ca29b7f17f985bcfea98ff1dafbedaa Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 14:45:51 -0600 Subject: [PATCH 01/10] feat(gguf-py): Add granitemoe architecture This includes the addition of new tensor names for the new moe layers. These may not be correct at this point due to the need for the hack in gguf_writer.py to double-check the length of the shape for these layers. Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- gguf-py/gguf/constants.py | 15 +++++++++++++++ gguf-py/gguf/tensor_mapping.py | 27 +++++++++++++++------------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b36a60d497abd..95a07f0a36995 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum): NEMOTRON = auto() EXAONE = auto() GRANITE = auto() + GRANITE_MOE = auto() class MODEL_TENSOR(IntEnum): @@ -392,6 +393,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1242,6 +1244,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 2ebfa2b43c471..db2d03e4d33e5 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -292,10 +292,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -324,10 +325,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), MODEL_TENSOR.FFN_GATE_SHEXP: ( @@ -364,10 +366,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) + "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( From e0b72290d086c16a728163fcc668fc0434a70be7 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 14:48:30 -0600 Subject: [PATCH 02/10] feat(convert_hf_to_gguf): Add GraniteMoeModel GraniteMoe has the same configuration deltas as Granite Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 13 +++++++++++-- gguf-py/gguf/constants.py | 2 +- gguf-py/gguf/tensor_mapping.py | 12 ++++++------ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ff4c9226faedb..8a32bf510849e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4102,14 +4102,23 @@ def set_gguf_parameters(self): # consistency if attention_scale := self.hparams.get("attention_multiplier"): self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) if embedding_scale := self.hparams.get("embedding_multiplier"): self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) if residual_scale := self.hparams.get("residual_multiplier"): self.gguf_writer.add_residual_scale(residual_scale) - if logits_scaling := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scaling) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) +@Model.register("GraniteMoeForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE_MOE + ###### CONVERSION LOGIC ###### # tree of lazy tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 95a07f0a36995..4e109122080d8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1253,7 +1253,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db2d03e4d33e5..fc5fb30109b4c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -251,11 +251,12 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe olmoe - "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -329,7 +330,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), MODEL_TENSOR.FFN_GATE_SHEXP: ( From 014e59d31d9813606dc346cbcd2c12460fe2f981 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 11 Sep 2024 10:03:43 -0600 Subject: [PATCH 03/10] fix(granitemoe convert): Split the double-sized input layer into gate and up After a lot of staring and squinting, it's clear that the standard mixtral expert implementation is equivalent to the vectorized parallel experts in granite. The difference is that in granite, the w1 and w3 are concatenated into a single tensor "input_linear." Rather than reimplementing all of the math on the llama.cpp side, the much simpler route is to just split this tensor during conversion and follow the standard mixtral route. Branch: GraniteMoE Co-Authored-By: alex.brooks@ibm.com Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 18 ++++++++++++++++++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 19 ++++++++++--------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8a32bf510849e..ea9daca826366 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4119,8 +4119,26 @@ class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compativility + with existing mixtral support, we pull them apart here. + """ + + if name.endswith("block_sparse_moe.input_linear.weight"): + gate, up = data_torch.chunk(2, dim=-2) + return [ + (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.gate.weight"), gate), + (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.up.weight"), up), + ] + + return super().modify_tensors(data_torch, name, bid) + + ###### CONVERSION LOGIC ###### + # tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4e109122080d8..fed7418bee37c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1254,6 +1254,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fc5fb30109b4c..901c03c7ce11c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -293,11 +293,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.input_linear.up", # granitemoe ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -326,10 +326,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.input_linear.gate", # granitemoe ), MODEL_TENSOR.FFN_GATE_SHEXP: ( From eca37cd4f2416735e31b8518949a97e64737d3e6 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 16:35:14 -0600 Subject: [PATCH 04/10] feat(granitemoe): Implement granitemoe GraniteMoE follows the mixtral architecture (once the input_linear layers are split into gate_exps/up_exps). The main delta is the addition of the same four multipliers used in Granite. Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- src/llama.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a718de054f934..65aa6cbebf819 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -215,6 +215,7 @@ enum llm_arch { LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, LLM_ARCH_GRANITE, + LLM_ARCH_GRANITE_MOE, LLM_ARCH_UNKNOWN, }; @@ -266,6 +267,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_GRANITE, "granite" }, + { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1478,6 +1480,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GRANITE_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2396,7 +2415,7 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - // Additional scale factors (Granite) + // Additional scale factors (Granite/Granite MoE) float f_residual_scale = 0.0f; float f_embedding_scale = 0.0f; float f_attention_scale = 0.0f; @@ -6048,6 +6067,7 @@ static void llm_load_hparams( } } break; case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -6056,6 +6076,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_3B; break; case 40: model.type = e_model::MODEL_3B; break; // Add additional layer/vocab/etc checks here for other model sizes default: model.type = e_model::MODEL_UNKNOWN; @@ -6810,7 +6831,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (model.arch == LLM_ARCH_GRANITE) { + if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); @@ -6984,6 +7005,7 @@ static bool llm_load_tensors( case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -15930,6 +15952,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: { result = llm.build_llama(); } break; @@ -19231,6 +19254,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 71bc4c1f934aa1cc83c33b4b0dcf0fda6fbc1319 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 23 Sep 2024 09:32:23 -0600 Subject: [PATCH 05/10] Typo fix in docstring Co-Authored-By: ggerganov@gmail.com Co-authored-by: Georgi Gerganov Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ea9daca826366..d62ecb9979080 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4122,7 +4122,7 @@ class GraniteMoeModel(GraniteModel): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: """In modeling_granitemoe, the JetMoe implementation of parallel experts is used. This essentially merges w1 and w3 into a single tensor with 2x - the hidden size that is then split during forward. To keep compativility + the hidden size that is then split during forward. To keep compatibility with existing mixtral support, we pull them apart here. """ From 5eb28c4710b6737063fbc2ec0b9b3cdc1a82a0eb Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 23 Sep 2024 11:03:18 -0600 Subject: [PATCH 06/10] fix(conversion): Simplify tensor name mapping in conversion Branch: GraniteMoE Co-Authored-By: git@compilade.net Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d62ecb9979080..1cad8ef5475cc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4129,8 +4129,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("block_sparse_moe.input_linear.weight"): gate, up = data_torch.chunk(2, dim=-2) return [ - (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.gate.weight"), gate), - (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.up.weight"), up), + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), ] return super().modify_tensors(data_torch, name, bid) From f2360996ca4d86a962839141a1adbf75c8dee076 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 23 Sep 2024 12:54:50 -0600 Subject: [PATCH 07/10] fix(convert): Remove unused tensor name mappings Branch: GraniteMoE Co-Authored-By: git@compilade.net Signed-off-by: Gabe Goodhart --- gguf-py/gguf/tensor_mapping.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 901c03c7ce11c..4e850726e9ba4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -293,11 +293,10 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.input_linear.up", # granitemoe + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -326,11 +325,10 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.input_linear.gate", # granitemoe + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( From 317b15bb6011c8210ebf425f2b0e5bd29d943c96 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 23 Sep 2024 13:56:39 -0600 Subject: [PATCH 08/10] fix(convert): Sanity check on merged FFN tensor sizes Branch: GraniteMoE Co-Authored-By: git@compilade.net Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1cad8ef5475cc..7be609054d6b8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4127,7 +4127,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter """ if name.endswith("block_sparse_moe.input_linear.weight"): - gate, up = data_torch.chunk(2, dim=-2) + ffn_dim = self.hparams["intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" + gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] return [ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), From 1c8b3e4c4464baab8507a7279bc1367985b3488f Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 23 Sep 2024 13:59:54 -0600 Subject: [PATCH 09/10] fix: Allow "output" layer in granite moe architecture (convert and cpp) Branch: GraniteMoE Co-Authored-By: git@compilade.net Signed-off-by: Gabe Goodhart --- gguf-py/gguf/constants.py | 1 + src/llama.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index fed7418bee37c..bd83be1441d79 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1247,6 +1247,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE_MOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/src/llama.cpp b/src/llama.cpp index 65aa6cbebf819..f268dd91e30e4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1485,6 +1485,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, From a843f1fac39acdd8575a698b0e5ec6a15694ae1b Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 24 Sep 2024 10:29:28 -0600 Subject: [PATCH 10/10] fix(granite): Add missing 'output' tensor for Granite This is a fix for the previous `granite` architecture PR. Recent snapshots have included this (`lm_head.weights`) as part of the architecture Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- gguf-py/gguf/constants.py | 1 + src/llama.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd83be1441d79..560eee916f27e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1234,6 +1234,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/src/llama.cpp b/src/llama.cpp index f268dd91e30e4..0accb1492efaa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1469,6 +1469,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },