From 4f07702821e8dee982cce02e811df5a7dc3a6c21 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 11 Sep 2024 10:03:43 -0600 Subject: [PATCH] fix(granitemoe convert): Split the double-sized input layer into gate and up After a lot of staring and squinting, it's clear that the standard mixtral expert implementation is equivalent to the vectorized parallel experts in granite. The difference is that in granite, the w1 and w3 are concatenated into a single tensor "input_linear." Rather than reimplementing all of the math on the llama.cpp side, the much simpler route is to just split this tensor during conversion and follow the standard mixtral route. Branch: GraniteMoE Co-Authored-By: alex.brooks@ibm.com Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 18 ++++++++++++++++++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 19 ++++++++++--------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8a32bf510849e0..ea9daca826366b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4119,8 +4119,26 @@ class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compativility + with existing mixtral support, we pull them apart here. + """ + + if name.endswith("block_sparse_moe.input_linear.weight"): + gate, up = data_torch.chunk(2, dim=-2) + return [ + (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.gate.weight"), gate), + (self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.up.weight"), up), + ] + + return super().modify_tensors(data_torch, name, bid) + + ###### CONVERSION LOGIC ###### + # tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4e109122080d89..fed7418bee37c2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1254,6 +1254,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fc5fb30109b4ca..901c03c7ce11c4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -293,11 +293,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.input_linear.up", # granitemoe ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -326,10 +326,11 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.input_linear.gate", # granitemoe ), MODEL_TENSOR.FFN_GATE_SHEXP: (