Skip to content

Commit

Permalink
fix(granitemoe convert): Split the double-sized input layer into gate…
Browse files Browse the repository at this point in the history
… and up

After a lot of staring and squinting, it's clear that the standard mixtral
expert implementation is equivalent to the vectorized parallel experts in
granite. The difference is that in granite, the w1 and w3 are concatenated
into a single tensor "input_linear." Rather than reimplementing all of the
math on the llama.cpp side, the much simpler route is to just split this
tensor during conversion and follow the standard mixtral route.

Branch: GraniteMoE

Co-Authored-By: [email protected]

Signed-off-by: Gabe Goodhart <[email protected]>
  • Loading branch information
gabe-l-hart committed Sep 17, 2024
1 parent 1788212 commit 54ce8cd
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 9 deletions.
18 changes: 18 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4119,8 +4119,26 @@ class GraniteMoeModel(GraniteModel):
"""Conversion for IBM's GraniteMoeForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE_MOE

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
is used. This essentially merges w1 and w3 into a single tensor with 2x
the hidden size that is then split during forward. To keep compativility
with existing mixtral support, we pull them apart here.
"""

if name.endswith("block_sparse_moe.input_linear.weight"):
gate, up = data_torch.chunk(2, dim=-2)
return [
(self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.gate.weight"), gate),
(self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.up.weight"), up),
]

return super().modify_tensors(data_torch, name, bid)


###### CONVERSION LOGIC ######


# tree of lazy tensors
class LazyTorchTensor(gguf.LazyBase):
_tensor_type = torch.Tensor
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
Expand Down
19 changes: 10 additions & 9 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,11 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.input_linear.up", # granitemoe
),

MODEL_TENSOR.FFN_UP_SHEXP: (
Expand Down Expand Up @@ -326,10 +326,11 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.input_linear.gate", # granitemoe
),

MODEL_TENSOR.FFN_GATE_SHEXP: (
Expand Down

0 comments on commit 54ce8cd

Please sign in to comment.