Skip to content

Commit

Permalink
Initial OpenELM support (270M only so far)
Browse files Browse the repository at this point in the history
  • Loading branch information
icecream95 committed May 18, 2024
1 parent c1b295e commit 2d50f81
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 3 deletions.
44 changes: 44 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2395,6 +2395,50 @@ def set_vocab(self, *args, **kwargs):
self.gguf_writer.add_add_eos_token(True)


@Model.register("OpenELMForCausalLM")
class OpenELMModel(Model):
model_arch = gguf.MODEL_ARCH.OPENELM

# Copied from LlamaModel
def set_vocab(self):
try:
self. _set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_llama_hf()

def set_gguf_parameters(self):
# TODO: Look closer at these

self.gguf_writer.add_name("OpenELM")
self.block_count = self.find_hparam(["num_transformer_layers"])
self.gguf_writer.add_layer_norm_eps(1e-5)
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
n_embd = self.find_hparam(["model_dim"])
self.gguf_writer.add_embedding_length(n_embd)
head_dim = self.find_hparam(["head_dim"])
n_head = n_embd // head_dim
rot_pct = 1.0
self.gguf_writer.add_context_length(self.find_hparam(["max_context_length"]))
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_head_count_kv(n_head*10)
self.gguf_writer.add_head_count(n_head*10)
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_feed_forward_length(0) # dynamically calculated

def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
# TODO: Read configuration!
if "n_layers" in keys:
return 16 # num_transformer_layers
if "hidden_size" in keys:
return 1280 # model_dim
if "num_attention_heads" in keys:
return 64 # head_dim

return super().find_hparam(keys, optional)


###### CONVERSION LOGIC ######


Expand Down
14 changes: 14 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
COMMAND_R = auto()
DBRX = auto()
OLMO = auto()
OPENELM = auto()


class MODEL_TENSOR(IntEnum):
Expand Down Expand Up @@ -217,6 +218,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OPENELM: "openelm",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -743,6 +745,18 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OPENELM: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_DOWN,
],
# TODO
}

Expand Down
17 changes: 14 additions & 3 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TensorNameMap:
"backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok
"transformer.token_embeddings", # openelm
),

# Token type embeddings
Expand All @@ -36,6 +37,7 @@ class TensorNameMap:
"word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert
"transformer.norm", # openelm
),

# Position embeddings
Expand Down Expand Up @@ -68,6 +70,7 @@ class TensorNameMap:
"model.norm_f", # mamba-qbert
"backbone.norm_f", # mamba
"transformer.rms_norm", # Grok
"transformer.norm", # openelm
),

# Rope frequencies
Expand Down Expand Up @@ -97,6 +100,7 @@ class TensorNameMap:
"backbone.layers.{bid}.norm", # mamba
"transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"transformer.layers.{bid}.attn_norm", # openelm
),

# Attention norm 2
Expand All @@ -117,7 +121,8 @@ class TensorNameMap:
"h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
"model.layers.{bid}.self_attn.qkv_proj" # phi3
"model.layers.{bid}.self_attn.qkv_proj", # phi3
"transformer.layers.{bid}.attn.qkv_proj", # openelm
),

# Attention query
Expand Down Expand Up @@ -175,6 +180,7 @@ class TensorNameMap:
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
"transformer.layers.{bid}.attn.out_proj", # openelm
),

# Attention output norm
Expand Down Expand Up @@ -206,6 +212,7 @@ class TensorNameMap:
"h.{bid}.ln_2", # gpt2
"model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
"transformer.layers.{bid}.ffn_norm", # openelm
),

MODEL_TENSOR.FFN_GATE_INP: (
Expand Down Expand Up @@ -244,6 +251,7 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
"transformer.layers.{bid}.ffn.proj_1", # openelm
),

MODEL_TENSOR.FFN_UP_EXP: (
Expand Down Expand Up @@ -306,6 +314,7 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", # starcoder2
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
"transformer.layers.{bid}.ffn.proj_2", # openelm
),

MODEL_TENSOR.FFN_DOWN_EXP: (
Expand All @@ -324,15 +333,17 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm
),

MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm
),

MODEL_TENSOR.ROPE_FREQS: (
Expand Down
Loading

0 comments on commit 2d50f81

Please sign in to comment.