diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 27f2323..0f120b7 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -4403,83 +4403,81 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file" - ) + parser = argparse.ArgumentParser(description="") parser.add_argument( "--vocab-only", action="store_true", - help="extract only the vocab", + help="", ) parser.add_argument( "--outfile", type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + help="", ) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + help="", ) parser.add_argument( "--bigendian", action="store_true", - help="model is executed on big endian machine", + help="", ) parser.add_argument( "model", type=Path, - help="directory containing model file", + help="", ) parser.add_argument( "--use-temp-file", action="store_true", - help="use the tempfile library while processing (helpful when running out of memory, process killed)", + help="", ) parser.add_argument( "--no-lazy", action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + help="", ) parser.add_argument( "--model-name", type=str, default=None, - help="name of the model", + help="", ) parser.add_argument( "--verbose", action="store_true", - help="increase output verbosity", + help="", ) parser.add_argument( "--split-max-tensors", type=int, default=0, - help="max tensors in each split", + help="", ) parser.add_argument( "--split-max-size", type=str, default="0", - help="max size per split N(M|G)", + help="", ) parser.add_argument( "--dry-run", action="store_true", - help="only print out a split plan and exit, without writing any new files", + help="", ) parser.add_argument( "--no-tensor-first-split", action="store_true", - help="do not add tensors to the first split (disabled by default)", + help="", ) parser.add_argument( "--metadata", type=Path, - help="Specify the path for an authorship metadata override file", + help="", ) return parser.parse_args() diff --git a/src/gguf-py/gguf/constants.py b/src/gguf-py/gguf/constants.py index 87a9acd..35da7d7 100644 --- a/src/gguf-py/gguf/constants.py +++ b/src/gguf-py/gguf/constants.py @@ -3,436 +3,413 @@ from enum import Enum, IntEnum, auto from typing import Any -# -# constants -# - -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 -GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h - -# -# metadata keys -# - +GGML_QUANT_VERSION = 2 class Keys: class General: - TYPE = "general.type" - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - FILE_TYPE = "general.file_type" - - # Authorship Metadata - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - ORGANIZATION = "general.organization" - - FINETUNE = "general.finetune" - BASENAME = "general.basename" - - DESCRIPTION = "general.description" - QUANTIZED_BY = "general.quantized_by" - - SIZE_LABEL = "general.size_label" - - # Licensing details - LICENSE = "general.license" - LICENSE_NAME = "general.license.name" - LICENSE_LINK = "general.license.link" - - # Typically represents the converted GGUF repo (Unless native) - URL = "general.url" # Model Website/Paper - DOI = "general.doi" - UUID = "general.uuid" - REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) - - # Model Source during conversion - SOURCE_URL = "general.source.url" # Model Website/Paper - SOURCE_DOI = "general.source.doi" - SOURCE_UUID = "general.source.uuid" - SOURCE_REPO_URL = ( - "general.source.repo_url" # Model Source Repository (git/svn/etc...) - ) - - # Base Model Source. There can be more than one source if it's a merged - # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in - # tracing linage of models as it is finetuned or merged over time. - BASE_MODEL_COUNT = "general.base_model.count" - BASE_MODEL_NAME = "general.base_model.{id}.name" - BASE_MODEL_AUTHOR = "general.base_model.{id}.author" - BASE_MODEL_VERSION = "general.base_model.{id}.version" - BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" - BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper - BASE_MODEL_DOI = "general.base_model.{id}.doi" - BASE_MODEL_UUID = "general.base_model.{id}.uuid" - BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) - - # Array based KV stores - TAGS = "general.tags" - LANGUAGES = "general.languages" - DATASETS = "general.datasets" + TYPE = "general.type" + ARCHITECTURE = "general.architecture" + QUANTIZATION_VERSION = "general.quantization_version" + ALIGNMENT = "general.alignment" + FILE_TYPE = "general.file_type" + + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + ORGANIZATION = "general.organization" + + FINETUNE = "general.finetune" + BASENAME = "general.basename" + + DESCRIPTION = "general.description" + QUANTIZED_BY = "general.quantized_by" + + SIZE_LABEL = "general.size_label" + + LICENSE = "general.license" + LICENSE_NAME = "general.license.name" + LICENSE_LINK = "general.license.link" + + URL = "general.url" + DOI = "general.doi" + UUID = "general.uuid" + REPO_URL = "general.repo_url" + + SOURCE_URL = "general.source.url" + SOURCE_DOI = "general.source.doi" + SOURCE_UUID = "general.source.uuid" + SOURCE_REPO_URL = "general.source.repo_url" + + BASE_MODEL_COUNT = "general.base_model.count" + BASE_MODEL_NAME = "general.base_model.{id}.name" + BASE_MODEL_AUTHOR = "general.base_model.{id}.author" + BASE_MODEL_VERSION = "general.base_model.{id}.version" + BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" + BASE_MODEL_URL = "general.base_model.{id}.url" + BASE_MODEL_DOI = "general.base_model.{id}.doi" + BASE_MODEL_UUID = "general.base_model.{id}.uuid" + BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" + + TAGS = "general.tags" + LANGUAGES = "general.languages" + DATASETS = "general.datasets" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" - EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" - DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" - ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" - FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" + EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" + DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - CAUSAL = "{arch}.attention.causal" - Q_LORA_RANK = "{arch}.attention.q_lora_rank" - KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + CAUSAL = "{arch}.attention.causal" + Q_LORA_RANK = "{arch}.attention.q_lora_rank" + KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" - SLIDING_WINDOW = "{arch}.attention.sliding_window" + SLIDING_WINDOW = "{arch}.attention.sliding_window" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" class Split: - LLM_KV_SPLIT_NO = "split.no" - LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: - MODEL = "tokenizer.ggml.model" - PRE = "tokenizer.ggml.pre" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = ( - "tokenizer.ggml.token_type_count" # for BERT-style token types - ) - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - CLS_ID = "tokenizer.ggml.cls_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" + MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" - CHAT_TEMPLATES = "tokenizer.chat_templates" - # FIM/Infill special tokens constants - PREFIX_ID = "tokenizer.ggml.prefix_token_id" - SUFFIX_ID = "tokenizer.ggml.suffix_token_id" - MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" + + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: - TYPE = "adapter.type" + TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" - -# -# recommended mapping of model tensor names for storage in gguf -# - - class GGUFType: - MODEL = "model" + MODEL = "model" ADAPTER = "adapter" - class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() JINA_BERT_V2 = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - PHI2 = auto() - PHI3 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - GEMMA = auto() - GEMMA2 = auto() - STARCODER2 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - DBRX = auto() - OLMO = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK2 = auto() - CHATGLM = auto() - BITNET = auto() - T5 = auto() - JAIS = auto() - + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + PHI2 = auto() + PHI3 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + GEMMA = auto() + GEMMA2 = auto() + STARCODER2 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + DBRX = auto() + OLMO = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK2 = auto() + CHATGLM = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + NEMOTRON = auto() + EXAONE = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ROPE_FACTORS_LONG = auto() - ROPE_FACTORS_SHORT = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_POST_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_GATE_INP_SHEXP = auto() - FFN_NORM = auto() - FFN_PRE_NORM = auto() - FFN_POST_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_NORM_EXP = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - FFN_GATE_SHEXP = auto() - FFN_DOWN_SHEXP = auto() - FFN_UP_SHEXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() - ATTN_Q_A = auto() - ATTN_Q_B = auto() - ATTN_KV_A_MQA = auto() - ATTN_KV_B = auto() - ATTN_Q_A_NORM = auto() - ATTN_KV_A_NORM = auto() - FFN_SUB_NORM = auto() - ATTN_SUB_NORM = auto() - DEC_ATTN_NORM = auto() - DEC_ATTN_Q = auto() - DEC_ATTN_K = auto() - DEC_ATTN_V = auto() - DEC_ATTN_OUT = auto() - DEC_ATTN_REL_B = auto() - DEC_CROSS_ATTN_NORM = auto() - DEC_CROSS_ATTN_Q = auto() - DEC_CROSS_ATTN_K = auto() - DEC_CROSS_ATTN_V = auto() - DEC_CROSS_ATTN_OUT = auto() + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ROPE_FACTORS_LONG = auto() + ROPE_FACTORS_SHORT = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_GATE_INP_SHEXP = auto() + FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_NORM_EXP = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + FFN_GATE_SHEXP = auto() + FFN_DOWN_SHEXP = auto() + FFN_UP_SHEXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() + ATTN_Q_A = auto() + ATTN_Q_B = auto() + ATTN_KV_A_MQA = auto() + ATTN_KV_B = auto() + ATTN_Q_A_NORM = auto() + ATTN_KV_A_NORM = auto() + FFN_SUB_NORM = auto() + ATTN_SUB_NORM = auto() + DEC_ATTN_NORM = auto() + DEC_ATTN_Q = auto() + DEC_ATTN_K = auto() + DEC_ATTN_V = auto() + DEC_ATTN_OUT = auto() + DEC_ATTN_REL_B = auto() + DEC_CROSS_ATTN_NORM = auto() + DEC_CROSS_ATTN_Q = auto() + DEC_CROSS_ATTN_K = auto() + DEC_CROSS_ATTN_V = auto() + DEC_CROSS_ATTN_OUT = auto() DEC_CROSS_ATTN_REL_B = auto() - DEC_FFN_NORM = auto() - DEC_FFN_GATE = auto() - DEC_FFN_DOWN = auto() - DEC_FFN_UP = auto() - DEC_OUTPUT_NORM = auto() - ENC_ATTN_NORM = auto() - ENC_ATTN_Q = auto() - ENC_ATTN_K = auto() - ENC_ATTN_V = auto() - ENC_ATTN_OUT = auto() - ENC_ATTN_REL_B = auto() - ENC_FFN_NORM = auto() - ENC_FFN_GATE = auto() - ENC_FFN_DOWN = auto() - ENC_FFN_UP = auto() - ENC_OUTPUT_NORM = auto() - + DEC_FFN_NORM = auto() + DEC_FFN_GATE = auto() + DEC_FFN_DOWN = auto() + DEC_FFN_UP = auto() + DEC_OUTPUT_NORM = auto() + ENC_ATTN_NORM = auto() + ENC_ATTN_Q = auto() + ENC_ATTN_K = auto() + ENC_ATTN_V = auto() + ENC_ATTN_OUT = auto() + ENC_ATTN_REL_B = auto() + ENC_FFN_NORM = auto() + ENC_FFN_GATE = auto() + ENC_FFN_DOWN = auto() + ENC_FFN_UP = auto() + ENC_OUTPUT_NORM = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", - MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", - MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", - MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", - MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", - MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", - MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", - MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", - MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", - MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", - MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", - MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", - MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", - MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", - MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", - MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", - MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", - MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", - MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", + MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", + MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", + MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", + MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", + MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", + MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", + MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", + MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", + MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", + MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", + MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", + MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", + MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", - MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", - MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", - MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", - MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", - MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", - MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", - MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", - MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", - MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", - MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", - MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", - MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", - MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", - MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", - MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", - MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", + MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", + MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", + MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", + MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", + MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", + MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", + MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", + MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", + MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", + MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", + MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", + MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -981,7 +958,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], - MODEL_ARCH.CHATGLM: [ + MODEL_ARCH.CHATGLM : [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.OUTPUT_NORM, @@ -1040,6 +1017,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.T5ENCODER: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ENC_ATTN_NORM, + MODEL_TENSOR.ENC_ATTN_Q, + MODEL_TENSOR.ENC_ATTN_K, + MODEL_TENSOR.ENC_ATTN_V, + MODEL_TENSOR.ENC_ATTN_OUT, + MODEL_TENSOR.ENC_ATTN_REL_B, + MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, + MODEL_TENSOR.ENC_FFN_DOWN, + MODEL_TENSOR.ENC_FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + ], MODEL_ARCH.JAIS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1052,10 +1044,40 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_UP, ], - # TODO + MODEL_ARCH.NEMOTRON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.EXAONE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + } -# tensors that will not be serialized MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.LLAMA: [ MODEL_TENSOR.ROPE_FREQS, @@ -1092,127 +1114,119 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.ROPE_FREQS, ], + MODEL_ARCH.NEMOTRON: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } -# -# types -# - - class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - + UNUSED = 5 + BYTE = 6 class RopeScalingType(Enum): - NONE = "none" - LINEAR = "linear" - YARN = "yarn" - + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 - + CLS = 2 class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 - BF16 = 30 - + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 + BF16 = 30 + Q4_0_4_4 = 31 + Q4_0_4_8 = 32 + Q4_0_8_8 = 33 -# TODO: add GGMLFileType from ggml_ftype in ggml.h - - -# from llama_ftype in llama.h -# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE. class LlamaFileType(IntEnum): - ALL_F32 = 0 - MOSTLY_F16 = 1 # except 1d tensors - MOSTLY_Q4_0 = 2 # except 1d tensors - MOSTLY_Q4_1 = 3 # except 1d tensors - MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 - # MOSTLY_Q4_2 = 5 # support has been removed - # MOSTLY_Q4_3 = 6 # support has been removed - MOSTLY_Q8_0 = 7 # except 1d tensors - MOSTLY_Q5_0 = 8 # except 1d tensors - MOSTLY_Q5_1 = 9 # except 1d tensors - MOSTLY_Q2_K = 10 # except 1d tensors - MOSTLY_Q3_K_S = 11 # except 1d tensors - MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors - MOSTLY_Q4_K_S = 14 # except 1d tensors - MOSTLY_Q4_K_M = 15 # except 1d tensors - MOSTLY_Q5_K_S = 16 # except 1d tensors - MOSTLY_Q5_K_M = 17 # except 1d tensors - MOSTLY_Q6_K = 18 # except 1d tensors - MOSTLY_IQ2_XXS = 19 # except 1d tensors - MOSTLY_IQ2_XS = 20 # except 1d tensors - MOSTLY_Q2_K_S = 21 # except 1d tensors - MOSTLY_IQ3_XS = 22 # except 1d tensors - MOSTLY_IQ3_XXS = 23 # except 1d tensors - MOSTLY_IQ1_S = 24 # except 1d tensors - MOSTLY_IQ4_NL = 25 # except 1d tensors - MOSTLY_IQ3_S = 26 # except 1d tensors - MOSTLY_IQ3_M = 27 # except 1d tensors - MOSTLY_IQ2_S = 28 # except 1d tensors - MOSTLY_IQ2_M = 29 # except 1d tensors - MOSTLY_IQ4_XS = 30 # except 1d tensors - MOSTLY_IQ1_M = 31 # except 1d tensors - MOSTLY_BF16 = 32 # except 1d tensors - - GUESSED = 1024 # not specified in the model file - + ALL_F32 = 0 + MOSTLY_F16 = 1 + MOSTLY_Q4_0 = 2 + MOSTLY_Q4_1 = 3 + + MOSTLY_Q8_0 = 7 + MOSTLY_Q5_0 = 8 + MOSTLY_Q5_1 = 9 + MOSTLY_Q2_K = 10 + MOSTLY_Q3_K_S = 11 + MOSTLY_Q3_K_M = 12 + MOSTLY_Q3_K_L = 13 + MOSTLY_Q4_K_S = 14 + MOSTLY_Q4_K_M = 15 + MOSTLY_Q5_K_S = 16 + MOSTLY_Q5_K_M = 17 + MOSTLY_Q6_K = 18 + MOSTLY_IQ2_XXS = 19 + MOSTLY_IQ2_XS = 20 + MOSTLY_Q2_K_S = 21 + MOSTLY_IQ3_XS = 22 + MOSTLY_IQ3_XXS = 23 + MOSTLY_IQ1_S = 24 + MOSTLY_IQ4_NL = 25 + MOSTLY_IQ3_S = 26 + MOSTLY_IQ3_M = 27 + MOSTLY_IQ2_S = 28 + MOSTLY_IQ2_M = 29 + MOSTLY_IQ4_XS = 30 + MOSTLY_IQ1_M = 31 + MOSTLY_BF16 = 32 + MOSTLY_Q4_0_4_4 = 33 + MOSTLY_Q4_0_4_8 = 34 + MOSTLY_Q4_0_8_8 = 35 + + GUESSED = 1024 class GGUFEndian(IntEnum): LITTLE = 0 BIG = 1 - class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -1227,108 +1241,101 @@ def get_type(val: Any) -> GGUFValueType: return GGUFValueType.BOOL elif isinstance(val, int): return GGUFValueType.INT32 - # TODO: need help with 64-bit types in Python + else: raise ValueError(f"Unknown type: {type(val)}") - -# Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), - GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), - GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), + GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), + GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), } - -# Aliases for backward compatibility. - -# general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE - -# LLM -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE + +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT -# attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS -# RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED -# SSM -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -# tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID -KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID -KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID \ No newline at end of file diff --git a/src/gguf-py/gguf/tensor_mapping.py b/src/gguf-py/gguf/tensor_mapping.py index 5389d5f..75d23b4 100644 --- a/src/gguf-py/gguf/tensor_mapping.py +++ b/src/gguf-py/gguf/tensor_mapping.py @@ -4,471 +4,569 @@ from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES - class TensorNameMap: mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Token embeddings + MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais - "transformer.word_embeddings", # falcon - "word_embeddings", # bloom - "model.embed_tokens", # llama-hf - "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert nomic-bert - "language_model.embedding.word_embeddings", # persimmon - "wte", # gpt2 - "transformer.embd.wte", # phi2 - "model.tok_embeddings", # internlm2 - "model.embedding", # mamba-qbert - "backbone.embedding", # mamba - "backbone.embeddings", # mamba-hf - "transformer.in_out_embed", # Grok - "embedding.word_embeddings", # chatglm - "transformer.token_embeddings", # openelm - "shared", # t5 - ), - # Token type embeddings + "gpt_neox.embed_in", + "transformer.wte", + "transformer.word_embeddings", + "word_embeddings", + "model.embed_tokens", + "tok_embeddings", + "embeddings.word_embeddings", + "language_model.embedding.word_embeddings", + "wte", + "transformer.embd.wte", + "model.tok_embeddings", + "model.embedding", + "backbone.embedding", + "backbone.embeddings", + "transformer.in_out_embed", + "embedding.word_embeddings", + "transformer.token_embeddings", + "shared", + ), + MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", # bert nomic-bert + "embeddings.token_type_embeddings", ), - # Normalization of token embeddings + MODEL_TENSOR.TOKEN_EMBD_NORM: ( - "word_embeddings_layernorm", # bloom - "embeddings.LayerNorm", # bert - "emb_ln", # nomic-bert - "transformer.norm", # openelm + "word_embeddings_layernorm", + "embeddings.LayerNorm", + "emb_ln", + "transformer.norm", ), - # Position embeddings + MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 - "embeddings.position_embeddings", # bert - "wpe", # gpt2 + "transformer.wpe", + "embeddings.position_embeddings", + "wpe", ), - # Output + MODEL_TENSOR.OUTPUT: ( - "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais - "output", # llama-pth bloom internlm2 - "word_embeddings_for_head", # persimmon - "lm_head.linear", # phi2 - "output_layer", # chatglm - ), - # Output norm + "embed_out", + "lm_head", + "output", + "word_embeddings_for_head", + "lm_head.linear", + "output_layer", + ), + MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon jais - "model.norm", # llama-hf baichuan internlm2 - "norm", # llama-pth - "transformer.norm_f", # mpt dbrx - "ln_f", # refact bloom qwen gpt2 - "language_model.encoder.final_layernorm", # persimmon - "model.final_layernorm", # persimmon - "lm_head.ln", # phi2 - "model.norm_f", # mamba-qbert - "backbone.norm_f", # mamba - "transformer.rms_norm", # Grok - "encoder.final_layernorm", # chatglm - "transformer.norm", # openelm - ), - # Rope frequencies + "gpt_neox.final_layer_norm", + "transformer.ln_f", + "model.norm", + "norm", + "transformer.norm_f", + "ln_f", + "language_model.encoder.final_layernorm", + "model.final_layernorm", + "lm_head.ln", + "model.norm_f", + "backbone.norm_f", + "transformer.rms_norm", + "encoder.final_layernorm", + "transformer.norm", + "model.norm", + ), + MODEL_TENSOR.ROPE_FREQS: ( - "rope.freqs", # llama-pth - "rotary_pos_emb.inv_freq", # chatglm + "rope.freqs", + "rotary_pos_emb.inv_freq", ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Attention norm + MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth - "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - "model.layers.{bid}.norm", # mamba-qbert - "backbone.layers.{bid}.norm", # mamba - "transformer.decoder_layer.{bid}.rms_norm", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx - "encoder.layers.{bid}.input_layernorm", # chatglm - "transformer.layers.{bid}.attn_norm", # openelm - ), - # Attention norm 2 + "gpt_neox.layers.{bid}.input_layernorm", + "transformer.h.{bid}.ln_1", + "transformer.blocks.{bid}.norm_1", + "transformer.h.{bid}.input_layernorm", + "h.{bid}.input_layernorm", + "transformer.h.{bid}.ln_mlp", + "model.layers.{bid}.input_layernorm", + "layers.{bid}.attention_norm", + "language_model.encoder.layers.{bid}.input_layernorm", + "model.layers.{bid}.ln1", + "h.{bid}.ln_1", + "transformer.h.{bid}.ln", + "model.layers.layers.{bid}.norm", + "model.layers.{bid}.attention_norm", + "model.layers.{bid}.norm", + "backbone.layers.{bid}.norm", + "transformer.decoder_layer.{bid}.rms_norm", + "transformer.blocks.{bid}.norm_attn_norm.norm_1", + "encoder.layers.{bid}.input_layernorm", + "transformer.layers.{bid}.attn_norm", + ), + MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - "encoder.layer.{bid}.layer_norm_1", # jina-v2-code + "transformer.h.{bid}.ln_attn", + "encoder.layer.{bid}.layer_norm_1", ), - # Attention query-key-value + MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais - "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx - "transformer.h.{bid}.self_attention.query_key_value", # falcon - "h.{bid}.self_attention.query_key_value", # bloom - "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon - "model.layers.{bid}.self_attn.query_key_value", # persimmon - "h.{bid}.attn.c_attn", # gpt2 - "transformer.h.{bid}.mixer.Wqkv", # phi2 - "encoder.layers.{bid}.attn.Wqkv", # nomic-bert - "model.layers.{bid}.self_attn.qkv_proj", # phi3 - "encoder.layers.{bid}.self_attention.query_key_value", # chatglm - "transformer.layers.{bid}.attn.qkv_proj", # openelm - ), - # Attention query + "gpt_neox.layers.{bid}.attention.query_key_value", + "transformer.h.{bid}.attn.c_attn", + "transformer.blocks.{bid}.attn.Wqkv", + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", + "transformer.h.{bid}.self_attention.query_key_value", + "h.{bid}.self_attention.query_key_value", + "language_model.encoder.layers.{bid}.self_attention.query_key_value", + "model.layers.{bid}.self_attn.query_key_value", + "h.{bid}.attn.c_attn", + "transformer.h.{bid}.mixer.Wqkv", + "encoder.layers.{bid}.attn.Wqkv", + "model.layers.{bid}.self_attn.qkv_proj", + "encoder.layers.{bid}.self_attention.query_key_value", + "transformer.layers.{bid}.attn.qkv_proj", + ), + MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok - ), - # Attention key + "model.layers.{bid}.self_attn.q_proj", + "layers.{bid}.attention.wq", + "encoder.layer.{bid}.attention.self.query", + "transformer.h.{bid}.attn.q_proj", + "model.layers.layers.{bid}.self_attn.q_proj", + "model.layers.{bid}.attention.wq", + "transformer.decoder_layer.{bid}.multi_head_attention.query", + "transformer.h.{bid}.attn.attention.q_proj", + ), + MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "transformer.h.{bid}.attn.k", # refact - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok - ), - # Attention value + "model.layers.{bid}.self_attn.k_proj", + "layers.{bid}.attention.wk", + "encoder.layer.{bid}.attention.self.key", + "transformer.h.{bid}.attn.k_proj", + "transformer.h.{bid}.attn.k", + "model.layers.layers.{bid}.self_attn.k_proj", + "model.layers.{bid}.attention.wk", + "transformer.decoder_layer.{bid}.multi_head_attention.key", + "transformer.h.{bid}.attn.attention.k_proj", + ), + MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "transformer.h.{bid}.attn.v", # refact - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok - ), - # Attention output + "model.layers.{bid}.self_attn.v_proj", + "layers.{bid}.attention.wv", + "encoder.layer.{bid}.attention.self.value", + "transformer.h.{bid}.attn.v_proj", + "transformer.h.{bid}.attn.v", + "model.layers.layers.{bid}.self_attn.v_proj", + "model.layers.{bid}.attention.wv", + "transformer.decoder_layer.{bid}.multi_head_attention.value", + "transformer.h.{bid}.attn.attention.v_proj", + ), + MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert - "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx - "encoder.layers.{bid}.self_attention.dense", # chatglm - "transformer.layers.{bid}.attn.out_proj", # openelm - ), - # Attention output norm + "gpt_neox.layers.{bid}.attention.dense", + "transformer.h.{bid}.attn.c_proj", + "transformer.blocks.{bid}.attn.out_proj", + "transformer.h.{bid}.self_attention.dense", + "h.{bid}.self_attention.dense", + "model.layers.{bid}.self_attn.o_proj", + "layers.{bid}.attention.wo", + "encoder.layer.{bid}.attention.output.dense", + "transformer.h.{bid}.attn.out_proj", + "language_model.encoder.layers.{bid}.self_attention.dense", + "model.layers.{bid}.self_attn.dense", + "h.{bid}.attn.c_proj", + "transformer.h.{bid}.mixer.out_proj", + "model.layers.layers.{bid}.self_attn.o_proj", + "model.layers.{bid}.attention.wo", + "encoder.layers.{bid}.attn.out_proj", + "transformer.decoder_layer.{bid}.multi_head_attention.linear", + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", + "encoder.layers.{bid}.self_attention.dense", + "transformer.layers.{bid}.attn.out_proj", + "transformer.h.{bid}.attn.attention.out_proj", + ), + MODEL_TENSOR.ATTN_OUT_NORM: ( - "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "encoder.layers.{bid}.norm1", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_1", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx + "encoder.layer.{bid}.attention.output.LayerNorm", + "encoder.layers.{bid}.norm1", + "transformer.decoder_layer.{bid}.rms_norm_1", + "transformer.blocks.{bid}.norm_attn_norm.norm_2", ), + MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 + "model.layers.{bid}.post_attention_layernorm", ), - # Rotary embeddings + MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf - "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo - "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", + "layers.{bid}.attention.inner_attention.rope.freqs", + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", + "transformer.h.{bid}.attn.rotary_emb.inv_freq", ), - # Feed-forward norm + MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais - "h.{bid}.post_attention_layernorm", # bloom - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth - "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon - "model.layers.{bid}.ln2", # yi - "h.{bid}.ln_2", # gpt2 - "model.layers.{bid}.ffn_norm", # internlm2 - "transformer.decoder_layer.{bid}.rms_norm_2", # Grok - "encoder.layers.{bid}.post_attention_layernorm", # chatglm - "transformer.layers.{bid}.ffn_norm", # openelm - ), - # Post feed-forward norm + "gpt_neox.layers.{bid}.post_attention_layernorm", + "transformer.h.{bid}.ln_2", + "h.{bid}.post_attention_layernorm", + "transformer.blocks.{bid}.norm_2", + "model.layers.{bid}.post_attention_layernorm", + "layers.{bid}.ffn_norm", + "language_model.encoder.layers.{bid}.post_attention_layernorm", + "model.layers.{bid}.ln2", + "h.{bid}.ln_2", + "model.layers.{bid}.ffn_norm", + "transformer.decoder_layer.{bid}.rms_norm_2", + "encoder.layers.{bid}.post_attention_layernorm", + "transformer.layers.{bid}.ffn_norm", + ), + MODEL_TENSOR.FFN_PRE_NORM: ( - "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + "model.layers.{bid}.pre_feedforward_layernorm", ), - # Post feed-forward norm + MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", # gemma2 + "model.layers.{bid}.post_feedforward_layernorm", ), + MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe - "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "layers.{bid}.feed_forward.gate", + "model.layers.{bid}.block_sparse_moe.gate", + "model.layers.{bid}.mlp.gate", + "transformer.decoder_layer.{bid}.router", + "transformer.blocks.{bid}.ffn.router.layer", ), + MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe + "model.layers.{bid}.mlp.shared_expert_gate", ), - # Feed-forward up + MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 jais - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact - "layers.{bid}.feed_forward.w3", # llama-pth - "encoder.layer.{bid}.intermediate.dense", # bert - "transformer.h.{bid}.mlp.fc_in", # gpt-j - "transformer.h.{bid}.mlp.linear_3", # refact - "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon - "transformer.h.{bid}.mlp.w1", # qwen - "h.{bid}.mlp.c_fc", # gpt2 - "transformer.h.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.fc1", # phi2 - "model.layers.{bid}.mlp.gate_up_proj", # phi3 - "model.layers.layers.{bid}.mlp.up_proj", # plamo - "model.layers.{bid}.feed_forward.w3", # internlm2 - "encoder.layers.{bid}.mlp.fc11", # nomic-bert - "model.layers.{bid}.mlp.c_fc", # starcoder2 - "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 - "model.layers.{bid}.residual_mlp.w3", # arctic - "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", + "transformer.h.{bid}.mlp.c_fc", + "transformer.blocks.{bid}.ffn.up_proj", + "transformer.h.{bid}.mlp.dense_h_to_4h", + "h.{bid}.mlp.dense_h_to_4h", + "model.layers.{bid}.mlp.up_proj", + "layers.{bid}.feed_forward.w3", + "encoder.layer.{bid}.intermediate.dense", + "transformer.h.{bid}.mlp.fc_in", + "transformer.h.{bid}.mlp.linear_3", + "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", + "model.layers.{bid}.mlp.dense_h_to_4h", + "transformer.h.{bid}.mlp.w1", + "h.{bid}.mlp.c_fc", + "transformer.h.{bid}.mlp.fc1", + "model.layers.{bid}.mlp.fc1", + "model.layers.{bid}.mlp.gate_up_proj", + "model.layers.layers.{bid}.mlp.up_proj", + "model.layers.{bid}.feed_forward.w3", + "encoder.layers.{bid}.mlp.fc11", + "model.layers.{bid}.mlp.c_fc", + "encoder.layer.{bid}.mlp.gated_layers_v", + "model.layers.{bid}.residual_mlp.w3", + "encoder.layers.{bid}.mlp.dense_h_to_4h", + "transformer.h.{bid}.mlp.c_fc_1", ), + MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) + "layers.{bid}.feed_forward.experts.w3", + "transformer.decoder_layer.{bid}.moe.linear_v", + "transformer.blocks.{bid}.ffn.experts.mlp.v1", + "model.layers.{bid}.mlp.experts.up_proj", ), + MODEL_TENSOR.FFN_UP_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_expert.up_proj", + "model.layers.{bid}.mlp.shared_experts.up_proj", + ), + + MODEL_TENSOR.FFN_ACT: ( + "transformer.blocks.{bid}.ffn.act", ), - # AWQ-activation gate - MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt - # Feed-forward gate + MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic + "model.layers.{bid}.mlp.gate_proj", + "layers.{bid}.feed_forward.w1", + "transformer.h.{bid}.mlp.w2", + "transformer.h.{bid}.mlp.c_fc2", + "model.layers.layers.{bid}.mlp.gate_proj", + "model.layers.{bid}.feed_forward.w1", + "encoder.layers.{bid}.mlp.fc12", + "encoder.layer.{bid}.mlp.gated_layers_w", + "transformer.h.{bid}.mlp.linear_1", + "model.layers.{bid}.residual_mlp.w1", + "transformer.h.{bid}.mlp.c_fc_0", ), + MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) + "layers.{bid}.feed_forward.experts.w1", + "transformer.decoder_layer.{bid}.moe.linear", + "transformer.blocks.{bid}.ffn.experts.mlp.w1", + "model.layers.{bid}.mlp.experts.gate_proj", ), + MODEL_TENSOR.FFN_GATE_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_expert.gate_proj", + "model.layers.{bid}.mlp.shared_experts.gate_proj", ), - # Feed-forward down + MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth - "encoder.layer.{bid}.output.dense", # bert - "transformer.h.{bid}.mlp.fc_out", # gpt-j - "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon - "h.{bid}.mlp.c_proj", # gpt2 - "transformer.h.{bid}.mlp.fc2", # phi2 - "model.layers.{bid}.mlp.fc2", # phi2 - "model.layers.layers.{bid}.mlp.down_proj", # plamo - "model.layers.{bid}.feed_forward.w2", # internlm2 - "encoder.layers.{bid}.mlp.fc2", # nomic-bert - "model.layers.{bid}.mlp.c_proj", # starcoder2 - "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 - "transformer.layers.{bid}.ffn.proj_2", # openelm - "model.layers.{bid}.residual_mlp.w2", # arctic - "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 - "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", + "transformer.h.{bid}.mlp.c_proj", + "transformer.blocks.{bid}.ffn.down_proj", + "transformer.h.{bid}.mlp.dense_4h_to_h", + "h.{bid}.mlp.dense_4h_to_h", + "model.layers.{bid}.mlp.down_proj", + "layers.{bid}.feed_forward.w2", + "encoder.layer.{bid}.output.dense", + "transformer.h.{bid}.mlp.fc_out", + "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", + "model.layers.{bid}.mlp.dense_4h_to_h", + "h.{bid}.mlp.c_proj", + "transformer.h.{bid}.mlp.fc2", + "model.layers.{bid}.mlp.fc2", + "model.layers.layers.{bid}.mlp.down_proj", + "model.layers.{bid}.feed_forward.w2", + "encoder.layers.{bid}.mlp.fc2", + "model.layers.{bid}.mlp.c_proj", + "encoder.layer.{bid}.mlp.wo", + "transformer.layers.{bid}.ffn.proj_2", + "model.layers.{bid}.residual_mlp.w2", + "encoder.layer.{bid}.mlp.down_layer", + "encoder.layers.{bid}.mlp.dense_4h_to_h", + "model.layers.h.{bid}.mlp.c_proj", ), + MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) + "layers.{bid}.feed_forward.experts.w2", + "transformer.decoder_layer.{bid}.moe.linear_1", + "transformer.blocks.{bid}.ffn.experts.mlp.w2", + "model.layers.{bid}.mlp.experts.down_proj", ), + MODEL_TENSOR.FFN_DOWN_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_expert.down_proj", + "model.layers.{bid}.mlp.shared_experts.down_proj", ), + MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere - "transformer.blocks.{bid}.attn.q_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 - "transformer.layers.{bid}.attn.q_norm", # openelm + "model.layers.{bid}.self_attn.q_layernorm", + "model.layers.{bid}.self_attn.q_norm", + "transformer.blocks.{bid}.attn.q_ln", + "encoder.layer.{bid}.attention.self.layer_norm_q", + "transformer.layers.{bid}.attn.q_norm", ), + MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere - "transformer.blocks.{bid}.attn.k_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 - "transformer.layers.{bid}.attn.k_norm", # openelm + "model.layers.{bid}.self_attn.k_layernorm", + "model.layers.{bid}.self_attn.k_norm", + "transformer.blocks.{bid}.attn.k_ln", + "encoder.layer.{bid}.attention.self.layer_norm_k", + "transformer.layers.{bid}.attn.k_norm", ), + MODEL_TENSOR.ROPE_FREQS: ( - "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon + "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", ), + MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 - "encoder.layer.{bid}.layer_norm_2", # jina-v2-code + "encoder.layer.{bid}.output.LayerNorm", + "encoder.layers.{bid}.norm2", + "transformer.decoder_layer.{bid}.rms_norm_3", + "encoder.layer.{bid}.mlp.layernorm", + "encoder.layer.{bid}.layer_norm_2" ), + MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", ), + MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", ), + MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", ), + MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", ), + MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", ), + MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", ), + MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", ), - MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2 - MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2 + + MODEL_TENSOR.ATTN_Q_A: ( + "model.layers.{bid}.self_attn.q_a_proj", + ), + + MODEL_TENSOR.ATTN_Q_B: ( + "model.layers.{bid}.self_attn.q_b_proj", + ), + MODEL_TENSOR.ATTN_KV_A_MQA: ( - "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2 + "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", ), + MODEL_TENSOR.ATTN_KV_B: ( - "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + "model.layers.{bid}.self_attn.kv_b_proj", ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( - "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 + "model.layers.{bid}.self_attn.q_a_layernorm", ), + MODEL_TENSOR.ATTN_KV_A_NORM: ( - "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 + "model.layers.{bid}.self_attn.kv_a_layernorm", ), + MODEL_TENSOR.ATTN_SUB_NORM: ( - "model.layers.{bid}.self_attn.inner_attn_ln", # bitnet + "model.layers.{bid}.self_attn.inner_attn_ln", + ), + + MODEL_TENSOR.FFN_SUB_NORM: ( + "model.layers.{bid}.mlp.ffn_layernorm", + ), + + MODEL_TENSOR.DEC_ATTN_NORM: ( + "decoder.block.{bid}.layer.0.layer_norm", ), - MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",), # bitnet - MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",), # t5 - MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",), # t5 - MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",), # t5 - MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",), # t5 + + MODEL_TENSOR.DEC_ATTN_Q: ( + "decoder.block.{bid}.layer.0.SelfAttention.q", + ), + + MODEL_TENSOR.DEC_ATTN_K: ( + "decoder.block.{bid}.layer.0.SelfAttention.k", + ), + + MODEL_TENSOR.DEC_ATTN_V: ( + "decoder.block.{bid}.layer.0.SelfAttention.v", + ), + MODEL_TENSOR.DEC_ATTN_OUT: ( - "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.o", ), + MODEL_TENSOR.DEC_ATTN_REL_B: ( - "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", ), + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( - "decoder.block.{bid}.layer.1.layer_norm", # t5 + "decoder.block.{bid}.layer.1.layer_norm", ), + MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( - "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.q", ), + MODEL_TENSOR.DEC_CROSS_ATTN_K: ( - "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.k", ), + MODEL_TENSOR.DEC_CROSS_ATTN_V: ( - "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.v", ), + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( - "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.o", ), + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( - "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5 + "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", + ), + + MODEL_TENSOR.DEC_FFN_NORM: ( + "decoder.block.{bid}.layer.2.layer_norm", ), - MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",), # t5 + MODEL_TENSOR.DEC_FFN_GATE: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", ), + MODEL_TENSOR.DEC_FFN_UP: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 - "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi", + "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", ), + MODEL_TENSOR.DEC_FFN_DOWN: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wo", + ), + + MODEL_TENSOR.DEC_OUTPUT_NORM: ( + "decoder.final_layer_norm", + ), + + MODEL_TENSOR.ENC_ATTN_NORM: ( + "encoder.block.{bid}.layer.0.layer_norm", + ), + + MODEL_TENSOR.ENC_ATTN_Q: ( + "encoder.block.{bid}.layer.0.SelfAttention.q", + ), + + MODEL_TENSOR.ENC_ATTN_K: ( + "encoder.block.{bid}.layer.0.SelfAttention.k", + ), + + MODEL_TENSOR.ENC_ATTN_V: ( + "encoder.block.{bid}.layer.0.SelfAttention.v", ), - MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",), # t5 - MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",), # t5 - MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",), # t5 - MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",), # t5 - MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",), # t5 + MODEL_TENSOR.ENC_ATTN_OUT: ( - "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.o", ), + MODEL_TENSOR.ENC_ATTN_REL_B: ( - "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + ), + + MODEL_TENSOR.ENC_FFN_NORM: ( + "encoder.block.{bid}.layer.1.layer_norm", ), - MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",), # t5 + MODEL_TENSOR.ENC_FFN_GATE: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", ), + MODEL_TENSOR.ENC_FFN_UP: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 - "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi", + "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", ), + MODEL_TENSOR.ENC_FFN_DOWN: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wo", + ), + + MODEL_TENSOR.ENC_OUTPUT_NORM: ( + "encoder.final_layer_norm", ), - MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), # t5 } - # architecture-specific block mappings arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { MODEL_ARCH.ARCTIC: { - MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",), - MODEL_TENSOR.FFN_NORM_EXP: ("model.layers.{bid}.post_attention_layernorm",), + MODEL_TENSOR.FFN_NORM: ( + "model.layers.{bid}.residual_layernorm", + ), + MODEL_TENSOR.FFN_NORM_EXP: ( + "model.layers.{bid}.post_attention_layernorm", + ), }, } @@ -490,35 +588,31 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): if tensor not in MODEL_TENSORS[arch]: continue - tensor_name = TENSOR_NAMES[tensor].format(bid=bid) + tensor_name = TENSOR_NAMES[tensor].format(bid = bid) self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - key = key.format(bid=bid) + key = key.format(bid = bid) self.mapping[key] = (tensor, tensor_name) - def get_type_and_name( - self, key: str, try_suffixes: Sequence[str] = () - ) -> tuple[MODEL_TENSOR, str] | None: + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result for suffix in try_suffixes: if key.endswith(suffix): - result = self.mapping.get(key[: -len(suffix)]) + result = self.mapping.get(key[:-len(suffix)]) if result is not None: return result[0], result[1] + suffix return None def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[1] - def get_type( - self, key: str, try_suffixes: Sequence[str] = () - ) -> MODEL_TENSOR | None: - result = self.get_type_and_name(key, try_suffixes=try_suffixes) + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[0] @@ -535,6 +629,5 @@ def __contains__(self, key: str) -> bool: def __repr__(self) -> str: return repr(self.mapping) - def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: - return TensorNameMap(arch, n_blocks) + return TensorNameMap(arch, n_blocks) \ No newline at end of file