diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index 0a0215c..c1d2f97 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import logging import argparse import contextlib @@ -165,14 +166,14 @@ def set_vocab(self): def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - if len(self.part_names) > 1: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): self.tensor_names = set() - index_name = ( - "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - ) - index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + with open(index_file, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): @@ -180,6 +181,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: self.tensor_names.update(weight_map.keys()) else: self.tensor_names = tensor_names_from_parts + weight_map = {} for part_name in self.part_names: logger.info(f"gguf: loading model part '{part_name}'") @@ -217,17 +219,20 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: data = LazyTorchTensor.from_eager(data) yield name, data - if ( - len( - sym_diff := tensor_names_from_parts.symmetric_difference( - self.tensor_names - ) - ) - > 0 - ): - raise ValueError( - f"Mismatch between weight map and model parts for tensor names: {sym_diff}" + if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) + missing_files = sorted( + set(weight_map[n] for n in missing if n in weight_map) ) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}") + else: + raise ValueError( + "Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}" + ) def format_tensor_name( self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight" @@ -383,12 +388,31 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.TIME_MIX_FIRST, + gguf.MODEL_TENSOR.TIME_MIX_W1, + gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, ) ) - or not name.endswith(".weight") + or not new_name.endswith(".weight") ): data_qtype = gguf.GGMLQuantizationType.F32 + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + + data_qtype = gguf.GGMLQuantizationType.F16 + if isinstance(data_qtype, bool): if self.ftype == gguf.LlamaFileType.ALL_F32: data_qtype = gguf.GGMLQuantizationType.F32 @@ -398,6 +422,10 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 else: raise ValueError(f"Unknown file type: {self.ftype.name}") @@ -417,7 +445,7 @@ def prepare_tensors(self): shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" logger.info( - f"{f'%s-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" + f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" ) self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) @@ -704,6 +732,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + + res = "phi-2" if res is None: logger.warning("\n") @@ -1130,6 +1161,7 @@ def set_vocab(self): try: self._set_vocab_gpt2() except Exception: + self._set_vocab_sentencepiece() self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_pad_token_id(3) @@ -1710,7 +1742,9 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register( + "LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM" +) class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1891,15 +1925,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def weight_quant(self, weight): + def weight_quant(self, weight: Tensor) -> Tensor: dtype = weight.dtype weight = weight.float() - s = 1 / weight.abs().mean().clamp(min=1e-5) - weight = (weight * s).round().clamp(-1, 1) / s - scale = weight.abs().max().unsqueeze(0) - weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) - weight = torch.sign(weight).type(dtype) - return weight.type(dtype), scale.type(torch.float32) + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + + result = (weight * iscale).round().clamp(-1, 1) / iscale + return result.type(dtype) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None @@ -1919,11 +1952,9 @@ def modify_tensors( ] ): - weight_torch, scale_torch = self.weight_quant(data_torch) - yield (new_name, weight_torch) - yield (new_name.removesuffix(".weight") + ".scale", scale_torch) - else: - yield (new_name, data_torch) + data_torch = self.weight_quant(data_torch) + + yield (new_name, data_torch) @Model.register("GrokForCausalLM") @@ -2099,6 +2130,79 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] +@Model.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(Model): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + rope_dims = hparams["qk_rope_head_dim"] + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length( + hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"] + ) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + rope_scaling = self.find_hparam(["rope_scaling"], True) + if rope_scaling is None: + return + + long_factors = rope_scaling.get("long_factor", None) + short_factors = rope_scaling.get("short_factor", None) + + if long_factors is None or short_factors is None: + raise KeyError( + "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" + ) + + if ( + len(long_factors) != len(short_factors) + or len(long_factors) != rope_dims / 2 + ): + raise ValueError( + f"The length of rope long and short factors must be {rope_dims / 2}" + ) + + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", + np.array(long_factors, dtype=np.float32), + ) + self.gguf_writer.add_tensor( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", + np.array(short_factors, dtype=np.float32), + ) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute( + self, weights: Tensor, n_head: int, n_kv_head: int | None = None + ) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN @@ -3087,6 +3191,100 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 +@Model.register("Rwkv6ForCausalLM") +class Rwkv6Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV6 + + def set_vocab(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = ["".encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open( + self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8" + ) as f: + lines = f.readlines() + for line in lines: + parts = line.split(" ") + assert len(parts) >= 3 + token, token_len = ast.literal_eval(" ".join(parts[1:-1])), int( + parts[-1] + ) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_size = self.hparams["head_size"] + hidden_size = self.hparams["hidden_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] + intermediate_size = ( + self.hparams["intermediate_size"] + if self.hparams["intermediate_size"] is not None + else int((hidden_size * 3.5) // 32 * 32) + ) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_head_count(0) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if ( + new_name.endswith("time_mix_w1.weight") + or new_name.endswith("time_mix_decay_w1.weight") + or new_name.endswith("time_mix_decay_w2.weight") + ): + data_torch = data_torch.transpose(0, 1) + + if new_name.endswith("time_mix_w2.weight"): + data_torch = data_torch.permute(0, 2, 1) + + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith( + "channel_mix_value.weight" + ): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + + yield (new_name, data_torch) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA @@ -3216,6 +3414,65 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] +@Model.register("OlmoeForCausalLM") +class OlmoeModel(Model): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4122,7 +4379,7 @@ def set_vocab(self): if len(token) == 1: continue merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank) - assert 2 <= len(merged) <= 7 + assert len(merged) >= 2 and len(merged) <= 7 merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged))) added_vocab = tokenizer.get_added_vocab() @@ -4334,6 +4591,152 @@ def prepare_tensors(self): super().prepare_tensors() +@Model.register("GraniteForCausalLM") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + if logits_scaling := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scaling) + + +@Model.register("JambaForCausalLM") +class JambaModel(Model): + model_arch = gguf.MODEL_ARCH.JAMBA + + def get_vocab_base_pre(self, tokenizer) -> str: + del tokenizer + + return "gpt-2" + + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + + self._set_vocab_sentencepiece() + else: + + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) + d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 + d_inner = self.hparams["mamba_expand"] * d_model + d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 + + dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -( + d_model // -16 + ) + rms_norm_eps = ( + self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) + or 1e-6 + ) + n_kv_head = self.hparams["num_key_value_heads"] + attn_offset = self.hparams["attn_layer_offset"] + attn_period = self.hparams["attn_layer_period"] + n_kv_vec = [0 for _ in range(attn_offset)] + [ + n_kv_head if (i - attn_offset) % attn_period == 0 else 0 + for i in range(attn_offset, self.block_count) + ] + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length( + self.find_hparam(["max_position_embeddings", "n_ctx"]) + ) + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(n_kv_vec) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_file_type(self.ftype) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + + name = name.replace(".moe.", ".feed_forward.") + if bid is not None: + moe_offset = self.hparams["expert_layer_offset"] + moe_period = self.hparams["expert_layer_period"] + + if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): + name = name.replace(".experts.0.", ".") + + if ".feed_forward.experts." in name: + n_experts = self.hparams["num_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + + for wid in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield new_name, data_torch + return + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + yield new_name, data_torch + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor @@ -4399,26 +4802,67 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("--vocab-only", action="store_true") - parser.add_argument("--outfile", type=Path) + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file" + ) + parser.add_argument( + "--vocab-only", + action="store_true", + ) + parser.add_argument( + "--outfile", + type=Path, + ) parser.add_argument( "--outtype", type=str, - choices=["f32", "f16", "bf16", "q8_0", "auto"], + choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", ) - parser.add_argument("--bigendian", action="store_true") - parser.add_argument("model", type=Path) + parser.add_argument( + "--bigendian", + action="store_true", + ) + parser.add_argument( + "model", + type=Path, + ) parser.add_argument("--use-temp-file", action="store_true") - parser.add_argument("--no-lazy", action="store_true") - parser.add_argument("--model-name", type=str, default=None) - parser.add_argument("--verbose", action="store_true") - parser.add_argument("--split-max-tensors", type=int, default=0) - parser.add_argument("--split-max-size", type=str, default="0") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--no-tensor-first-split", action="store_true") - parser.add_argument("--metadata", type=Path) + parser.add_argument( + "--no-lazy", + action="store_true", + ) + parser.add_argument( + "--model-name", + type=str, + default=None, + ) + parser.add_argument( + "--verbose", + action="store_true", + ) + parser.add_argument( + "--split-max-tensors", + type=int, + default=0, + ) + parser.add_argument( + "--split-max-size", + type=str, + default="0", + ) + parser.add_argument( + "--dry-run", + action="store_true", + ) + parser.add_argument( + "--no-tensor-first-split", + action="store_true", + ) + parser.add_argument( + "--metadata", + type=Path, + ) return parser.parse_args() @@ -4462,6 +4906,8 @@ def main() -> None: "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, + "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, "auto": gguf.LlamaFileType.GUESSED, } diff --git a/src/gguf/constants.py b/src/gguf/constants.py index 51ea4e4..fe7c905 100644 --- a/src/gguf/constants.py +++ b/src/gguf/constants.py @@ -78,6 +78,11 @@ class LLM: DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" + TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" + TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -93,6 +98,7 @@ class Attention: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -114,6 +120,10 @@ class SSM: INNER_SIZE = "{arch}.ssm.inner_size" STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" + DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + + class WKV: + HEAD_SIZE = "{arch}.wkv.head_size" class Tokenizer: MODEL = "tokenizer.ggml.model" @@ -183,14 +193,18 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() + RWKV6 = auto() MAMBA = auto() + JAMBA = auto() XVERSE = auto() COMMAND_R = auto() DBRX = auto() OLMO = auto() + OLMOE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK2 = auto() @@ -201,6 +215,7 @@ class MODEL_ARCH(IntEnum): JAIS = auto() NEMOTRON = auto() EXAONE = auto() + GRANITE = auto() class MODEL_TENSOR(IntEnum): @@ -246,9 +261,35 @@ class MODEL_TENSOR(IntEnum): SSM_CONV1D = auto() SSM_X = auto() SSM_DT = auto() + SSM_DT_NORM = auto() SSM_A = auto() + SSM_B_NORM = auto() + SSM_C_NORM = auto() SSM_D = auto() SSM_OUT = auto() + TIME_MIX_W1 = auto() + TIME_MIX_W2 = auto() + TIME_MIX_LERP_X = auto() + TIME_MIX_LERP_K = auto() + TIME_MIX_LERP_V = auto() + TIME_MIX_LERP_R = auto() + TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_W = auto() + TIME_MIX_FIRST = auto() + TIME_MIX_DECAY = auto() + TIME_MIX_DECAY_W1 = auto() + TIME_MIX_DECAY_W2 = auto() + TIME_MIX_KEY = auto() + TIME_MIX_VALUE = auto() + TIME_MIX_RECEPTANCE = auto() + TIME_MIX_GATE = auto() + TIME_MIX_LN = auto() + TIME_MIX_OUTPUT = auto() + CHANNEL_MIX_LERP_K = auto() + CHANNEL_MIX_LERP_R = auto() + CHANNEL_MIX_KEY = auto() + CHANNEL_MIX_RECEPTANCE = auto() + CHANNEL_MIX_VALUE = auto() ATTN_Q_A = auto() ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() @@ -313,14 +354,18 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.JAMBA: "jamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK2: "deepseek2", @@ -331,6 +376,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -376,9 +422,35 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm", + MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", + MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", + MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", + MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", + MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", + MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", + MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", + MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", + MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", + MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", + MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", + MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", + MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", + MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", + MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", + MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", + MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", + MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", + MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", + MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", @@ -792,6 +864,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.MINICPM3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GEMMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -835,6 +924,37 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.RWKV6: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_LERP_X, + MODEL_TENSOR.TIME_MIX_LERP_K, + MODEL_TENSOR.TIME_MIX_LERP_V, + MODEL_TENSOR.TIME_MIX_LERP_R, + MODEL_TENSOR.TIME_MIX_LERP_G, + MODEL_TENSOR.TIME_MIX_LERP_W, + MODEL_TENSOR.TIME_MIX_FIRST, + MODEL_TENSOR.TIME_MIX_DECAY, + MODEL_TENSOR.TIME_MIX_DECAY_W1, + MODEL_TENSOR.TIME_MIX_DECAY_W2, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_GATE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.CHANNEL_MIX_LERP_K, + MODEL_TENSOR.CHANNEL_MIX_LERP_R, + MODEL_TENSOR.CHANNEL_MIX_KEY, + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE, + MODEL_TENSOR.CHANNEL_MIX_VALUE, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -848,6 +968,34 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.JAMBA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_X, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_DT_NORM, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_B_NORM, + MODEL_TENSOR.SSM_C_NORM, + MODEL_TENSOR.SSM_D, + MODEL_TENSOR.SSM_OUT, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.XVERSE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -902,6 +1050,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], MODEL_ARCH.OPENELM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1080,6 +1245,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], } MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1179,6 +1357,8 @@ class GGMLQuantizationType(IntEnum): Q4_0_4_4 = 31 Q4_0_4_8 = 32 Q4_0_8_8 = 33 + TQ1_0 = 34 + TQ2_0 = 35 class LlamaFileType(IntEnum): @@ -1216,6 +1396,8 @@ class LlamaFileType(IntEnum): MOSTLY_Q4_0_4_4 = 33 MOSTLY_Q4_0_4_8 = 34 MOSTLY_Q4_0_8_8 = 35 + MOSTLY_TQ1_0 = 36 + MOSTLY_TQ2_0 = 37 GUESSED = 1024 @@ -1291,6 +1473,8 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16), GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16), GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16), + GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), + GGMLQuantizationType.TQ2_0: (256, 2 + 64), } KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE @@ -1330,6 +1514,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK +KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE diff --git a/src/gguf/gguf_writer.py b/src/gguf/gguf_writer.py index df0c894..a8754cf 100644 --- a/src/gguf/gguf_writer.py +++ b/src/gguf/gguf_writer.py @@ -32,7 +32,6 @@ logger = logging.getLogger(__name__) - SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" @@ -136,7 +135,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: continue elif name.endswith(".lora_b"): if last_lora_a is None or last_lora_a[0] != name[:-1] + "a": - # Bail when the LoRA pair can't be found trivially + logger.warning( "can't measure LoRA size correctly, tensor order is unusual" ) @@ -155,14 +154,11 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: total_params += size - # Hopefully this should work even for variable-expert-count models expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0 - # Negate the total to signal it's likely not exact if last_lora_a is not None: total_params = -total_params - # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py return total_params, shared_params, expert_params, expert_count def format_shard_names(self, path: Path) -> list[Path]: @@ -181,7 +177,7 @@ def open_output_file(self, path: Path | None = None) -> None: and self.fout is not None and (path is None or path == self.path) ): - # allow calling this multiple times as long as the path is the same + return if self.state is not WriterState.NO_FILE: @@ -210,7 +206,7 @@ def print_plan(self) -> list[Path]: if self.dry_run: logger.info("Dry run, not writing files") for name in filenames: - print(name) # noqa: NP100 + print(name) exit() return filenames @@ -394,12 +390,11 @@ def add_tensor_info( if tensor_dtype == np.uint8: tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) - # make sure there is at least one tensor before splitting if len(self.tensors[-1]) > 0: - if ( # split when over tensor limit + if ( self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors - ) or ( # split when over size limit + ) or ( self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size @@ -465,8 +460,6 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: fout = self.fout[file_id] - # pop the first tensor info - # TODO: cleaner way to get the first key first_tensor_name = [ name for name, _ in zip(self.tensors[file_id].keys(), range(1)) ][0] @@ -513,11 +506,8 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: total = sum(ti.nbytes for ti in tensors.values()) shard_bar.reset(total=(total if total > 0 else None)) - # relying on the fact that Python dicts preserve insertion order (since 3.7) for ti in tensors.values(): - assert ( - ti.tensor is not None - ) # can only iterate once over the tensors + assert ti.tensor is not None assert ti.tensor.nbytes == ti.nbytes ti.tensor.tofile(fout) if shard_bar is not None: @@ -749,6 +739,24 @@ def add_expert_shared_count(self, count: int) -> None: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_rescale_every_n_layers(self, count: int) -> None: + self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + + def add_time_mix_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim) + + def add_time_decay_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + + def add_residual_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value) + + def add_embedding_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value) + + def add_wkv_head_size(self, size: int) -> None: + self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) @@ -770,6 +778,9 @@ def add_relative_attn_buckets_count(self, value: int) -> None: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) + def add_attention_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) @@ -809,6 +820,9 @@ def add_ssm_state_size(self, value: int) -> None: def add_ssm_time_step_rank(self, value: int) -> None: self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) + def add_ssm_dt_b_c_rms(self, value: bool) -> None: + self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) @@ -879,7 +893,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: name = choice.get("name", "") template = choice.get("template") - # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it name = "".join( (c if c in ascii_letters + digits else "_" for c in name) ) @@ -915,6 +928,9 @@ def add_middle_token_id(self, id: int) -> None: def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def add_eom_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = "" if not skip_pack_prefix: diff --git a/src/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py index 750917e..3161173 100644 --- a/src/gguf/tensor_mapping.py +++ b/src/gguf/tensor_mapping.py @@ -26,6 +26,7 @@ class TensorNameMap: "embedding.word_embeddings", "transformer.token_embeddings", "shared", + "rwkv.embeddings", ), MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",), MODEL_TENSOR.TOKEN_EMBD_NORM: ( @@ -33,6 +34,7 @@ class TensorNameMap: "embeddings.LayerNorm", "emb_ln", "transformer.norm", + "rwkv.blocks.0.pre_ln", ), MODEL_TENSOR.POS_EMBD: ( "transformer.wpe", @@ -46,6 +48,7 @@ class TensorNameMap: "word_embeddings_for_head", "lm_head.linear", "output_layer", + "head", ), MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", @@ -63,6 +66,7 @@ class TensorNameMap: "encoder.final_layernorm", "transformer.norm", "model.norm", + "rwkv.ln_out", ), MODEL_TENSOR.ROPE_FREQS: ( "rope.freqs", @@ -92,10 +96,12 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_1", "encoder.layers.{bid}.input_layernorm", "transformer.layers.{bid}.attn_norm", + "rwkv.blocks.{bid}.ln1", ), MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", "encoder.layer.{bid}.layer_norm_1", + "rwkv.blocks.{bid}.ln2", ), MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", @@ -332,31 +338,72 @@ class TensorNameMap: MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", + "model.layers.{bid}.mamba.in_proj", ), MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", + "model.layers.{bid}.mamba.conv1d", ), MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", + "model.layers.{bid}.mamba.x_proj", ), MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", + "model.layers.{bid}.mamba.dt_proj", ), + MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",), MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", + "model.layers.{bid}.mamba.A_log", + ), + MODEL_TENSOR.SSM_B_NORM: ( + "model.layers.{bid}.mamba.b_layernorm", + "model.layers.{bid}.mamba.B_layernorm", + ), + MODEL_TENSOR.SSM_C_NORM: ( + "model.layers.{bid}.mamba.c_layernorm", + "model.layers.{bid}.mamba.C_layernorm", ), MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", + "model.layers.{bid}.mamba.D", ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", - ), + "model.layers.{bid}.mamba.out_proj", + ), + MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",), + MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",), + MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",), + MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",), + MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",), + MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",), + MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",), + MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",), + MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",), + MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",), + MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",), + MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",), + MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), + MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), + MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",), + MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), + MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), + MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), + MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",), + MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",), + MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( + "rwkv.blocks.{bid}.feed_forward.receptance", + ), + MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",), MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), MODEL_TENSOR.ATTN_KV_A_MQA: (