diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7cd60c0a2bec3..482ba264dd3c5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2624,6 +2624,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @Model.register("RobertaModel") class RobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None def set_vocab(self): """Support BPE tokenizers for roberta models""" @@ -2641,6 +2651,19 @@ def set_vocab(self): else: return super().set_vocab() + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + @Model.register("NomicBertModel") class NomicBertModel(BertModel):