diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c63d929c187a8..7cd60c0a2bec3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -663,7 +663,11 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = "minerva-7b" if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base - res = "roberta-bpe" + # NOTE: The Roberta tokenizer is the same as GPT-2, but it always + # adds the cls/sep tokens as bos/eos. This is handled as a + # post-processor in tokenizers, so the chkhsh is different, but + # it still maps to gpt-2 internally. + res = "gpt-2" if res is None: logger.warning("\n") @@ -2544,7 +2548,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel", "RobertaModel") +@Model.register("BertModel", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2617,6 +2621,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + @Model.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT