From 334ddfd97da0970f73ba95776cd8c0548d47133b Mon Sep 17 00:00:00 2001 From: Sukriti-Sharma4 Date: Wed, 18 Dec 2024 18:37:00 -0700 Subject: [PATCH] map roberta-bpe to gpt-2 Signed-off-by: Sukriti-Sharma4 --- convert_hf_to_gguf.py | 6 +----- src/llama.cpp | 3 ++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 482ba264dd3c5..c74b47d096a0c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -663,11 +663,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = "minerva-7b" if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base - # NOTE: The Roberta tokenizer is the same as GPT-2, but it always - # adds the cls/sep tokens as bos/eos. This is handled as a - # post-processor in tokenizers, so the chkhsh is different, but - # it still maps to gpt-2 internally. - res = "gpt-2" + res = "roberta-bpe" if res is None: logger.warning("\n") diff --git a/src/llama.cpp b/src/llama.cpp index 49ef5b78a515c..87a246097720f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6406,7 +6406,8 @@ static void llm_load_vocab( tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-de" || - tokenizer_pre == "jina-v2-code") { + tokenizer_pre == "jina-v2-code" || + tokenizer_pre == "roberta-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "refact") {