Skip to content

Commit

Permalink
map roberta-bpe to gpt-2
Browse files Browse the repository at this point in the history
Signed-off-by: Sukriti-Sharma4 <[email protected]>
  • Loading branch information
Ssukriti committed Dec 19, 2024
1 parent d5f69e8 commit 334ddfd
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 6 deletions.
6 changes: 1 addition & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,11 +663,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
res = "minerva-7b"
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
# NOTE: The Roberta tokenizer is the same as GPT-2, but it always
# adds the cls/sep tokens as bos/eos. This is handled as a
# post-processor in tokenizers, so the chkhsh is different, but
# it still maps to gpt-2 internally.
res = "gpt-2"
res = "roberta-bpe"

if res is None:
logger.warning("\n")
Expand Down
3 changes: 2 additions & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6406,7 +6406,8 @@ static void llm_load_vocab(
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code") {
tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "roberta-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
Expand Down

0 comments on commit 334ddfd

Please sign in to comment.