Skip to content

Commit

Permalink
convert_hf : fix Gemma v1 not setting BOS and EOS tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
compilade committed Jul 20, 2024
1 parent 5a9cb57 commit 50d1a03
Showing 1 changed file with 3 additions and 7 deletions.
10 changes: 3 additions & 7 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,7 @@ def set_vocab(self):
special_vocab._set_special_token("prefix", 1)
special_vocab._set_special_token("suffix", 3)
special_vocab._set_special_token("middle", 2)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer)

def set_gguf_parameters(self):
Expand Down Expand Up @@ -2466,13 +2467,7 @@ class GemmaModel(Model):
model_arch = gguf.MODEL_ARCH.GEMMA

def set_vocab(self):
tokens, scores, toktypes = self._create_vocab_sentencepiece()

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self._set_vocab_sentencepiece()

# TODO: these special tokens should be exported only for the CodeGemma family
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
Expand All @@ -2482,6 +2477,7 @@ def set_vocab(self):
special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer)

self.gguf_writer.add_add_space_prefix(False)
Expand Down

0 comments on commit 50d1a03

Please sign in to comment.