From c69c63039cd75f8f33f253ab7485d82a8b4cd403 Mon Sep 17 00:00:00 2001 From: compilade Date: Sat, 20 Jul 2024 21:53:01 -0400 Subject: [PATCH] convert_hf : fix Gemma v1 conversion (#8597) * convert_hf : fix Gemma v1 conversion * convert_hf : allow renaming tokens, but with a warning * convert_hf : fix Gemma v1 not setting BOS and EOS tokens --- convert_hf_to_gguf.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a0661f120eea7..fba8dbbedebbd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -753,7 +753,8 @@ def _create_vocab_sentencepiece(self): token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token.encode("utf-8") + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: @@ -1312,6 +1313,7 @@ def set_vocab(self): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -2014,7 +2016,8 @@ def set_vocab(self): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2030,7 +2033,8 @@ def set_vocab(self): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2269,7 +2273,8 @@ def set_vocab(self): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2288,7 +2293,8 @@ def set_vocab(self): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2474,6 +2480,7 @@ def set_vocab(self): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False)