diff --git a/llama.cpp b/llama.cpp index 4beb140bfb253..b73eab1ac8b84 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3230,6 +3230,16 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER; } else if (tokenizer_name == "deepseek_llm") { vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKLLM; + } else if (tokenizer_name == "bert") { + vocab.type = LLAMA_VOCAB_TYPE_WPM; + + // default special tokens + vocab.special_bos_id = 101; + vocab.special_eos_id = 102; + vocab.special_unk_id = 100; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.add_space_prefix = false; } else { LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); @@ -3267,21 +3277,6 @@ static void llm_load_vocab( vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; - } else if (tokenizer_name == "bert") { - vocab.type = LLAMA_VOCAB_TYPE_WPM; - - // default special tokens - vocab.special_bos_id = 101; - vocab.special_eos_id = 102; - vocab.special_unk_id = 100; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.add_space_prefix = false; - } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); - LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); - - vocab.type = LLAMA_VOCAB_TYPE_SPM; } } diff --git a/llama.h b/llama.h index 3b9552138a01e..5efe6cc17c002 100644 --- a/llama.h +++ b/llama.h @@ -61,8 +61,9 @@ extern "C" { enum llama_vocab_type { LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder - LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // Deepseek LLM + LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 3, // Deepseek Coder + LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 4, // Deepseek LLM }; enum llama_token_type {