Skip to content

Commit

Permalink
Updated/merged the deepseek coder pr
Browse files Browse the repository at this point in the history
  • Loading branch information
jaggzh committed Feb 12, 2024
1 parent b16a391 commit 19a03e0
Showing 2 changed files with 13 additions and 17 deletions.
25 changes: 10 additions & 15 deletions llama.cpp
Original file line number Diff line number Diff line change
@@ -3230,6 +3230,16 @@ static void llm_load_vocab(
vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER;
} else if (tokenizer_name == "deepseek_llm") {
vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKLLM;
} else if (tokenizer_name == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;

// default special tokens
vocab.special_bos_id = 101;
vocab.special_eos_id = 102;
vocab.special_unk_id = 100;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
vocab.add_space_prefix = false;
} else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3267,21 +3277,6 @@ static void llm_load_vocab(
vocab.special_unk_id = -1;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
} else if (tokenizer_name == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;

// default special tokens
vocab.special_bos_id = 101;
vocab.special_eos_id = 102;
vocab.special_unk_id = 100;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
vocab.add_space_prefix = false;
} else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);

vocab.type = LLAMA_VOCAB_TYPE_SPM;
}
}

5 changes: 3 additions & 2 deletions llama.h
Original file line number Diff line number Diff line change
@@ -61,8 +61,9 @@ extern "C" {
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // Deepseek LLM
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 3, // Deepseek Coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 4, // Deepseek LLM
};

enum llama_token_type {

0 comments on commit 19a03e0

Please sign in to comment.