Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tokenizer : special token handling #3538

Merged
merged 11 commits into from
Oct 17, 2023
12 changes: 7 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -865,21 +865,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos) {
return llama_tokenize(llama_get_model(ctx), text, add_bos);
bool add_bos,
bool special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
}

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos) {
bool add_bos,
bool special) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
Expand Down
6 changes: 4 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos);
bool add_bos,
bool special = false);

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos);
bool add_bos,
bool special = false);

// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
Expand Down
8 changes: 4 additions & 4 deletions common/train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false);
false, false);
if (n_tokens < 0) {
out_tokens.resize(-n_tokens);
n_tokens = llama_tokenize(
Expand All @@ -872,7 +872,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false);
false, false);
}
if (n_tokens >= 0) {
out_tokens.resize(n_tokens);
Expand Down Expand Up @@ -966,15 +966,15 @@ size_t tokenize_file(
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false);
false, false);
if (n_tokens < 0) {
tok_sample.resize(-n_tokens);
n_tokens = llama_tokenize(llama_get_model(lctx),
buf_sample.data(),
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false);
false, false);
GGML_ASSERT(n_tokens >= 0);
}
GGML_ASSERT(n_tokens <= (int) tok_sample.size());
Expand Down
2 changes: 1 addition & 1 deletion examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])
Expand Down
40 changes: 30 additions & 10 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ int main(int argc, char ** argv) {

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
Expand All @@ -260,10 +260,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));

original_prompt_len = original_inp.size();
Expand Down Expand Up @@ -320,8 +320,8 @@ int main(int argc, char ** argv) {
}

// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);

LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
Expand Down Expand Up @@ -383,6 +383,12 @@ int main(int argc, char ** argv) {
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}

Expand All @@ -392,10 +398,22 @@ int main(int argc, char ** argv) {

if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}

if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
Expand Down Expand Up @@ -717,7 +735,7 @@ int main(int argc, char ** argv) {
if (params.interactive) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
Expand All @@ -744,8 +762,7 @@ int main(int argc, char ** argv) {
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
printf("%s", params.input_prefix.c_str());
}

// color user input only
Expand All @@ -767,7 +784,6 @@ int main(int argc, char ** argv) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}

Expand All @@ -782,10 +798,14 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}

const auto line_inp = ::llama_tokenize(ctx, buffer, false);
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));

embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());

// instruct mode: insert response suffix
if (params.instruct) {
Expand Down
Loading
Loading