diff --git a/common/sampling.cpp b/common/sampling.cpp index 0b07ad01bac23..a4f250d990b92 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -354,7 +354,7 @@ static llama_token_data_array llama_sampling_prepare_impl( llama_token_data_array cur_p = { cur.data(), cur.size(), false }; // apply penalties - const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const auto & penalty_tokens = prev; const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); if (penalty_tokens_used_size) { const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; diff --git a/common/sampling.h b/common/sampling.h index f3ffa090fd63e..79b8f502012a1 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -56,9 +56,6 @@ typedef struct gpt_sampling_params { float cfg_scale = 1.f; // how strong is guidance std::vector logit_bias; // logit biases to apply - - std::vector penalty_prompt_tokens; - bool use_penalty_prompt_tokens = false; } gpt_sampling_params; // general sampler context diff --git a/examples/server/README.md b/examples/server/README.md index e17595fe87f25..d161c0b99abe0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -424,8 +424,6 @@ node index.js `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. - `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`. - `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` @@ -672,7 +670,6 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte "stopping_word": "" }, "penalize_nl": true, - "penalty_prompt_tokens": [], "presence_penalty": 0.0, "prompt": "Say hello to llama.cpp", "repeat_last_n": 64, @@ -696,8 +693,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte "tfs_z": 1.0, "top_k": 40, "top_p": 0.949999988079071, - "typical_p": 1.0, - "use_penalty_prompt_tokens": false + "typical_p": 1.0 } ] ``` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 18e66e6c8cd7f..5de903986bf13 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -986,51 +986,6 @@ struct server_context { } } - // penalize user-provided tokens - { - slot.sparams.penalty_prompt_tokens.clear(); - slot.sparams.use_penalty_prompt_tokens = false; - - const auto & penalty_prompt = data.find("penalty_prompt"); - - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false); - - if (slot.params.n_predict > 0) { - slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict); - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict)); - - const int n_vocab = llama_n_vocab(model); - for (const auto & penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot.sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - } - } - { slot.sparams.logit_bias.clear(); @@ -1201,11 +1156,6 @@ struct server_context { slot.generated_text += token_str; slot.has_next_token = true; - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - // check if there is incomplete UTF-8 character at the end bool incomplete = false; for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { @@ -1346,8 +1296,6 @@ struct server_context { {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, diff --git a/include/llama.h b/include/llama.h index 2df6c6ca72792..a9db3eebfd422 100644 --- a/include/llama.h +++ b/include/llama.h @@ -386,6 +386,7 @@ extern "C" { bool ignore_eos; // ignore the end-of-sequence token const char * grammar; + const char * grammar_root; int32_t n_logit_bias; const llama_logit_bias * logit_bias; diff --git a/src/llama.cpp b/src/llama.cpp index e2e265228d29a..773008c711d5a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16514,6 +16514,7 @@ struct llama_sampling_params llama_sampling_default_params() { /*.penalize_nl =*/ false, /*.ignore_eos =*/ false, /*.grammar =*/ nullptr, + /*.grammar_root =*/ nullptr, /*.n_logit_bias =*/ 0, /*.logit_bias =*/ nullptr, };