Skip to content

Commit

Permalink
chore(deps): update llama.cpp
Browse files Browse the repository at this point in the history
Signed-off-by: Ettore Di Giacinto <[email protected]>
  • Loading branch information
mudler committed Sep 8, 2024
1 parent 5139dad commit bb76578
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 58 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04
CPPLLAMA_VERSION?=daa9623ab051a8162ae750b150b9522571b55f21

# go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
Expand Down
105 changes: 48 additions & 57 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "common.h"
#include "json.hpp"
#include "llama.h"
#include "grammar-parser.h"
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "utils.hpp"
Expand Down Expand Up @@ -203,8 +202,8 @@ struct llama_client_slot
std::string stopping_word;

// sampling
struct llama_sampling_params sparams;
llama_sampling_context *ctx_sampling = nullptr;
struct gpt_sampler_params sparams;
gpt_sampler *ctx_sampling = nullptr;

int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor
Expand Down Expand Up @@ -619,7 +618,7 @@ struct llama_server_context

bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params;
llama_sampling_params default_sparams;
gpt_sampler_params default_sparams;

slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
Expand All @@ -628,7 +627,7 @@ struct llama_server_context
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
Expand All @@ -641,7 +640,7 @@ struct llama_server_context
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
Expand All @@ -665,6 +664,7 @@ struct llama_server_context
slot->params.input_prefix = "";
}


if (data.count("input_suffix") != 0)
{
slot->params.input_suffix = data["input_suffix"];
Expand All @@ -683,6 +683,10 @@ struct llama_server_context
slot->prompt = "";
}

if (json_value(data, "ignore_eos", false)) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
}
/*
slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt");
Expand Down Expand Up @@ -718,14 +722,10 @@ struct llama_server_context
slot->sparams.use_penalty_prompt_tokens = true;
}
}
*/

slot->sparams.logit_bias.clear();

if (json_value(data, "ignore_eos", false))
{
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}

const auto &logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array())
{
Expand Down Expand Up @@ -753,21 +753,21 @@ struct llama_server_context
llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.logit_bias[tok] = bias;
slot->sparams.logit_bias.push_back({tok, bias});
}
}
else if (el[0].is_string())
{
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias[tok] = bias;
slot->sparams.logit_bias.push_back({tok, bias});
}
}
}
}
}

slot->params.antiprompt.clear();

const auto &stop = data.find("stop");
Expand All @@ -781,24 +781,22 @@ struct llama_server_context
}
}
}

const auto &samplers_sequence = data.find("samplers");
if (samplers_sequence != data.end() && samplers_sequence->is_array())
{

const auto & samplers = data.find("samplers");
if (samplers != data.end() && samplers->is_array()) {
std::vector<std::string> sampler_names;
for (const auto &sampler_name : *samplers_sequence)
{
if (sampler_name.is_string())
{
sampler_names.emplace_back(sampler_name);
for (const auto & name : *samplers) {
if (name.is_string()) {
sampler_names.emplace_back(name);
}
}
}
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
}
else
{
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
slot->sparams.samplers = default_sparams.samplers;
}


if (multimodal)
{
Expand Down Expand Up @@ -875,10 +873,10 @@ struct llama_server_context

if (slot->ctx_sampling != nullptr)
{
llama_sampling_free(slot->ctx_sampling);
gpt_sampler_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
//llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;

all_slots_are_idle = false;
Expand All @@ -888,7 +886,7 @@ struct llama_server_context
{"task_id", slot->task_id},
});

LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

return true;
}
Expand Down Expand Up @@ -1006,11 +1004,13 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;

/*
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
{
// we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
}
*/

// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
Expand Down Expand Up @@ -1144,13 +1144,11 @@ struct llama_server_context

json get_formated_generation(llama_client_slot &slot)
{
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
std::vector<std::string> samplers;
samplers.reserve(slot.sparams.samplers.size());
for (const auto & sampler : slot.sparams.samplers)
{
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
}

return json {
Expand All @@ -1165,27 +1163,25 @@ struct llama_server_context
{"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p},
{"tfs_z", slot.sparams.tfs_z},
{"typical_p", slot.sparams.typical_p},
{"typical_p", slot.sparams.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n},
{"repeat_penalty", slot.sparams.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq},
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
{"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep},
{"ignore_eos", ignore_eos},
{"ignore_eos", slot.sparams.ignore_eos},
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
// {"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
{"samplers", samplers}
};
}

Expand Down Expand Up @@ -1714,7 +1710,7 @@ struct llama_server_context

if (!slot.params.cache_prompt)
{
llama_sampling_reset(slot.ctx_sampling);
gpt_sampler_reset(slot.ctx_sampling);

slot.n_past = 0;
slot.n_past_se = 0;
Expand All @@ -1726,7 +1722,7 @@ struct llama_server_context
// push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens)
{
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
gpt_sampler_accept(slot.ctx_sampling, token, false);
}

slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
Expand Down Expand Up @@ -1934,9 +1930,9 @@ struct llama_server_context
}

completion_token_output result;
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
gpt_sampler_accept(slot.ctx_sampling, id, true);

slot.n_decoded += 1;
if (slot.n_decoded == 1)
Expand All @@ -1946,19 +1942,14 @@ struct llama_server_context
metrics.on_prompt_eval(slot);
}

llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
result.tok = id;
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);

const int32_t n_probs = slot.sparams.n_probs;
if (slot.sparams.temp <= 0 && n_probs > 0)
{
// for llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &cur_p);
}

for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
{
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
result.probs.push_back({
cur_p->data[i].id,
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
});
}

if (!process_token(result, slot))
Expand Down

0 comments on commit bb76578

Please sign in to comment.