diff --git a/Makefile b/Makefile index fe05dc1a5741..26f711f676e5 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04 +CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1 # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index e1b6f868b2e8..a46b4ee0a335 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -17,11 +17,10 @@ #include "common.h" #include "json.hpp" #include "llama.h" -#include "grammar-parser.h" #include "backend.pb.h" #include "backend.grpc.pb.h" #include "utils.hpp" - +#include "sampling.h" // include std::regex #include #include @@ -203,8 +202,8 @@ struct llama_client_slot std::string stopping_word; // sampling - struct llama_sampling_params sparams; - llama_sampling_context *ctx_sampling = nullptr; + struct gpt_sampler_params sparams; + gpt_sampler *ctx_sampling = nullptr; int32_t ga_i = 0; // group-attention state int32_t ga_n = 1; // group-attention factor @@ -619,7 +618,7 @@ struct llama_server_context bool launch_slot_with_data(llama_client_slot* &slot, json data) { slot_params default_params; - llama_sampling_params default_sparams; + gpt_sampler_params default_sparams; slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); @@ -628,7 +627,7 @@ struct llama_server_context slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); @@ -641,7 +640,7 @@ struct llama_server_context slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.seed = json_value(data, "seed", default_sparams.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -665,6 +664,7 @@ struct llama_server_context slot->params.input_prefix = ""; } + if (data.count("input_suffix") != 0) { slot->params.input_suffix = data["input_suffix"]; @@ -683,6 +683,10 @@ struct llama_server_context slot->prompt = ""; } + if (json_value(data, "ignore_eos", false)) { + slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + } + /* slot->sparams.penalty_prompt_tokens.clear(); slot->sparams.use_penalty_prompt_tokens = false; const auto &penalty_prompt = data.find("penalty_prompt"); @@ -718,14 +722,10 @@ struct llama_server_context slot->sparams.use_penalty_prompt_tokens = true; } } + */ slot->sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) - { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - const auto &logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { @@ -753,7 +753,7 @@ struct llama_server_context llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) @@ -761,13 +761,13 @@ struct llama_server_context auto toks = llama_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot->sparams.logit_bias[tok] = bias; + slot->sparams.logit_bias.push_back({tok, bias}); } } } } } - + slot->params.antiprompt.clear(); const auto &stop = data.find("stop"); @@ -781,24 +781,22 @@ struct llama_server_context } } } - - const auto &samplers_sequence = data.find("samplers"); - if (samplers_sequence != data.end() && samplers_sequence->is_array()) - { + + const auto & samplers = data.find("samplers"); + if (samplers != data.end() && samplers->is_array()) { std::vector sampler_names; - for (const auto &sampler_name : *samplers_sequence) - { - if (sampler_name.is_string()) - { - sampler_names.emplace_back(sampler_name); + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } } - } - slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); } else { - slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + slot->sparams.samplers = default_sparams.samplers; } + if (multimodal) { @@ -875,10 +873,10 @@ struct llama_server_context if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); + gpt_sampler_free(slot->ctx_sampling); } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); + slot->ctx_sampling = gpt_sampler_init(model, slot->sparams); + //llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -888,7 +886,7 @@ struct llama_server_context {"task_id", slot->task_id}, }); - LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); + // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); return true; } @@ -1006,11 +1004,13 @@ struct llama_server_context slot.generated_text += token_str; slot.has_next_token = true; +/* if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { // we can change penalty_prompt_tokens because it is always created from scratch each request slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); } + */ // check if there is incomplete UTF-8 character at the end bool incomplete = false; @@ -1144,13 +1144,11 @@ struct llama_server_context json get_formated_generation(llama_client_slot &slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && std::isinf(eos_bias->second); - std::vector samplers_sequence; - for (const auto &sampler_type : slot.sparams.samplers_sequence) + std::vector samplers; + samplers.reserve(slot.sparams.samplers.size()); + for (const auto & sampler : slot.sparams.samplers) { - samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); + samplers.emplace_back(gpt_sampler_type_to_str(sampler)); } return json { @@ -1165,13 +1163,11 @@ struct llama_server_context {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, + {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1179,13 +1175,13 @@ struct llama_server_context {"stop", slot.params.antiprompt}, {"n_predict", slot.params.n_predict}, {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, + {"ignore_eos", slot.sparams.ignore_eos}, {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, + // {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, - {"samplers", samplers_sequence} + {"samplers", samplers} }; } @@ -1714,7 +1710,7 @@ struct llama_server_context if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); + gpt_sampler_reset(slot.ctx_sampling); slot.n_past = 0; slot.n_past_se = 0; @@ -1726,7 +1722,7 @@ struct llama_server_context // push the prompt into the sampling context (do not apply grammar) for (auto &token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); + gpt_sampler_accept(slot.ctx_sampling, token, false); } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); @@ -1934,9 +1930,9 @@ struct llama_server_context } completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); + const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i); - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + gpt_sampler_accept(slot.ctx_sampling, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) @@ -1946,19 +1942,14 @@ struct llama_server_context metrics.on_prompt_eval(slot); } - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; result.tok = id; + const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling); - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) - { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) - { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); } if (!process_token(result, slot)) diff --git a/backend/cpp/llama/patches/01-llava.patch b/backend/cpp/llama/patches/01-llava.patch new file mode 100644 index 000000000000..fa122da257cd --- /dev/null +++ b/backend/cpp/llama/patches/01-llava.patch @@ -0,0 +1,13 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index 342042ff..224db9b5 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { +- patches_data[i] = i + 1; ++ patches_data[i] = i; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); \ No newline at end of file diff --git a/backend/cpp/llama/prepare.sh b/backend/cpp/llama/prepare.sh index 6c00f27caa38..4c8393b908d7 100644 --- a/backend/cpp/llama/prepare.sh +++ b/backend/cpp/llama/prepare.sh @@ -1,5 +1,12 @@ #!/bin/bash +## Patches +## Apply patches from the `patches` directory +for patch in $(ls patches); do + echo "Applying patch $patch" + patch -d llama.cpp/ -p1 < patches/$patch +done + cp -r CMakeLists.txt llama.cpp/examples/grpc-server/ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/ diff --git a/backend/cpp/llama/utils.hpp b/backend/cpp/llama/utils.hpp index c5dafbf0f9ce..198b6f265957 100644 --- a/backend/cpp/llama/utils.hpp +++ b/backend/cpp/llama/utils.hpp @@ -480,31 +480,4 @@ static inline std::vector base64_decode(const std::string & encoded_str } return ret; -} - -// -// random string / id -// - -static std::string random_string() -{ - static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() -{ - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); } \ No newline at end of file