diff --git a/Makefile b/Makefile index 72fdc6ba46bc71..ec0b0d5306db0a 100644 --- a/Makefile +++ b/Makefile @@ -800,7 +800,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o ngram-cache.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 3ca112ef1613d8..3d033d4d71eec6 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -216,12 +216,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen } -llama_ngram_cache llama_ngram_cache_load(std::string & filename) { +bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) { std::ifstream hashmap_file(filename, std::ios::binary); if (!hashmap_file) { - throw std::ifstream::failure("Unable to open file " + filename); + return false; } - llama_ngram_cache ngram_cache; llama_ngram ngram; int32_t ntokens; @@ -251,7 +250,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { } GGML_ASSERT(hashmap_file.eof()); - return ngram_cache; + return true; } void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { diff --git a/common/ngram-cache.h b/common/ngram-cache.h index e4fa4cbd12f11e..1e07e93c6381fc 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -84,9 +84,10 @@ void llama_ngram_cache_draft( void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); // Load an ngram cache saved with llama_ngram_cache_save. +// ngram_cache: the ngram cache to load the data into. // filename: the path from which to load the ngram cache. // returns: an ngram cache containing the information saved to filename. -llama_ngram_cache llama_ngram_cache_load(std::string & filename); +bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename); // Merge two ngram caches. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp index 07c93eb8d057bb..17e33ee09e1f7b 100644 --- a/examples/lookup/lookup-merge.cpp +++ b/examples/lookup/lookup-merge.cpp @@ -33,11 +33,13 @@ int main(int argc, char ** argv){ } fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); - llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); + llama_ngram_cache ngram_cache_merged; + GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0])); for (size_t i = 1; i < args.size()-1; ++i) { fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); - llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); + llama_ngram_cache ngram_cache; + GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i])); llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 41b62c2fe9f76b..39cd43cd61cf02 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -47,18 +47,15 @@ int main(int argc, char ** argv){ const int64_t t_start_draft_us = ggml_time_us(); if (!params.lookup_cache_static.empty()) { - try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); - } catch (std::ifstream::failure const &) { + if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) { fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } if (!params.lookup_cache_dynamic.empty()) { - try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); - } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + // If the dynamic lookup cache doesn't exist it will be created at the end of the program: + llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic); } t_draft_flat_us += ggml_time_us() - t_start_draft_us; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 9526e898fe7638..54e1ec52f83741 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -57,18 +57,15 @@ int main(int argc, char ** argv){ llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); if (!params.lookup_cache_static.empty()) { - try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); - } catch (std::ifstream::failure const &) { + if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) { fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } if (!params.lookup_cache_dynamic.empty()) { - try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); - } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + // If the dynamic lookup cache doesn't exist it will be created at the end of the program: + llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic); } t_draft_flat_us += ggml_time_us() - t_start_draft_us; diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 6ca637bddc3a12..a60ee39c25a8f6 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -45,6 +45,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) parser.add_argument("--duration", type=str, help="Bench scenario", required=True) + parser.add_argument("--draft", type=int, help="Max. number of additional tokens to draft for lookup decoding", required=False, default=5) + parser.add_argument("-lcs", "--lookup-cache-static", type=str, help="Path to optional static lookup cache to use.", required=False, default=None) + parser.add_argument("-lcd", "--lookup-cache-dynamic", type=str, help="Path to optional dynamic lookup cache to use. Will be overwritten upon server shutdown.", required=False, default=None) args = parser.parse_args(args_in) @@ -269,6 +272,11 @@ def start_server_background(args): server_args.append('--cont-batching') server_args.append('--metrics') server_args.extend(['--log-format', "text"]) + server_args.extend(['--draft', args.draft]) + if args.lookup_cache_static is not None: + server_args.extend(['--lookup-cache-static', args.lookup_cache_static]) + if args.lookup_cache_dynamic is not None: + server_args.extend(['--lookup-cache-dynamic', args.lookup_cache_dynamic]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") pkwargs = { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 25bc2963967725..872577ffc216e8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,9 +1,14 @@ +#include "common/common.h" +#include "common/ngram-cache.h" #include "utils.hpp" #include "common.h" #include "json-schema-to-grammar.h" -#include "llama.h" #include "grammar-parser.h" +#include "llama.h" +#include "ngram-cache.h" +#include +#include #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -163,6 +168,10 @@ struct server_slot { // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; + llama_ngram_cache nc_context; + std::vector draft; + std::vector context_tokens; + std::string generated_text; std::vector cache_tokens; std::vector generated_token_probs; @@ -218,6 +227,8 @@ struct server_slot { n_past_se = 0; generated_token_probs.clear(); + + nc_context.clear(); } bool has_budget(gpt_params &global_params) { @@ -258,7 +269,7 @@ struct server_slot { } } - json get_formated_timings() const { + json get_formatted_timings() const { return json { {"prompt_n", n_prompt_tokens_processed}, {"prompt_ms", t_prompt_processing}, @@ -423,7 +434,7 @@ struct server_queue { queue_tasks_deferred.push_back(std::move(task)); } - // Get the next id for creating anew task + // Get the next id for creating a new task int get_new_id() { std::unique_lock lock(mutex_tasks); int new_id = id++; @@ -539,7 +550,7 @@ struct server_queue { queue_multitasks.push_back(multi); } - // updatethe remaining subtasks, while appending results to multitask + // update the remaining subtasks, while appending results to multitask void update_multitask(int id_multi, int id_sub, server_task_result & result) { std::lock_guard lock(mutex_tasks); for (auto & multitask : queue_multitasks) { @@ -572,7 +583,7 @@ struct server_response { waiting_task_ids.insert(id_task); } - // when the request is finished, we can remove task associated with it + // when the request is finished, we can remove the task associated with it void remove_waiting_task_id(int id_task) { LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); @@ -656,6 +667,10 @@ struct server_context { std::vector slots; json default_generation_settings_for_props; + int32_t n_draft = 3; + llama_ngram_cache nc_dynamic; + llama_ngram_cache nc_static; + server_queue queue_tasks; server_response queue_results; @@ -714,6 +729,8 @@ struct server_context { slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; + slot.context_tokens.resize(n_ctx_slot); + LOG_INFO("new slot", { {"id_slot", slot.id}, {"n_ctx_slot", slot.n_ctx} @@ -744,7 +761,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props = get_formatted_generation(slots.front()); default_generation_settings_for_props["seed"] = -1; // the update_slots() logic will always submit a maximum of n_batch tokens @@ -1066,6 +1083,10 @@ struct server_context { for (int i = 0; i < (int)system_tokens.size(); ++i) { llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } + for (auto slot : slots) { + memcpy(slot.context_tokens.data(), system_tokens.data(), system_tokens.size()*sizeof(llama_token)); + llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, system_tokens, system_tokens.size(), false); + } const int32_t n_batch = llama_n_batch(ctx); @@ -1225,7 +1246,7 @@ struct server_context { return slot.has_next_token; // continue } - json get_formated_generation(const server_slot & slot) const { + json get_formatted_generation(const server_slot & slot) const { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -1347,7 +1368,7 @@ struct server_context { {"model", params.model_alias}, {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, + {"generation_settings", get_formatted_generation(slot)}, {"prompt", slot.prompt}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, @@ -1355,7 +1376,7 @@ struct server_context { {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()} + {"timings", slot.get_formatted_timings()} }; if (slot.sparams.n_probs > 0) { @@ -1553,7 +1574,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = get_formated_generation(slot); + json slot_data = get_formatted_generation(slot); slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["state"] = slot.state; @@ -1755,6 +1776,7 @@ struct server_context { if (slot.command == SLOT_COMMAND_RELEASE) { slot.state = SLOT_STATE_IDLE; slot.command = SLOT_COMMAND_NONE; + llama_ngram_cache_merge(nc_dynamic, slot.nc_context); slot.t_last_used = ggml_time_us(); LOG_INFO("slot released", { @@ -1826,6 +1848,9 @@ struct server_context { llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + for (int j = n_keep; j < slot.n_past - n_discard; ++j) { + slot.context_tokens[j] = slot.context_tokens[j + n_discard]; + } if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1845,7 +1870,7 @@ struct server_context { // start populating the batch for this iteration llama_batch_clear(batch); - // frist, add sampled tokens from any ongoing sequences + // first, add sampled tokens from any ongoing sequences for (auto & slot : slots) { if (slot.state == SLOT_STATE_IDLE) { continue; @@ -1858,6 +1883,9 @@ struct server_context { // TODO: we always have to take into account the "system_tokens" // this is not great and needs to be improved somehow llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); + slot.context_tokens[system_tokens.size() + slot_npast] = slot.sampled; + std::vector tail(slot.context_tokens.begin(), slot.context_tokens.begin() + system_tokens.size() + slot_npast); + llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, tail, 1, false); slot.n_past += 1; @@ -1885,7 +1913,7 @@ struct server_context { for (auto & slot : slots) { // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { - auto & prompt_tokens = slot.prompt_tokens; + std::vector & prompt_tokens = slot.prompt_tokens; // we haven't tokenized the prompt yet - do it now: if (prompt_tokens.empty()) { @@ -2087,6 +2115,9 @@ struct server_context { } llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); + slot.context_tokens[system_tokens.size() + slot_npast] = prompt_tokens[slot.n_past]; + std::vector tail(slot.context_tokens.begin(), slot.context_tokens.begin() + slot_npast); + llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, tail, 1, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); @@ -2185,6 +2216,42 @@ struct server_context { 0, 0, 0, // unused }; + for (auto & slot : slots) { + if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + continue; // continue loop of slots + } + + const int32_t tail_start = std::max(slot.n_past - LLAMA_NGRAM_MAX, 0); + std::vector context_tail(slot.context_tokens.begin() + tail_start, slot.context_tokens.begin() + slot.n_past); + + slot.draft.clear(); + slot.draft.push_back(slot.context_tokens[slot.n_past - 1]); + llama_ngram_cache_draft( + context_tail, slot.draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, slot.nc_context, nc_dynamic, nc_static); + // fprintf(stderr, "draft post: slot.draft.size()=%d\n", (int)slot.draft.size()); + + // if (slot.draft.size() > 1) { + // fprintf(stderr, "context_tail: "); + // for (llama_token t : context_tail) { + // const std::string s = llama_token_to_piece(ctx, t); + // fprintf(stderr, "'%s' ", s.c_str()); + // } + // fprintf(stderr, "\n"); + + // fprintf(stderr, "draft:"); + // for (llama_token t : slot.draft) { + // const std::string s = llama_token_to_piece(ctx, t); + // fprintf(stderr, "'%s' ", s.c_str()); + // } + // fprintf(stderr, "\n"); + // } + + for (int j = 1; j < (int)slot.draft.size(); ++j) { + llama_batch_add(batch_view, slot.draft[j], slot.n_past, {slot.id + 1}, true); + slot.n_past++; + } + } + const int ret = llama_decode(ctx, batch_view); if (ret != 0) { @@ -2230,39 +2297,56 @@ struct server_context { continue; // continue loop of slots } - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); + int j = 0; + do { // while (j < slot.draft.size() && slot.sampled == draft[j]) + completion_token_output result; + const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i + j); + const std::string s = llama_token_to_piece(ctx, id); + // fprintf(stderr, "Sampled: j=%d '%s'\n", j, s.c_str()); + // if (j >= 1) { + // const std::string d0 = llama_token_to_piece(ctx, slot.draft[j-1]); + // const std::string d1 = llama_token_to_piece(ctx, slot.draft[j-0]); + // fprintf(stderr, "Prediction correct: '%s' -> '%s'\n", d0.c_str(), d1.c_str()); + // } + + llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + + slot.n_decoded += 1; + if (slot.n_decoded == 1) { + slot.t_start_generation = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; + result.tok = id; - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); - } + const int32_t n_probs = slot.sparams.n_probs; + if (slot.sparams.temp <= 0 && n_probs > 0) { + // for llama_sample_token_greedy we need to sort candidates + llama_sample_softmax(ctx, &cur_p); + } - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; - result.tok = id; + for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) { + result.probs.push_back({ + cur_p.data[i].id, + cur_p.data[i].p + }); + } - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } + if (!process_token(result, slot)) { + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + } - for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) { - result.probs.push_back({ - cur_p.data[i].id, - cur_p.data[i].p - }); - } + ++j; + } while (j < (int)slot.draft.size() && slot.sampled == slot.draft[j]); - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); + if (j < (int)slot.draft.size()) { + slot.n_past -= slot.draft.size() - j; + llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1); } slot.i_batch = -1; @@ -2317,6 +2401,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" - distribute: spread execution evenly over all nodes\n"); printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); printf(" - numactl: use the CPU map provided my numactl\n"); + printf(" --draft N max. number of additional tokens to draft for lookup decoding (default: %d)\n", params.n_draft); + printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); + printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); + printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); + printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); if (llama_supports_gpu_offload()) { printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); @@ -2718,6 +2807,24 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } } + } else if (arg == "-lcs" || arg == "--lookup-cache-static") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lookup_cache_static = argv[i]; + } else if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lookup_cache_dynamic = argv[i]; + } else if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_draft = std::stoi(argv[i]); } else if (arg == "--embedding" || arg == "--embeddings") { params.embedding = true; } else if (arg == "-cb" || arg == "--cont-batching") { @@ -3020,6 +3127,23 @@ int main(int argc, char ** argv) { LOG_INFO("model loaded", {}); + ctx_server.n_draft = params.n_draft; + + if (!params.lookup_cache_static.empty()) { + LOG_INFO("Loading static lookup cache from %s", {params.lookup_cache_static.c_str()}); + if(!llama_ngram_cache_load(ctx_server.nc_static, params.lookup_cache_static)){ + LOG_ERROR("Did not find a lookup cache under %s", {params.lookup_cache_static.c_str()}); + return 1; + } + } + + if (!params.lookup_cache_dynamic.empty()) { + LOG_INFO("Loading dynamic lookup cache from %s", {params.lookup_cache_dynamic.c_str()}); + if(!llama_ngram_cache_load(ctx_server.nc_dynamic, params.lookup_cache_dynamic)){ + LOG_INFO("Did not find a lookup cache under %s . It will be created on server shutdown.", {params.lookup_cache_dynamic.c_str()}); + } + } + const auto model_meta = ctx_server.model_meta(); // if a custom chat template is not supplied, we will use the one that comes with the model (if any) @@ -3820,6 +3944,11 @@ int main(int argc, char ** argv) { svr->stop(); t.join(); + if (!params.lookup_cache_dynamic.empty()) { + LOG_INFO("Saving dynamic lookup cache to %s", {params.lookup_cache_dynamic.c_str()}); + llama_ngram_cache_save(ctx_server.nc_dynamic, params.lookup_cache_dynamic); + } + llama_backend_free(); return 0;