diff --git a/Makefile b/Makefile
index 72fdc6ba46bc71..ec0b0d5306db0a 100644
--- a/Makefile
+++ b/Makefile
@@ -800,7 +800,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o ngram-cache.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index 3ca112ef1613d8..3d033d4d71eec6 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -216,12 +216,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
 
 }
 
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) {
     std::ifstream hashmap_file(filename, std::ios::binary);
     if (!hashmap_file) {
-        throw std::ifstream::failure("Unable to open file " + filename);
+        return false;
     }
-    llama_ngram_cache ngram_cache;
 
     llama_ngram ngram;
     int32_t     ntokens;
@@ -251,7 +250,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
     }
     GGML_ASSERT(hashmap_file.eof());
 
-    return ngram_cache;
+    return true;
 }
 
 void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
index e4fa4cbd12f11e..1e07e93c6381fc 100644
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -84,9 +84,10 @@ void llama_ngram_cache_draft(
 void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Load an ngram cache saved with llama_ngram_cache_save.
+// ngram_cache: the ngram cache to load the data into.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
index 07c93eb8d057bb..17e33ee09e1f7b 100644
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -33,11 +33,13 @@ int main(int argc, char ** argv){
     }
 
     fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+    llama_ngram_cache ngram_cache_merged;
+    GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0]));
 
     for (size_t i = 1; i < args.size()-1; ++i) {
         fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+        llama_ngram_cache ngram_cache;
+        GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i]));
 
         llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
     }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 41b62c2fe9f76b..39cd43cd61cf02 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -47,18 +47,15 @@ int main(int argc, char ** argv){
         const int64_t t_start_draft_us = ggml_time_us();
 
         if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
+            if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
                 fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
 
         if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+            // If the dynamic lookup cache doesn't exist it will be created at the end of the program:
+            llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
         }
 
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 9526e898fe7638..54e1ec52f83741 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -57,18 +57,15 @@ int main(int argc, char ** argv){
         llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
 
         if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
+            if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
                 fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
 
         if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+            // If the dynamic lookup cache doesn't exist it will be created at the end of the program:
+            llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
         }
 
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 6ca637bddc3a12..a60ee39c25a8f6 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -45,6 +45,9 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
     parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
     parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+    parser.add_argument("--draft", type=int, help="Max. number of additional tokens to draft for lookup decoding", required=False, default=5)
+    parser.add_argument("-lcs", "--lookup-cache-static", type=str, help="Path to optional static lookup cache to use.", required=False, default=None)
+    parser.add_argument("-lcd", "--lookup-cache-dynamic", type=str, help="Path to optional dynamic lookup cache to use. Will be overwritten upon server shutdown.", required=False, default=None)
 
     args = parser.parse_args(args_in)
 
@@ -269,6 +272,11 @@ def start_server_background(args):
     server_args.append('--cont-batching')
     server_args.append('--metrics')
     server_args.extend(['--log-format', "text"])
+    server_args.extend(['--draft', args.draft])
+    if args.lookup_cache_static is not None:
+        server_args.extend(['--lookup-cache-static', args.lookup_cache_static])
+    if args.lookup_cache_dynamic is not None:
+        server_args.extend(['--lookup-cache-dynamic', args.lookup_cache_dynamic])
     args = [str(arg) for arg in [server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")
     pkwargs = {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 25bc2963967725..872577ffc216e8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,9 +1,14 @@
+#include "common/common.h"
+#include "common/ngram-cache.h"
 #include "utils.hpp"
 
 #include "common.h"
 #include "json-schema-to-grammar.h"
-#include "llama.h"
 #include "grammar-parser.h"
+#include "llama.h"
+#include "ngram-cache.h"
+#include <cstdint>
+#include <cstring>
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -163,6 +168,10 @@ struct server_slot {
     // when a task is submitted, we first tokenize the prompt and store it here
     std::vector<llama_token> prompt_tokens;
 
+    llama_ngram_cache nc_context;
+    std::vector<llama_token> draft;
+    std::vector<llama_token> context_tokens;
+
     std::string generated_text;
     std::vector<llama_token> cache_tokens;
     std::vector<completion_token_output> generated_token_probs;
@@ -218,6 +227,8 @@ struct server_slot {
         n_past_se          = 0;
 
         generated_token_probs.clear();
+
+        nc_context.clear();
     }
 
     bool has_budget(gpt_params &global_params) {
@@ -258,7 +269,7 @@ struct server_slot {
         }
     }
 
-    json get_formated_timings() const {
+    json get_formatted_timings() const {
         return json {
             {"prompt_n",               n_prompt_tokens_processed},
             {"prompt_ms",              t_prompt_processing},
@@ -423,7 +434,7 @@ struct server_queue {
         queue_tasks_deferred.push_back(std::move(task));
     }
 
-    // Get the next id for creating anew task
+    // Get the next id for creating a new task
     int get_new_id() {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         int new_id = id++;
@@ -539,7 +550,7 @@ struct server_queue {
         queue_multitasks.push_back(multi);
     }
 
-    // updatethe remaining subtasks, while appending results to multitask
+    // update the remaining subtasks, while appending results to multitask
     void update_multitask(int id_multi, int id_sub, server_task_result & result) {
         std::lock_guard<std::mutex> lock(mutex_tasks);
         for (auto & multitask : queue_multitasks) {
@@ -572,7 +583,7 @@ struct server_response {
         waiting_task_ids.insert(id_task);
     }
 
-    // when the request is finished, we can remove task associated with it
+    // when the request is finished, we can remove the task associated with it
     void remove_waiting_task_id(int id_task) {
         LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
 
@@ -656,6 +667,10 @@ struct server_context {
     std::vector<server_slot> slots;
     json default_generation_settings_for_props;
 
+    int32_t n_draft = 3;
+    llama_ngram_cache nc_dynamic;
+    llama_ngram_cache nc_static;
+
     server_queue    queue_tasks;
     server_response queue_results;
 
@@ -714,6 +729,8 @@ struct server_context {
             slot.n_ctx = n_ctx_slot;
             slot.n_predict = params.n_predict;
 
+            slot.context_tokens.resize(n_ctx_slot);
+
             LOG_INFO("new slot", {
                 {"id_slot",    slot.id},
                 {"n_ctx_slot", slot.n_ctx}
@@ -744,7 +761,7 @@ struct server_context {
             slots.push_back(slot);
         }
 
-        default_generation_settings_for_props = get_formated_generation(slots.front());
+        default_generation_settings_for_props = get_formatted_generation(slots.front());
         default_generation_settings_for_props["seed"] = -1;
 
         // the update_slots() logic will always submit a maximum of n_batch tokens
@@ -1066,6 +1083,10 @@ struct server_context {
             for (int i = 0; i < (int)system_tokens.size(); ++i) {
                 llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
             }
+            for (auto slot : slots) {
+                memcpy(slot.context_tokens.data(), system_tokens.data(), system_tokens.size()*sizeof(llama_token));
+                llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, system_tokens, system_tokens.size(), false);
+            }
 
             const int32_t n_batch = llama_n_batch(ctx);
 
@@ -1225,7 +1246,7 @@ struct server_context {
         return slot.has_next_token; // continue
     }
 
-    json get_formated_generation(const server_slot & slot) const {
+    json get_formatted_generation(const server_slot & slot) const {
         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
@@ -1347,7 +1368,7 @@ struct server_context {
             {"model",               params.model_alias},
             {"tokens_predicted",    slot.n_decoded},
             {"tokens_evaluated",    slot.n_prompt_tokens},
-            {"generation_settings", get_formated_generation(slot)},
+            {"generation_settings", get_formatted_generation(slot)},
             {"prompt",              slot.prompt},
             {"truncated",           slot.truncated},
             {"stopped_eos",         slot.stopped_eos},
@@ -1355,7 +1376,7 @@ struct server_context {
             {"stopped_limit",       slot.stopped_limit},
             {"stopping_word",       slot.stopping_word},
             {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
+            {"timings",             slot.get_formatted_timings()}
         };
 
         if (slot.sparams.n_probs > 0) {
@@ -1553,7 +1574,7 @@ struct server_context {
                     int n_processing_slots = 0;
 
                     for (server_slot & slot : slots) {
-                        json slot_data = get_formated_generation(slot);
+                        json slot_data = get_formatted_generation(slot);
                         slot_data["id"]         = slot.id;
                         slot_data["id_task"]    = slot.id_task;
                         slot_data["state"]      = slot.state;
@@ -1755,6 +1776,7 @@ struct server_context {
             if (slot.command == SLOT_COMMAND_RELEASE) {
                 slot.state       = SLOT_STATE_IDLE;
                 slot.command     = SLOT_COMMAND_NONE;
+                llama_ngram_cache_merge(nc_dynamic, slot.nc_context);
                 slot.t_last_used = ggml_time_us();
 
                 LOG_INFO("slot released", {
@@ -1826,6 +1848,9 @@ struct server_context {
 
                     llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
                     llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    for (int j = n_keep; j < slot.n_past - n_discard; ++j) {
+                        slot.context_tokens[j] = slot.context_tokens[j + n_discard];
+                    }
 
                     if (slot.params.cache_prompt) {
                         for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1845,7 +1870,7 @@ struct server_context {
         // start populating the batch for this iteration
         llama_batch_clear(batch);
 
-        // frist, add sampled tokens from any ongoing sequences
+        // first, add sampled tokens from any ongoing sequences
         for (auto & slot : slots) {
             if (slot.state == SLOT_STATE_IDLE) {
                 continue;
@@ -1858,6 +1883,9 @@ struct server_context {
             // TODO: we always have to take into account the "system_tokens"
             //       this is not great and needs to be improved somehow
             llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
+            slot.context_tokens[system_tokens.size() + slot_npast] = slot.sampled;
+            std::vector<llama_token> tail(slot.context_tokens.begin(), slot.context_tokens.begin() + system_tokens.size() + slot_npast);
+            llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, tail, 1, false);
 
             slot.n_past += 1;
 
@@ -1885,7 +1913,7 @@ struct server_context {
             for (auto & slot : slots) {
                 // this slot still has a prompt to be processed
                 if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
-                    auto & prompt_tokens = slot.prompt_tokens;
+                    std::vector<llama_token> & prompt_tokens = slot.prompt_tokens;
 
                     // we haven't tokenized the prompt yet - do it now:
                     if (prompt_tokens.empty()) {
@@ -2087,6 +2115,9 @@ struct server_context {
                         }
 
                         llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
+                        slot.context_tokens[system_tokens.size() + slot_npast] = prompt_tokens[slot.n_past];
+                        std::vector<llama_token> tail(slot.context_tokens.begin(), slot.context_tokens.begin() + slot_npast);
+                        llama_ngram_cache_update(slot.nc_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, tail, 1, false);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2185,6 +2216,42 @@ struct server_context {
                 0, 0, 0, // unused
             };
 
+            for (auto & slot : slots) {
+                if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                    continue; // continue loop of slots
+                }
+
+                const int32_t tail_start = std::max(slot.n_past - LLAMA_NGRAM_MAX, 0);
+                std::vector<llama_token> context_tail(slot.context_tokens.begin() + tail_start, slot.context_tokens.begin() + slot.n_past);
+
+                slot.draft.clear();
+                slot.draft.push_back(slot.context_tokens[slot.n_past - 1]);
+                llama_ngram_cache_draft(
+                    context_tail, slot.draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, slot.nc_context, nc_dynamic, nc_static);
+                // fprintf(stderr, "draft post: slot.draft.size()=%d\n", (int)slot.draft.size());
+
+                // if (slot.draft.size() > 1) {
+                //     fprintf(stderr, "context_tail: ");
+                //     for (llama_token t : context_tail) {
+                //         const std::string s = llama_token_to_piece(ctx, t);
+                //         fprintf(stderr, "'%s' ", s.c_str());
+                //     }
+                //     fprintf(stderr, "\n");
+
+                //     fprintf(stderr, "draft:");
+                //     for (llama_token t : slot.draft) {
+                //         const std::string s = llama_token_to_piece(ctx, t);
+                //         fprintf(stderr, "'%s' ", s.c_str());
+                //     }
+                //     fprintf(stderr, "\n");
+                // }
+
+                for (int j = 1; j < (int)slot.draft.size(); ++j) {
+                    llama_batch_add(batch_view, slot.draft[j], slot.n_past, {slot.id + 1}, true);
+                    slot.n_past++;
+                }
+            }
+
             const int ret = llama_decode(ctx, batch_view);
 
             if (ret != 0) {
@@ -2230,39 +2297,56 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                int j = 0;
+                do { // while (j < slot.draft.size() && slot.sampled == draft[j])
+                    completion_token_output result;
+                    const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i + j);
+                    const std::string s = llama_token_to_piece(ctx, id);
+                    // fprintf(stderr, "Sampled: j=%d '%s'\n", j, s.c_str());
+                    // if (j >= 1) {
+                    //     const std::string d0 = llama_token_to_piece(ctx, slot.draft[j-1]);
+                    //     const std::string d1 = llama_token_to_piece(ctx, slot.draft[j-0]);
+                    //     fprintf(stderr, "Prediction correct: '%s' -> '%s'\n", d0.c_str(), d1.c_str());
+                    // }
+
+                    llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+
+                    slot.n_decoded += 1;
+                    if (slot.n_decoded == 1) {
+                        slot.t_start_generation = ggml_time_us();
+                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                        metrics.on_prompt_eval(slot);
+                    }
 
-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                    llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                    result.tok = id;
 
-                slot.n_decoded += 1;
-                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
-                }
+                    const int32_t n_probs = slot.sparams.n_probs;
+                    if (slot.sparams.temp <= 0 && n_probs > 0) {
+                        // for llama_sample_token_greedy we need to sort candidates
+                        llama_sample_softmax(ctx, &cur_p);
+                    }
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
-                result.tok = id;
+                    for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
+                        result.probs.push_back({
+                            cur_p.data[i].id,
+                            cur_p.data[i].p
+                        });
+                    }
 
-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0) {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
-                }
+                    if (!process_token(result, slot)) {
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                    }
 
-                for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
-                    result.probs.push_back({
-                        cur_p.data[i].id,
-                        cur_p.data[i].p
-                    });
-                }
+                ++j;
+                } while (j < (int)slot.draft.size() && slot.sampled == slot.draft[j]);
 
-                if (!process_token(result, slot)) {
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
+                if (j < (int)slot.draft.size()) {
+                    slot.n_past -= slot.draft.size() - j;
+                    llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1);
                 }
 
                 slot.i_batch = -1;
@@ -2317,6 +2401,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("                              - distribute: spread execution evenly over all nodes\n");
     printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
     printf("                              - numactl: use the CPU map provided my numactl\n");
+    printf("  --draft N                 max. number of additional tokens to draft for lookup decoding (default: %d)\n", params.n_draft);
+    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
+    printf("                            path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+    printf("                            path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
     if (llama_supports_gpu_offload()) {
         printf("  -ngl N, --n-gpu-layers N\n");
         printf("                            number of layers to store in VRAM\n");
@@ -2718,6 +2807,24 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
+        } else if (arg == "-lcs" || arg == "--lookup-cache-static") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lookup_cache_static = argv[i];
+        } else if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lookup_cache_dynamic = argv[i];
+        } else if (arg == "--draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_draft = std::stoi(argv[i]);
         } else if (arg == "--embedding" || arg == "--embeddings") {
             params.embedding = true;
         } else if (arg == "-cb" || arg == "--cont-batching") {
@@ -3020,6 +3127,23 @@ int main(int argc, char ** argv) {
 
     LOG_INFO("model loaded", {});
 
+    ctx_server.n_draft = params.n_draft;
+
+    if (!params.lookup_cache_static.empty()) {
+        LOG_INFO("Loading static lookup cache from %s", {params.lookup_cache_static.c_str()});
+        if(!llama_ngram_cache_load(ctx_server.nc_static, params.lookup_cache_static)){
+            LOG_ERROR("Did not find a lookup cache under %s", {params.lookup_cache_static.c_str()});
+            return 1;
+        }
+    }
+
+    if (!params.lookup_cache_dynamic.empty()) {
+        LOG_INFO("Loading dynamic lookup cache from %s", {params.lookup_cache_dynamic.c_str()});
+        if(!llama_ngram_cache_load(ctx_server.nc_dynamic, params.lookup_cache_dynamic)){
+            LOG_INFO("Did not find a lookup cache under %s . It will be created on server shutdown.", {params.lookup_cache_dynamic.c_str()});
+        }
+    }
+
     const auto model_meta = ctx_server.model_meta();
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
@@ -3820,6 +3944,11 @@ int main(int argc, char ** argv) {
     svr->stop();
     t.join();
 
+    if (!params.lookup_cache_dynamic.empty()) {
+        LOG_INFO("Saving dynamic lookup cache to %s", {params.lookup_cache_dynamic.c_str()});
+        llama_ngram_cache_save(ctx_server.nc_dynamic, params.lookup_cache_dynamic);
+    }
+
     llama_backend_free();
 
     return 0;