From 7eda5583fae1ffdbe573262858887e137e031877 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 11 Aug 2024 14:18:17 -0400
Subject: [PATCH 1/3] server : fix segfault on long system prompt

---
 examples/server/server.cpp | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 360f571e42867..a1bbafbc9e4e2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1136,28 +1136,19 @@ struct server_context {
         if (!system_prompt.empty()) {
             system_tokens = ::llama_tokenize(ctx, system_prompt, true);
 
-            llama_batch_clear(batch);
+            const int32_t n_batch = llama_n_batch(ctx);
+            const int32_t n_tokens_prompt = system_tokens.size();
 
-            for (int i = 0; i < (int)system_tokens.size(); ++i) {
-                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
-            }
+            for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
+                const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
 
-            const int32_t n_batch = llama_n_batch(ctx);
+                llama_batch_clear(batch);
 
-            for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-                const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
+                for (int32_t j = 0; j < n_tokens; ++j) {
+                    llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
+                }
 
-                if (llama_decode(ctx, batch_view) != 0) {
+                if (llama_decode(ctx, batch) != 0) {
                     LOG_ERROR("llama_decode() failed", {});
                     return;
                 }

From c1b738ef43540e7102a14abe38ce894be9ac803c Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 13 Aug 2024 22:03:57 -0400
Subject: [PATCH 2/3] server : fix parallel generation with very small batch
 sizes

---
 examples/server/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a1bbafbc9e4e2..b2bf5bb5f043b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -753,13 +753,13 @@ struct server_context {
         default_generation_settings_for_props = get_formated_generation(slots.front());
         default_generation_settings_for_props["seed"] = -1;
 
-        // the update_slots() logic will always submit a maximum of n_batch tokens
+        // the update_slots() logic will always submit a maximum of n_batch or n_parralel tokens
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);
 
             // only a single seq_id per token is needed
-            batch = llama_batch_init(n_batch, 0, 1);
+            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
         }
 
         metrics.init();

From 93ec58b932ae132f1ce65d0a6d932466e2bd3818 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 13 Aug 2024 22:12:26 -0400
Subject: [PATCH 3/3] server : fix typo in comment

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 637b403372fb6..c25338f573767 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -754,7 +754,7 @@ struct server_context {
         default_generation_settings_for_props = get_formated_generation(slots.front());
         default_generation_settings_for_props["seed"] = -1;
 
-        // the update_slots() logic will always submit a maximum of n_batch or n_parralel tokens
+        // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
             const int32_t n_batch = llama_n_batch(ctx);