From eb403d528d9439e578ef1cdf42ff4bb2efd600d1 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 17 Jan 2024 22:27:21 +0700
Subject: [PATCH 1/3] feat: temporary fix to add artificial queue into nitro

---
 controllers/llamaCPP.cc | 25 +++++++++++++++++++++++--
 controllers/llamaCPP.h  |  4 +++-
 2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 291ec2ad7..4d2880349 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -2,6 +2,7 @@
 #include "llama.h"
 #include "log.h"
 #include "utils/nitro_utils.h"
+#include <thread>
 
 using namespace inferences;
 using json = nlohmann::json;
@@ -293,20 +294,38 @@ void llamaCPP::chatCompletion(
   LOG_INFO << "Current completion text";
   LOG_INFO << formatted_output;
 #endif
-  const int task_id = llama.request_completion(data, false, false, -1);
+  int task_id;
+
+  if (llama.params.n_parallel == 1) {
+    while (true) {
+      if (!single_queue_is_busy) {
+        task_id = llama.request_completion(data, false, false, -1);
+        single_queue_is_busy = true;
+        break;
+      } else {
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(100)); // Sleep for 500 milliseconds
+      }
+    }
+  } else {
+    task_id = llama.request_completion(data, false, false, -1);
+  }
+
   LOG_INFO << "Resolved request for task_id:" << task_id;
 
   if (is_streamed) {
     auto state = createState(task_id, this);
 
     auto chunked_content_provider =
-        [state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
+        [this, state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
       if (!pBuffer) {
         LOG_INFO << "Connection closed or buffer is null. Reset context";
         state->instance->llama.request_cancel(state->task_id);
+        single_queue_is_busy = false;
         return 0;
       }
       if (state->isStopped) {
+        single_queue_is_busy = false;
         return 0;
       }
 
@@ -339,8 +358,10 @@ void llamaCPP::chatCompletion(
         }
         return nRead;
       } else {
+        single_queue_is_busy = false;
         return 0;
       }
+      single_queue_is_busy = false;
       return 0;
     };
     auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index c07bc144c..09ec676f7 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -2560,7 +2560,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
 
 private:
   llama_server_context llama;
-  //std::atomic<bool> model_loaded = false;
+  // std::atomic<bool> model_loaded = false;
   size_t sent_count = 0;
   size_t sent_token_probs_index = 0;
   std::thread backgroundThread;
@@ -2572,5 +2572,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
   bool caching_enabled;
   std::atomic<int> no_of_chats = 0;
   int clean_cache_threshold;
+  std::atomic<bool> single_queue_is_busy; // This value only used under the
+                                          // condition n_parallel is 1
 };
 }; // namespace inferences

From d51089a0355f2966af4a9bfabe7913a53ee8f88f Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 17 Jan 2024 22:28:13 +0700
Subject: [PATCH 2/3] remove redundant include

---
 controllers/llamaCPP.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 4d2880349..a0100e917 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -2,7 +2,6 @@
 #include "llama.h"
 #include "log.h"
 #include "utils/nitro_utils.h"
-#include <thread>
 
 using namespace inferences;
 using json = nlohmann::json;

From 8edf8ae2003a8c4202c4c440cca3a0eefbbdfaa6 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 17 Jan 2024 22:31:28 +0700
Subject: [PATCH 3/3] sleep for 0.5s

---
 controllers/llamaCPP.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index a0100e917..3f7eb9dd0 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -303,7 +303,7 @@ void llamaCPP::chatCompletion(
         break;
       } else {
         std::this_thread::sleep_for(
-            std::chrono::milliseconds(100)); // Sleep for 500 milliseconds
+            std::chrono::milliseconds(500)); // Sleep for 500 milliseconds
       }
     }
   } else {