From eb403d528d9439e578ef1cdf42ff4bb2efd600d1 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 17 Jan 2024 22:27:21 +0700 Subject: [PATCH 1/3] feat: temporary fix to add artificial queue into nitro --- controllers/llamaCPP.cc | 25 +++++++++++++++++++++++-- controllers/llamaCPP.h | 4 +++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 291ec2ad7..4d2880349 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -2,6 +2,7 @@ #include "llama.h" #include "log.h" #include "utils/nitro_utils.h" +#include using namespace inferences; using json = nlohmann::json; @@ -293,20 +294,38 @@ void llamaCPP::chatCompletion( LOG_INFO << "Current completion text"; LOG_INFO << formatted_output; #endif - const int task_id = llama.request_completion(data, false, false, -1); + int task_id; + + if (llama.params.n_parallel == 1) { + while (true) { + if (!single_queue_is_busy) { + task_id = llama.request_completion(data, false, false, -1); + single_queue_is_busy = true; + break; + } else { + std::this_thread::sleep_for( + std::chrono::milliseconds(100)); // Sleep for 500 milliseconds + } + } + } else { + task_id = llama.request_completion(data, false, false, -1); + } + LOG_INFO << "Resolved request for task_id:" << task_id; if (is_streamed) { auto state = createState(task_id, this); auto chunked_content_provider = - [state](char *pBuffer, std::size_t nBuffSize) -> std::size_t { + [this, state](char *pBuffer, std::size_t nBuffSize) -> std::size_t { if (!pBuffer) { LOG_INFO << "Connection closed or buffer is null. Reset context"; state->instance->llama.request_cancel(state->task_id); + single_queue_is_busy = false; return 0; } if (state->isStopped) { + single_queue_is_busy = false; return 0; } @@ -339,8 +358,10 @@ void llamaCPP::chatCompletion( } return nRead; } else { + single_queue_is_busy = false; return 0; } + single_queue_is_busy = false; return 0; }; auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index c07bc144c..09ec676f7 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2560,7 +2560,7 @@ class llamaCPP : public drogon::HttpController { private: llama_server_context llama; - //std::atomic model_loaded = false; + // std::atomic model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; std::thread backgroundThread; @@ -2572,5 +2572,7 @@ class llamaCPP : public drogon::HttpController { bool caching_enabled; std::atomic no_of_chats = 0; int clean_cache_threshold; + std::atomic single_queue_is_busy; // This value only used under the + // condition n_parallel is 1 }; }; // namespace inferences From d51089a0355f2966af4a9bfabe7913a53ee8f88f Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 17 Jan 2024 22:28:13 +0700 Subject: [PATCH 2/3] remove redundant include --- controllers/llamaCPP.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 4d2880349..a0100e917 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -2,7 +2,6 @@ #include "llama.h" #include "log.h" #include "utils/nitro_utils.h" -#include using namespace inferences; using json = nlohmann::json; From 8edf8ae2003a8c4202c4c440cca3a0eefbbdfaa6 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 17 Jan 2024 22:31:28 +0700 Subject: [PATCH 3/3] sleep for 0.5s --- controllers/llamaCPP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index a0100e917..3f7eb9dd0 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -303,7 +303,7 @@ void llamaCPP::chatCompletion( break; } else { std::this_thread::sleep_for( - std::chrono::milliseconds(100)); // Sleep for 500 milliseconds + std::chrono::milliseconds(500)); // Sleep for 500 milliseconds } } } else {