From 323181f2abb7ab9f03f03b8ee160efe66f5624eb Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sun, 11 Aug 2024 11:20:32 -0700
Subject: [PATCH] threadpool: add support for hybrid polling

poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var.
poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ...

The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms.
We can tune this further as things evolve.
---
 common/common.cpp   |  2 +-
 common/common.h     |  2 +-
 ggml/include/ggml.h | 14 ++++----
 ggml/src/ggml.c     | 86 +++++++++++++++++++++++++--------------------
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index df6e1624ef7b5..3a202be1b73d8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
     options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
     options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0...100>",       "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
     options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
     options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
diff --git a/common/common.h b/common/common.h
index b0c32f949b273..eb87e8880ffa5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -73,7 +73,7 @@ struct cpu_params {
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 
 struct gpt_params {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 923182d9d9710..910171c07c00c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -625,13 +625,13 @@ extern "C" {
     typedef bool (*ggml_abort_callback)(void * data);
 
     struct ggml_threadpool_params {
-        bool    cpumask[GGML_MAX_N_THREADS];
-        bool    mask_specified;
-        int32_t n_threads;
-        int32_t prio;
-        bool    poll;
-        bool    strict_cpu;
-        bool    paused;
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
+        bool     mask_specified;              // mask is non-empty
+        int32_t  n_threads;                   // number of threads
+        int32_t  prio;                        // thread priority
+        uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool     strict_cpu;                  // strict cpu placement
+        bool     paused;                      // start in paused state
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 308e569856c70..f0c0f4fb0a29a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
 
     int32_t      prio;       // Scheduling priority
     bool         disposable; // Doesn't initialize a conv-var
-    bool         poll;       // Use polling (busywait)  // TODO
+    uint32_t     poll;       // Polling level (0 - no polling)
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -19235,35 +19235,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
+#ifndef GGML_USE_OPENMP
 
+static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
+}
 
-#ifndef GGML_USE_OPENMP
+static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    if (threadpool->stop || threadpool->pause) return true;
+    return ggml_graph_compute_got_work(state);
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+        // No new work. Keep polling.
+        __cpu_relax();
+    }
+
+    return ggml_graph_compute_got_work(state);
+}
 
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    if (threadpool->poll) {
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Yield and keep polling.
-            __cpu_relax();
-        }
-    } else {
-        ggml_mutex_lock_shared(&threadpool->mutex);
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Wait for the signal.
-            ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-        }
-        ggml_mutex_unlock_shared(&threadpool->mutex);
+    if (ggml_graph_compute_poll_for_work(state)) {
+        return ggml_graph_compute_got_work(state);
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
     }
-    return threadpool->new_work;
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return ggml_graph_compute_got_work(state);
 }
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19483,24 +19498,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             __thread_affinity(threadpool->workers[0].cpumask);
         }
 
-        if (!threadpool->poll) {
-            ggml_mutex_lock(&threadpool->mutex);
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-               __ggml_resume_threadpool(threadpool);
-            } else {
-               ggml_cond_broadcast(&threadpool->cond);
-            }
-            ggml_mutex_unlock(&threadpool->mutex);
+        // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+        ggml_mutex_lock(&threadpool->mutex);
+        threadpool->new_work = true;
+        if (!threadpool->pause) {
+           ggml_cond_broadcast(&threadpool->cond);
         } else {
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-                ggml_mutex_lock(&threadpool->mutex);
-                __ggml_resume_threadpool(threadpool);
-                ggml_mutex_unlock(&threadpool->mutex);
-            }
+           // resume does cond broadcast
+           __ggml_resume_threadpool(threadpool);
         }
+        ggml_mutex_unlock(&threadpool->mutex);
     }
+
     // this is a work thread too
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif