From 323181f2abb7ab9f03f03b8ee160efe66f5624eb Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sun, 11 Aug 2024 11:20:32 -0700 Subject: [PATCH] threadpool: add support for hybrid polling poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var. poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ... The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms. We can tune this further as things evolve. --- common/common.cpp | 2 +- common/common.h | 2 +- ggml/include/ggml.h | 14 ++++---- ggml/src/ggml.c | 86 +++++++++++++++++++++++++-------------------- 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index df6e1624ef7b5..3a202be1b73d8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); - options.push_back({ "*", " --poll <0|1>", "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll}); + options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll}); options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", diff --git a/common/common.h b/common/common.h index b0c32f949b273..eb87e8880ffa5 100644 --- a/common/common.h +++ b/common/common.h @@ -73,7 +73,7 @@ struct cpu_params { bool mask_valid = false; // Default: any CPU int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) bool strict_cpu = false; // Use strict CPU placement - bool poll = true; // Use polling (busywait) to wait for work (default matches OpenMP) + uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; struct gpt_params { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 923182d9d9710..910171c07c00c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -625,13 +625,13 @@ extern "C" { typedef bool (*ggml_abort_callback)(void * data); struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; - bool mask_specified; - int32_t n_threads; - int32_t prio; - bool poll; - bool strict_cpu; - bool paused; + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores + bool mask_specified; // mask is non-empty + int32_t n_threads; // number of threads + int32_t prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state }; struct ggml_compute_threadpool; // forward declaration, see ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 308e569856c70..f0c0f4fb0a29a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool { int32_t prio; // Scheduling priority bool disposable; // Doesn't initialize a conv-var - bool poll; // Use polling (busywait) // TODO + uint32_t poll; // Polling level (0 - no polling) ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void * abort_callback_data; @@ -19235,35 +19235,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } +#ifndef GGML_USE_OPENMP +static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) { + struct ggml_compute_threadpool * threadpool = state->threadpool; + return (threadpool->new_work && state->ith < threadpool->n_threads_cur); +} -#ifndef GGML_USE_OPENMP +static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { + struct ggml_compute_threadpool * threadpool = state->threadpool; + if (threadpool->stop || threadpool->pause) return true; + return ggml_graph_compute_got_work(state); +} + +static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { + struct ggml_compute_threadpool * threadpool = state->threadpool; + + // This seems to make 0 ... 100 a decent range for polling level across modern processors. + // Perhaps, we can adjust it dynamically based on load and things. + const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; + + for (uint64_t i=0; !ggml_graph_compute_ready(state) && ithreadpool; - if (threadpool->poll) { - while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) || - threadpool->stop || - threadpool->pause - ) - ) { - // No new work. Yield and keep polling. - __cpu_relax(); - } - } else { - ggml_mutex_lock_shared(&threadpool->mutex); - while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) || - threadpool->stop || - threadpool->pause - ) - ) { - // No new work. Wait for the signal. - ggml_cond_wait(&threadpool->cond, &threadpool->mutex); - } - ggml_mutex_unlock_shared(&threadpool->mutex); + if (ggml_graph_compute_poll_for_work(state)) { + return ggml_graph_compute_got_work(state); + } + + ggml_mutex_lock_shared(&threadpool->mutex); + while (!ggml_graph_compute_ready(state)) { + // No new work. Wait for the signal. + GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith); + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); } - return threadpool->new_work; + ggml_mutex_unlock_shared(&threadpool->mutex); + + return ggml_graph_compute_got_work(state); } static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { @@ -19483,24 +19498,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl __thread_affinity(threadpool->workers[0].cpumask); } - if (!threadpool->poll) { - ggml_mutex_lock(&threadpool->mutex); - threadpool->new_work = true; - if (threadpool->pause) { - __ggml_resume_threadpool(threadpool); - } else { - ggml_cond_broadcast(&threadpool->cond); - } - ggml_mutex_unlock(&threadpool->mutex); + // always take the mutex here because the worker threads are doing hybrid poll/wait + + ggml_mutex_lock(&threadpool->mutex); + threadpool->new_work = true; + if (!threadpool->pause) { + ggml_cond_broadcast(&threadpool->cond); } else { - threadpool->new_work = true; - if (threadpool->pause) { - ggml_mutex_lock(&threadpool->mutex); - __ggml_resume_threadpool(threadpool); - ggml_mutex_unlock(&threadpool->mutex); - } + // resume does cond broadcast + __ggml_resume_threadpool(threadpool); } + ggml_mutex_unlock(&threadpool->mutex); } + // this is a work thread too ggml_graph_compute_thread(&threadpool->workers[0]); #endif