Skip to content

Commit

Permalink
threadpool: improve thread sync for new-graphs
Browse files Browse the repository at this point in the history
Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order
to keep it efficient, once the new graph is detected we do full fence using read-modify-write
with strict memory order.
  • Loading branch information
max-krasnyansky committed Sep 16, 2024
1 parent c4411d5 commit b9763b3
Showing 1 changed file with 15 additions and 2 deletions.
17 changes: 15 additions & 2 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -19963,12 +19963,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

#ifndef GGML_USE_OPENMP

// check if thread is active
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
return (state->ith < n_threads);
}

// check if thread is ready to proceed (exit from polling or sleeping)
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;

Expand All @@ -19984,6 +19986,14 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
return state->pending;
}

// sync thread state after polling
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
// so instead we just use a dummy read-modify-write
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
}

static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;

Expand All @@ -20008,6 +20018,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
struct ggml_threadpool * threadpool = state->threadpool;

if (ggml_graph_compute_poll_for_work(state)) {
ggml_graph_compute_thread_sync(state);
return state->pending;
}

Expand Down Expand Up @@ -20063,7 +20074,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
// Start processing new graph
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
{
// always take the mutex here because the worker threads are doing hybrid poll/wait
// Always take the mutex here because the worker threads are doing hybrid poll/wait

ggml_mutex_lock(&threadpool->mutex);

Expand All @@ -20072,7 +20083,9 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
// Update the number of active threads
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);

atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
// Indicate the graph is ready to be processed
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);

if (threadpool->pause) {
// Update main thread prio and affinity to match the threadpool settings
Expand Down

0 comments on commit b9763b3

Please sign in to comment.