threadpool: use relaxed order for chunk sync

Full memory barrier is an overkill for this since each thread works on different chunk
ggerganov · Aug 22, 2024 · 6b22b53 · 6b22b53
1 parent d90a9c3
commit 6b22b53
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -88,6 +88,10 @@ typedef enum {
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
 static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
@@ -12471,7 +12475,7 @@ UseGgmlGemm1:;
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->threadpool->current_chunk, nth);
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
     }
 
     ggml_barrier(params->threadpool);
@@ -12582,7 +12586,7 @@ UseGgmlGemm2:;
             break;
         }
 
-        current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1);
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
     }
 }