forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
threadpool : skip polling for unused threads (ggerganov#9461)
* threadpool: skip polling for unused threads Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions). * threadpool: further simplify and improve ggml_barrier Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit. * threads: add simple barrier test This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead. * threadpool: improve thread sync for new-graphs Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order to keep it efficient, once the new graph is detected we do full fence using read-modify-write with strict memory order. * threadpool: improve abort handling Do not use threadpool->ec (exit code) to decide whether to exit the compute loop. threadpool->ec is not atomic which makes thread-sanitizer rightfully unhappy about it. Instead introduce atomic threadpool->abort flag used for this. This is consistent with how we handle threadpool->stop or pause. While at it add an explicit atomic_load for n_threads_cur for consistency. * test-barrier: release threadpool before releasing the context fixes use-after-free detected by gcc thread-sanitizer on x86-64 for some reason llvm sanitizer is not detecting this issue.
- Loading branch information
1 parent
a611cbc
commit 87163c3
Showing
3 changed files
with
168 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include "ggml.h" | ||
#include "ggml-backend.h" | ||
|
||
#include <chrono> | ||
#include <iostream> | ||
#include <cstdio> | ||
#include <cstdlib> | ||
#include <cassert> | ||
#include <vector> | ||
|
||
#define MAX_NARGS 2 | ||
|
||
int main(int argc, char *argv[]) { | ||
|
||
int n_threads = 4; | ||
int n_rounds = 100; | ||
|
||
if (argc > 1) { | ||
n_threads = std::atoi(argv[1]); | ||
} | ||
|
||
if (argc > 2) { | ||
n_rounds = std::atoi(argv[2]); | ||
} | ||
|
||
struct ggml_init_params params = { | ||
/* .mem_size = */ 1024*1024*1024, | ||
/* .mem_buffer = */ NULL, | ||
/* .no_alloc = */ false, | ||
}; | ||
|
||
struct ggml_context * ctx = ggml_init(params); | ||
|
||
// Create graph | ||
struct ggml_cgraph * gf = ggml_new_graph(ctx); | ||
|
||
// Lots of small, parallel ops where barriers in between will dominate | ||
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64); | ||
for (int i = 0; i < 1000; i++) { | ||
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); | ||
out = ggml_mul_mat(ctx, a, out); | ||
|
||
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64); | ||
out = ggml_mul_mat(ctx, d, out); | ||
} | ||
|
||
ggml_build_forward_expand(gf, out); | ||
int n_nodes = ggml_graph_n_nodes(gf); | ||
|
||
// Create threadpool | ||
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads); | ||
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); | ||
if (!threadpool) { | ||
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads); | ||
exit(1); | ||
} | ||
|
||
// Create compute plan | ||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); | ||
|
||
std::vector<uint8_t> work_data(cplan.work_size); | ||
cplan.work_data = work_data.data(); | ||
|
||
std::cerr << "graph-compute with" | ||
<< "\n n_threads: " << n_threads | ||
<< "\n n_nodes: " << n_nodes | ||
<< "\n n_rounds: " << n_rounds | ||
<< "\n"; | ||
// ggml_graph_print(gf); | ||
|
||
// Warmup | ||
ggml_graph_compute(gf, &cplan); | ||
|
||
auto t0 = std::chrono::high_resolution_clock::now(); | ||
|
||
for (int i=0; i < n_rounds; i++) { | ||
ggml_graph_compute(gf, &cplan); | ||
} | ||
|
||
auto t1 = std::chrono::high_resolution_clock::now(); | ||
|
||
auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); | ||
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count(); | ||
std::cerr << "graph-compute took " << usec << " usec " | ||
<< "\n " << (float) usec / n_rounds << " usec per-iter" | ||
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node" | ||
<< "\n"; | ||
|
||
ggml_threadpool_free(threadpool); | ||
ggml_free(ctx); | ||
|
||
return 0; | ||
} |