Skip to content

Commit

Permalink
threads: add simple barrier test
Browse files Browse the repository at this point in the history
This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead.
  • Loading branch information
max-krasnyansky committed Sep 16, 2024
1 parent ed094a5 commit c4411d5
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 0 deletions.
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-grad0.cpp)
llama_target_and_test(test-barrier.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)

Expand Down
93 changes: 93 additions & 0 deletions tests/test-barrier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include "ggml.h"
#include "ggml-backend.h"

#include <chrono>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>

#define MAX_NARGS 2

int main(int argc, char *argv[]) {

int n_threads = 4;
int n_rounds = 100;

if (argc > 1) {
n_threads = std::atoi(argv[1]);
}

if (argc > 2) {
n_rounds = std::atoi(argv[2]);
}

struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};

struct ggml_context * ctx = ggml_init(params);

// Create graph
struct ggml_cgraph * gf = ggml_new_graph(ctx);

// Lots of small, parallel ops where barriers in between will dominate
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
for (int i = 0; i < 1000; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
out = ggml_mul_mat(ctx, a, out);

struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
out = ggml_mul_mat(ctx, d, out);
}

ggml_build_forward_expand(gf, out);
int n_nodes = ggml_graph_n_nodes(gf);

// Create threadpool
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
exit(1);
}

// Create compute plan
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);

std::vector<uint8_t> work_data(cplan.work_size);
cplan.work_data = work_data.data();

std::cerr << "graph-compute with"
<< "\n n_threads: " << n_threads
<< "\n n_nodes: " << n_nodes
<< "\n n_rounds: " << n_rounds
<< "\n";
// ggml_graph_print(gf);

// Warmup
ggml_graph_compute(gf, &cplan);

auto t0 = std::chrono::high_resolution_clock::now();

for (int i=0; i < n_rounds; i++) {
ggml_graph_compute(gf, &cplan);
}

auto t1 = std::chrono::high_resolution_clock::now();

auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
std::cerr << "graph-compute took " << usec << " usec "
<< "\n " << (float) usec / n_rounds << " usec per-iter"
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
<< "\n";

ggml_free(ctx);
ggml_threadpool_free(threadpool);

return 0;
}

0 comments on commit c4411d5

Please sign in to comment.