Skip to content

Commit

Permalink
ggml : remove ggml_cplan + rework ggml_cgraph
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Sep 11, 2024
1 parent 92a9686 commit 28aa818
Show file tree
Hide file tree
Showing 10 changed files with 272 additions and 211 deletions.
28 changes: 12 additions & 16 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = 5e-6f;
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
) {
Expand Down Expand Up @@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

std::vector<uint8_t> work_buffer;

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand All @@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

float error_before_opt = ggml_get_f32_1d(e, 0);

Expand All @@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

float error_after_opt = ggml_get_f32_1d(e, 0);

Expand Down Expand Up @@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
Expand Down
23 changes: 7 additions & 16 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
if (tensor->type == GGML_TYPE_F32) {
Expand Down Expand Up @@ -179,9 +168,8 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

std::vector<uint8_t> work_buffer;

ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
ggml_graph_work_init(gf, nullptr);

TENSOR_DUMP(ggml_graph_node(gf, 0));

Expand Down Expand Up @@ -234,7 +222,7 @@ int main(int argc, char ** argv) {

long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
ggml_graph_compute(gf31);

long long int stop = ggml_time_us();
long long int usec = stop-start;
Expand Down Expand Up @@ -267,8 +255,11 @@ int main(int argc, char ** argv) {
}

// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
ggml_graph_compute(gf32);
}

ggml_graph_work_free(gf);

printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");
Expand Down
4 changes: 3 additions & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, model.ctx);
ggml_graph_compute(gf);
struct ggml_tensor* result = ggml_graph_node(gf, -1);

memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
Expand Down
82 changes: 51 additions & 31 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -644,20 +644,6 @@ extern "C" {

typedef struct ggml_threadpool * ggml_threadpool_t;

// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};

// scratch buffer
struct ggml_scratch {
size_t offs;
Expand Down Expand Up @@ -2068,23 +2054,57 @@ extern "C" {
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

// loops through the graph and determines:
//
// - work size needed for CPU computation
// - number of threads to start
//
GGML_API enum ggml_status ggml_graph_prepare(
struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );

// get the estimated work size for the graph from ggml_graph_prepare()
GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);

// if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
// otherwise, the work buffer will be allocated in the context. no need to free it
GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
GGML_API void ggml_graph_work_free(struct ggml_cgraph * cgraph);

GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);

// note: call ggml_graph_prepare() and ggml_graph_work_init() first
//
// sample usages:
//
// - no dynamic allocations:
//
// ... prepare ggml_context ctx ...
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, ctx);
//
// ggml_graph_compute (cgraph); // can call many times
//
// // no need to call ggml_graph_work_free() because it is allocated in ctx
//
// - dynamic allocations:
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, NULL); // will allocate memory
//
// ggml_graph_compute (cgraph); // can call many times
//
// ggml_graph_work_free(cgraph);
//
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);

GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

Expand Down
37 changes: 20 additions & 17 deletions ggml/src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -751,8 +751,10 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
GGML_UNUSED(backend);
}

// TODO: this struct should no longer be needed
// instead, the new ggml_graph_work_init() + ggml_graph_work_free() API should be enough to replace this
// for now, keeping the implementation as it is, to avoid making a mistake
struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
struct ggml_cgraph cgraph;
};

Expand All @@ -761,27 +763,27 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg

struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
if (cpu_plan->cplan.work_data == NULL) {
if (cpu_plan->cgraph.work_size > 0) {
cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
if (cpu_plan->cgraph.work_data == NULL) {
free(cpu_plan);
return NULL;
}
}

cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cpu_plan->cgraph.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;

return cpu_plan;
}

GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

free(cpu_plan->cplan.work_data);
free(cpu_plan->cgraph.work_data);
free(cpu_plan);

GGML_UNUSED(backend);
Expand All @@ -790,31 +792,32 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
return ggml_graph_compute(&cpu_plan->cgraph);

GGML_UNUSED(backend);
}

GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

if (cpu_ctx->work_size < cplan.work_size) {
if (cpu_ctx->work_size < cgraph->work_size) {
free(cpu_ctx->work_data);
cpu_ctx->work_data = malloc(cplan.work_size);
cpu_ctx->work_data = malloc(cgraph->work_size);
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
cpu_ctx->work_size = cgraph->work_size;
}
cplan.work_data = cpu_ctx->work_data;
cgraph->work_data = cpu_ctx->work_data;
cgraph->work_own = false; // always freed by ggml_backend_cpu_graph_plan_free

cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cgraph->abort_callback = cpu_ctx->abort_callback;
cgraph->abort_callback_data = cpu_ctx->abort_callback_data;

return ggml_graph_compute(cgraph, &cplan);
return ggml_graph_compute(cgraph);
}

GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,17 @@ struct ggml_cgraph {
struct ggml_hash_set visited_hash_set;

enum ggml_cgraph_eval_order order;

bool work_own;
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
Expand Down
Loading

0 comments on commit 28aa818

Please sign in to comment.