From cc1c017191b70f6f888fb3c6730ee8a3f0dba595 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Sep 2024 09:11:42 +0300 Subject: [PATCH] naming : normalize the name of callback-related identifiers ggml-ci --- common/common.cpp | 2 +- common/common.h | 4 +- .../cvector-generator/cvector-generator.cpp | 26 ++--- examples/eval-callback/eval-callback.cpp | 26 ++--- examples/imatrix/imatrix.cpp | 2 +- ggml/include/ggml-backend.h | 10 +- ggml/include/ggml-metal.h | 4 +- ggml/include/ggml.h | 14 +-- ggml/src/ggml-backend.c | 52 ++++----- ggml/src/ggml-metal.m | 90 +++++++-------- ggml/src/ggml.c | 42 +++---- include/llama.h | 39 +++---- src/llama.cpp | 104 +++++++++--------- tests/test-model-load-cancel.cpp | 2 +- 14 files changed, 202 insertions(+), 215 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 8d0ed4f95a737..dc571037c396a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1018,7 +1018,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.attention_type = params.attention_type; cparams.defrag_thold = params.defrag_thold; cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.cb_eval_ctx = params.cb_eval_ctx; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; diff --git a/common/common.h b/common/common.h index e100c8fa73ecd..e882890133436 100644 --- a/common/common.h +++ b/common/common.h @@ -173,8 +173,8 @@ struct gpt_params { struct cpu_params draft_cpuparams; struct cpu_params draft_cpuparams_batch; - ggml_backend_sched_eval_callback cb_eval = nullptr; - void * cb_eval_user_data = nullptr; + ggml_backend_sched_eval_callback cb_eval = nullptr; + void * cb_eval_ctx = nullptr; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 41bf4eb2a406c..c29b4a83738a9 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -50,7 +50,7 @@ static void print_usage(int, char ** argv) { // cb_eval is reused for each pair of positive - negative prompt -struct callback_data { +struct callback_context { ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered int n_layers = 0; @@ -155,7 +155,7 @@ struct callback_data { return diff_filtered; } - // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors + // we don't implement destructor, because we want to reuse callback_context. we just want to free the tensors void reset() { for (auto ptr : v_pos) free(ptr->data); for (auto ptr : v_neg) free(ptr->data); @@ -320,7 +320,7 @@ static std::vector ctrlvec_load_prompt_file(std::string path, bool ////////////////////////////////////////////////// static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; + auto * cb_ctx = (callback_context *) user_data; static const char * l_out_name = "l_out"; const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; @@ -328,12 +328,12 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { return is_l_out; } - if (!is_l_out || t->ne[1] != cb_data->n_tokens) { + if (!is_l_out || t->ne[1] != cb_ctx->n_tokens) { return true; } // save the tensor to current context - cb_data->save_tensor_for_layer(t); + cb_ctx->save_tensor_for_layer(t); return true; } @@ -400,12 +400,12 @@ int main(int argc, char ** argv) { } - callback_data cb_data; + callback_context cb_ctx; // pass the callback to the backend scheduler // it will be executed for each node during the graph computation params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; + params.cb_eval_ctx = &cb_ctx; params.warmup = false; print_build_info(); @@ -445,8 +445,8 @@ int main(int argc, char ** argv) { for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { bool success = false; tokenized_prompt t = tokenized_prompts[i]; - cb_data.n_layers = n_layers; - cb_data.n_tokens = t.max_seq_len; + cb_ctx.n_layers = n_layers; + cb_ctx.n_tokens = t.max_seq_len; printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", (int) i+1, (int) ctx_train.positive_entries.size(), @@ -454,22 +454,22 @@ int main(int argc, char ** argv) { tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), (int) t.max_seq_len); - cb_data.is_eval_pos = true; + cb_ctx.is_eval_pos = true; success = get_hidden_layers(ctx, t.tokens_pos); if (!success) break; - cb_data.is_eval_pos = false; + cb_ctx.is_eval_pos = false; success = get_hidden_layers(ctx, t.tokens_neg); if (!success) break; // calculate diff and remove all zero rows - auto v_diff_filtered = cb_data.calc_diff(); + auto v_diff_filtered = cb_ctx.calc_diff(); // save & concat the filtered v_diff to ctx_train ctx_train.concat_diff_tmp(v_diff_filtered); // reset for next iteration - cb_data.reset(); + cb_ctx.reset(); } // done with the model, we can now free it to make gain some memory diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6d629fe4ef189..25b18f60e0bf3 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -12,7 +12,7 @@ * This the arbitrary data which will be passed to each callback. * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. */ -struct callback_data { +struct callback_context { std::vector data; }; @@ -27,7 +27,7 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { +static void ggml_print_tensor(const uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { @@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); } else if (type == GGML_TYPE_F32) { - v = *(float *) &data[i]; + v = *(const float *) &data[i]; } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) &data[i]; + v = (float) *(const int32_t *) &data[i]; } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) &data[i]; + v = (float) *(const int16_t *) &data[i]; } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) &data[i]; + v = (float) *(const int8_t *) &data[i]; } else { GGML_ABORT("fatal error"); } @@ -88,7 +88,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne * @return true to receive data or continue the graph, false otherwise */ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; + auto * cb_ctx = (callback_context *) user_data; const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; @@ -114,12 +114,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (!is_host) { auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + cb_ctx->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_ctx->data.data(), 0, n_bytes); } if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + uint8_t * data = is_host ? (uint8_t *) t->data : cb_ctx->data.data(); ggml_print_tensor(data, t->type, t->ne, t->nb, 3); } @@ -140,7 +140,7 @@ static bool run(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - callback_data cb_data; + callback_context cb_ctx; gpt_params params; @@ -156,7 +156,7 @@ int main(int argc, char ** argv) { // pass the callback to the backend scheduler // it will be executed for each node during the graph computation params.cb_eval = ggml_debug; - params.cb_eval_user_data = &cb_data; + params.cb_eval_ctx = &cb_ctx; params.warmup = false; // init diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 26528169978f8..5f2f6bfb63401 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -602,7 +602,7 @@ int main(int argc, char ** argv) { // pass the callback to the backend scheduler // it will be executed for each node during the graph computation params.cb_eval = ik_collect_imatrix; - params.cb_eval_user_data = NULL; + params.cb_eval_ctx = NULL; params.warmup = false; // init diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index e497b6d02388a..b1f54f7e559b8 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -104,7 +104,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback cb, void * cb_ctx); // Create a backend buffer from an existing pointer GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); @@ -177,7 +177,7 @@ extern "C" { // when ask == false, the scheduler is passing the node tensor to the user for observation // if the user returns false, the scheduler will cancel the graph compute // - typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); + typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * cb_ctx); // Initialize a backend scheduler GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); @@ -208,7 +208,7 @@ extern "C" { GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute - GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback cb, void * cb_ctx); // // Utils @@ -225,10 +225,10 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * cb_ctx); // Compare the output of two backends - GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback cb_eval, void * cb_eval_ctx); // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index d483cf1ac40c6..19266d3219ed3 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -40,7 +40,7 @@ extern "C" { // user-code should use only these functions // -GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback cb, void * cb_ctx); GGML_API ggml_backend_t ggml_backend_metal_init(void); @@ -50,7 +50,7 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); -GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); +GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback cb, void * cb_ctx); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a413df35750b1..b05f01bea766f 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -620,7 +620,7 @@ extern "C" { // Abort callback // If not NULL, called before ggml computation // If it returns true, the computation is aborted - typedef bool (*ggml_abort_callback)(void * data); + typedef bool (*ggml_abort_callback)(void * cb_ctx); // Scheduling priorities enum ggml_sched_priority { @@ -655,8 +655,8 @@ extern "C" { struct ggml_threadpool * threadpool; // abort ggml_graph_compute when true - ggml_abort_callback abort_callback; - void * abort_callback_data; + ggml_abort_callback cb_abort; + void * cb_abort_ctx; }; // scratch buffer @@ -2143,8 +2143,8 @@ extern "C" { GGML_LINESEARCH_INVALID_PARAMETERS, }; - typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); - typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); + typedef void (*ggml_opt_callback)(void * cb_ctx, int accum_step, float * sched, bool * cancel); + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * cb_ctx); // optimization parameters // @@ -2281,8 +2281,8 @@ extern "C" { struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data); + ggml_opt_callback cb_opt, + void * cb_opt_ctx); // // tensor flags diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index b5d9301a78762..b4d2a5b8d056c 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -728,8 +728,8 @@ struct ggml_backend_cpu_context { void * work_data; size_t work_size; - ggml_abort_callback abort_callback; - void * abort_callback_data; + ggml_abort_callback cb_abort; + void * cb_abort_ctx; }; GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { @@ -772,8 +772,8 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg } } - cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; - cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cpu_plan->cplan.cb_abort = cpu_ctx->cb_abort; + cpu_plan->cplan.cb_abort_ctx = cpu_ctx->cb_abort_ctx; return cpu_plan; } @@ -811,8 +811,8 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t } cplan.work_data = cpu_ctx->work_data; - cplan.abort_callback = cpu_ctx->abort_callback; - cplan.abort_callback_data = cpu_ctx->abort_callback_data; + cplan.cb_abort = cpu_ctx->cb_abort; + cplan.cb_abort_ctx = cpu_ctx->cb_abort_ctx; return ggml_graph_compute(cgraph, &cplan); } @@ -878,12 +878,12 @@ ggml_backend_t ggml_backend_cpu_init(void) { return NULL; } - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->threadpool = NULL; - ctx->work_data = NULL; - ctx->work_size = 0; - ctx->abort_callback = NULL; - ctx->abort_callback_data = NULL; + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->cb_abort = NULL; + ctx->cb_abort_ctx = NULL; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); if (cpu_backend == NULL) { @@ -922,12 +922,12 @@ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool ctx->threadpool = threadpool; } -void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { +void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback cb, void * cb_ctx) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; + ctx->cb_abort = cb; + ctx->cb_abort_ctx = cb_ctx; } GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { @@ -1093,8 +1093,8 @@ struct ggml_backend_sched { struct ggml_context * ctx; - ggml_backend_sched_eval_callback callback_eval; - void * callback_eval_user_data; + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_ctx; char * context_buffer; size_t context_buffer_size; @@ -1814,7 +1814,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } - if (!sched->callback_eval) { + if (!sched->cb_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; @@ -1825,14 +1825,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_tensor * t = split->graph.nodes[j0]; // check if the user needs data from this node - bool need = sched->callback_eval(t, true, sched->callback_eval_user_data); + bool need = sched->cb_eval(t, true, sched->cb_eval_ctx); int j1 = j0; // determine the range [j0, j1] of nodes that can be computed together while (!need && j1 < split->graph.n_nodes - 1) { t = split->graph.nodes[++j1]; - need = sched->callback_eval(t, true, sched->callback_eval_user_data); + need = sched->cb_eval(t, true, sched->cb_eval_ctx); } struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); @@ -1845,7 +1845,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // TODO: pass backend to the callback, then the user can decide if they want to synchronize ggml_backend_synchronize(split_backend); - if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { + if (need && !sched->cb_eval(t, false, sched->cb_eval_ctx)) { break; } @@ -2012,9 +2012,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { } } -void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { - sched->callback_eval = callback; - sched->callback_eval_user_data = user_data; +void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback cb, void * cb_ctx) { + sched->cb_eval = cb; + sched->cb_eval_ctx = cb_ctx; } int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { @@ -2229,7 +2229,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback cb_eval, void * cb_eval_ctx) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); if (copy.buffer == NULL) { return false; @@ -2258,7 +2258,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t } // compare results, calculate rms etc - if (!callback(i, t1, t2, user_data)) { + if (!cb_eval(i, t1, t2, cb_eval_ctx)) { break; } } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f87181d19332e..41d9790f39222 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -236,8 +236,8 @@ bool should_capture_next_compute; // abort ggml_metal_graph_compute if callback returns true - ggml_abort_callback abort_callback; - void * abort_callback_data; + ggml_abort_callback cb_abort; + void * cb_abort_ctx; }; // MSL code @@ -251,32 +251,32 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { +static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * cb_ctx) { fprintf(stderr, "%s", msg); UNUSED(level); - UNUSED(user_data); + UNUSED(cb_ctx); } -ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; -void * ggml_metal_log_user_data = NULL; +static ggml_log_callback ggml_metal_log_cb = ggml_metal_default_log_callback; +static void * ggml_metal_log_cb_ctx = NULL; GGML_ATTRIBUTE_FORMAT(2, 3) static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ - if (ggml_metal_log_callback != NULL) { + if (ggml_metal_log_cb != NULL) { va_list args; va_start(args, format); char buffer[128]; int len = vsnprintf(buffer, 128, format, args); if (len < 128) { - ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); + ggml_metal_log_cb(level, buffer, ggml_metal_log_cb_ctx); } else { char* buffer2 = malloc(len+1); va_end(args); va_start(args, format); vsnprintf(buffer2, len+1, format, args); buffer2[len] = 0; - ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); + ggml_metal_log_cb(level, buffer2, ggml_metal_log_cb_ctx); free(buffer2); } va_end(args); @@ -910,7 +910,7 @@ static enum ggml_status ggml_metal_graph_compute( // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort - if (cb_idx < 2 || ctx->abort_callback == NULL) { + if (cb_idx < 2 || ctx->cb_abort == NULL) { [command_buffer enqueue]; } } @@ -3026,7 +3026,7 @@ static enum ggml_status ggml_metal_graph_compute( [encoder endEncoding]; - if (cb_idx < 2 || ctx->abort_callback == NULL) { + if (cb_idx < 2 || ctx->cb_abort == NULL) { [command_buffer commit]; } }); @@ -3058,7 +3058,7 @@ static enum ggml_status ggml_metal_graph_compute( continue; } - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + if (ctx->cb_abort && ctx->cb_abort(ctx->cb_abort_ctx)) { GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); return GGML_STATUS_ABORTED; } @@ -3225,19 +3225,15 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = nil; - - if (size_aligned > 0) { - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; - } + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; } - if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { + if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); @@ -3314,17 +3310,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].metal = nil; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; - if (size_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; - } + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; } ggml_backend_metal_log_allocated_size(device, size_aligned); @@ -3340,17 +3333,14 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, for (size_t i = 0; i < size; i += size_step) { const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - ctx->buffers[ctx->n_buffers].metal = nil; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; - if (size_step_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; - } + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; } ggml_backend_metal_log_allocated_size(device, size_step_aligned); @@ -3427,9 +3417,9 @@ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, g /* .event_synchronize = */ NULL, }; -void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { - ggml_metal_log_callback = log_callback; - ggml_metal_log_user_data = user_data; +void ggml_backend_metal_log_set_callback(ggml_log_callback cb, void * cb_ctx) { + ggml_metal_log_cb = cb; + ggml_metal_log_cb_ctx = cb_ctx; } static ggml_guid_t ggml_backend_metal_guid(void) { @@ -3467,13 +3457,13 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } -void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { +void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback cb, void * cb_ctx) { GGML_ASSERT(ggml_backend_is_metal(backend)); struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = user_data; + ctx->cb_abort = cb; + ctx->cb_abort_ctx = cb_ctx; } bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { @@ -3491,11 +3481,11 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { ctx->should_capture_next_compute = true; } -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * cb_ctx); // silence warning -GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * cb_ctx) { return ggml_backend_metal_init(); GGML_UNUSED(params); - GGML_UNUSED(user_data); + GGML_UNUSED(cb_ctx); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 490c8d602853b..bdf412f632a45 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -184,7 +184,7 @@ struct backtrace_state { void ** end; }; -static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) { +static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context * context, void * arg) { struct backtrace_state * state = (struct backtrace_state *)arg; uintptr_t pc = _Unwind_GetIP(context); if (pc) { @@ -19951,7 +19951,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + if (state->ith == 0 && cplan->cb_abort && cplan->cb_abort(cplan->cb_abort_ctx)) { state->threadpool->ec = GGML_STATUS_ABORTED; } @@ -21011,8 +21011,8 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { + ggml_opt_callback cb_opt, + void * cb_opt_ctx) { GGML_ASSERT(ggml_is_scalar(f)); GGML_ASSERT(f->type == GGML_TYPE_F32); @@ -21066,8 +21066,8 @@ static enum ggml_opt_result ggml_opt_adam( float fx = 0; ggml_set_zero(opt->adam.g); for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - callback(callback_data, accum_step, &sched, &cancel); + if (cb_opt) { + cb_opt(cb_opt_ctx, accum_step, &sched, &cancel); if (cancel) { return GGML_OPT_RESULT_CANCEL; } @@ -21157,8 +21157,8 @@ static enum ggml_opt_result ggml_opt_adam( fx = 0; ggml_set_zero(opt->adam.g); for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { - callback(callback_data, accum_step, &sched, &cancel); + if (cb_opt) { + cb_opt(cb_opt_ctx, accum_step, &sched, &cancel); if (cancel) { return GGML_OPT_RESULT_CANCEL;; } @@ -21254,8 +21254,8 @@ static enum ggml_opt_result linesearch_backtracking( const int np, struct ggml_tensor * ps[], bool * cancel, - ggml_opt_callback callback, - void * callback_data) { + ggml_opt_callback cb_opt, + void * cb_opt_ctx) { int count = 0; float width = 0.0f; @@ -21297,10 +21297,10 @@ static enum ggml_opt_result linesearch_backtracking( *fx = 0; memset(g, 0, sizeof(float)*nx); for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { + if (cb_opt) { // LBFG-S does not support learning rate -> ignore learning schedule float sched = 0; - callback(callback_data, accum_step, &sched, cancel); + cb_opt(cb_opt_ctx, accum_step, &sched, cancel); if (*cancel) { return GGML_OPT_RESULT_CANCEL; } @@ -21370,8 +21370,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { + ggml_opt_callback cb_opt, + void * cb_opt_ctx) { if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { @@ -21440,10 +21440,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( fx = 0; memset(g, 0, sizeof(float)*nx); for (int accum_step = 0; accum_step < n_accum; ++accum_step) { - if (callback) { + if (cb_opt) { // LBFG-S does not support learning rate -> ignore learning schedule float sched = 0; - callback(callback_data, accum_step, &sched, &cancel); + cb_opt(cb_opt_ctx, accum_step, &sched, &cancel); if (cancel) { return GGML_OPT_RESULT_CANCEL; } @@ -21516,7 +21516,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // to determine if the optimization should be cancelled // this is a simple change, but not doing this atm, since I don't have a nice // way to test and don't want to break something with so many changes lined up - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, cb_opt, cb_opt_ctx); if (cancel) { return GGML_OPT_RESULT_CANCEL; } @@ -21834,8 +21834,8 @@ enum ggml_opt_result ggml_opt_resume_g( struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, - ggml_opt_callback callback, - void * callback_data) { + ggml_opt_callback cb_opt, + void * cb_opt_ctx) { GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor"); @@ -21845,11 +21845,11 @@ enum ggml_opt_result ggml_opt_resume_g( switch (opt->params.type) { case GGML_OPT_TYPE_ADAM: { - result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, cb_opt, cb_opt_ctx); } break; case GGML_OPT_TYPE_LBFGS: { - result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, cb_opt, cb_opt_ctx); } break; } diff --git a/include/llama.h b/include/llama.h index cfc8d85dc0474..83820769c31d0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -221,7 +221,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void * user_data); + typedef bool (*llama_progress_callback)(float progress, void * cb_ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -290,12 +290,10 @@ extern "C" { const char * rpc_servers; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. - // If the provided progress_callback returns true, model loading continues. + // If the provided cb_progress returns true, model loading continues. // If it returns false, model loading is immediately aborted. - llama_progress_callback progress_callback; - - // context pointer passed to the progress callback - void * progress_callback_user_data; + llama_progress_callback cb_progress; + void * cb_progress_ctx; // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; @@ -331,25 +329,24 @@ extern "C" { uint32_t yarn_orig_ctx; // YaRN original context size float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default) - ggml_backend_sched_eval_callback cb_eval; - void * cb_eval_user_data; - enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. - // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_ctx; // Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution - ggml_abort_callback abort_callback; - void * abort_callback_data; + ggml_abort_callback cb_abort; + void * cb_abort_ctx; + + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // enable flash attention [EXPERIMENTAL] + bool no_perf; // disable performance timings }; // model quantization parameters @@ -373,7 +370,7 @@ extern "C" { } llama_logit_bias; typedef struct llama_sampler_chain_params { - bool no_perf; // whether to measure performance timings + bool no_perf; // disable performance timings } llama_sampler_chain_params; // used in chat template @@ -833,7 +830,7 @@ extern "C" { LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); // Set abort callback - LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); + LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback cb, void * cb_ctx); // Wait until all computations are finished // This is automatically done when using one of the functions below to obtain the computation results @@ -1168,7 +1165,7 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_set(ggml_log_callback cb, void * cb_ctx); // // Performance utils diff --git a/src/llama.cpp b/src/llama.cpp index c917d1c7b5781..af6e4e539a486 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2183,17 +2183,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer struct llama_state { llama_state() { #ifdef GGML_USE_METAL - ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); + ggml_backend_metal_log_set_callback(cb_log, cb_log_ctx); #elif defined(GGML_USE_CUDA) - ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data); + ggml_backend_cuda_log_set_callback(cb_log, cb_log_ctx); #elif defined(GGML_USE_CANN) - ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data); + ggml_backend_cann_log_set_callback(cb_log, cb_log_ctx); #endif } // We save the log callback globally - ggml_log_callback log_callback = llama_log_callback_default; - void * log_callback_user_data = nullptr; + ggml_log_callback cb_log = llama_log_callback_default; + void * cb_log_ctx = nullptr; }; static llama_state g_state; @@ -2491,7 +2491,7 @@ struct llama_cparams { enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; - void * cb_eval_user_data; + void * cb_eval_ctx; }; // TODO: separate into "llama_layer_enc" and "llama_layer_dec" @@ -3263,8 +3263,8 @@ struct llama_context { std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; - ggml_abort_callback abort_callback = nullptr; - void * abort_callback_data = nullptr; + ggml_abort_callback cb_abort = nullptr; + void * cb_abort_ctx = nullptr; // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] @@ -4901,13 +4901,13 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; - // Returns false if cancelled by progress_callback + // Returns false if cancelled by cb_progress bool load_all_data( struct ggml_context * ctx, llama_buf_map & bufs_mmap, llama_mlocks * lmlocks, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { + llama_progress_callback cb_progress, + void * cb_progress_ctx) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -4958,8 +4958,8 @@ struct llama_model_loader { continue; } - if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + if (cb_progress) { + if (!cb_progress((float) size_done / size_data, cb_progress_ctx)) { return false; } } @@ -5081,10 +5081,10 @@ struct llama_model_loader { } } } - if (progress_callback) { + if (cb_progress) { // Even though the model is done loading, we still honor // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); + return cb_progress(1.0f, cb_progress_ctx); } } @@ -6651,7 +6651,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } } -// Returns false if cancelled by progress_callback +// Returns false if cancelled by cb_progress static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, @@ -6660,8 +6660,8 @@ static bool llm_load_tensors( int main_gpu, const float * tensor_split, bool use_mlock, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { + llama_progress_callback cb_progress, + void * cb_progress_ctx) { auto & hparams = model.hparams; model.split_mode = split_mode; @@ -8581,7 +8581,7 @@ static bool llm_load_tensors( for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { + if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, cb_progress, cb_progress_ctx)) { return false; } } @@ -8595,7 +8595,7 @@ static bool llm_load_tensors( return true; } -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_cb_progress static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { model.t_start_us = ggml_time_us(); @@ -8651,7 +8651,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, - params.progress_callback, params.progress_callback_user_data + params.cb_progress, params.cb_progress_ctx )) { return -2; } @@ -16046,9 +16046,9 @@ static void llama_graph_compute( #endif if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + ggml_backend_cpu_set_n_threads (lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool (lctx.backend_cpu, threadpool); + ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.cb_abort, lctx.cb_abort_ctx); } #ifdef GGML_USE_BLAS if (lctx.backend_blas != nullptr) { @@ -16208,7 +16208,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_ctx); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -16432,7 +16432,7 @@ static int llama_encode_internal( GGML_ASSERT(n_threads > 0); ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_ctx); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); @@ -17907,8 +17907,8 @@ struct llama_model_params llama_model_default_params() { /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.rpc_servers =*/ nullptr, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, + /*.cb_progress =*/ nullptr, + /*.cb_progress_ctx =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, @@ -17943,17 +17943,17 @@ struct llama_context_params llama_context_default_params() { /*.yarn_beta_slow =*/ 1.0f, /*.yarn_orig_ctx =*/ 0, /*.defrag_thold =*/ -1.0f, - /*.cb_eval =*/ nullptr, - /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, + /*.cb_eval =*/ nullptr, + /*.cb_eval_ctx =*/ nullptr, + /*.cb_abort =*/ nullptr, + /*.cb_abort_ctx =*/ nullptr, /*.logits_all =*/ false, /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, }; return result; @@ -18067,9 +18067,9 @@ struct llama_model * llama_load_model_from_file( llama_model * model = new llama_model; unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { - params.progress_callback_user_data = &cur_percentage; - params.progress_callback = [](float progress, void * ctx) { + if (params.cb_progress == NULL) { + params.cb_progress_ctx = &cur_percentage; + params.cb_progress = [](float progress, void * ctx) { unsigned * cur_percentage_p = (unsigned *) ctx; unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { @@ -18189,8 +18189,8 @@ struct llama_context * llama_new_context_with_model( hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : hparams.n_ctx_train; - cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_ctx = params.cb_eval_ctx; auto rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { @@ -18228,8 +18228,8 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; + ctx->cb_abort = params.cb_abort; + ctx->cb_abort_ctx = params.cb_abort_ctx; ctx->logits_all = params.logits_all; @@ -19971,9 +19971,9 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { return ctx->cparams.n_threads_batch; } -void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; +void llama_set_abort_callback(struct llama_context * ctx, bool (*cb)(void * data), void * cb_ctx) { + ctx->cb_abort = cb; + ctx->cb_abort_ctx = cb_ctx; } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -20761,15 +20761,15 @@ const std::vector> & llama_internal return ctx->model.tensors_by_name; } -void llama_log_set(ggml_log_callback log_callback, void * user_data) { - g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; - g_state.log_callback_user_data = user_data; +void llama_log_set(ggml_log_callback cb, void * cb_ctx) { + g_state.cb_log = cb ? cb : llama_log_callback_default; + g_state.cb_log_ctx = cb_ctx; #ifdef GGML_USE_METAL - ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); + ggml_backend_metal_log_set_callback(g_state.cb_log, g_state.cb_log_ctx); #elif defined(GGML_USE_CUDA) - ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); + ggml_backend_cuda_log_set_callback(g_state.cb_log, g_state.cb_log_ctx); #elif defined(GGML_USE_CANN) - ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); + ggml_backend_cann_log_set_callback(g_state.cb_log, g_state.cb_log_ctx); #endif } @@ -20779,12 +20779,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l char buffer[128]; int len = vsnprintf(buffer, 128, format, args); if (len < 128) { - g_state.log_callback(level, buffer, g_state.log_callback_user_data); + g_state.cb_log(level, buffer, g_state.cb_log_ctx); } else { char * buffer2 = new char[len + 1]; vsnprintf(buffer2, len + 1, format, args_copy); buffer2[len] = 0; - g_state.log_callback(level, buffer2, g_state.log_callback_user_data); + g_state.cb_log(level, buffer2, g_state.cb_log_ctx); delete[] buffer2; } va_end(args_copy); @@ -20797,9 +20797,9 @@ void llama_log_internal(ggml_log_level level, const char * format, ...) { va_end(args); } -void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) { +void llama_log_callback_default(ggml_log_level level, const char * text, void * cb_ctx) { (void) level; - (void) user_data; + (void) cb_ctx; fputs(text, stderr); fflush(stderr); } diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 858535c3c4020..f44928b24c5db 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -17,7 +17,7 @@ int main(int argc, char *argv[] ) { llama_backend_init(); auto params = llama_model_params{}; params.use_mmap = false; - params.progress_callback = [](float progress, void * ctx){ + params.cb_progress = [](float progress, void * ctx){ (void) ctx; return progress > 0.50; };