From 471e7e1e594aab1ebf41b391cbb8fa618961de57 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Sep 2024 20:50:23 +0300 Subject: [PATCH 1/7] llama : llama_perf + option to disable timings during decode ggml-ci --- common/common.cpp | 1 + common/common.h | 1 + common/sampling.cpp | 2 +- common/sampling.h | 1 + include/llama.h | 21 +++++++++++- src/llama.cpp | 84 ++++++++++++++++++++++++++++++--------------- 6 files changed, 81 insertions(+), 29 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c5c4d7508f033..3203faddd634a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2810,6 +2810,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); diff --git a/common/common.h b/common/common.h index d7c08f20a124b..4e2924a56d94d 100644 --- a/common/common.h +++ b/common/common.h @@ -204,6 +204,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool no_perf = false; // no perf (TODO: add llama_arg) bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch diff --git a/common/sampling.cpp b/common/sampling.cpp index 7806b77e06a9f..c66a4582be237 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_perf = false; // TODO: control via params + lparams.no_perf = params.no_perf; auto * result = new gpt_sampler { /* .params = */ params, diff --git a/common/sampling.h b/common/sampling.h index 654e0c513904d..67ad0add41d90 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -39,6 +39,7 @@ struct gpt_sampler_params { float mirostat_eta = 0.10f; // learning rate bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; + bool no_perf = false; // disable performance metrics std::vector samplers = { GPT_SAMPLER_TYPE_TOP_K, diff --git a/include/llama.h b/include/llama.h index 6334fc30d413c..e21a62e260322 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,7 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - //bool no_perf; // whether to measure performance timings, TODO: implement + bool no_perf; // whether to measure performance timings // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -1168,11 +1168,30 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // + // performance timing information + struct llama_perf_data { + // llama_context + double t_start_ms; + double t_load_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_p_eval; + int32_t n_eval; + + // llama_sampler_chain + double t_sample_ms; + + int32_t n_sample; + }; + enum llama_perf_type { LLAMA_PERF_TYPE_CONTEXT = 0, LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, }; + LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); diff --git a/src/llama.cpp b/src/llama.cpp index f590bcd3b9047..c68f5e826ef76 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2482,6 +2482,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool no_perf; enum llama_pooling_type pooling_type; @@ -6657,8 +6658,6 @@ static bool llm_load_tensors( bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - auto & hparams = model.hparams; model.split_mode = split_mode; @@ -8589,14 +8588,13 @@ static bool llm_load_tensors( } } - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { + model.t_start_us = ggml_time_us(); + try { llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); @@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam return -1; } + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; + return 0; } @@ -17939,6 +17941,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -18149,6 +18152,7 @@ struct llama_context * llama_new_context_with_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -20067,10 +20071,14 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_eval++; } else if (ctx->n_queued_tokens > 1) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_p_eval += ctx->n_queued_tokens; } @@ -20677,39 +20685,61 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { +llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) { + llama_perf_data data = {}; + + if (ctx == nullptr) { + return data; + } + switch (type) { case LLAMA_PERF_TYPE_CONTEXT: { const auto * p = (const struct llama_context *) ctx; - const double t_start_ms = 1e-3 * p->t_start_us; - const double t_end_ms = 1.00 * ggml_time_ms(); - const double t_load_ms = 1e-3 * p->t_load_us; - const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; - const double t_eval_ms = 1e-3 * p->t_eval_us; + data.t_start_ms = 1e-3 * p->t_start_us; + data.t_load_ms = 1e-3 * p->t_load_us;; + data.t_p_eval_ms = 1e-3 * p->t_p_eval_us; + data.t_eval_ms = 1e-3 * p->t_eval_us; + data.n_p_eval = std::max(1, p->n_p_eval); + data.n_eval = std::max(1, p->n_eval); + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + const auto * smpl = (const struct llama_sampler *) ctx; + const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - const int32_t n_p_eval = std::max(0, p->n_p_eval); - const int32_t n_eval = std::max(1, p->n_eval); + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + } break; + default: + GGML_ABORT("invalid perf type"); + } + + return data; +} - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); +void llama_perf_print(const void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + const auto data = llama_perf_get(ctx, type); + + const double t_end_ms = 1e-3 * ggml_time_us(); + + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } break; case LLAMA_PERF_TYPE_SAMPLER_CHAIN: { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - - const double t_sampler_ms = 1e-3 * p->t_sample_us; - - const int32_t n_sampler = std::max(0, p->n_sample); + const auto data = llama_perf_get(ctx, type); LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } break; default: GGML_ABORT("invalid perf type"); @@ -20729,7 +20759,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) { case LLAMA_PERF_TYPE_SAMPLER_CHAIN: { auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + auto * p = (struct llama_sampler_chain *) smpl->ctx; p->t_sample_us = p->n_sample = 0; } break; From ade52b6cc6cbb32cd6820bf37e0301cefd6e155c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Sep 2024 08:57:56 +0300 Subject: [PATCH 2/7] common : add llama_arg --- common/common.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 3203faddd634a..c2c79433dd8e6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -995,6 +995,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(llama_arg( + {"--no-perf"}, + format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](gpt_params & params) { + params.no_perf = true; + params.sparams.no_perf = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(llama_arg( {"-p", "--prompt"}, "PROMPT", ex == LLAMA_EXAMPLE_MAIN From fd46535314b8b1b49daa812320cb1489dbed3464 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Sep 2024 13:03:54 +0300 Subject: [PATCH 3/7] Update src/llama.cpp Co-authored-by: Xuan Son Nguyen --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index b640951294b9b..22f7cbbf0736d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20700,7 +20700,7 @@ llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) { const auto * p = (const struct llama_context *) ctx; data.t_start_ms = 1e-3 * p->t_start_us; - data.t_load_ms = 1e-3 * p->t_load_us;; + data.t_load_ms = 1e-3 * p->t_load_us; data.t_p_eval_ms = 1e-3 * p->t_p_eval_us; data.t_eval_ms = 1e-3 * p->t_eval_us; data.n_p_eval = std::max(1, p->n_p_eval); From f42de2426e63a183c5003846fc297992f0759b57 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Sep 2024 09:56:41 +0300 Subject: [PATCH 4/7] perf : separate functions in the API ggml-ci --- common/common.cpp | 2 +- common/sampling.cpp | 4 +- examples/batched-bench/batched-bench.cpp | 2 +- examples/batched.swift/Sources/main.swift | 4 +- examples/batched/batched.cpp | 4 +- examples/embedding/embedding.cpp | 2 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 2 +- examples/llava/llava-cli.cpp | 4 +- examples/llava/minicpmv-cli.cpp | 2 +- examples/lookup/lookup.cpp | 3 +- examples/parallel/parallel.cpp | 2 +- examples/passkey/passkey.cpp | 2 +- examples/perplexity/perplexity.cpp | 2 +- examples/retrieval/retrieval.cpp | 2 +- examples/simple/simple.cpp | 4 +- examples/speculative/speculative.cpp | 2 +- include/llama.h | 20 ++-- src/llama.cpp | 107 +++++++++------------- 20 files changed, 76 insertions(+), 98 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4982150361569..b294dd6bf2ac5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_reset_context(lctx); } iparams.model = model; diff --git a/common/sampling.cpp b/common/sampling.cpp index ee290f82a53b6..21403e213fb01 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print_sampler(gsmpl->chain); } if (ctx) { - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a91e7f4bdea08..931a05286c64e 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -209,7 +209,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 9f7c49492dda1..a6c1b64e947a5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) -llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) +llama_perf_print_sampler(smpl) +llama_perf_print_context(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 5d32153fe1a9a..8dc35e73d649e 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_sampler(smpl); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index da7c7925362af..e7134608a2a7c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index bc72031434103..6f0e59dd853ca 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 032a901365640..e9eda9575c098 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d7db5af722a60..451b8d9ffaba8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e9108a9bdbd4b..3419da4d43222 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 3475bbce58562..56135cf2f898e 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index fff44a499e4bc..be6f8d7d7b6e9 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -240,8 +240,7 @@ int main(int argc, char ** argv){ LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index bc6301311d941..8277487814156 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d3d5ab46fa0db..e53513b41ce45 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index c7d617988b2ed..cfb3b5e056ec4 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 7a360b731916a..ef20aa86bf162 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3fdc0439445e8..dabd619ead4ad 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_sampler(smpl); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 214e4932ba2ca..f82c21ce85297 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index cc488f5a601b0..e63daea9b0648 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1169,9 +1169,7 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - // performance timing information - struct llama_perf_data { - // llama_context + struct llama_perf_data_context { double t_start_ms; double t_load_ms; double t_p_eval_ms; @@ -1179,22 +1177,22 @@ extern "C" { int32_t n_p_eval; int32_t n_eval; + }; - // llama_sampler_chain + struct llama_perf_data_sampler { double t_sample_ms; int32_t n_sample; }; - enum llama_perf_type { - LLAMA_PERF_TYPE_CONTEXT = 0, - LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, - }; + LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx); + LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain); - LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_print_context(const struct llama_context * ctx); + LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain); - LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); - LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_reset_context(struct llama_context * ctx); + LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 22f7cbbf0736d..085a8cd3b909c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20687,87 +20687,68 @@ const char * llama_print_system_info(void) { return s.c_str(); } -llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) { - llama_perf_data data = {}; +struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_data_context data = {}; if (ctx == nullptr) { return data; } - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto * p = (const struct llama_context *) ctx; - - data.t_start_ms = 1e-3 * p->t_start_us; - data.t_load_ms = 1e-3 * p->t_load_us; - data.t_p_eval_ms = 1e-3 * p->t_p_eval_us; - data.t_eval_ms = 1e-3 * p->t_eval_us; - data.n_p_eval = std::max(1, p->n_p_eval); - data.n_eval = std::max(1, p->n_eval); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; + data.t_start_ms = 1e-3 * ctx->t_start_us; + data.t_load_ms = 1e-3 * ctx->t_load_us; + data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; + data.t_eval_ms = 1e-3 * ctx->t_eval_us; + data.n_p_eval = std::max(1, ctx->n_p_eval); + data.n_eval = std::max(1, ctx->n_eval); - data.t_sample_ms = 1e-3 * p->t_sample_us; - data.n_sample = std::max(0, p->n_sample); - } break; - default: - GGML_ABORT("invalid perf type"); + return data; +} + +struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_data_sampler data = {}; + + if (chain == nullptr) { + return data; } + const auto * p = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + return data; } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto data = llama_perf_get(ctx, type); +void llama_perf_print_context(const struct llama_context * ctx) { + const auto data = llama_perf_context(ctx); - const double t_end_ms = 1e-3 * ggml_time_us(); + const double t_end_ms = 1e-3 * ggml_time_us(); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto data = llama_perf_get(ctx, type); + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +} - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); - } break; - default: - GGML_ABORT("invalid perf type"); - } +void llama_perf_print_sampler(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } -void llama_perf_reset(void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - auto * p = (struct llama_context *) ctx; +void llama_perf_reset_context(struct llama_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; +} - p->t_start_us = ggml_time_us(); - p->t_eval_us = p->n_eval = 0; - p->t_p_eval_us = p->n_p_eval = 0; - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; +void llama_perf_reset_sampler(struct llama_sampler * chain) { + auto * p = (struct llama_sampler_chain *) chain->ctx; - p->t_sample_us = p->n_sample = 0; - } break; - default: - GGML_ABORT("invalid perf type"); - } + p->t_sample_us = p->n_sample = 0; } void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { From 7362f288337f2b1c6e8ecc1c10e7349a51fc7aab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Sep 2024 09:19:41 +0300 Subject: [PATCH 5/7] perf : safer pointer handling + naming update ggml-ci --- common/common.cpp | 2 +- common/sampling.cpp | 4 +-- examples/batched-bench/batched-bench.cpp | 2 +- examples/batched.swift/Sources/main.swift | 4 +-- examples/batched/batched.cpp | 4 +-- examples/embedding/embedding.cpp | 2 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 2 +- examples/llava/llava-cli.cpp | 4 +-- examples/llava/minicpmv-cli.cpp | 2 +- examples/parallel/parallel.cpp | 2 +- examples/passkey/passkey.cpp | 2 +- examples/perplexity/perplexity.cpp | 2 +- examples/retrieval/retrieval.cpp | 2 +- examples/simple/simple.cpp | 4 +-- examples/speculative/speculative.cpp | 2 +- include/llama.h | 18 ++++++------ src/llama-sampling.cpp | 36 +++++++++++++++++++++++ src/llama.cpp | 36 +++-------------------- 20 files changed, 71 insertions(+), 63 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b294dd6bf2ac5..11fc1e2233388 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset_context(lctx); + llama_perf_context_reset(lctx); } iparams.model = model; diff --git a/common/sampling.cpp b/common/sampling.cpp index 21403e213fb01..8e429cac59072 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print_sampler(gsmpl->chain); + llama_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 931a05286c64e..c2e854ea46a04 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -209,7 +209,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index a6c1b64e947a5..10f2e7fd117a1 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print_sampler(smpl) -llama_perf_print_context(context) +llama_perf_sampler_print(smpl) +llama_perf_context_print(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 8dc35e73d649e..f1df20c6ecf09 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_sampler(smpl); - llama_perf_print_context(ctx); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e7134608a2a7c..5661cf0b78142 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6f0e59dd853ca..af389abe1aac1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index e9eda9575c098..73b54da7fd4a9 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 451b8d9ffaba8..2d90f65a07e52 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 3419da4d43222..12fe7345ff76c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 56135cf2f898e..f36caa42e11a3 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 8277487814156..758393c3d767a 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e53513b41ce45..52aa68bfcdf3c 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index cfb3b5e056ec4..1bdb6521c640f 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index ef20aa86bf162..d08679edb3d14 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index dabd619ead4ad..0c923d4edf68f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_sampler(smpl); - llama_perf_print_context(ctx); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index f82c21ce85297..843579acd2222 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print_context(ctx_dft); + llama_perf_context_print(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index e63daea9b0648..d0e0b3a6105e4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1169,7 +1169,7 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - struct llama_perf_data_context { + struct llama_perf_context_data { double t_start_ms; double t_load_ms; double t_p_eval_ms; @@ -1179,20 +1179,20 @@ extern "C" { int32_t n_eval; }; - struct llama_perf_data_sampler { + struct llama_perf_sampler_data { double t_sample_ms; int32_t n_sample; }; - LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx); - LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain); + LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); + LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); + LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); - LLAMA_API void llama_perf_print_context(const struct llama_context * ctx); - LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain); - - LLAMA_API void llama_perf_reset_context(struct llama_context * ctx); - LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain); + // NOTE: the following work only with samplers constructed via llama_sampler_chain_init + LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6f448b80c44c1..d17e4427ea75a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1599,3 +1599,39 @@ struct llama_sampler * llama_sampler_init_logit_bias( }, }; } + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + // TODO: return empty data, or GGML_ABORT() ? + return data; + } + + const auto * p = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + // TODO: return empty data, or GGML_ABORT() ? + return; + } + + auto * p = (struct llama_sampler_chain *) chain->ctx; + + p->t_sample_us = p->n_sample = 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index 085a8cd3b909c..d3ba80ebab085 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20687,8 +20687,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } -struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) { - struct llama_perf_data_context data = {}; +struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_context_data data = {}; if (ctx == nullptr) { return data; @@ -20704,22 +20704,7 @@ struct llama_perf_data_context llama_perf_context(const struct llama_context * c return data; } -struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) { - struct llama_perf_data_sampler data = {}; - - if (chain == nullptr) { - return data; - } - - const auto * p = (const struct llama_sampler_chain *) chain->ctx; - - data.t_sample_ms = 1e-3 * p->t_sample_us; - data.n_sample = std::max(0, p->n_sample); - - return data; -} - -void llama_perf_print_context(const struct llama_context * ctx) { +void llama_perf_context_print(const struct llama_context * ctx) { const auto data = llama_perf_context(ctx); const double t_end_ms = 1e-3 * ggml_time_us(); @@ -20732,25 +20717,12 @@ void llama_perf_print_context(const struct llama_context * ctx) { LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } -void llama_perf_print_sampler(const struct llama_sampler * chain) { - const auto data = llama_perf_sampler(chain); - - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); -} - -void llama_perf_reset_context(struct llama_context * ctx) { +void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_start_us = ggml_time_us(); ctx->t_eval_us = ctx->n_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0; } -void llama_perf_reset_sampler(struct llama_sampler * chain) { - auto * p = (struct llama_sampler_chain *) chain->ctx; - - p->t_sample_us = p->n_sample = 0; -} - void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { fprintf(stream, "\n"); fprintf(stream, "###########\n"); From f35e9b87cd259cbe9fa618ac9125f040ebc3e4b8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Sep 2024 09:23:50 +0300 Subject: [PATCH 6/7] minor : better local var name --- src/llama-sampling.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 87d975f6083f4..ea83f6de2ead6 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1667,10 +1667,10 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c return data; } - const auto * p = (const struct llama_sampler_chain *) chain->ctx; + const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; - data.t_sample_ms = 1e-3 * p->t_sample_us; - data.n_sample = std::max(0, p->n_sample); + data.t_sample_ms = 1e-3 * ctx->t_sample_us; + data.n_sample = std::max(0, ctx->n_sample); return data; } @@ -1688,7 +1688,7 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) { return; } - auto * p = (struct llama_sampler_chain *) chain->ctx; + auto * ctx = (struct llama_sampler_chain *) chain->ctx; - p->t_sample_us = p->n_sample = 0; + ctx->t_sample_us = ctx->n_sample = 0; } From 444b757bce023e42cae4bd179302bc81132adbe5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Sep 2024 15:08:48 +0300 Subject: [PATCH 7/7] perf : abort on invalid sampler pointer ggml-ci --- src/llama-sampling.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index ea83f6de2ead6..dd86f2661fc09 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1663,8 +1663,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c struct llama_perf_sampler_data data = {}; if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { - // TODO: return empty data, or GGML_ABORT() ? - return data; + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); } const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; @@ -1684,8 +1683,7 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) { void llama_perf_sampler_reset(struct llama_sampler * chain) { if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { - // TODO: return empty data, or GGML_ABORT() ? - return; + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); } auto * ctx = (struct llama_sampler_chain *) chain->ctx;