diff --git a/common/arg.cpp b/common/arg.cpp index ce6a276148c7d..a1cd5830f9303 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.prompt = value; } )); + add_opt(llama_arg( + {"--no-perf"}, + format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](gpt_params & params) { + params.no_perf = true; + params.sparams.no_perf = true; + } + ).set_env("LLAMA_ARG_NO_PERF")); add_opt(llama_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", diff --git a/common/common.cpp b/common/common.cpp index 30c6e84c795f7..f3304b3575026 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_reset(lctx); } iparams.model = model; @@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); diff --git a/common/common.h b/common/common.h index 23babdd09f1d1..e8025aeef57fa 100644 --- a/common/common.h +++ b/common/common.h @@ -124,6 +124,7 @@ struct gpt_sampler_params { float mirostat_eta = 0.10f; // learning rate bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; + bool no_perf = false; // disable performance metrics std::vector samplers = { GPT_SAMPLER_TYPE_TOP_K, @@ -246,6 +247,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool no_perf = false; // disable performance metrics bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch diff --git a/common/sampling.cpp b/common/sampling.cpp index 4498feb117b7e..c07b5e9409bef 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_perf = false; // TODO: control via params + lparams.no_perf = params.no_perf; auto * result = new gpt_sampler { /* .params = */ params, @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 89a4566c4b827..ec00fcf78d7ac 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -187,7 +187,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 9f7c49492dda1..10f2e7fd117a1 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) -llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) +llama_perf_sampler_print(smpl) +llama_perf_context_print(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 5d32153fe1a9a..f1df20c6ecf09 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index db00c636330fc..e94ae295558ba 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -306,7 +306,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index bc72031434103..af389abe1aac1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 032a901365640..73b54da7fd4a9 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d7db5af722a60..2d90f65a07e52 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e9108a9bdbd4b..12fe7345ff76c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index afc74d27939cc..3ac455e69c800 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index fff44a499e4bc..be6f8d7d7b6e9 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -240,8 +240,7 @@ int main(int argc, char ** argv){ LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index bc6301311d941..758393c3d767a 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d3d5ab46fa0db..52aa68bfcdf3c 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 04df65b0a5892..29ff86bbc358e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 7a360b731916a..d08679edb3d14 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3fdc0439445e8..0c923d4edf68f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 214e4932ba2ca..843579acd2222 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index 405af912c4686..c3408aaf1e4ee 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,7 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - //bool no_perf; // whether to measure performance timings, TODO: implement + bool no_perf; // whether to measure performance timings // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -1173,13 +1173,30 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - enum llama_perf_type { - LLAMA_PERF_TYPE_CONTEXT = 0, - LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, + struct llama_perf_context_data { + double t_start_ms; + double t_load_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_p_eval; + int32_t n_eval; }; - LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); - LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + struct llama_perf_sampler_data { + double t_sample_ms; + + int32_t n_sample; + }; + + LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); + LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); + LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); + + // NOTE: the following work only with samplers constructed via llama_sampler_chain_init + LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index fd1b7f9196f37..dd86f2661fc09 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1656,3 +1656,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return LLAMA_DEFAULT_SEED; } + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * ctx->t_sample_us; + data.n_sample = std::max(0, ctx->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + auto * ctx = (struct llama_sampler_chain *) chain->ctx; + + ctx->t_sample_us = ctx->n_sample = 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index f1a95b3a3d09f..4801e4f2c0acf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2482,6 +2482,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool no_perf; enum llama_pooling_type pooling_type; @@ -6657,8 +6658,6 @@ static bool llm_load_tensors( bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - auto & hparams = model.hparams; model.split_mode = split_mode; @@ -8589,14 +8588,13 @@ static bool llm_load_tensors( } } - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { + model.t_start_us = ggml_time_us(); + try { llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); @@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam return -1; } + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; + return 0; } @@ -17945,6 +17947,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -18155,6 +18158,7 @@ struct llama_context * llama_new_context_with_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -20073,10 +20077,14 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_eval++; } else if (ctx->n_queued_tokens > 1) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_p_eval += ctx->n_queued_tokens; } @@ -20683,65 +20691,40 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto * p = (const struct llama_context *) ctx; - - const double t_start_ms = 1e-3 * p->t_start_us; - const double t_end_ms = 1.00 * ggml_time_ms(); - const double t_load_ms = 1e-3 * p->t_load_us; - const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; - const double t_eval_ms = 1e-3 * p->t_eval_us; - - const int32_t n_p_eval = std::max(0, p->n_p_eval); - const int32_t n_eval = std::max(1, p->n_eval); - - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; +struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_context_data data = {}; - const double t_sampler_ms = 1e-3 * p->t_sample_us; + if (ctx == nullptr) { + return data; + } - const int32_t n_sampler = std::max(0, p->n_sample); + data.t_start_ms = 1e-3 * ctx->t_start_us; + data.t_load_ms = 1e-3 * ctx->t_load_us; + data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; + data.t_eval_ms = 1e-3 * ctx->t_eval_us; + data.n_p_eval = std::max(1, ctx->n_p_eval); + data.n_eval = std::max(1, ctx->n_eval); - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); - } break; - default: - GGML_ABORT("invalid perf type"); - } + return data; } -void llama_perf_reset(void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - auto * p = (struct llama_context *) ctx; +void llama_perf_context_print(const struct llama_context * ctx) { + const auto data = llama_perf_context(ctx); - p->t_start_us = ggml_time_us(); - p->t_eval_us = p->n_eval = 0; - p->t_p_eval_us = p->n_p_eval = 0; - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + const double t_end_ms = 1e-3 * ggml_time_us(); - p->t_sample_us = p->n_sample = 0; - } break; - default: - GGML_ABORT("invalid perf type"); - } + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +} + +void llama_perf_context_reset(struct llama_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; } void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {