diff --git a/common/common.cpp b/common/common.cpp index b294dd6bf2ac5..11fc1e2233388 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset_context(lctx); + llama_perf_context_reset(lctx); } iparams.model = model; diff --git a/common/sampling.cpp b/common/sampling.cpp index 21403e213fb01..8e429cac59072 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print_sampler(gsmpl->chain); + llama_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 931a05286c64e..c2e854ea46a04 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -209,7 +209,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index a6c1b64e947a5..10f2e7fd117a1 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print_sampler(smpl) -llama_perf_print_context(context) +llama_perf_sampler_print(smpl) +llama_perf_context_print(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 8dc35e73d649e..f1df20c6ecf09 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_sampler(smpl); - llama_perf_print_context(ctx); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e7134608a2a7c..5661cf0b78142 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6f0e59dd853ca..af389abe1aac1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index e9eda9575c098..73b54da7fd4a9 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 451b8d9ffaba8..2d90f65a07e52 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 3419da4d43222..12fe7345ff76c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 56135cf2f898e..f36caa42e11a3 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print_context(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 8277487814156..758393c3d767a 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e53513b41ce45..52aa68bfcdf3c 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index cfb3b5e056ec4..1bdb6521c640f 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index ef20aa86bf162..d08679edb3d14 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print_context(ctx); + llama_perf_context_print(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index dabd619ead4ad..0c923d4edf68f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print_sampler(smpl); - llama_perf_print_context(ctx); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index f82c21ce85297..843579acd2222 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print_context(ctx_dft); + llama_perf_context_print(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index e63daea9b0648..d0e0b3a6105e4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1169,7 +1169,7 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - struct llama_perf_data_context { + struct llama_perf_context_data { double t_start_ms; double t_load_ms; double t_p_eval_ms; @@ -1179,20 +1179,20 @@ extern "C" { int32_t n_eval; }; - struct llama_perf_data_sampler { + struct llama_perf_sampler_data { double t_sample_ms; int32_t n_sample; }; - LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx); - LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain); + LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); + LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); + LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); - LLAMA_API void llama_perf_print_context(const struct llama_context * ctx); - LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain); - - LLAMA_API void llama_perf_reset_context(struct llama_context * ctx); - LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain); + // NOTE: the following work only with samplers constructed via llama_sampler_chain_init + LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6f448b80c44c1..d17e4427ea75a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1599,3 +1599,39 @@ struct llama_sampler * llama_sampler_init_logit_bias( }, }; } + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + // TODO: return empty data, or GGML_ABORT() ? + return data; + } + + const auto * p = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + // TODO: return empty data, or GGML_ABORT() ? + return; + } + + auto * p = (struct llama_sampler_chain *) chain->ctx; + + p->t_sample_us = p->n_sample = 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index 085a8cd3b909c..d3ba80ebab085 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20687,8 +20687,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } -struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) { - struct llama_perf_data_context data = {}; +struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_context_data data = {}; if (ctx == nullptr) { return data; @@ -20704,22 +20704,7 @@ struct llama_perf_data_context llama_perf_context(const struct llama_context * c return data; } -struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) { - struct llama_perf_data_sampler data = {}; - - if (chain == nullptr) { - return data; - } - - const auto * p = (const struct llama_sampler_chain *) chain->ctx; - - data.t_sample_ms = 1e-3 * p->t_sample_us; - data.n_sample = std::max(0, p->n_sample); - - return data; -} - -void llama_perf_print_context(const struct llama_context * ctx) { +void llama_perf_context_print(const struct llama_context * ctx) { const auto data = llama_perf_context(ctx); const double t_end_ms = 1e-3 * ggml_time_us(); @@ -20732,25 +20717,12 @@ void llama_perf_print_context(const struct llama_context * ctx) { LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } -void llama_perf_print_sampler(const struct llama_sampler * chain) { - const auto data = llama_perf_sampler(chain); - - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); -} - -void llama_perf_reset_context(struct llama_context * ctx) { +void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_start_us = ggml_time_us(); ctx->t_eval_us = ctx->n_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0; } -void llama_perf_reset_sampler(struct llama_sampler * chain) { - auto * p = (struct llama_sampler_chain *) chain->ctx; - - p->t_sample_us = p->n_sample = 0; -} - void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { fprintf(stream, "\n"); fprintf(stream, "###########\n");