From 471e7e1e594aab1ebf41b391cbb8fa618961de57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 7 Sep 2024 20:50:23 +0300
Subject: [PATCH 1/7] llama : llama_perf + option to disable timings during
 decode

ggml-ci
---
 common/common.cpp   |  1 +
 common/common.h     |  1 +
 common/sampling.cpp |  2 +-
 common/sampling.h   |  1 +
 include/llama.h     | 21 +++++++++++-
 src/llama.cpp       | 84 ++++++++++++++++++++++++++++++---------------
 6 files changed, 81 insertions(+), 29 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index c5c4d7508f033..3203faddd634a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2810,6 +2810,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
+    cparams.no_perf           = params.no_perf;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/common/common.h b/common/common.h
index d7c08f20a124b..4e2924a56d94d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -204,6 +204,7 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // no perf (TODO: add llama_arg)
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool logits_all        = false; // return logits for all tokens in the batch
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 7806b77e06a9f..c66a4582be237 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
-    lparams.no_perf = false; // TODO: control via params
+    lparams.no_perf = params.no_perf;
 
     auto * result = new gpt_sampler {
         /* .params = */ params,
diff --git a/common/sampling.h b/common/sampling.h
index 654e0c513904d..67ad0add41d90 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -39,6 +39,7 @@ struct gpt_sampler_params {
     float   mirostat_eta      = 0.10f; // learning rate
     bool    penalize_nl       = false; // consider newlines as a repeatable token
     bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics
 
     std::vector<enum gpt_sampler_type> samplers = {
         GPT_SAMPLER_TYPE_TOP_K,
diff --git a/include/llama.h b/include/llama.h
index 6334fc30d413c..e21a62e260322 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -343,7 +343,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-      //bool no_perf;     // whether to measure performance timings, TODO: implement
+        bool no_perf;     // whether to measure performance timings
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -1168,11 +1168,30 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
 
+    // performance timing information
+    struct llama_perf_data {
+        // llama_context
+        double t_start_ms;
+        double t_load_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_p_eval;
+        int32_t n_eval;
+
+        // llama_sampler_chain
+        double t_sample_ms;
+
+        int32_t n_sample;
+    };
+
     enum llama_perf_type {
         LLAMA_PERF_TYPE_CONTEXT       = 0,
         LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
     };
 
+    LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type);
+
     LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
     LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index f590bcd3b9047..c68f5e826ef76 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2482,6 +2482,7 @@ struct llama_cparams {
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
+    bool no_perf;
 
     enum llama_pooling_type pooling_type;
 
@@ -6657,8 +6658,6 @@ static bool llm_load_tensors(
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
-    model.t_start_us = ggml_time_us();
-
     auto & hparams = model.hparams;
 
     model.split_mode   = split_mode;
@@ -8589,14 +8588,13 @@ static bool llm_load_tensors(
         }
     }
 
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
     return true;
 }
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
+    model.t_start_us = ggml_time_us();
+
     try {
         llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
 
@@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         return -1;
     }
 
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = ggml_time_us() - model.t_start_us;
+
     return 0;
 }
 
@@ -17939,6 +17941,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
+        /*.no_perf                     =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -18149,6 +18152,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -20067,10 +20071,14 @@ void llama_synchronize(struct llama_context * ctx) {
 
     // add the evaluation to the stats
     if (ctx->n_queued_tokens == 1) {
-        ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_eval++;
     } else if (ctx->n_queued_tokens > 1) {
-        ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_p_eval += ctx->n_queued_tokens;
     }
 
@@ -20677,39 +20685,61 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
+    llama_perf_data data = {};
+
+    if (ctx == nullptr) {
+        return data;
+    }
+
     switch (type) {
         case LLAMA_PERF_TYPE_CONTEXT:
             {
                 const auto * p = (const struct llama_context *) ctx;
 
-                const double t_start_ms   = 1e-3 * p->t_start_us;
-                const double t_end_ms     = 1.00 * ggml_time_ms();
-                const double t_load_ms    = 1e-3 * p->t_load_us;
-                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
-                const double t_eval_ms    = 1e-3 * p->t_eval_us;
+                data.t_start_ms  = 1e-3 * p->t_start_us;
+                data.t_load_ms   = 1e-3 * p->t_load_us;;
+                data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
+                data.t_eval_ms   = 1e-3 * p->t_eval_us;
+                data.n_p_eval    = std::max(1, p->n_p_eval);
+                data.n_eval      = std::max(1, p->n_eval);
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                const auto * smpl = (const struct llama_sampler       *) ctx;
+                const auto * p    = (const struct llama_sampler_chain *) smpl->ctx;
 
-                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
-                const int32_t n_eval    = std::max(1, p->n_eval);
+                data.t_sample_ms = 1e-3 * p->t_sample_us;
+                data.n_sample    = std::max(0, p->n_sample);
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+
+    return data;
+}
 
-                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
+void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                const auto data = llama_perf_get(ctx, type);
+
+                const double t_end_ms = 1e-3 * ggml_time_us();
+
+                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
                 LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
+                        __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
                 LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
-                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
+                        __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
             } break;
         case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
             {
-                const auto * smpl = (const struct llama_sampler *) ctx;
-                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
-
-                const double t_sampler_ms = 1e-3 * p->t_sample_us;
-
-                const int32_t n_sampler = std::max(0, p->n_sample);
+                const auto data = llama_perf_get(ctx, type);
 
                 LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
+                        __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
             } break;
         default:
             GGML_ABORT("invalid perf type");
@@ -20729,7 +20759,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
         case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
             {
                 auto * smpl = (struct llama_sampler *) ctx;
-                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+                auto * p    = (struct llama_sampler_chain *) smpl->ctx;
 
                 p->t_sample_us = p->n_sample = 0;
             } break;

From ade52b6cc6cbb32cd6820bf37e0301cefd6e155c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Sep 2024 08:57:56 +0300
Subject: [PATCH 2/7] common : add llama_arg

---
 common/common.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index 3203faddd634a..c2c79433dd8e6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -995,6 +995,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
             params.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(llama_arg(
+        {"--no-perf"},
+        format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](gpt_params & params) {
+            params.no_perf = true;
+            params.sparams.no_perf = true;
+        }
+    ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(llama_arg(
         {"-p", "--prompt"}, "PROMPT",
         ex == LLAMA_EXAMPLE_MAIN

From fd46535314b8b1b49daa812320cb1489dbed3464 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Sep 2024 13:03:54 +0300
Subject: [PATCH 3/7] Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index b640951294b9b..22f7cbbf0736d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20700,7 +20700,7 @@ llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
                 const auto * p = (const struct llama_context *) ctx;
 
                 data.t_start_ms  = 1e-3 * p->t_start_us;
-                data.t_load_ms   = 1e-3 * p->t_load_us;;
+                data.t_load_ms   = 1e-3 * p->t_load_us;
                 data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
                 data.t_eval_ms   = 1e-3 * p->t_eval_us;
                 data.n_p_eval    = std::max(1, p->n_p_eval);

From f42de2426e63a183c5003846fc297992f0759b57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Sep 2024 09:56:41 +0300
Subject: [PATCH 4/7] perf : separate functions in the API

ggml-ci
---
 common/common.cpp                         |   2 +-
 common/sampling.cpp                       |   4 +-
 examples/batched-bench/batched-bench.cpp  |   2 +-
 examples/batched.swift/Sources/main.swift |   4 +-
 examples/batched/batched.cpp              |   4 +-
 examples/embedding/embedding.cpp          |   2 +-
 examples/eval-callback/eval-callback.cpp  |   2 +-
 examples/imatrix/imatrix.cpp              |   2 +-
 examples/llama-bench/llama-bench.cpp      |   2 +-
 examples/llava/llava-cli.cpp              |   4 +-
 examples/llava/minicpmv-cli.cpp           |   2 +-
 examples/lookup/lookup.cpp                |   3 +-
 examples/parallel/parallel.cpp            |   2 +-
 examples/passkey/passkey.cpp              |   2 +-
 examples/perplexity/perplexity.cpp        |   2 +-
 examples/retrieval/retrieval.cpp          |   2 +-
 examples/simple/simple.cpp                |   4 +-
 examples/speculative/speculative.cpp      |   2 +-
 include/llama.h                           |  20 ++--
 src/llama.cpp                             | 107 +++++++++-------------
 20 files changed, 76 insertions(+), 98 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4982150361569..b294dd6bf2ac5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_reset_context(lctx);
     }
 
     iparams.model   = model;
diff --git a/common/sampling.cpp b/common/sampling.cpp
index ee290f82a53b6..21403e213fb01 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
 
     if (gsmpl) {
-        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+        llama_perf_print_sampler(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_print_context(ctx);
     }
 }
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index a91e7f4bdea08..931a05286c64e 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -209,7 +209,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 9f7c49492dda1..a6c1b64e947a5 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
 
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
-llama_perf_print(UnsafeRawPointer(smpl),    LLAMA_PERF_TYPE_SAMPLER_CHAIN)
+llama_perf_print_sampler(smpl)
+llama_perf_print_context(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let utf8Count = text.utf8.count
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 5d32153fe1a9a..8dc35e73d649e 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_sampler(smpl);
+    llama_perf_print_context(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index da7c7925362af..e7134608a2a7c 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     // clean up
     llama_batch_free(batch);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index bc72031434103..6f0e59dd853ca 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 032a901365640..e9eda9575c098 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
     g_collector.save_imatrix();
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index d7db5af722a60..451b8d9ffaba8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_print_context(ctx);
 
         llama_free(ctx);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index e9108a9bdbd4b..3419da4d43222 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_print_context(ctx_llava->ctx_llama);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+            llama_perf_print_context(ctx_llava->ctx_llama);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 3475bbce58562..56135cf2f898e 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_print_context(ctx_llava->ctx_llama);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index fff44a499e4bc..be6f8d7d7b6e9 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -240,8 +240,7 @@ int main(int argc, char ** argv){
     LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
     LOG_TEE("\ntarget:\n\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index bc6301311d941..8277487814156 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index d3d5ab46fa0db..e53513b41ce45 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index c7d617988b2ed..cfb3b5e056ec4 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 7a360b731916a..ef20aa86bf162 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx);
 
     // clean up
     llama_batch_free(query_batch);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 3fdc0439445e8..dabd619ead4ad 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_sampler(smpl);
+    llama_perf_print_context(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 214e4932ba2ca..f82c21ce85297 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("\ndraft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_print_context(ctx_dft);
 
     LOG_TEE("\ntarget:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
diff --git a/include/llama.h b/include/llama.h
index cc488f5a601b0..e63daea9b0648 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1169,9 +1169,7 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
 
-    // performance timing information
-    struct llama_perf_data {
-        // llama_context
+    struct llama_perf_data_context {
         double t_start_ms;
         double t_load_ms;
         double t_p_eval_ms;
@@ -1179,22 +1177,22 @@ extern "C" {
 
         int32_t n_p_eval;
         int32_t n_eval;
+    };
 
-        // llama_sampler_chain
+    struct llama_perf_data_sampler {
         double t_sample_ms;
 
         int32_t n_sample;
     };
 
-    enum llama_perf_type {
-        LLAMA_PERF_TYPE_CONTEXT       = 0,
-        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
-    };
+    LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
+    LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);
 
-    LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
+    LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);
 
-    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
-    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
+    LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);
 
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 22f7cbbf0736d..085a8cd3b909c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20687,87 +20687,68 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
-    llama_perf_data data = {};
+struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_data_context data = {};
 
     if (ctx == nullptr) {
         return data;
     }
 
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                const auto * p = (const struct llama_context *) ctx;
-
-                data.t_start_ms  = 1e-3 * p->t_start_us;
-                data.t_load_ms   = 1e-3 * p->t_load_us;
-                data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
-                data.t_eval_ms   = 1e-3 * p->t_eval_us;
-                data.n_p_eval    = std::max(1, p->n_p_eval);
-                data.n_eval      = std::max(1, p->n_eval);
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                const auto * smpl = (const struct llama_sampler       *) ctx;
-                const auto * p    = (const struct llama_sampler_chain *) smpl->ctx;
+    data.t_start_ms  = 1e-3 * ctx->t_start_us;
+    data.t_load_ms   = 1e-3 * ctx->t_load_us;
+    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
+    data.n_p_eval    = std::max(1, ctx->n_p_eval);
+    data.n_eval      = std::max(1, ctx->n_eval);
 
-                data.t_sample_ms = 1e-3 * p->t_sample_us;
-                data.n_sample    = std::max(0, p->n_sample);
-            } break;
-        default:
-            GGML_ABORT("invalid perf type");
+    return data;
+}
+
+struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) {
+    struct llama_perf_data_sampler data = {};
+
+    if (chain == nullptr) {
+        return data;
     }
 
+    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
+
+    data.t_sample_ms = 1e-3 * p->t_sample_us;
+    data.n_sample    = std::max(0, p->n_sample);
+
     return data;
 }
 
-void llama_perf_print(const void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                const auto data = llama_perf_get(ctx, type);
+void llama_perf_print_context(const struct llama_context * ctx) {
+    const auto data = llama_perf_context(ctx);
 
-                const double t_end_ms = 1e-3 * ggml_time_us();
+    const double t_end_ms = 1e-3 * ggml_time_us();
 
-                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
-                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
-                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
-                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                const auto data = llama_perf_get(ctx, type);
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+}
 
-                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
-            } break;
-        default:
-            GGML_ABORT("invalid perf type");
-    }
+void llama_perf_print_sampler(const struct llama_sampler * chain) {
+    const auto data = llama_perf_sampler(chain);
+
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
 }
 
-void llama_perf_reset(void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                auto * p = (struct llama_context *) ctx;
+void llama_perf_reset_context(struct llama_context * ctx) {
+    ctx->t_start_us  = ggml_time_us();
+    ctx->t_eval_us   = ctx->n_eval = 0;
+    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+}
 
-                p->t_start_us  = ggml_time_us();
-                p->t_eval_us   = p->n_eval = 0;
-                p->t_p_eval_us = p->n_p_eval = 0;
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                auto * smpl = (struct llama_sampler *) ctx;
-                auto * p    = (struct llama_sampler_chain *) smpl->ctx;
+void llama_perf_reset_sampler(struct llama_sampler * chain) {
+    auto * p = (struct llama_sampler_chain *) chain->ctx;
 
-                p->t_sample_us = p->n_sample = 0;
-            } break;
-        default:
-            GGML_ABORT("invalid perf type");
-    }
+    p->t_sample_us = p->n_sample = 0;
 }
 
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {

From 7362f288337f2b1c6e8ecc1c10e7349a51fc7aab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Sep 2024 09:19:41 +0300
Subject: [PATCH 5/7] perf : safer pointer handling + naming update

ggml-ci
---
 common/common.cpp                         |  2 +-
 common/sampling.cpp                       |  4 +--
 examples/batched-bench/batched-bench.cpp  |  2 +-
 examples/batched.swift/Sources/main.swift |  4 +--
 examples/batched/batched.cpp              |  4 +--
 examples/embedding/embedding.cpp          |  2 +-
 examples/eval-callback/eval-callback.cpp  |  2 +-
 examples/imatrix/imatrix.cpp              |  2 +-
 examples/llama-bench/llama-bench.cpp      |  2 +-
 examples/llava/llava-cli.cpp              |  4 +--
 examples/llava/minicpmv-cli.cpp           |  2 +-
 examples/parallel/parallel.cpp            |  2 +-
 examples/passkey/passkey.cpp              |  2 +-
 examples/perplexity/perplexity.cpp        |  2 +-
 examples/retrieval/retrieval.cpp          |  2 +-
 examples/simple/simple.cpp                |  4 +--
 examples/speculative/speculative.cpp      |  2 +-
 include/llama.h                           | 18 ++++++------
 src/llama-sampling.cpp                    | 36 +++++++++++++++++++++++
 src/llama.cpp                             | 36 +++--------------------
 20 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b294dd6bf2ac5..11fc1e2233388 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_perf_reset_context(lctx);
+        llama_perf_context_reset(lctx);
     }
 
     iparams.model   = model;
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 21403e213fb01..8e429cac59072 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
 
     if (gsmpl) {
-        llama_perf_print_sampler(gsmpl->chain);
+        llama_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print_context(ctx);
+        llama_perf_context_print(ctx);
     }
 }
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 931a05286c64e..c2e854ea46a04 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -209,7 +209,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index a6c1b64e947a5..10f2e7fd117a1 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
 
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_perf_print_sampler(smpl)
-llama_perf_print_context(context)
+llama_perf_sampler_print(smpl)
+llama_perf_context_print(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let utf8Count = text.utf8.count
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 8dc35e73d649e..f1df20c6ecf09 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_sampler(smpl);
-    llama_perf_print_context(ctx);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index e7134608a2a7c..5661cf0b78142 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(batch);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 6f0e59dd853ca..af389abe1aac1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e9eda9575c098..73b54da7fd4a9 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
     g_collector.save_imatrix();
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 451b8d9ffaba8..2d90f65a07e52 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_perf_print_context(ctx);
+        llama_perf_context_print(ctx);
 
         llama_free(ctx);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 3419da4d43222..12fe7345ff76c 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_perf_print_context(ctx_llava->ctx_llama);
+        llama_perf_context_print(ctx_llava->ctx_llama);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_perf_print_context(ctx_llava->ctx_llama);
+            llama_perf_context_print(ctx_llava->ctx_llama);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 56135cf2f898e..f36caa42e11a3 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_perf_print_context(ctx_llava->ctx_llama);
+        llama_perf_context_print(ctx_llava->ctx_llama);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 8277487814156..758393c3d767a 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e53513b41ce45..52aa68bfcdf3c 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index cfb3b5e056ec4..1bdb6521c640f 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index ef20aa86bf162..d08679edb3d14 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(query_batch);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index dabd619ead4ad..0c923d4edf68f 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_sampler(smpl);
-    llama_perf_print_context(ctx);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index f82c21ce85297..843579acd2222 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("\ndraft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    llama_perf_print_context(ctx_dft);
+    llama_perf_context_print(ctx_dft);
 
     LOG_TEE("\ntarget:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
diff --git a/include/llama.h b/include/llama.h
index e63daea9b0648..d0e0b3a6105e4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1169,7 +1169,7 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
 
-    struct llama_perf_data_context {
+    struct llama_perf_context_data {
         double t_start_ms;
         double t_load_ms;
         double t_p_eval_ms;
@@ -1179,20 +1179,20 @@ extern "C" {
         int32_t n_eval;
     };
 
-    struct llama_perf_data_sampler {
+    struct llama_perf_sampler_data {
         double t_sample_ms;
 
         int32_t n_sample;
     };
 
-    LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
-    LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
 
-    LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
-    LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);
-
-    LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
-    LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 6f448b80c44c1..d17e4427ea75a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1599,3 +1599,39 @@ struct llama_sampler * llama_sampler_init_logit_bias(
         },
     };
 }
+
+// perf
+
+struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
+    struct llama_perf_sampler_data data = {};
+
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        // TODO: return empty data, or GGML_ABORT() ?
+        return data;
+    }
+
+    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
+
+    data.t_sample_ms = 1e-3 * p->t_sample_us;
+    data.n_sample    = std::max(0, p->n_sample);
+
+    return data;
+}
+
+void llama_perf_sampler_print(const struct llama_sampler * chain) {
+    const auto data = llama_perf_sampler(chain);
+
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+}
+
+void llama_perf_sampler_reset(struct llama_sampler * chain) {
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        // TODO: return empty data, or GGML_ABORT() ?
+        return;
+    }
+
+    auto * p = (struct llama_sampler_chain *) chain->ctx;
+
+    p->t_sample_us = p->n_sample = 0;
+}
diff --git a/src/llama.cpp b/src/llama.cpp
index 085a8cd3b909c..d3ba80ebab085 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20687,8 +20687,8 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) {
-    struct llama_perf_data_context data = {};
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
 
     if (ctx == nullptr) {
         return data;
@@ -20704,22 +20704,7 @@ struct llama_perf_data_context llama_perf_context(const struct llama_context * c
     return data;
 }
 
-struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) {
-    struct llama_perf_data_sampler data = {};
-
-    if (chain == nullptr) {
-        return data;
-    }
-
-    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
-
-    data.t_sample_ms = 1e-3 * p->t_sample_us;
-    data.n_sample    = std::max(0, p->n_sample);
-
-    return data;
-}
-
-void llama_perf_print_context(const struct llama_context * ctx) {
+void llama_perf_context_print(const struct llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
     const double t_end_ms = 1e-3 * ggml_time_us();
@@ -20732,25 +20717,12 @@ void llama_perf_print_context(const struct llama_context * ctx) {
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 }
 
-void llama_perf_print_sampler(const struct llama_sampler * chain) {
-    const auto data = llama_perf_sampler(chain);
-
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
-}
-
-void llama_perf_reset_context(struct llama_context * ctx) {
+void llama_perf_context_reset(struct llama_context * ctx) {
     ctx->t_start_us  = ggml_time_us();
     ctx->t_eval_us   = ctx->n_eval = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 
-void llama_perf_reset_sampler(struct llama_sampler * chain) {
-    auto * p = (struct llama_sampler_chain *) chain->ctx;
-
-    p->t_sample_us = p->n_sample = 0;
-}
-
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");

From f35e9b87cd259cbe9fa618ac9125f040ebc3e4b8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Sep 2024 09:23:50 +0300
Subject: [PATCH 6/7] minor : better local var name

---
 src/llama-sampling.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 87d975f6083f4..ea83f6de2ead6 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1667,10 +1667,10 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
         return data;
     }
 
-    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
+    const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
 
-    data.t_sample_ms = 1e-3 * p->t_sample_us;
-    data.n_sample    = std::max(0, p->n_sample);
+    data.t_sample_ms = 1e-3 * ctx->t_sample_us;
+    data.n_sample    = std::max(0, ctx->n_sample);
 
     return data;
 }
@@ -1688,7 +1688,7 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
         return;
     }
 
-    auto * p = (struct llama_sampler_chain *) chain->ctx;
+    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
 
-    p->t_sample_us = p->n_sample = 0;
+    ctx->t_sample_us = ctx->n_sample = 0;
 }

From 444b757bce023e42cae4bd179302bc81132adbe5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Sep 2024 15:08:48 +0300
Subject: [PATCH 7/7] perf : abort on invalid sampler pointer

ggml-ci
---
 src/llama-sampling.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index ea83f6de2ead6..dd86f2661fc09 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1663,8 +1663,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
     struct llama_perf_sampler_data data = {};
 
     if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        // TODO: return empty data, or GGML_ABORT() ?
-        return data;
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
     }
 
     const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
@@ -1684,8 +1683,7 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) {
 
 void llama_perf_sampler_reset(struct llama_sampler * chain) {
     if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
-        // TODO: return empty data, or GGML_ABORT() ?
-        return;
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
     }
 
     auto * ctx = (struct llama_sampler_chain *) chain->ctx;