From 7362f288337f2b1c6e8ecc1c10e7349a51fc7aab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Sep 2024 09:19:41 +0300
Subject: [PATCH] perf : safer pointer handling + naming update

ggml-ci
---
 common/common.cpp                         |  2 +-
 common/sampling.cpp                       |  4 +--
 examples/batched-bench/batched-bench.cpp  |  2 +-
 examples/batched.swift/Sources/main.swift |  4 +--
 examples/batched/batched.cpp              |  4 +--
 examples/embedding/embedding.cpp          |  2 +-
 examples/eval-callback/eval-callback.cpp  |  2 +-
 examples/imatrix/imatrix.cpp              |  2 +-
 examples/llama-bench/llama-bench.cpp      |  2 +-
 examples/llava/llava-cli.cpp              |  4 +--
 examples/llava/minicpmv-cli.cpp           |  2 +-
 examples/parallel/parallel.cpp            |  2 +-
 examples/passkey/passkey.cpp              |  2 +-
 examples/perplexity/perplexity.cpp        |  2 +-
 examples/retrieval/retrieval.cpp          |  2 +-
 examples/simple/simple.cpp                |  4 +--
 examples/speculative/speculative.cpp      |  2 +-
 include/llama.h                           | 18 ++++++------
 src/llama-sampling.cpp                    | 36 +++++++++++++++++++++++
 src/llama.cpp                             | 36 +++--------------------
 20 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b294dd6bf2ac5..11fc1e2233388 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_perf_reset_context(lctx);
+        llama_perf_context_reset(lctx);
     }
 
     iparams.model   = model;
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 21403e213fb01..8e429cac59072 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
 
     if (gsmpl) {
-        llama_perf_print_sampler(gsmpl->chain);
+        llama_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print_context(ctx);
+        llama_perf_context_print(ctx);
     }
 }
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 931a05286c64e..c2e854ea46a04 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -209,7 +209,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index a6c1b64e947a5..10f2e7fd117a1 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
 
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_perf_print_sampler(smpl)
-llama_perf_print_context(context)
+llama_perf_sampler_print(smpl)
+llama_perf_context_print(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let utf8Count = text.utf8.count
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 8dc35e73d649e..f1df20c6ecf09 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_sampler(smpl);
-    llama_perf_print_context(ctx);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index e7134608a2a7c..5661cf0b78142 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(batch);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 6f0e59dd853ca..af389abe1aac1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e9eda9575c098..73b54da7fd4a9 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
     g_collector.save_imatrix();
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 451b8d9ffaba8..2d90f65a07e52 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_perf_print_context(ctx);
+        llama_perf_context_print(ctx);
 
         llama_free(ctx);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 3419da4d43222..12fe7345ff76c 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_perf_print_context(ctx_llava->ctx_llama);
+        llama_perf_context_print(ctx_llava->ctx_llama);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_perf_print_context(ctx_llava->ctx_llama);
+            llama_perf_context_print(ctx_llava->ctx_llama);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 56135cf2f898e..f36caa42e11a3 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_perf_print_context(ctx_llava->ctx_llama);
+        llama_perf_context_print(ctx_llava->ctx_llama);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 8277487814156..758393c3d767a 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e53513b41ce45..52aa68bfcdf3c 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index cfb3b5e056ec4..1bdb6521c640f 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index ef20aa86bf162..d08679edb3d14 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print_context(ctx);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(query_batch);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index dabd619ead4ad..0c923d4edf68f 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print_sampler(smpl);
-    llama_perf_print_context(ctx);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index f82c21ce85297..843579acd2222 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("\ndraft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    llama_perf_print_context(ctx_dft);
+    llama_perf_context_print(ctx_dft);
 
     LOG_TEE("\ntarget:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
diff --git a/include/llama.h b/include/llama.h
index e63daea9b0648..d0e0b3a6105e4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1169,7 +1169,7 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
 
-    struct llama_perf_data_context {
+    struct llama_perf_context_data {
         double t_start_ms;
         double t_load_ms;
         double t_p_eval_ms;
@@ -1179,20 +1179,20 @@ extern "C" {
         int32_t n_eval;
     };
 
-    struct llama_perf_data_sampler {
+    struct llama_perf_sampler_data {
         double t_sample_ms;
 
         int32_t n_sample;
     };
 
-    LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
-    LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
 
-    LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
-    LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);
-
-    LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
-    LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 6f448b80c44c1..d17e4427ea75a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1599,3 +1599,39 @@ struct llama_sampler * llama_sampler_init_logit_bias(
         },
     };
 }
+
+// perf
+
+struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
+    struct llama_perf_sampler_data data = {};
+
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        // TODO: return empty data, or GGML_ABORT() ?
+        return data;
+    }
+
+    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
+
+    data.t_sample_ms = 1e-3 * p->t_sample_us;
+    data.n_sample    = std::max(0, p->n_sample);
+
+    return data;
+}
+
+void llama_perf_sampler_print(const struct llama_sampler * chain) {
+    const auto data = llama_perf_sampler(chain);
+
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+}
+
+void llama_perf_sampler_reset(struct llama_sampler * chain) {
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        // TODO: return empty data, or GGML_ABORT() ?
+        return;
+    }
+
+    auto * p = (struct llama_sampler_chain *) chain->ctx;
+
+    p->t_sample_us = p->n_sample = 0;
+}
diff --git a/src/llama.cpp b/src/llama.cpp
index 085a8cd3b909c..d3ba80ebab085 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20687,8 +20687,8 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) {
-    struct llama_perf_data_context data = {};
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
 
     if (ctx == nullptr) {
         return data;
@@ -20704,22 +20704,7 @@ struct llama_perf_data_context llama_perf_context(const struct llama_context * c
     return data;
 }
 
-struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) {
-    struct llama_perf_data_sampler data = {};
-
-    if (chain == nullptr) {
-        return data;
-    }
-
-    const auto * p = (const struct llama_sampler_chain *) chain->ctx;
-
-    data.t_sample_ms = 1e-3 * p->t_sample_us;
-    data.n_sample    = std::max(0, p->n_sample);
-
-    return data;
-}
-
-void llama_perf_print_context(const struct llama_context * ctx) {
+void llama_perf_context_print(const struct llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
     const double t_end_ms = 1e-3 * ggml_time_us();
@@ -20732,25 +20717,12 @@ void llama_perf_print_context(const struct llama_context * ctx) {
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 }
 
-void llama_perf_print_sampler(const struct llama_sampler * chain) {
-    const auto data = llama_perf_sampler(chain);
-
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
-}
-
-void llama_perf_reset_context(struct llama_context * ctx) {
+void llama_perf_context_reset(struct llama_context * ctx) {
     ctx->t_start_us  = ggml_time_us();
     ctx->t_eval_us   = ctx->n_eval = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 
-void llama_perf_reset_sampler(struct llama_sampler * chain) {
-    auto * p = (struct llama_sampler_chain *) chain->ctx;
-
-    p->t_sample_us = p->n_sample = 0;
-}
-
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");