From 5449c1720d0214d33766ede463bea1aee5851d2c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Sep 2024 16:41:45 +0300
Subject: [PATCH] cont : update all examples except server

ggml-ci
---
 Makefile                                      |   2 +-
 common/arg.cpp                                |   6 +-
 common/common.cpp                             |  26 +-
 common/common.h                               |   6 +-
 examples/batched-bench/batched-bench.cpp      |  39 +-
 examples/batched/batched.cpp                  |  54 +--
 .../convert-llama2c-to-ggml.cpp               |  96 ++---
 examples/embedding/embedding.cpp              | 109 +++---
 examples/eval-callback/eval-callback.cpp      |  54 +--
 examples/imatrix/imatrix.cpp                  |  69 ++--
 examples/infill/infill.cpp                    | 184 +++++-----
 examples/llava/clip.cpp                       | 140 +++----
 examples/llava/llava-cli.cpp                  |  72 ++--
 examples/llava/llava.cpp                      |  47 +--
 examples/llava/minicpmv-cli.cpp               |  90 +++--
 examples/lookahead/lookahead.cpp              |  59 +--
 examples/lookup/lookup-stats.cpp              |  33 +-
 examples/lookup/lookup.cpp                    |  59 +--
 examples/main/main.cpp                        |  14 +-
 examples/parallel/parallel.cpp                |  70 ++--
 examples/passkey/passkey.cpp                  |  63 ++--
 examples/perplexity/perplexity.cpp            | 343 +++++++++---------
 examples/retrieval/retrieval.cpp              |  72 ++--
 examples/server/server.cpp                    |  20 +-
 examples/simple/simple.cpp                    |   4 +-
 examples/speculative/speculative.cpp          | 105 +++---
 examples/tokenize/tokenize.cpp                |  49 ++-
 27 files changed, 975 insertions(+), 910 deletions(-)

diff --git a/Makefile b/Makefile
index ae883e989d8093..8a85662d16c547 100644
--- a/Makefile
+++ b/Makefile
@@ -1348,7 +1348,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_GGML) $(OBJ_LLAMA)
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
diff --git a/common/arg.cpp b/common/arg.cpp
index c71b31eef1a0f8..c71e730df5ef9a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -419,7 +419,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.use_color = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(llama_arg(
         {"-t", "--threads"}, "N",
         format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -870,7 +870,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
@@ -878,7 +878,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
diff --git a/common/common.cpp b/common/common.cpp
index ecebb05eb9d11c..a6869778d63732 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -442,7 +442,29 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
 
-std::string string_from_tokens(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string string_from(bool value) {
+    return value ? "true" : "false";
+}
+
+std::string string_from(const std::vector<int> & values) {
+    std::stringstream buf;
+
+    buf << "[ ";
+    bool first = true;
+    for (auto e : values) {
+        if (first) {
+            first = false;
+        } else {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
     std::stringstream buf;
 
     buf << "[ ";
@@ -473,7 +495,7 @@ std::string string_from_tokens(const struct llama_context * ctx, const std::vect
     return buf.str();
 }
 
-std::string string_from_batch(const struct llama_context * ctx, const struct llama_batch & batch) {
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
     std::stringstream buf;
 
     buf << "[ ";
diff --git a/common/common.h b/common/common.h
index 8feb9cf9129d78..06ea7abad4c74f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -374,8 +374,10 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
-std::string string_from_tokens(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from_batch (const struct llama_context * ctx, const struct llama_batch & batch);
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
 
 //
 // Filesystem utils
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 89a4566c4b8272..61562a387754e1 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -8,12 +9,18 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
@@ -76,7 +83,7 @@ int main(int argc, char ** argv) {
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 return false;
             }
 
@@ -93,17 +100,17 @@ int main(int argc, char ** argv) {
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
     }
 
     if (!params.batched_bench_output_jsonl) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG_TEE("\n");
-        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
     }
 
     for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@@ -133,7 +140,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                     return 1;
                 }
 
@@ -155,7 +162,7 @@ int main(int argc, char ** argv) {
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                         return 1;
                     }
                 }
@@ -173,20 +180,20 @@ int main(int argc, char ** argv) {
                 const float speed    = n_kv / t;
 
                 if(params.batched_bench_output_jsonl) {
-                    LOG_TEE(
+                    LOG(
                         "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                         "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
                         n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                         pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
                     );
                 } else {
-                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
                 }
             }
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_batch_free(batch);
@@ -196,7 +203,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 5d32153fe1a9a1..413a6e571a822e 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -8,12 +9,18 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     params.prompt = "Hello my name is";
@@ -42,7 +49,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
@@ -72,31 +79,29 @@ int main(int argc, char ** argv) {
     llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
 
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
     const int n_ctx = llama_n_ctx(ctx);
 
-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
 
     // print the prompt token-by-token
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
-    fflush(stderr);
-
     // create a llama_batch
     // we use this object to submit token data for decoding
     llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +119,7 @@ int main(int argc, char ** argv) {
 
     if (llama_model_has_encoder(model)) {
         if (llama_encode(ctx, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
 
@@ -131,7 +136,7 @@ int main(int argc, char ** argv) {
     batch.logits[batch.n_tokens - 1] = true;
 
     if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
@@ -142,7 +147,7 @@ int main(int argc, char ** argv) {
     //}
 
     if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
     }
 
     // main loop
@@ -175,9 +180,9 @@ int main(int argc, char ** argv) {
             // is it an end of generation? -> mark the stream as finished
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
-                LOG_TEE("\n");
+                LOG("\n");
                 if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                 }
 
                 continue;
@@ -185,8 +190,7 @@ int main(int argc, char ** argv) {
 
             // if there is only one stream, we print immediately to stdout
             if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
+                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
             }
 
             streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -208,27 +212,25 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
-
     if (n_parallel > 1) {
-        LOG_TEE("\n");
+        LOG("\n");
 
         for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
         }
     }
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
     llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
     llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 8ca9f8915916cc..3d4df24dcb2422 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,6 +9,7 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
+#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
     const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
     try {
         w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
 
         w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
 
         w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
 
         w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
         w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
         w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
         w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
         w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
 
         if (shared_weights) {
             w->wcls = {};
         } else {
             w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
         }
     }
     catch (std::length_error &) {
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
     fseek(f, 0, SEEK_END);
     auto end = ftell(f);
     if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
         return 1;
     }
 
@@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }
 
 static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
-
-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+    LOG_INF("%f\n", w->token_embedding_table[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    LOG_INF("%f\n", w->rms_ffn_weight[0]);
+
+    LOG_INF("%f\n", w->wq[0]);
+    LOG_INF("%f\n", w->wk[0]);
+    LOG_INF("%f\n", w->wv[0]);
+    LOG_INF("%f\n", w->wo[0]);
+    LOG_INF("%f\n", w->w1[0]);
+    LOG_INF("%f\n", w->w2[0]);
+    LOG_INF("%f\n", w->w3[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -318,20 +319,20 @@ struct train_params {
 };
 
 static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
 }
 
 static void print_tensor_info(const struct ggml_context * ctx) {
     for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
+        LOG_INF("%s: Allocating ", __func__);
         int64_t total = 1;
         int i = 0;
         for (; i < ggml_n_dims(t); ++i) {
@@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
 
 static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
     if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
         struct ggml_context * ctx_data = NULL;
 
         struct gguf_init_params params = {
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
         gguf_free(ctx);
     } else {
         // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
         llama_file file(filename, "rb");
         if (!file.fp) {
             die_fmt("%s: %s", strerror(errno), filename);
@@ -871,23 +872,28 @@ static std::string basename(const std::string &path) {
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     struct train_params params = get_default_train_params();
     if (!params_parse(argc, argv, &params)) {
         return 1;
     }
-    log_set_target(stdout);
     Config config;
     TransformerWeights weights = {};
     {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
         FILE * file = fopen(params.fn_llama2c_model, "rb");
         if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
             return 1;
         }
         // read in the config header
         if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
             return 1;
         }
         auto shared_weights = config.vocab_size > 0;
@@ -896,7 +902,7 @@ int main(int argc, char ** argv) {
         // read in the Transformer weights
         alloc_weights(&weights, &config, shared_weights);
         if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
             return 1;
         }
         fclose(file);
@@ -929,7 +935,7 @@ int main(int argc, char ** argv) {
     model.name = basename(params.fn_llama2c_model);
     save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
 
-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
 
     ggml_free(model.ctx);
     return 0;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index db00c636330fc0..716fb712722805 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <ctime>
@@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     llama_kv_cache_clear(ctx);
 
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
     if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
         // encoder-only model
         if (llama_encode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to encode\n", __func__);
+            LOG_ERR("%s : failed to encode\n", __func__);
         }
     } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
         // decoder-only model
         if (llama_decode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to decode\n", __func__);
+            LOG_ERR("%s : failed to decode\n", __func__);
         }
     }
 
@@ -78,6 +79,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
@@ -99,7 +106,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
@@ -109,19 +116,19 @@ int main(int argc, char ** argv) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
         return 1;
     }
 
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     // split the prompt into lines
@@ -136,7 +143,7 @@ int main(int argc, char ** argv) {
     for (const auto & prompt : prompts) {
         auto inp = ::llama_tokenize(ctx, prompt, true, false);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -147,20 +154,20 @@ int main(int argc, char ** argv) {
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
         if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
     }
 
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
             for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
         }
     }
 
@@ -211,57 +218,57 @@ int main(int argc, char ** argv) {
     batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
 
     if (params.embd_out.empty()) {
-        fprintf(stdout, "\n");
+        LOG("\n");
 
         if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
             for (int j = 0; j < n_embd_count; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                 for (int i = 0; i < std::min(3, n_embd); i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, " ... ");
+                LOG(" ... ");
                 for (int i = n_embd - 3; i < n_embd; i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
             for (int j = 0; j < n_prompts; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                 for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
             }
 
             // print cosine similarity matrix
             if (n_prompts > 1) {
-                fprintf(stdout, "\n");
-                printf("cosine similarity matrix:\n\n");
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
                 for (int i = 0; i < n_prompts; i++) {
-                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
+                    LOG("%6.6s ", prompts[i].c_str());
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
                 for (int i = 0; i < n_prompts; i++) {
                     for (int j = 0; j < n_prompts; j++) {
                         float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        fprintf(stdout, "%6.2f ", sim);
+                        LOG("%6.2f ", sim);
                     }
-                    fprintf(stdout, "%1.10s", prompts[i].c_str());
-                    fprintf(stdout, "\n");
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
                 }
             }
         }
@@ -270,42 +277,42 @@ int main(int argc, char ** argv) {
     if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
         const bool notArray = params.embd_out != "array";
 
-        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
         for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            fprintf(stdout, "[");
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
             for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                 i++;
-                if (i < n_embd) fprintf(stdout, ","); else break;
+                if (i < n_embd) LOG(","); else break;
             }
-            fprintf(stdout, notArray ? "]\n    }" : "]");
+            LOG(notArray ? "]\n    }" : "]");
             j++;
-            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
         }
-        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        LOG(notArray ? "\n  ]" : "]\n");
 
         if (params.embd_out == "json+" && n_prompts > 1) {
-            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            LOG(",\n  \"cosineSimilarity\": [\n");
             for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                fprintf(stdout, "    [");
+                LOG("    [");
                 for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                     float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f", sim);
+                    LOG("%6.2f", sim);
                     j++;
-                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
+                    if (j < n_embd_count) LOG(", "); else break;
                 }
-                fprintf(stdout, " ]");
+                LOG(" ]");
                 i++;
-                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+                if (i < n_embd_count) LOG(",\n"); else break;
             }
-            fprintf(stdout, "\n  ]");
+            LOG("\n  ]");
         }
 
-        if (notArray) fprintf(stdout, "\n}\n");
+        if (notArray) LOG("\n}\n");
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     // clean up
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index bc720314341031..24ef2a6149344d 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,12 +1,11 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include "ggml.h"
 
 #include <cstdio>
-#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
 
 /**
@@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
     GGML_ASSERT(n > 0);
     float sum = 0;
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("                                     [\n");
+        LOG("                                     [\n");
         for (int64_t i2 = 0; i2 < ne[2]; i2++) {
             if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
+                LOG("                                      ..., \n");
                 i2 = ne[2] - n;
             }
-            printf("                                      [\n");
+            LOG("                                      [\n");
             for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                 if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
+                    LOG("                                       ..., \n");
                     i1 = ne[1] - n;
                 }
-                printf("                                       [");
+                LOG("                                       [");
                 for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                     if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
+                        LOG("..., ");
                         i0 = ne[0] - n;
                     }
                     size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else {
                         GGML_ABORT("fatal error");
                     }
-                    printf("%12.4f", v);
+                    LOG("%12.4f", v);
                     sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1) LOG(", ");
                 }
-                printf("],\n");
+                LOG("],\n");
             }
-            printf("                                      ],\n");
+            LOG("                                      ],\n");
         }
-        printf("                                     ]\n");
-        printf("                                     sum = %f\n", sum);
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
     }
 }
 
@@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
         snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
     }
 
-    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-           t->name, ggml_type_name(t->type), ggml_op_desc(t),
-           src0->name, ggml_ne_string(src0).c_str(),
-           src1 ? src1_str : "",
-           ggml_ne_string(t).c_str());
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
 
 
     // copy the data from the GPU memory if needed
@@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
 
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+        LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
 
@@ -141,6 +140,12 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     callback_data cb_data;
 
     gpt_params params;
@@ -166,14 +171,15 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
 
     bool OK = run(ctx, params);
@@ -181,7 +187,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_free(ctx);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 032a9013656407..cdd34059e8c23a 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -19,12 +20,12 @@
 #endif
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s \\\n"
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
             "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
             "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
             "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG_TEE("\n");
+    LOG("\n");
 }
 
 struct Stats {
@@ -125,7 +126,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             e.counts.resize(src1->ne[0]*n_as, 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
             exit(1); //GGML_ABORT("fatal error");
         }
         if (m_params.verbosity > 1) {
@@ -151,7 +152,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                         e.values[e_start + j] += x[j]*x[j];
                         e.counts[e_start + j]++;
                         if (!std::isfinite(e.values[e_start + j])) {
-                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            LOG("\n");
+                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
                             exit(1);
                         }
                     }
@@ -174,12 +176,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             e.counts.resize(src1->ne[0], 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
             exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
         if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            LOG("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
         }
         for (int row = 0; row < (int)src1->ne[1]; ++row) {
             const float * x = data + row * src1->ne[0];
@@ -187,7 +189,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                 e.values[j] += x[j]*x[j];
                 e.counts[j]++;
                 if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
                     exit(1);
                 }
             }
@@ -239,17 +241,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
         }
 
         if (n_zeros != 0 && is_first) {
-            fprintf(stderr, "\n");
+            LOG_INF("\n");
             is_first = false;
         }
 
         if (n_zeros == n_all) {
-            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
             continue;
         }
 
         if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
             continue;
         }
 
@@ -258,7 +260,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
     }
 
     if (to_store.size() < m_stats.size()) {
-        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
     }
 
     std::ofstream out(fname, std::ios::binary);
@@ -291,7 +293,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
     }
 
     if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+        LOG_INF("\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
     }
 }
 
@@ -437,26 +439,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     const int n_ctx = llama_n_ctx(ctx);
 
     auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
     if (params.i_chunk > 0) {
         if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
             return false;
         }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
         tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
     }
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
         return false;
     }
 
@@ -478,7 +479,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     double nll = 0.0;
     double nll2 = 0.0;
 
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
@@ -514,7 +515,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 
             // TODO: use batch.logits to save computations instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                 return false;
             }
 
@@ -531,13 +532,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
 
         if (params.compute_ppl) {
@@ -572,6 +573,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     params.n_ctx = 512;
@@ -589,7 +596,7 @@ int main(int argc, char ** argv) {
     for (const auto & in_file : params.in_files) {
         printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
         if (!g_collector.load_imatrix(in_file.c_str())) {
-            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
             return 1;
         }
     }
@@ -614,20 +621,20 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     if (!compute_imatrix(ctx, params)) {
@@ -636,7 +643,7 @@ int main(int argc, char ** argv) {
 
     g_collector.save_imatrix();
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_free(ctx);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 7e252ce093d759..213618f09ca740 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "console.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cassert>
@@ -55,7 +56,7 @@ static void write_logfile(
 
     const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
         return;
     }
@@ -64,7 +65,7 @@ static void write_logfile(
     FILE * logfile = fopen(logfile_path.c_str(), "w");
 
     if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
         return;
     }
 
@@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
             is_interacting = true;
         } else {
             console::cleanup();
-            printf("\n");
+            LOG("\n");
             gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
             _exit(130);
@@ -103,6 +104,12 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
     g_params = &params;
 
@@ -112,54 +119,49 @@ int main(int argc, char ** argv) {
 
     auto & sparams = params.sparams;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
+
     if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
     print_build_info();
 
-    LOG("%s: llama backend init\n", __func__);
+    LOG_INF("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
@@ -172,34 +174,32 @@ int main(int argc, char ** argv) {
     g_smpl = &smpl;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     llama_init_result llama_init = llama_init_from_gpt_params(params);
 
     model = llama_init.model;
     ctx = llama_init.context;
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
     const bool add_bos = llama_add_bos_token(model);
     GGML_ASSERT(!llama_add_eos_token(model));
-    LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
     std::vector<llama_token> embd_end;
@@ -224,18 +224,19 @@ int main(int argc, char ** argv) {
         embd_inp.push_back(middle_token);
     }
 
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
@@ -244,9 +245,8 @@ int main(int argc, char ** argv) {
         params.n_keep = (int)embd_inp.size();
     }
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
 
     // enable interactive mode if interactive start is specified
     if (params.interactive_first) {
@@ -254,21 +254,21 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     if (params.interactive) {
@@ -285,28 +285,30 @@ int main(int argc, char ** argv) {
         SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
     smpl = gpt_sampler_init(model, sparams);
 
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
+    LOG("\n");
+    LOG("\n#####  Infill mode  #####\n\n");
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
@@ -317,11 +319,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
@@ -354,9 +356,8 @@ int main(int argc, char ** argv) {
                 embd.resize(max_embd_size);
 
                 console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
-                fflush(stdout);
             }
 
             // infinite text generation via context swapping
@@ -365,14 +366,14 @@ int main(int argc, char ** argv) {
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
             if (n_past + (int) embd.size() > n_ctx) {
                 if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
                 }
 
                 const int n_left    = n_past - params.n_keep - 1;
                 const int n_discard = n_left/2;
 
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                 llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -380,9 +381,9 @@ int main(int argc, char ** argv) {
 
                 n_past -= n_discard;
 
-                LOG("after swap: n_past = %d\n", n_past);
+                LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
             }
 
@@ -394,16 +395,16 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
             }
 
         }
@@ -415,7 +416,7 @@ int main(int argc, char ** argv) {
 
             gpt_sampler_accept(smpl, id, true);
 
-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -425,10 +426,10 @@ int main(int argc, char ** argv) {
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
@@ -447,7 +448,7 @@ int main(int argc, char ** argv) {
         if (input_echo) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
@@ -456,7 +457,6 @@ int main(int argc, char ** argv) {
                     output_ss << token_str;
                 }
             }
-            fflush(stdout);
         }
         // reset color to default if we there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -469,10 +469,9 @@ int main(int argc, char ** argv) {
             if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                 if (is_interacting && !params.interactive_first) {
                     // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                 }
-                fflush(stdout);
-                printf("\n");
+                LOG("\n");
                 console::set_display(console::user_input);
                 std::string buffer;
                 std::string line;
@@ -528,35 +527,33 @@ int main(int argc, char ** argv) {
                 n_remain = params.n_predict;
                 n_past = 0;
                 n_consumed = 0;
-                // LOG_TEE("took new input\n");
                 is_interacting = false;
             }
             // deal with end of generation tokens in interactive mode
             else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG("found EOS token\n");
+                LOG_DBG("found EOS token\n");
 
                 if (params.interactive) {
 
                     is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                     console::set_display(console::user_input);
-                    fflush(stdout);
                }
             }
 
             if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                     embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    LOG("%s", buffer.c_str());
                 }
 
                 std::string line;
@@ -574,17 +571,17 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
                     const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
@@ -595,9 +592,9 @@ int main(int argc, char ** argv) {
                     }
 
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -624,11 +621,10 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
+        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
@@ -638,9 +634,5 @@ int main(int argc, char ** argv) {
     gpt_sampler_free(smpl);
     llama_backend_free();
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
     return 0;
 }
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9b890571eee9c8..d8dcfc7d9c7c20 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,7 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
-#include "log.h"
+#include "log.h" // TODO: clip should not depend on common/log
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -165,7 +165,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
     int i = gguf_find_key(ctx, key);
     if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        LOG_ERR("key %s not found in file\n", key);
         throw std::runtime_error(format("Missing required key: %s", key));
     }
 
@@ -270,7 +270,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
             tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -288,7 +288,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -307,7 +307,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -568,7 +568,7 @@ struct clip_ctx {
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return nullptr;
     }
 
@@ -582,7 +582,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         if (load_image_size == nullptr) {
             load_image_size = clip_image_size_init();
         }
-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
         image_size_width  = load_image_size->width;
         image_size_height = load_image_size->height;
         if (is_inf) {
@@ -1047,21 +1047,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         const int idx_name = gguf_find_key(ctx, KEY_NAME);
         if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
             const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
         }
-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_TEE("\n");
+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_INF("\n");
     }
     const int n_tensors = gguf_get_n_tensors(ctx);
 
     // kv
     const int n_kv = gguf_get_n_kv(ctx);
-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
         __func__, n_kv, n_tensors, fname);
     {
         std::map<enum ggml_type, uint32_t> n_type;
@@ -1072,7 +1072,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             n_type[type]++;
         }
 
-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(ctx, i);
             const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1088,7 +1088,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             replace_all(value, "\n", "\\n");
 
-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
@@ -1097,7 +1097,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 continue;
             }
 
-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -1112,7 +1112,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             size_t tensor_size = ggml_nbytes(cur);
             model_size += tensor_size;
             if (verbosity >= 3) {
-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                        __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
             }
         }
@@ -1139,27 +1139,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
 #ifdef GGML_USE_CUDA
     new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_METAL
     new_clip->backend = ggml_backend_metal_init();
-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_CANN
     new_clip->backend = ggml_backend_cann_init(0);
-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_VULKAN
     new_clip->backend = ggml_backend_vk_init(0);
-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
     }
 
     // model size and capabilities
@@ -1194,16 +1194,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
 
         if (verbosity >= 1) {
-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
         }
     }
 
-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
 
     // load tensors
     {
@@ -1216,7 +1216,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         new_clip->ctx_data = ggml_init(params);
         if (!new_clip->ctx_data) {
-            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            LOG_ERR("%s: ggml_init() failed\n", __func__);
             clip_free(new_clip);
             gguf_free(ctx);
             return nullptr;
@@ -1224,7 +1224,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         auto fin = std::ifstream(fname, std::ios::binary);
         if (!fin) {
-            LOG_TEE("cannot open model file for loading tensors\n");
+            LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
             gguf_free(ctx);
             return nullptr;
@@ -1246,7 +1246,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
             fin.seekg(offset, std::ios::beg);
             if (!fin) {
-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
                 clip_free(new_clip);
                 gguf_free(ctx);
                 return nullptr;
@@ -1317,23 +1317,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
 
         if (verbosity >= 2) {
-            LOG_TEE("\n%s: vision model hparams\n", __func__);
-            LOG_TEE("image_size         %d\n", hparams.image_size);
-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
-            LOG_TEE("v_eps              %f\n", hparams.eps);
-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_TEE("v_image_grid_pinpoints: ");
+            LOG_INF("\n%s: vision model hparams\n", __func__);
+            LOG_INF("image_size         %d\n", hparams.image_size);
+            LOG_INF("patch_size         %d\n", hparams.patch_size);
+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_INF("v_n_head           %d\n", hparams.n_head);
+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
+            LOG_INF("v_eps              %f\n", hparams.eps);
+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_INF("v_image_grid_pinpoints: ");
             for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
             }
-            LOG_TEE("\n");
-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_INF("\n");
+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
 
         }
 
@@ -1371,7 +1371,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
         } catch(const std::exception& /*e*/) {
-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
         }
 
         // LLaVA projection
@@ -1400,7 +1400,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             } catch (std::runtime_error & /*e*/) { }
             try {
                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
             } catch (std::runtime_error & /*e*/) { }
         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1501,7 +1501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
     }
 
     return new_clip;
@@ -1552,7 +1552,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
     int nx, ny, nc;
     auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
     if (!data) {
-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1564,7 +1564,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     int nx, ny, nc;
     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
     if (!data) {
-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1754,7 +1754,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -1872,7 +1872,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
     const int multiple = fmin(ceil(ratio), max_slice_nums);
 
     std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
+    LOG_INF("%s: multiple %d\n", __func__, multiple);
     images.push_back(std::vector<clip_image_u8 *>());
 
     if (multiple <= 1) {
@@ -1887,17 +1887,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
         clip_image_u8 * source_image = clip_image_u8_init();
         bicubic_resize(*img, *source_image, best_size.first, best_size.second);
         // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
         images[images.size()-1].push_back(source_image);
 
         std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
 
         auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
         clip_image_u8 * refine_image = clip_image_u8_init();
         bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
 
-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
 
         // split_to_patches
         int width = refine_image->nx;
@@ -1954,7 +1954,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         int idx = 0;
         for (size_t i = 0; i < imgs.size(); ++i) {
             for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                 clip_image_f32 * res = clip_image_f32_init();
                 normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                 res_imgs->data[idx++] = *res;
@@ -1966,7 +1966,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
     auto & params = ctx->vision_model.hparams;
@@ -2043,7 +2043,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
 
             for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                 clip_image_u8_free(patches[i]);
             }
 
@@ -2279,7 +2279,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -2291,7 +2291,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -2521,7 +2521,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             new_type = type;
             if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                 new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
             }
             const size_t n_elms = ggml_nelements(cur);
             float * f32_data;
@@ -2540,7 +2540,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                 f32_data = (float *)conv_buf.data();
                 break;
             default:
-                LOG_TEE("Please use an input file in f32 or f16\n");
+                LOG_ERR("Please use an input file in f32 or f16\n");
                 gguf_free(ctx_out);
                 return false;
             }
@@ -2567,7 +2567,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             fout.put(0);
         }
 
-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
                orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
     }
 
@@ -2583,8 +2583,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     gguf_free(ctx_out);
 
     {
-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
     }
 
     return true;
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index e9108a9bdbd4b9..73ddbad319022c 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
             n_eval = n_batch;
         }
         if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
         *n_past += n_eval;
@@ -75,7 +75,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
     size_t img_base64_str_start, img_base64_str_end;
     find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
     if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
         return NULL;
     }
 
@@ -89,7 +89,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
 
     auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
     if (!embed) {
-        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
         return NULL;
     }
 
@@ -114,9 +114,9 @@ struct llava_context {
 };
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\n example usage:\n");
-    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -126,11 +126,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
     auto prompt = params->prompt;
     if (prompt_contains_image(prompt)) {
         if (!params->image.empty()) {
-            LOG_TEE("using base64 encoded image instead of command line image path\n");
+            LOG_INF("using base64 encoded image instead of command line image path\n");
         }
         embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
-            LOG_TEE("%s: can't load image from prompt\n", __func__);
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
@@ -156,18 +156,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
         system_prompt = prompt.substr(0, image_pos);
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
-        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     } else {
@@ -177,7 +177,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     }
@@ -188,11 +188,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     // generate the response
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
     if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
     }
 
@@ -202,7 +202,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         response += tmp;
         if (strcmp(tmp, "</s>") == 0) break;
         if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        LOG("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
         if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -211,7 +211,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     }
 
     gpt_sampler_free(smpl);
-    printf("\n");
+    LOG("\n");
 }
 
 static struct llama_model * llava_init(gpt_params * params) {
@@ -222,7 +222,7 @@ static struct llama_model * llava_init(gpt_params * params) {
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
     }
     return model;
@@ -245,11 +245,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;
     ctx_llava->ctx_clip = ctx_clip;
@@ -268,13 +268,13 @@ static void llava_free(struct llava_context * ctx_llava) {
     llama_backend_free();
 }
 
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     ggml_time_init();
 
     gpt_params params;
@@ -283,27 +283,21 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
-    auto model = llava_init(&params);
+
+    auto * model = llava_init(&params);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
         return 1;
     }
 
     if (prompt_contains_image(params.prompt)) {
-        auto ctx_llava = llava_init_context(&params, model);
+        auto * ctx_llava = llava_init_context(&params, model);
 
-        auto image_embed = load_image(ctx_llava, &params, "");
+        auto * image_embed = load_image(ctx_llava, &params, "");
 
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -314,11 +308,11 @@ int main(int argc, char ** argv) {
         llava_free(ctx_llava);
     } else {
         for (auto & image : params.image) {
-            auto ctx_llava = llava_init_context(&params, model);
+            auto * ctx_llava = llava_init_context(&params, model);
 
-            auto image_embed = load_image(ctx_llava, &params, image);
+            auto * image_embed = load_image(ctx_llava, &params, image);
             if (!image_embed) {
-                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
                 return 1;
             }
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 851af0f004a691..2a61b2b0a66c8c 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,8 +1,11 @@
 #include "clip.h"
-#include "common.h"
-#include "llama.h"
 #include "llava.h"
+
 #include "base64.hpp"
+#include "common.h"
+#include "log.h"
+
+#include "llama.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -54,7 +57,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -236,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     img_res_v.size = 0;
     img_res_v.data = nullptr;
     if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        LOG_ERR("%s: unable to preprocess image\n", __func__);
         delete[] img_res_v.data;
         return false;
     }
@@ -265,14 +268,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
             }
             if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
             const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         int n_img_pos_out = 0;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -287,7 +290,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         load_image_size->width = img->nx;
         load_image_size->height = img->ny;
         clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
@@ -295,7 +298,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
         delete[] img_res_v.data;
         if (!encoded) {
-            LOG_TEE("Unable to encode image\n");
+            LOG_ERR("Unable to encode image\n");
 
             return false;
         }
@@ -309,12 +312,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
             const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         const int32_t * image_grid = clip_image_grid(ctx_clip);
 
@@ -347,12 +350,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
     }
 
-    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
 
-    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
 
     return true;
 }
@@ -362,7 +365,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
     int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
     auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
     if (n_image_embd != n_llama_embd) {
-        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
         return false;
     }
     return true;
@@ -375,13 +378,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     }
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
     if (!image_embd) {
-        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        LOG_ERR("Unable to allocate memory for image embeddings\n");
         return false;
     }
 
     int n_img_pos;
     if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
         free(image_embd);
         return false;
     }
@@ -401,7 +404,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         }
         llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
         if (llama_decode(ctx_llama, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
         *n_past += n_eval;
@@ -413,7 +416,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     clip_image_u8 * img = clip_image_u8_init();
     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
         clip_image_u8_free(img);
-        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
         return NULL;
     }
 
@@ -422,7 +425,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
     if (!image_embed_result) {
         clip_image_u8_free(img);
-        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: coulnd't embed the image\n", __func__);
         return NULL;
     }
 
@@ -436,7 +439,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
     auto file = fopen(path, "rb");
     if (file == NULL) {
-        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        LOG_ERR("%s: can't read file %s\n", __func__, path);
         return false;
     }
 
@@ -446,7 +449,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
 
     auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
     if (buffer == NULL) {
-        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
         perror("Memory allocation error");
         fclose(file);
         return false;
@@ -471,7 +474,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
     long image_bytes_length;
     auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
     if (!loaded) {
-        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
         return NULL;
     }
 
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 3475bbce58562c..7ca1ef9b98c6e4 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -10,6 +10,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+#include <iostream> // TODO: remove me
 
 struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
@@ -18,14 +19,8 @@ struct llava_context {
 };
 
 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
+    LOG("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static struct llama_model * llava_init(gpt_params * params) {
@@ -36,7 +31,7 @@ static struct llama_model * llava_init(gpt_params * params) {
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
     }
     return model;
@@ -51,7 +46,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     if (params->n_ctx < 2048) {
         // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
         ctx_params.n_ctx = 2048;
     } else {
         ctx_params.n_ctx = params->n_ctx;
@@ -60,11 +55,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;
     ctx_llava->model = model;
@@ -89,7 +84,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
     if (prompt.empty()) {
         prompt = "describe the image in detail.";
     }
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
     return ctx_clip;
 }
 
@@ -101,7 +96,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
             n_eval = n_batch;
         }
         if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
         *n_past += n_eval;
@@ -125,7 +120,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
     float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
     std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
 
-    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
     slice_embed->embed = image_embed;
     slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
     llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@@ -143,7 +138,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
     else if (has_minicpmv_projector == 3) {
         system_prompt = "<|im_start|>user\n";
     }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
     eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
     eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@@ -162,7 +157,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
         }
         eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
     }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
 }
 
 static const char * sample(struct gpt_sampler * smpl,
@@ -181,42 +176,42 @@ static const char * sample(struct gpt_sampler * smpl,
 }
 
 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    auto * ctx_clip = clip_init_context(params);
+    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
     if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
         return NULL;
     }
 
     // process the prompt
     if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
+        LOG_ERR("prompt should be given or interactive mode should be on");
         return NULL;
     }
 
-    auto model = llava_init(params);
+    auto * model = llava_init(params);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
         return NULL;
     }
     const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
+    auto * ctx_llava = llava_init_context(params, model);
     ctx_llava->ctx_clip = ctx_clip;
     const int64_t t_llava_init_end_us = ggml_time_us();
     float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
 
     const int64_t t_process_image_start_us = ggml_time_us();
     process_image(ctx_llava, embeds, params, n_past);
     const int64_t t_process_image_end_us = ggml_time_us();
     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
 
     llava_image_embed_free(embeds);
     return ctx_llava;
 }
 
-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
     std::string user_prompt = prompt;
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (!is_first) {
@@ -238,7 +233,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
 
     // generate the response
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
 
     struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
     return smpl;
@@ -251,6 +246,12 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     ggml_time_init();
 
     gpt_params params;
@@ -259,13 +260,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
     if (params.mmproj.empty() || (params.image.empty())) {
         show_additional_info(argc, argv);
         return 1;
@@ -273,21 +267,23 @@ int main(int argc, char ** argv) {
 
     for (auto & image : params.image) {
         int n_past = 0;
-        auto ctx_llava = minicpmv_init(&params, image, n_past);
+        auto * ctx_llava = minicpmv_init(&params, image, n_past);
 
         if (!params.prompt.empty()) {
-            LOG_TEE("<user>%s\n", params.prompt.c_str());
-            LOG_TEE("<assistant>");
-            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            LOG("<user>%s\n", params.prompt.c_str());
+            LOG("<assistant>");
+            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
             const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response = "";
+            std::string response;
             bool have_tmp = false;
             for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                 response += tmp;
                 if (strcmp(tmp, "</s>") == 0){
-                    if(!have_tmp)continue;
-                    else break;
+                    if (!have_tmp) {
+                        continue;
+                    }
+                    break;
                 }
                 if (strstr(tmp, "###")) break; // Yi-VL behavior
                 have_tmp = true;
@@ -299,15 +295,15 @@ int main(int argc, char ** argv) {
             gpt_sampler_free(smpl);
         }else {
             while (true) {
-                LOG_TEE("<user>");
+                LOG("<user>");
                 std::string prompt;
                 std::getline(std::cin, prompt);
-                LOG_TEE("<assistant>");
-                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                LOG("<assistant>");
+                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                 const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response = "";
+                std::string response;
                 for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                     response += tmp;
                     if (strcmp(tmp, "</s>") == 0) break;
                     if (strstr(tmp, "###")) break; // Yi-VL behavior
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index de8b792f237144..6e936a7d008b48 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cstdio>
@@ -36,6 +37,12 @@ struct ngram_container {
 };
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
@@ -48,12 +55,6 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookahead", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -75,14 +76,14 @@ int main(int argc, char ** argv) {
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -166,7 +167,7 @@ int main(int argc, char ** argv) {
         {
             const std::string token_str = llama_token_to_piece(ctx, id);
 
-            printf("%s", token_str.c_str());
+            LOG("%s", token_str.c_str());
             fflush(stdout);
         }
     }
@@ -256,7 +257,7 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
             return 1;
         }
 
@@ -293,10 +294,10 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
 
                 if (v == 0) {
-                    printf("%s", token_str.c_str());
+                    LOG("%s", token_str.c_str());
                 } else {
                     // print light cyan
-                    printf("\033[0;96m%s\033[0m", token_str.c_str());
+                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
                 }
                 fflush(stdout);
 
@@ -330,21 +331,21 @@ int main(int argc, char ** argv) {
             // print known n-grams starting with token id (debug)
             if (0 && v == 0) {
                 if (ngrams_observed.cnt[id] > 0) {
-                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                 }
 
                 for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    printf("   - ngram %2d: ", i);
+                    LOG("   - ngram %2d: ", i);
 
                     const int idx = id*(N - 1)*G + i*(N - 1);
 
                     for (int j = 0; j < N - 1; j++) {
                         const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
 
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                     }
 
-                    printf("\n");
+                    LOG("\n");
                 }
             }
 
@@ -455,20 +456,20 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("W = %2d\n", W);
-    LOG_TEE("N = %2d\n", N);
-    LOG_TEE("G = %2d\n", G);
-    LOG_TEE("\n");
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_INF("\n");
+    LOG_INF("W = %2d\n", W);
+    LOG_INF("N = %2d\n", N);
+    LOG_INF("G = %2d\n", G);
+    LOG_INF("\n");
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_accept  = %d\n", n_accept);
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
     gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
@@ -482,7 +483,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index f299d68a93ed95..1c486903e78901 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -5,15 +5,20 @@
 #include "llama.h"
 #include "ggml.h"
 
-#include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>
 
 int main(int argc, char ** argv){
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
@@ -49,7 +54,7 @@ int main(int argc, char ** argv){
             try {
                 ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
@@ -128,7 +133,7 @@ int main(int argc, char ** argv){
             const int64_t eta_min  = eta_ms / (60*1000);
             const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
 
-            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
         }
 
         // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -136,24 +141,24 @@ int main(int argc, char ** argv){
         ngram_cache_context.clear();
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
             t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
     llama_free(ctx);
     llama_free_model(model);
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index fff44a499e4bcf..11ddb37ccf919a 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "ngram-cache.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cstdint>
@@ -12,6 +13,12 @@
 #include <vector>
 
 int main(int argc, char ** argv){
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
@@ -23,12 +30,6 @@ int main(int argc, char ** argv){
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookup", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -58,7 +59,7 @@ int main(int argc, char ** argv){
             try {
                 ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
@@ -76,14 +77,14 @@ int main(int argc, char ** argv){
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -124,7 +125,7 @@ int main(int argc, char ** argv){
         }
 
         // print current draft sequence
-        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
 
         int i_dft = 0;
         while (true) {
@@ -136,7 +137,7 @@ int main(int argc, char ** argv){
             const std::string token_str = llama_token_to_piece(ctx, id);
 
             if (!params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
 
             if (llama_token_is_eog(model, id)) {
@@ -147,7 +148,7 @@ int main(int argc, char ** argv){
 
             // check if the target token matches the draft
             if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                 ++n_accept;
                 ++n_past;
                 ++i_dft;
@@ -161,19 +162,19 @@ int main(int argc, char ** argv){
 
                 if (params.use_color) {
                     // color accepted draft token
-                    printf("\033[34m%s\033[0m", token_str.c_str());
+                    LOG("\033[34m%s\033[0m", token_str.c_str());
                     fflush(stdout);
                 }
                 continue;
             }
 
             if (params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
             fflush(stdout);
 
 
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
             draft.clear();
             draft.push_back(id);
@@ -224,22 +225,22 @@ int main(int argc, char ** argv){
     llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
     llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_predict);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_predict);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
             t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\ntarget:\n\n");
     llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
     llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
 
@@ -252,7 +253,7 @@ int main(int argc, char ** argv){
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4f3e3640016d23..33a81acb09253f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -126,7 +126,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
     llama_chat_msg new_msg{role, content};
     auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
+    LOG_DBG("formatted: %s\n", formatted.c_str());
     return formatted;
 }
 
@@ -304,14 +304,14 @@ int main(int argc, char ** argv) {
         }
 
         LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
-        LOG_DBG("tokens: %s\n", string_from_tokens(ctx, embd_inp).c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         if (add_bos) {
             embd_inp.push_back(llama_token_bos(model));
-            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from_tokens(ctx, embd_inp).c_str());
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
         } else {
             LOG_ERR("input is empty\n");
             return -1;
@@ -584,7 +584,7 @@ int main(int argc, char ** argv) {
 
                     LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                    LOG_DBG("embd: %s\n", string_from_tokens(ctx, embd).c_str());
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
                     LOG_DBG("clear session path\n");
                     path_session.clear();
@@ -641,7 +641,7 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG_DBG("eval: %s\n", string_from_tokens(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
@@ -678,7 +678,7 @@ int main(int argc, char ** argv) {
 
             gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
 
-            // LOG_DBG("last: %s\n", string_from_tokens(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -861,7 +861,7 @@ int main(int argc, char ** argv) {
                     const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
-                    LOG_DBG("input tokens: %s\n", string_from_tokens(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     // if user stop generation mid-way, we must add EOT to finish model's last response
                     if (need_insert_eot && format_chat) {
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index bc6301311d9419..519e97a560313c 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -4,6 +4,7 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -83,7 +84,9 @@ static void print_date_time() {
     char buffer[80];
     strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
 
-    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+    LOG_INF("\n");
+    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
+    LOG_INF("\n");
 }
 
 // Define a split string function to ...
@@ -100,6 +103,12 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 int main(int argc, char ** argv) {
     srand(1234);
 
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
@@ -120,12 +129,6 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("parallel", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -138,23 +141,22 @@ int main(int argc, char ** argv) {
 
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
-        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
     } else {
         // Output each line of the input params.prompts vector and copy to k_prompts
         int index = 0;
-        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
 
         std::vector<std::string> prompts = split_string(params.prompt, '\n');
         for (const auto& prompt : prompts) {
             k_prompts.resize(index + 1);
             k_prompts[index] = prompt;
             index++;
-            printf("%3d prompt: %s\n", index, prompt.c_str());
+            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
         }
     }
 
-    fprintf(stderr, "\n\n");
-    fflush(stderr);
+    LOG_INF("\n\n");
 
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -183,19 +185,19 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_TEE("\n");
+    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("\n");
 
     {
-        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
 
         for (int32_t i = 0; i < n_tokens_system; ++i) {
             llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
@@ -204,10 +206,10 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
-    LOG_TEE("Processing requests ...\n\n");
+    LOG_INF("Processing requests ...\n\n");
 
     while (true) {
         if (dump_kv_cache) {
@@ -238,7 +240,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
-            LOG_TEE("%s: clearing the KV cache\n", __func__);
+            LOG_INF("%s: clearing the KV cache\n", __func__);
         }
 
         // insert new sequences for decoding
@@ -273,7 +275,7 @@ int main(int argc, char ** argv) {
                     client.n_decoded = 0;
                     client.i_batch   = batch.n_tokens - 1;
 
-                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
 
                     g_seq_id += 1;
 
@@ -317,11 +319,11 @@ int main(int argc, char ** argv) {
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return 1;
                 }
 
-                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
                 n_cache_miss += 1;
 
@@ -332,7 +334,7 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
 
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -377,7 +379,7 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                             client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -400,19 +402,19 @@ int main(int argc, char ** argv) {
 
     print_date_time();
 
-    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
     if (params.prompt_file.empty()) {
         params.prompt_file = "used built-in defaults";
     }
-    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
 
-    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
+    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
 
     // TODO: print sampling/grammar timings for all clients
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
@@ -424,7 +426,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index d3d5ab46fa0dbf..5df8ad7a4fd390 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -8,12 +9,18 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     params.n_junk = 250;
@@ -63,7 +70,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return 1;
     }
 
@@ -77,7 +84,7 @@ int main(int argc, char ** argv) {
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
@@ -107,14 +114,14 @@ int main(int argc, char ** argv) {
     const int n_batch     = ctx_params.n_batch;
     const int n_batch_grp = ctx_params.n_batch/n_grp;
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
 
     // print the prompt token-by-token
 
-    LOG_TEE("\n");
-    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
-    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
+    LOG_INF("\n");
+    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_INF("prompt tokens: %d\n", n_tokens_all);
+    //LOG_INF("prompt: %s\n", params.prompt.c_str());
 
     llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
 
@@ -145,11 +152,11 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_INF("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
 
         if (i + n_batch >= n_tokens_all) {
             break;
@@ -159,7 +166,7 @@ int main(int argc, char ** argv) {
     for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
         const int n_discard = n_batch;
 
-        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -179,18 +186,18 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
     }
 
     {
         const int n_discard = n_past - n_ctx + n_predict;
 
         if (n_discard > 0) {
-            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -201,17 +208,16 @@ int main(int argc, char ** argv) {
         }
     }
 
-    LOG_TEE("\n");
-    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_TEE("\n");
+    LOG_INF("\n");
+    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_INF("\n");
 
     // main loop
 
     int n_cur    = n_tokens_all;
     int n_decode = 0;
 
-    LOG_TEE("%s", prompt_suffix.c_str());
-    fflush(stdout);
+    LOG_INF("%s", prompt_suffix.c_str());
 
     const auto t_main_start = ggml_time_us();
 
@@ -222,13 +228,12 @@ int main(int argc, char ** argv) {
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-                LOG_TEE("\n");
+                LOG("\n");
 
                 break;
             }
 
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
+            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
 
             n_decode += 1;
 
@@ -243,22 +248,22 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     llama_sampler_free(smpl);
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 04df65b0a58922..199c45fcead359 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <array>
@@ -41,7 +42,7 @@ static void write_logfile(
     }
 
     if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
         return;
     }
 
@@ -49,7 +50,7 @@ static void write_logfile(
 
     const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
         return;
     }
@@ -58,7 +59,7 @@ static void write_logfile(
     FILE * logfile = fopen(logfile_path.c_str(), "w");
 
     if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
         return;
     }
 
@@ -344,16 +345,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
     GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
 
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     const int n_ctx = llama_n_ctx(ctx);
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -364,16 +365,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     prob_history.resize(tokens.size());
 
     if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
 
     const int calc_chunk = n_ctx;
 
-    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
 
     if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                 tokens.size(), n_ctx, params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
@@ -387,14 +388,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     int count = 0;
     double nll = 0.0;
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * params.ppl_stride;
         const int end   = start + calc_chunk;
 
         const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
 
         std::vector<float> logits;
 
@@ -407,10 +408,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
 
-            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
             // TODO: use llama_batch.logits instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                //LOG_ERR("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -434,16 +435,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
 
-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
         for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
 
             // Calculate probability of next token, given the previous ones.
@@ -460,13 +462,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         }
         // perplexity is e^(average negative log-likelihood)
         if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
         } else {
-            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
         }
-        fflush(stdout);
     }
-    printf("\n");
+    LOG("\n");
 
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
@@ -488,26 +489,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     if (!params.logits_file.empty()) {
         logits_stream.open(params.logits_file.c_str(), std::ios::binary);
         if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
             return {};
         }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
         logits_stream.write("_logits_", 8);
         logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
     }
 
     auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -540,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         logits.reserve((size_t)n_ctx * n_vocab);
     }
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
@@ -613,7 +614,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
 
             if (llama_decode(ctx, batch)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_INF("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -628,14 +629,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             llama_synchronize(ctx);
             const auto t_end = std::chrono::high_resolution_clock::now();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total*n_chunk/n_seq);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
 
         for (int seq = 0; seq < n_seq_batch; seq++) {
             const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@@ -656,19 +658,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
             // perplexity is e^(average negative log-likelihood)
             if (params.ppl_output_type == 0) {
-                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
             } else {
                 double av = nll/count;
                 double av2 = nll2/count - av*av;
                 if (av2 > 0) av2 = sqrt(av2/(count-1));
-                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
             }
         }
-        fflush(stdout);
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     nll2 /= count;
     nll /= count;
@@ -676,9 +677,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     nll2 -= nll * nll;
     if (nll2 > 0) {
         nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
     } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
     }
 
     llama_batch_free(batch);
@@ -704,7 +705,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
 
         const int ret = llama_decode(ctx, batch_view);
         if (ret != 0) {
-            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
             return false;
         }
 
@@ -790,15 +791,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     }
 
     if (prompt_lines.size() % 6 != 0) {
-        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
         return;
     }
 
     size_t hs_task_count = prompt_lines.size()/6;
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
 
     const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+    LOG_INF("================================= is_spm = %d\n", is_spm);
 
     // The tasks should be randomized so the score stabilizes quickly.
     bool randomize_tasks = true;
@@ -825,7 +826,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         std::vector<llama_token> seq_tokens[4];
     };
 
-    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
 
     // Select and read data from prompt lines
     std::vector<hs_data_t> hs_data(hs_task_count);
@@ -871,9 +872,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
     }
 
-    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
     double acc = 0.0f;
 
@@ -941,7 +942,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -949,7 +950,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -999,7 +1000,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                 }
             }
 
-            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
 
             // If the gold ending got the maximum logprobe add one accuracy point
             if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1007,8 +1008,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
 
             // Print the accumulated accuracy mean x 100
-            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
-            fflush(stdout);
+            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
         }
 
         i0 = i1 - 1;
@@ -1016,7 +1016,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
     llama_batch_free(batch);
 
-    printf("\n");
+    LOG("\n");
 }
 
 struct winogrande_entry {
@@ -1060,7 +1060,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
             }
         }
         if (ipos != 4) {
-            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
             continue;
         }
         auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1074,13 +1074,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
             if (sentence[where] == '_') break;
         }
         if (where == int(sentence.size())) {
-            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
             continue;
         }
         std::istringstream stream(answer.c_str());
         int i_answer; stream >> i_answer;
         if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
             continue;
         }
         result.emplace_back();
@@ -1109,14 +1109,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
     auto data = load_winogrande_from_csv(params.prompt);
     if (data.empty()) {
-        fprintf(stderr, "%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
 
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
 
     if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
         std::mt19937 rng(1);
         std::vector<int> aux(data.size());
         for (int i = 0; i < int(data.size()); ++i) {
@@ -1134,7 +1134,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         data = std::move(selected);
     }
 
-    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);
 
     for (auto & task : data) {
         task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1157,7 +1157,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
     }
 
-    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
@@ -1218,7 +1218,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1226,7 +1226,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1286,20 +1286,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             ++n_done;
 
             // print the accumulated accuracy mean x 100
-            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
-            fflush(stdout);
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
         }
 
         i0 = i1 - 1;
     }
 
-    printf("\n");
+    LOG("\n");
 
     if (n_done < 100) return;
 
     const float p = 1.f*n_correct/n_done;
     const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 
 static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1348,7 +1348,7 @@ struct multiple_choice_task {
 static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
     if (task.question.empty() || task.mc1.answers.empty()) {
         if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
         }
         return false;
     }
@@ -1356,7 +1356,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
     for (auto& answer : task.mc1.answers) {
         if (answer.empty()) {
             if (log_error) {
-                printf("%s: found empty answer\n", __func__);
+                LOG_ERR("%s: found empty answer\n", __func__);
             }
             return false;
         }
@@ -1410,14 +1410,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
     if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
     std::vector<uint32_t> task_pos(n_task);
     strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
     if (strstream.fail()) {
-        printf("%s: failed to read task positions from prompt\n", __func__);
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
         return;
     }
 
@@ -1425,21 +1425,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
         // Use all tasks
         tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
+        LOG_INF("%s: reading tasks", __func__);
         int n_dot = std::max((int) n_task/100, 1);
         int i = 0;
         for (auto& task : tasks) {
             ++i;
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
                 return;
             }
-            if (i%n_dot == 0) printf(".");
+            if (i%n_dot == 0) LOG(".");
         }
-        printf("done\n");
+        LOG("done\n");
     }
     else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
         std::mt19937 rng(1);
         std::vector<int> aux(n_task);
         for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1452,18 +1452,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             aux.pop_back();
             strstream.seekg(task_pos[idx], std::ios::beg);
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                 return;
             }
         }
         n_task = params.multiple_choice_tasks;
     }
 
-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
+    LOG_INF("%s: preparing task data", __func__);
     if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
+        LOG("...");
         std::atomic<int> counter(0);
         std::atomic<int> n_bad(0);
         auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1487,11 +1485,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         for (auto& w : workers) w = std::thread(prepare);
         prepare();
         for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
+        LOG("done\n");
         int nbad = n_bad;
         if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
             return;
         }
     } else {
@@ -1503,16 +1500,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 return;
             }
             if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
+                LOG(".");
             }
         }
-        printf("done\n");
+        LOG("done\n");
     }
 
-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
@@ -1591,7 +1587,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1599,7 +1595,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1623,13 +1619,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // compute the logprobs for each ending of the decoded tasks
         for (size_t i = i0; i < i1; ++i) {
             auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
             //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
             //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
+            //        LOG("%d", j+1);
             //    }
             //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
 
             // get the logits of the last token of the common prefix
             std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1641,13 +1637,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 size_t count = 1;
                 float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
                     ++count;
                     log_prob += eval_results[ir++];
                 }
                 cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
             }
 
             // Find the ending with maximum logprob
@@ -1667,8 +1663,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             ++n_done;
 
             // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
         }
 
         i0 = i1 - 1;
@@ -1680,29 +1675,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
     float p = 1.f*n_correct/n_done;
     float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
     p = 1.f*n_done/n_tot_answers;
     sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
 
-    printf("\n");
+    LOG_INF("\n");
 }
 
 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
     }
     std::ifstream in(params.logits_file.c_str(), std::ios::binary);
     if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
         return;
     }
     {
         char check[9]; check[8] = 0;
         in.read(check, 8);
         if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
             return;
         }
     }
@@ -1710,7 +1706,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     uint32_t n_ctx;
     in.read((char *)&n_ctx, sizeof(n_ctx));
     if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                 __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
     }
 
@@ -1718,16 +1714,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     in.read((char *)&n_vocab, sizeof(n_vocab));
     in.read((char *)&n_chunk, sizeof(n_chunk));
     if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
     if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
     }
 
     std::vector<llama_token> tokens(n_ctx * n_chunk);
     if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
         return;
     }
 
@@ -1776,7 +1772,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
             return;
         }
 
@@ -1797,7 +1793,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
             // TODO: use llama_batch.logits instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                 return;
             }
 
@@ -1814,16 +1810,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
 
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1832,79 +1828,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         p_diff_ptr += n_ctx - 1 - first;
         kld_ptr    += n_ctx - 1 - first;
 
-        printf("%4d", i+1);
+        LOG("%4d", i+1);
 
         auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
         const double ppl_val = exp(log_ppl.first);
         const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
 
         auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
         const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
         const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
         const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
 
         auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
 
         auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
         const double p_diff_rms_val = sqrt(p_diff_mse.first);
         const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
 
         double p_top_val = 1.*kld.n_same_top/kld.count;
         double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
-
-        printf("\n");
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
 
-        fflush(stdout);
+        LOG("\n");
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     if (kld.count < 100) return; // we do not wish to do statistics on so few values
 
     std::sort(kld_values.begin(), kld_values.end());
     std::sort(p_diff_values.begin(), p_diff_values.end());
 
-    printf("====== Perplexity statistics ======\n");
+    LOG("====== Perplexity statistics ======\n");
 
     auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
     const double ppl_val = exp(log_ppl.first);
     const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
 
     auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
     const double ppl_base_val = exp(log_ppl_base.first);
     const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
 
     const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
     const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
 
     const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
     const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
 
     const double ppl_ratio_val = exp(log_ppl_ratio_val);
     const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
 
     const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
     const double ppl_diff_val = ppl_val - ppl_base_val;
     const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
 
-    printf("\n");
+    LOG("\n");
 
-    printf("====== KL divergence statistics ======\n");
+    LOG("====== KL divergence statistics ======\n");
     auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
     auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                                : kld_values[kld_values.size()/2];
 
@@ -1916,53 +1910,58 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
     };
 
-    printf("Maximum KLD: %10.6f\n", kld_values.back());
-    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("Median  KLD: %10.6f\n", kld_median);
-    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    printf("Minimum KLD: %10.6f\n", kld_values.front());
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());
 
-    printf("\n");
+    LOG("\n");
 
-    printf("====== Token probability statistics ======\n");
+    LOG("====== Token probability statistics ======\n");
 
     auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
 
     auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                                : p_diff_values[p_diff_values.size()/2];
 
-    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
 
     auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
 
     const double p_diff_rms_val = sqrt(p_diff_mse.first);
     const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
 
     const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
-
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     params.n_ctx = 512;
@@ -1975,7 +1974,7 @@ int main(int argc, char ** argv) {
     const int32_t n_ctx = params.n_ctx;
 
     if (n_ctx <= 0) {
-        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
         return 1;
     }
 
@@ -2000,7 +1999,7 @@ int main(int argc, char ** argv) {
     }
 
     if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                 params.n_ctx, params.n_ctx + params.ppl_stride/2);
         params.n_ctx += params.ppl_stride/2;
     }
@@ -2016,21 +2015,21 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
 
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     struct results_perplexity results;
@@ -2046,7 +2045,7 @@ int main(int argc, char ** argv) {
         results = perplexity(ctx, params, n_ctx);
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
     write_logfile(ctx, params, model, results);
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 7a360b731916a8..ed5af43070a809 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,14 +1,16 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
 #include <fstream>
+#include <iostream> // TODO: remove me
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG("\n");
 }
 
 struct chunk {
@@ -17,7 +19,7 @@ struct chunk {
     // original file position
     size_t filepos;
     // original text data
-    std::string textdata = "";
+    std::string textdata;
     // tokenized text data
     std::vector<llama_token> tokens;
     // embedding
@@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     std::ifstream f(filename.c_str());
 
     if (!f.is_open()) {
-        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        LOG_ERR("could not open file %s\n", filename.c_str());
         return chunks;
     }
 
     chunk current_chunk;
     char buffer[1024];
     int64_t filepos = 0;
-    std::string current = "";
+    std::string current;
     while (f.read(buffer, 1024)) {
         current += std::string(buffer, f.gcount());
         size_t pos;
@@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     llama_kv_cache_clear(ctx);
 
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
     if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+        LOG_ERR("%s : failed to decode\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         if (embd == NULL) {
             embd = llama_get_embeddings_ith(ctx, i);
             if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
                 continue;
             }
         }
@@ -110,6 +112,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 }
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
@@ -121,19 +129,19 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     if (params.chunk_size <= 0) {
-        fprintf(stderr, "chunk_size must be positive\n");
+        LOG_ERR("chunk_size must be positive\n");
         return 1;
     }
     if (params.context_files.empty()) {
-        fprintf(stderr, "context_files must be specified\n");
+        LOG_ERR("context_files must be specified\n");
         return 1;
     }
 
     print_build_info();
 
-    printf("processing files:\n");
+    LOG_INF("processing files:\n");
     for (auto & context_file : params.context_files) {
-        printf("%s\n", context_file.c_str());
+        LOG_INF("%s\n", context_file.c_str());
     }
 
     std::vector<chunk> chunks;
@@ -141,7 +149,7 @@ int main(int argc, char ** argv) {
         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
-    printf("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %ld\n", chunks.size());
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -153,7 +161,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx = llama_init.context;
 
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
@@ -162,19 +170,19 @@ int main(int argc, char ** argv) {
 
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
     if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
         return 1;
     }
 
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     // max batch size
@@ -185,7 +193,7 @@ int main(int argc, char ** argv) {
     for (auto & chunk : chunks) {
         auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -199,12 +207,12 @@ int main(int argc, char ** argv) {
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) chunks.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
             for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG_INF("\n\n");
         }
     }
 
@@ -256,7 +264,7 @@ int main(int argc, char ** argv) {
     // start loop, receive query and return top k similar chunks based on cosine similarity
     std::string query;
     while (true) {
-        printf("Enter query: ");
+        LOG("Enter query: ");
         std::getline(std::cin, query);
         std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
 
@@ -280,18 +288,18 @@ int main(int argc, char ** argv) {
                 return a.second > b.second;
             });
 
-            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            LOG("Top %d similar chunks:\n", params.sparams.top_k);
             for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                printf("similarity: %f\n", similarities[i].second);
-                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                printf("--------------------\n");
+                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                LOG("similarity: %f\n", similarities[i].second);
+                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                LOG("--------------------\n");
             }
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     // clean up
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5b263f646979bb..52c4a713446f2f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2,6 +2,7 @@
 
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@@ -2252,10 +2253,10 @@ struct server_context {
                         const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                         const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
 
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG_DBG("\n");
+                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
                         llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
                         llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
@@ -2265,7 +2266,7 @@ struct server_context {
 
                         slot.ga_i += slot.ga_w / slot.ga_n;
 
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                     }
 
                     slot.n_past_se += n_tokens;
@@ -2420,9 +2421,12 @@ inline void signal_handler(int signal) {
 }
 
 int main(int argc, char ** argv) {
-#if SERVER_VERBOSE != 1
-    log_disable();
-#endif
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     // own arguments required by this example
     gpt_params params;
 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index ceb3eba24e626c..0e96fe7e8746e9 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -13,7 +13,9 @@ static void print_usage(int, char ** argv) {
 
 int main(int argc, char ** argv) {
     llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        gpt_log_add(gpt_log_main(), level, 0, text, NULL);
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
     }, NULL);
 
     gpt_params params;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 214e4932ba2ca2..10bb8b654a3848 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cstdio>
@@ -27,6 +28,12 @@ struct seq_draft {
 };
 
 int main(int argc, char ** argv) {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
@@ -34,7 +41,7 @@ int main(int argc, char ** argv) {
     }
 
     if (params.model_draft.empty()) {
-        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
+        LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
 
@@ -47,12 +54,6 @@ int main(int argc, char ** argv) {
     std::default_random_engine rng(params.sparams.seed);
     std::uniform_real_distribution<> u_dist;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("speculative", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -81,14 +82,14 @@ int main(int argc, char ** argv) {
     ctx_dft = llama_init_dft.context;
 
     const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG("vocab_type tgt: %d\n", vocab_type_tgt);
+    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
 
     const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG("vocab_type dft: %d\n", vocab_type_dft);
+    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
-        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
-        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
+        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
         return 1;
     }
 
@@ -98,7 +99,7 @@ int main(int argc, char ** argv) {
         llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
         llama_token_eos(model_tgt) != llama_token_eos(model_dft)
     ) {
-        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
         return 1;
     }
 
@@ -110,8 +111,8 @@ int main(int argc, char ** argv) {
             : n_vocab_dft - n_vocab_tgt;
 
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
-            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                     n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return 1;
         }
@@ -120,8 +121,8 @@ int main(int argc, char ** argv) {
             const char * token_text_tgt = llama_token_get_text(model_tgt, i);
             const char * token_text_dft = llama_token_get_text(model_dft, i);
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
+                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
                         llama_token_to_piece(ctx_tgt, i).c_str(),
                         llama_token_to_piece(ctx_dft, i).c_str());
                 return 1;
@@ -138,18 +139,16 @@ int main(int argc, char ** argv) {
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
     }
 
-    fflush(stderr);
-
     const int n_input = inp.size();
 
     const auto t_enc_start = ggml_time_us();
@@ -211,7 +210,7 @@ int main(int argc, char ** argv) {
             active_seqs.insert(s);
             const auto & tokens = drafts[s].tokens;
 
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
         }
 
         int i_dft  = 0;
@@ -254,7 +253,7 @@ int main(int argc, char ** argv) {
                             continue;
                         }
 
-                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
                         llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
 
@@ -272,7 +271,7 @@ int main(int argc, char ** argv) {
                                 break;
                             }
                         }
-                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                         if (r <= p_tgt / p_dft) {
                             s_keep = s;
                             accept = true;
@@ -280,10 +279,10 @@ int main(int argc, char ** argv) {
                             token_str = llama_token_to_piece(ctx_tgt, token_id);
                             gpt_sampler_accept(smpl, token_id, true);
 
-                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                             break;
                         } else {
-                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                             drafts[s].active = false;
 
                             // calculate residual probability
@@ -338,7 +337,7 @@ int main(int argc, char ** argv) {
                     if (!accept) {
                         // all drafted tokens were rejected
                         // sample from the target model
-                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
                         std::vector<float> probs(dist_tgt.size);
                         for (size_t i = 0; i < dist_tgt.size; ++i) {
                             probs[i] = dist_tgt.data[i].p;
@@ -356,13 +355,11 @@ int main(int argc, char ** argv) {
                     // greedy verification
 
                     // sample from the target model
-                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
                     token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
                     gpt_sampler_accept(smpl, token_id, true);
 
-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
-
                     token_str = llama_token_to_piece(ctx_tgt, token_id);
 
                     for (int s = 0; s < n_seq_dft; ++s) {
@@ -371,7 +368,7 @@ int main(int argc, char ** argv) {
                         }
 
                         if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
 
                             s_keep = s;
                             accept = true;
@@ -393,26 +390,24 @@ int main(int argc, char ** argv) {
                     ++i_dft;
                     if (params.use_color) {
                         // Color token according to its origin sequence
-                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
                     } else {
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                     }
-                    fflush(stdout);
                     continue;
                 } else {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
+                    LOG("%s", token_str.c_str());
                     break;
                 }
             }
         }
 
         {
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
 
             // TODO: simplify
             {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -439,7 +434,7 @@ int main(int argc, char ** argv) {
             llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
             llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -486,7 +481,7 @@ int main(int argc, char ** argv) {
                 const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                 }
 
@@ -495,7 +490,7 @@ int main(int argc, char ** argv) {
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
                         llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                         llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -584,7 +579,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             llama_decode(ctx_tgt, batch_tgt);
             ++n_past_tgt;
         }
@@ -602,23 +597,25 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ndraft:\n\n");
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
     llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
 
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
 
     gpt_sampler_free(smpl);
@@ -637,7 +634,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index c817be566cf548..02addc5b2c366a 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,11 +1,12 @@
 #include "common.h"
+//#include "log.h" // TODO: start using log.h
 #include "llama.h"
 
-#include <cmath>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
+#include <iostream> // TODO: remove me
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -13,25 +14,25 @@
 #include <shellapi.h>   // For CommandLineToArgvW
 #endif
 
-static void print_usage_information(const char * argv0, FILE * stream) {
-    fprintf(stream, "usage: %s [options]\n\n", argv0);
-    fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
-    fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
-    fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
-    fprintf(stream, "to control the behavior of the tokenizer.\n\n");
-    fprintf(stream, "    The possible options are:\n");
-    fprintf(stream, "\n");
-    fprintf(stream, "    -h, --help                           print this help and exit\n");
-    fprintf(stream, "    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
-    fprintf(stream, "    --ids                                if given, only print numerical token IDs, and not token strings.\n");
-    fprintf(stream, "                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
-    fprintf(stream, "    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
-    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
-    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
-    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
-    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
-    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
-    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
+static void print_usage_information(const char * argv0) {
+    printf("usage: %s [options]\n\n", argv0);
+    printf("The tokenize program tokenizes a prompt using a given model,\n");
+    printf("and prints the resulting tokens to standard output.\n\n");
+    printf("It needs a model file, a prompt, and optionally other flags\n");
+    printf("to control the behavior of the tokenizer.\n\n");
+    printf("    The possible options are:\n");
+    printf("\n");
+    printf("    -h, --help                           print this help and exit\n");
+    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    printf("    --stdin                              read prompt from standard input.\n");
+    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-parse-special                   do not parse control tokens.\n");
+    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    printf("    --show-count                         print the total number of tokens.\n");
 }
 
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -185,7 +186,7 @@ int main(int raw_argc, char ** raw_argv) {
     const int argc = argv.size();
 
     if (argc <= 1) {
-        print_usage_information(argv[0].c_str(), stderr);
+        print_usage_information(argv[0].c_str());
         return 1;
     }
 
@@ -214,7 +215,7 @@ int main(int raw_argc, char ** raw_argv) {
     for (; iarg < argc; ++iarg) {
         std::string arg{argv[iarg]};
         if (arg == "-h" || arg == "--help") {
-            print_usage_information(argv[0].c_str(), stdout);
+            print_usage_information(argv[0].c_str());
             return 0;
         }
         else if (arg == "--ids") {
@@ -323,10 +324,6 @@ int main(int raw_argc, char ** raw_argv) {
     // Start actually doing the tokenizing stuff.
     //////
 
-#ifdef LOG_DISABLE_LOGS
-    disable_logging = true;
-#endif
-
     if (disable_logging) {
         llama_log_set(llama_log_callback_null, NULL);
     }